In [1]:
import pandas as pd
import numpy as np
import re
import os
import unicodedata
from numpy import hstack
from nltk import word_tokenize, sent_tokenize
import nltk
import swifter
import string

from matplotlib import pyplot as plt

from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

import ast
from Regex import *

nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Uyen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# CONSTANT VARIABLE

In [2]:
PATH_TO_TRAIN = 'data/PRICE_MAY_NOV_2019.csv'
PREDICTION_THRESHOLD = 0.5

from tag_id_by_name import TAG_ID


In [3]:
def special_pattern_sub(doc):
    doc = re.sub(PATTERN_HTMLTAG,'', doc)
    doc = re.sub(PATTERN_NOT_PUNC_WSPACE_ALNUM,'',doc)
    doc = re.sub(PATTERN_URL,'LINK',doc)
    doc = re.sub(PATTERN_NUMBER,'NUM',doc)
    doc = re.sub(PATTERN_PHONENB,'PNUM',doc)
    doc = re.sub(PATTERN_EMAIL, 'EMAIL', doc)
    doc = re.sub(PATTERN_LINEBRK,'',doc)

    doc = re.sub(r"(\b\D+)\.(\D+\b)",r"\1 . \2",doc)
    return doc

def preprocess(doc):
    doc = '' if not doc else doc 
    preprocessed_doc = doc.lower() 
    preprocessed_doc = special_pattern_sub(doc)
    
    preprocessed_doc = __tokenize_single_doc(preprocessed_doc) 
    preprocessed_doc = unicodedata.normalize('NFKC',preprocessed_doc) 
    return doc

def get_right_elem(text_list):
    no_element = len(text_list)
    if no_element > 1:
        return text_list[1]
    elif no_element == 1: 
        return text_list[0]
    else:
        return np.nan

def __tokenize_single_doc(doc):
    return " ".join(doc.split())

In [4]:
#DATA EXPLAINATION

{
    'search_text': 'A List that usually includes 3 items, the main content usually is at 2nd position.\n ',
    'topic_id' : 'An int that determine the topic that the text grouped into based on keyword matching',
    'sentiment': '1: Positive; 0: Neutral; -1: Negative; 10:undetermined',
    'mention_type':'1: Post; 2: Comment; 3: Share',
    'tags' : 'labeled list of tag_id; Use'
}

{'search_text': 'A List that usually includes 3 items, the main content usually is at 2nd position.\n ',
 'topic_id': 'An int that determine the topic that the text grouped into based on keyword matching',
 'sentiment': '1: Positive; 0: Neutral; -1: Negative; 10:undetermined',
 'mention_type': '1: Post; 2: Comment; 3: Share',
 'tags': 'labeled list of tag_id; Use'}

In [6]:
train_df = pd.read_csv(PATH_TO_TRAIN)
train_df = train_df.sample(frac=0.2)

In [7]:
def preprocess_text(df):
    df.fillna({'search_text':'[]'}, inplace=True)
    df['list_text'] = df.search_text.swifter.set_npartitions(20).allow_dask_on_strings(enable=True).apply(ast.literal_eval)
    df['mention'] = df.list_text.swifter.set_npartitions(20).allow_dask_on_strings(enable=True).apply(get_right_elem)
    df['mention'] = df.mention.swifter.set_npartitions(20).allow_dask_on_strings(enable=True).apply(preprocess)
    df['mention'] = df.mention.swifter.set_npartitions(20).allow_dask_on_strings(enable=True).apply(lambda x: unicodedata.normalize('NFKC',x))

df = train_df.copy()
df = preprocess_text(df)

In [14]:
df.drop_duplicates(subset = ['mention'], inplace = True)
# test_df.drop_duplicates(subset = ['mention'], inplace = True)


print('Train Label Distribution: %s\n'%df.label.value_counts())
# print(('Test Label Distribution: %s\n'%test_df.label.value_counts()))


X_train, X_eval, y_train, y_eval = train_test_split(df.mention, df.label, test_size = 0.3)



Train Label Distribution: 0    543315
1      6020
Name: label, dtype: int64



In [None]:
#BASELINE MODEL


In [33]:
text_clf = Pipeline([
                     ('tfidf',TfidfVectorizer()),
                     ('clf',LogisticRegression())
])

In [26]:
param_grid = [
              {
                  'tfidf__ngram_range':[(1, 3)],
                  'tfidf__min_df':[0, 0.2],
                  'tfidf__max_df':[1, 0.8],
                  'tfidf__max_features':[500, 5000],
                  'clf__class_weight':['balanced',{1:10,0:1},{1:5,0:1}],
              }
]

In [28]:
#This step take a long time to run, possibly use RandomizedSearchCV
GridCV = GridSearchCV(text_clf, param_grid, verbose = 1, n_jobs = -1, scoring = 'roc_auc', return_train_score=True)

In [29]:
GridCV.fit(X_train, y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  1.3min finished


GridSearchCV(estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('clf', LogisticRegression())]),
             n_jobs=-1,
             param_grid=[{'clf__class_weight': ['balanced'],
                          'tfidf__max_df': [1], 'tfidf__max_features': [500],
                          'tfidf__min_df': [0],
                          'tfidf__ngram_range': [(1, 3)]}],
             verbose=1)

In [30]:
best_clf = GridCV.best_estimator_
best_clf

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=1, max_features=500, min_df=0,
                                 ngram_range=(1, 3))),
                ('clf', LogisticRegression(class_weight='balanced'))])

In [31]:
pd.DataFrame(GridCV.best_params_)

Unnamed: 0,clf__class_weight,tfidf__max_df,tfidf__max_features,tfidf__min_df,tfidf__ngram_range
0,balanced,1,500,0,1
1,balanced,1,500,0,3


In [33]:
from meilibs import utils

utils.dump_pkl('model/best_cls', GridCV.best_estimator_)

In [32]:
print(classification_report(y_eval, best_clf.predict(X_eval)))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99    163003
           1       0.00      0.00      0.00      1798

    accuracy                           0.99    164801
   macro avg       0.49      0.50      0.50    164801
weighted avg       0.98      0.99      0.98    164801

