In [12]:
import pandas as pd
import numpy as np
import re
import os
import unicodedata
from numpy import hstack
from nltk import word_tokenize, sent_tokenize
import nltk
import swifter
import string

from matplotlib import pyplot as plt

from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

import ast
from Regex import *

nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Uyen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# CONSTANT VARIABLE

In [2]:
PATH_TO_TRAIN = 'data/PRICE_MAY_NOV_2019.csv'
PREDICTION_THRESHOLD = 0.5

from tag_id_by_name import TAG_ID


In [3]:
def special_pattern_sub(doc):
    doc = re.sub(PATTERN_HTMLTAG,'', doc)
    doc = re.sub(PATTERN_NOT_PUNC_WSPACE_ALNUM,'',doc)
    doc = re.sub(PATTERN_URL,'LINK',doc)
    doc = re.sub(PATTERN_NUMBER,'NUM',doc)
    doc = re.sub(PATTERN_PHONENB,'PNUM',doc)
    doc = re.sub(PATTERN_EMAIL, 'EMAIL', doc)
    doc = re.sub(PATTERN_LINEBRK,'',doc)

    doc = re.sub(r"(\b\D+)\.(\D+\b)",r"\1 . \2",doc)
    return doc

def preprocess(doc):
    doc = '' if not doc else doc 
    preprocessed_doc = doc.lower() 
    preprocessed_doc = special_pattern_sub(doc)
    
    preprocessed_doc = __tokenize_single_doc(preprocessed_doc) 
    preprocessed_doc = unicodedata.normalize('NFKC',preprocessed_doc) 
    return doc

def get_right_elem(text_list):
    no_element = len(text_list)
    if no_element > 1:
        return text_list[1]
    elif no_element == 1: 
        return text_list[0]
    else:
        return np.nan

def __tokenize_single_doc(doc):
    return " ".join(doc.split())

In [4]:
#DATA EXPLAINATION

{
    'search_text': 'A List that usually includes 3 items, the main content usually is at 2nd position.\n ',
    'topic_id' : 'An int that determine the topic that the text grouped into based on keyword matching',
    'sentiment': '1: Positive; 0: Neutral; -1: Negative; 10:undetermined',
    'mention_type':'1: Post; 2: Comment; 3: Share',
    'tags' : 'labeled list of tag_id; Use'
}

{'search_text': 'A List that usually includes 3 items, the main content usually is at 2nd position.\n ',
 'topic_id': 'An int that determine the topic that the text grouped into based on keyword matching',
 'sentiment': '1: Positive; 0: Neutral; -1: Negative; 10:undetermined',
 'mention_type': '1: Post; 2: Comment; 3: Share',
 'tags': 'labeled list of tag_id; Use'}

In [5]:
train_df = pd.read_csv(PATH_TO_TRAIN)


In [15]:
def preprocess_text(df):
    df.fillna({'search_text':'[]'}, inplace=True)
    df['list_text'] = df.search_text.swifter.set_npartitions(20).allow_dask_on_strings(enable=True).apply(ast.literal_eval)
    df['mention'] = df.list_text.swifter.set_npartitions(20).allow_dask_on_strings(enable=True).apply(get_right_elem)
    df['mention'] = df.mention.swifter.set_npartitions(20).allow_dask_on_strings(enable=True).apply(preprocess)
    df['mention'] = df.mention.swifter.set_npartitions(20).allow_dask_on_strings(enable=True).apply(lambda x: unicodedata.normalize('NFKC',x))

df = train_df.copy()

In [16]:
df.fillna({'search_text':'[]'}, inplace=True)
df['list_text'] = df.search_text.swifter.set_npartitions(20).allow_dask_on_strings(enable=True).apply(ast.literal_eval)


Dask Apply: 100%|██████████| 20/20 [00:43<00:00,  2.17s/it]


In [17]:
df['mention'] = df.list_text.swifter.set_npartitions(20).allow_dask_on_strings(enable=True).apply(get_right_elem)

Dask Apply: 100%|██████████| 20/20 [00:13<00:00,  1.49it/s]


In [18]:
df['mention_1'] = df.mention.swifter.set_npartitions(20).allow_dask_on_strings(enable=True).apply(preprocess)

Dask Apply: 100%|██████████| 20/20 [01:08<00:00,  3.41s/it]


In [19]:
df['mention_2'] = df.mention_1.swifter.set_npartitions(20).allow_dask_on_strings(enable=True).apply(lambda x: unicodedata.normalize('NFKC',x))

Dask Apply: 100%|██████████| 20/20 [00:10<00:00,  1.82it/s]


In [20]:
df.drop_duplicates(subset = ['mention_2'], inplace = True)
# test_df.drop_duplicates(subset = ['mention'], inplace = True)


print('Train Label Distribution: %s\n'%df.label.value_counts())
# print(('Test Label Distribution: %s\n'%test_df.label.value_counts()))


X_train, X_eval, y_train, y_eval = train_test_split(df.mention_2, df.label, test_size = 0.3)



Train Label Distribution: 0    2686429
1      29261
Name: label, dtype: int64



#BASELINE MODEL


In [22]:
text_clf = Pipeline([
                     ('tfidf',TfidfVectorizer()),
                     ('clf',LogisticRegression())
])

In [23]:
param_grid = [
              {
                  'tfidf__ngram_range':[(1,2)],
                  'tfidf__min_df':[0, 0.05],
                  'tfidf__max_df':[1, 0.9],
                  'tfidf__max_features':[3000, 4000, 5000],
                  'clf__class_weight':['balanced'],
                  'clf__max_iter':[200]
              }
]

In [24]:
#This step take a long time to run, possibly use RandomizedSearchCV
GridCV = GridSearchCV(text_clf, param_grid, verbose = 1, n_jobs = -1, scoring = 'roc_auc', return_train_score=True)

In [25]:
GridCV.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed: 16.7min
[Parallel(n_jobs=-1)]: Done  60 out of  60 | elapsed: 39.9min finished


GridSearchCV(estimator=Pipeline(steps=[('tfidf', TfidfVectorizer()),
                                       ('clf', LogisticRegression())]),
             n_jobs=-1,
             param_grid=[{'clf__class_weight': ['balanced'],
                          'clf__max_iter': [200], 'tfidf__max_df': [1, 0.9],
                          'tfidf__max_features': [3000, 4000, 5000],
                          'tfidf__min_df': [0, 0.05],
                          'tfidf__ngram_range': [(1, 2)]}],
             return_train_score=True, scoring='roc_auc', verbose=1)

In [27]:
best_clf = GridCV.best_estimator_
best_clf

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(max_df=0.9, max_features=5000, min_df=0,
                                 ngram_range=(1, 2))),
                ('clf',
                 LogisticRegression(class_weight='balanced', max_iter=200))])

In [30]:
pd.DataFrame(GridCV.best_params_)

Unnamed: 0,clf__class_weight,clf__max_iter,tfidf__max_df,tfidf__max_features,tfidf__min_df,tfidf__ngram_range
0,balanced,200,0.9,5000,0,1
1,balanced,200,0.9,5000,0,2


In [33]:
from meilibs import utils

utils.dump_pkl('model/best_cls', GridCV.best_estimator_)

In [None]:
print(classification_report(test_df.label, best_clf.predict(test_df.mention)))