# Count Vectorizer, Preprocessing Variations
## In this notebook, our objective is to find the optimal preprocessing steps that would maximize performance on both the train set and validation set using count vectorizer (Attempt to minimize both bias and variance). Often times, the best preprocessing might differ from dataset to dataset, and it may even be the case that if we use lesser preprocessing steps, we might obtain a better outcome. We use multiple preprocessing steps to see how well our model does.

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import string
import hyperopt
import time

In [2]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from scipy.sparse import coo_matrix, hstack, vstack
from sklearn import metrics
from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from gensim.parsing.preprocessing import remove_stopwords
from lightgbm import LGBMClassifier

In [3]:
# Read in the preprocessed files
train = pd.read_csv("../datasets/preprocessed_train.csv")
test = pd.read_csv("../datasets/preprocessed_test.csv")
train["Comment"] = train["Comment"].apply(lambda x: x.lower())
test["Comment"] = test["Comment"].apply(lambda x: x.lower())

In [4]:
# Helper function to get train, val and test data
def get_train_test(train, test = None, ngram_range = (1,1), max_features=None, random_state=1, test_size=0.2, min_df=50):
    
    if type(test) != pd.core.frame.DataFrame:
        # To check if we want to split into train val, or train test
        
        # Use only the train data for train val split
        X = train.Processed
        y = train.Outcome
        
        # split into train and test set, using random_state so that it is reproducable
        X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=random_state, test_size=test_size)
        
        # We use count vect character level analyser
        count_vect_ngram_chars = CountVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=ngram_range, max_features=max_features, min_df=min_df)

        print("Fitting...")
        start = time.time()
        # Fit transform only on the train set, use it to transform the val set
        X_train_dtm =  count_vect_ngram_chars.fit_transform(X_train) 
        X_val_dtm =  count_vect_ngram_chars.transform(X_val) 
        print(f"Operation Took {round(start-time.time(), 2)}s")
        print(X_train_dtm.shape, X_val_dtm.shape)

        # Next, we need to add in the other variables from EDA, need to use scipy to maintain the sparse matrix or we will run out of memory
        add_var_df = train.loc[X_train.index].reset_index()[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]

        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            # Stacks horizontally, effectively increasing columns of features to include our EDA
            X_train_dtm = hstack((X_train_dtm, var_sparse))

        # Repeat the same for the validation set
        add_var_df = train.loc[X_val.index].reset_index()[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]
        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            X_val_dtm = hstack((X_val_dtm, var_sparse))
        
        print("X_train: ", X_train_dtm.shape)
        print("X_val: ", X_val_dtm.shape)
        
        return X_train_dtm, X_val_dtm, y_train, y_val
    else:
        # We use count vect character level analyser
        count_vect_ngram_chars = CountVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=ngram_range, max_features=max_features, min_df=min_df)

        print("Fitting...")
        start = time.time()
        # Fit on train, transform train and test
        X_train_dtm =  count_vect_ngram_chars.fit_transform(train.Processed) 
        X_test_dtm =  count_vect_ngram_chars.transform(test.Processed) 
        print(f"Operation Took {time.time()-start}s")
        print(X_train_dtm.shape, X_test_dtm.shape)

        # Next, we need to add in the other variables from EDA, need to use scipy to maintain the sparse matrix or we will run out of memory
        add_var_df = train[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]

        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            # Stacks horizontally, effectively increasing columns of features to include our EDA
            X_train_dtm = hstack((X_train_dtm, var_sparse))

        # Repeat the same for the test set
        add_var_df = test[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]
        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            X_test_dtm = hstack((X_test_dtm, var_sparse))
        
        print(X_train_dtm.shape, X_test_dtm.shape)
        
        print("X_train: ", X_train_dtm.shape)
        print("X_test: ", X_test_dtm.shape)
        
        return X_train_dtm, X_test_dtm, train.Outcome
    
def train_data(min_df = 50):
    start = time.time()
    # Using ngram_range of 2,5 because it was found to produce one of the best results for the model
    X_train, X_val, y_train, y_val = get_train_test(train, test = None, ngram_range = (2,5), 
                        max_features=None, random_state=1, test_size=0.1, min_df = min_df)
    
    LG = LGBMClassifier(class_weight='balanced')
    %time LG.fit(X_train, y_train)

    from sklearn import metrics
    print("Train")
    y_pred_class = LG.predict(X_train)
    print("Accuracy: ", metrics.accuracy_score(y_train, y_pred_class))
    print("Auroc: ", metrics.roc_auc_score(y_train, y_pred_class))
    y_pred_class = LG.predict_proba(X_train)[:, 1]
    print("Auroc: ", metrics.roc_auc_score(y_train, y_pred_class))

    y_pred_class = LG.predict(X_val)
    print("Validation")
    print("Accuracy: ", metrics.accuracy_score(y_val, y_pred_class))
    print("Auroc: ", metrics.roc_auc_score(y_val, y_pred_class))
    y_pred_class = LG.predict_proba(X_val)[:, 1]
    print("Auroc: ", metrics.roc_auc_score(y_val, y_pred_class))
    print(f"Total Time Elapsed: {round(time.time() - start, 2)}s")
    
    return round(metrics.roc_auc_score(y_val, y_pred_class), 3)

In [5]:
import warnings
# Ignore scipy warnings as it was intended to convert to sparse matrix below
warnings.filterwarnings("ignore", message="Converting data to scipy sparse matrix.")

# Raw

In [6]:
train["Processed"] = train["Comment"]
test["Processed"] = test["Comment"]
train_data()

Fitting...
Operation Took -149.19s
(40013, 96827) (4446, 96827)
X_train:  (40013, 96839)
X_val:  (4446, 96839)
CPU times: user 15min 39s, sys: 3.86 s, total: 15min 43s
Wall time: 4min 32s
Train
Accuracy:  0.7958913353160223
Auroc:  0.7990208217230683
Auroc:  0.8849861586527137
Validation
Accuracy:  0.7345928924876294
Auroc:  0.7365992494135207
Auroc:  0.8148530031175327
Total Time Elapsed: 443.18s


0.815

# Remove Space

In [7]:
train["Processed"] = train["Comment"].apply(lambda x: "".join(x.split()))
test["Processed"] = test["Comment"].apply(lambda x: "".join(x.split()))
train_data()

Fitting...
Operation Took -143.35s
(40013, 108120) (4446, 108120)
X_train:  (40013, 108132)
X_val:  (4446, 108132)
CPU times: user 14min 53s, sys: 2.27 s, total: 14min 56s
Wall time: 4min 15s
Train
Accuracy:  0.7904431059905531
Auroc:  0.7934347247792324
Auroc:  0.8798952104995476
Validation
Accuracy:  0.7181736392262708
Auroc:  0.7211688926575737
Auroc:  0.8038123159311044
Total Time Elapsed: 420.91s


0.804

# Replace Digits

In [8]:
import re
train["Processed"] = train["Comment"].apply(lambda x: re.sub('\d', '1', x))
train_data()

Fitting...
Operation Took -142.97s
(40013, 91675) (4446, 91675)
X_train:  (40013, 91687)
X_val:  (4446, 91687)
CPU times: user 14min 45s, sys: 2.25 s, total: 14min 47s
Wall time: 4min 13s
Train
Accuracy:  0.7967660510334141
Auroc:  0.799772657220535
Auroc:  0.8852668762254632
Validation
Accuracy:  0.7341430499325237
Auroc:  0.7356710473169075
Auroc:  0.8116025467840811
Total Time Elapsed: 416.98s


0.812

# Remove Stopwords

In [9]:
train["Processed"] = train["Comment"].apply(lambda x: remove_stopwords(x))
train_data()

Fitting...
Operation Took -124.57s
(40013, 90352) (4446, 90352)
X_train:  (40013, 90364)
X_val:  (4446, 90364)
CPU times: user 12min 33s, sys: 1.83 s, total: 12min 34s
Wall time: 3min 30s
Train
Accuracy:  0.7918676430160198
Auroc:  0.7951906824275033
Auroc:  0.8791654882214287
Validation
Accuracy:  0.7278452541610436
Auroc:  0.730915014672013
Auroc:  0.8101237198125982
Total Time Elapsed: 350.52s


0.81

# Lemmatize

In [10]:
import en_core_web_sm
nlp = en_core_web_sm.load()
# Helper function to lematize 
def lemm_text(text):
    doc = nlp(text)
    lemma_word1 = [] 
    for token in doc:
        lemma_word1.append(token.lemma_)
    return "".join(lemma_word1)

In [11]:
train["Processed"] = train["Comment"].apply(lambda x: lemm_text(x))
train_data()

Fitting...
Operation Took -145.81s
(40013, 104201) (4446, 104201)
X_train:  (40013, 104213)
X_val:  (4446, 104213)
CPU times: user 14min 13s, sys: 2.9 s, total: 14min 16s
Wall time: 4min 3s
Train
Accuracy:  0.7893934471296828
Auroc:  0.792565618402093
Auroc:  0.8775167360291246
Validation
Accuracy:  0.7199730094466936
Auroc:  0.7218804251005108
Auroc:  0.8009717417915021
Total Time Elapsed: 411.45s


0.801

# Replace Digits, Remove Stopwords

In [12]:
train["Processed"] = train["Comment"].apply(lambda x: remove_stopwords(re.sub('\d', '1', x)))
train_data()

Fitting...
Operation Took -112.98s
(40013, 85433) (4446, 85433)
X_train:  (40013, 85445)
X_val:  (4446, 85445)
CPU times: user 11min 53s, sys: 1.69 s, total: 11min 54s
Wall time: 3min 18s
Train
Accuracy:  0.7909929272986279
Auroc:  0.7941460726777758
Auroc:  0.8795049802031099
Validation
Accuracy:  0.7307692307692307
Auroc:  0.7333879911513183
Auroc:  0.8108805203693137
Total Time Elapsed: 326.75s


0.811

# Remove Stop Words, Lemmatize

In [13]:
train["Processed"] = train["Comment"].apply(lambda x: lemm_text(remove_stopwords(x)))
train_data()

Fitting...
Operation Took -125.98s
(40013, 90770) (4446, 90770)
X_train:  (40013, 90782)
X_val:  (4446, 90782)
CPU times: user 10min 45s, sys: 1.86 s, total: 10min 47s
Wall time: 3min 1s
Train
Accuracy:  0.7839702096818534
Auroc:  0.7871799402763369
Auroc:  0.8730315002251199
Validation
Accuracy:  0.7134502923976608
Auroc:  0.7163366243937931
Auroc:  0.7970118106566488
Total Time Elapsed: 321.58s


0.797

# Replace Digits, Lemmatize

In [14]:
train["Processed"] = train["Comment"].apply(lambda x: lemm_text(re.sub('\d', '1', x)))
train_data()

Fitting...
Operation Took -135.86s
(40013, 101656) (4446, 101656)
X_train:  (40013, 101668)
X_val:  (4446, 101668)
CPU times: user 14min 12s, sys: 3.24 s, total: 14min 15s
Wall time: 4min 2s
Train
Accuracy:  0.7880688776147752
Auroc:  0.7910003288909877
Auroc:  0.8773255399363092
Validation
Accuracy:  0.7174988753936122
Auroc:  0.7208064391011152
Auroc:  0.8000359264212079
Total Time Elapsed: 400.13s


0.8

# Remove Digits, Stopwords, Lemmatize

In [15]:
train["Processed"] = train["Comment"].apply(lambda x: lemm_text(remove_stopwords(re.sub('\d', '1', x))))
train_data()

Fitting...
Operation Took -112.4s
(40013, 88374) (4446, 88374)
X_train:  (40013, 88386)
X_val:  (4446, 88386)
CPU times: user 10min 27s, sys: 1.93 s, total: 10min 29s
Wall time: 2min 55s
Train
Accuracy:  0.7838952340489341
Auroc:  0.7868396347860698
Auroc:  0.8731406982060164
Validation
Accuracy:  0.7170490328385065
Auroc:  0.7200547826482382
Auroc:  0.7992169850901236
Total Time Elapsed: 301.85s


0.799

# Stem

In [16]:
stemmer = SnowballStemmer("english")
def stem_text(text):
    text = word_tokenize(text)
    text = [stemmer.stem(word) for word in text]
    return "".join(text)

In [17]:
train["Processed"] = train["Comment"].apply(lambda x: stem_text(x))
train_data()

Fitting...
Operation Took -145.17s
(40013, 107611) (4446, 107611)
X_train:  (40013, 107623)
X_val:  (4446, 107623)
CPU times: user 13min 59s, sys: 2.23 s, total: 14min 1s
Wall time: 3min 58s
Train
Accuracy:  0.7912428460750256
Auroc:  0.7944445326062695
Auroc:  0.8798651576394603
Validation
Accuracy:  0.7183985605038237
Auroc:  0.7217212665277484
Auroc:  0.7993692505637424
Total Time Elapsed: 404.75s


0.799

# Stem, Remove Digits

In [18]:
train["Processed"] = train["Comment"].apply(lambda x: stem_text(re.sub('\d', '1', x)))
train_data()

Fitting...
Operation Took -131.37s
(40013, 105020) (4446, 105020)
X_train:  (40013, 105032)
X_val:  (4446, 105032)
CPU times: user 14min, sys: 2.47 s, total: 14min 2s
Wall time: 3min 58s
Train
Accuracy:  0.7905180816234724
Auroc:  0.7937623009541839
Auroc:  0.8791270451815256
Validation
Accuracy:  0.7195231668915879
Auroc:  0.7223057396058752
Auroc:  0.8002814030563796
Total Time Elapsed: 390.66s


0.8

# Stem, Stopwords Removal

In [19]:
train["Processed"] = train["Comment"].apply(lambda x: stem_text(remove_stopwords(x)))
train_data()

Fitting...
Operation Took -118.75s
(40013, 91252) (4446, 91252)
X_train:  (40013, 91264)
X_val:  (4446, 91264)
CPU times: user 9min 51s, sys: 1.64 s, total: 9min 53s
Wall time: 2min 46s
Train
Accuracy:  0.7821458026141505
Auroc:  0.7856096440704238
Auroc:  0.8722938281532686
Validation
Accuracy:  0.7165991902834008
Auroc:  0.7194796718390973
Auroc:  0.7961928693255648
Total Time Elapsed: 297.63s


0.796

# Stem, Stopwords Removal, Digits Replace

In [20]:
train["Processed"] = train["Comment"].apply(lambda x: stem_text(remove_stopwords(re.sub('\d', '1', x))))
train_data()

Fitting...
Operation Took -107.88s
(40013, 88806) (4446, 88806)
X_train:  (40013, 88818)
X_val:  (4446, 88818)
CPU times: user 9min 50s, sys: 2.71 s, total: 9min 52s
Wall time: 2min 46s
Train
Accuracy:  0.7825956564116662
Auroc:  0.7858120909489095
Auroc:  0.8725080207758469
Validation
Accuracy:  0.7159244264507423
Auroc:  0.7185875813514302
Auroc:  0.795761381895594
Total Time Elapsed: 293.61s


0.796