# Count Vectorizer, Preprocessing Variations, Binary Count
## In this notebook, our objective is to find the optimal preprocessing steps that would maximize performance on both the train set and validation set (Attempt to minimize both bias and variance). Often times, the best preprocessing might differ from dataset to dataset, and it may even be the case that if we use lesser preprocessing steps, we might obtain a better outcome. We use multiple preprocessing steps to see how well our model does.

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import string
import hyperopt
import time

In [2]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from scipy.sparse import coo_matrix, hstack, vstack
from sklearn import metrics
from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from gensim.parsing.preprocessing import remove_stopwords
from lightgbm import LGBMClassifier

In [3]:
# Read in the preprocessed files
train = pd.read_csv("../datasets/preprocessed_train.csv")
test = pd.read_csv("../datasets/preprocessed_test.csv")
train["Comment"] = train["Comment"].apply(lambda x: x.lower())
test["Comment"] = test["Comment"].apply(lambda x: x.lower())

In [4]:
# Helper function to get train, val and test data
def get_train_test(train, test = None, ngram_range = (1,1), max_features=None, random_state=1, test_size=0.2, min_df=50):
    
    if type(test) != pd.core.frame.DataFrame:
        # To check if we want to split into train val, or train test
        
        # Use only the train data for train val split
        X = train.Processed
        y = train.Outcome
        
        # split into train and test set, using random_state so that it is reproducable
        X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=random_state, test_size=test_size)
        
        # We use count vect character level analyser
        # Binary set to true
        count_vect_ngram_chars = CountVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=ngram_range, max_features=max_features, min_df=min_df, binary = True)

        print("Fitting...")
        start = time.time()
        # Fit transform only on the train set, use it to transform the val set
        X_train_dtm =  count_vect_ngram_chars.fit_transform(X_train) 
        X_val_dtm =  count_vect_ngram_chars.transform(X_val) 
        print(f"Operation Took {round(start-time.time(), 2)}s")
        print(X_train_dtm.shape, X_val_dtm.shape)

        # Next, we need to add in the other variables from EDA, need to use scipy to maintain the sparse matrix or we will run out of memory
        add_var_df = train.loc[X_train.index].reset_index()[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]

        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            # Stacks horizontally, effectively increasing columns of features to include our EDA
            X_train_dtm = hstack((X_train_dtm, var_sparse))

        # Repeat the same for the validation set
        add_var_df = train.loc[X_val.index].reset_index()[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]
        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            X_val_dtm = hstack((X_val_dtm, var_sparse))
        
        print("X_train: ", X_train_dtm.shape)
        print("X_val: ", X_val_dtm.shape)
        
        return X_train_dtm, X_val_dtm, y_train, y_val
    else:
        # We use ccount vect character level analyser
        # Binary set to true
        count_vect_ngram_chars = CountVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=ngram_range, max_features=max_features, min_df=min_df, binary = True)

        print("Fitting...")
        start = time.time()
        # Fit on train, transform train and test
        X_train_dtm =  count_vect_ngram_chars.fit_transform(train.Processed) 
        X_test_dtm =  count_vect_ngram_chars.transform(test.Processed) 
        print(f"Operation Took {time.time()-start}s")
        print(X_train_dtm.shape, X_test_dtm.shape)

        # Next, we need to add in the other variables from EDA, need to use scipy to maintain the sparse matrix or we will run out of memory
        add_var_df = train[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]

        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            # Stacks horizontally, effectively increasing columns of features to include our EDA
            X_train_dtm = hstack((X_train_dtm, var_sparse))

        # Repeat the same for the test set
        add_var_df = test[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]
        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            X_test_dtm = hstack((X_test_dtm, var_sparse))
        
        print(X_train_dtm.shape, X_test_dtm.shape)
        
        print("X_train: ", X_train_dtm.shape)
        print("X_test: ", X_test_dtm.shape)
        
        return X_train_dtm, X_test_dtm, train.Outcome
    
def train_data(min_df = 50):
    start = time.time()
    # Using ngram_range of 2,5 because it was found to produce one of the best results for the model
    X_train, X_val, y_train, y_val = get_train_test(train, test = None, ngram_range = (2,5), 
                        max_features=None, random_state=1, test_size=0.1, min_df = min_df)
    
    LG = LGBMClassifier(class_weight='balanced')
    %time LG.fit(X_train, y_train)

    from sklearn import metrics
    print("Train")
    y_pred_class = LG.predict(X_train)
    print("Accuracy: ", metrics.accuracy_score(y_train, y_pred_class))
    print("Auroc: ", metrics.roc_auc_score(y_train, y_pred_class))
    y_pred_class = LG.predict_proba(X_train)[:, 1]
    print("Auroc: ", metrics.roc_auc_score(y_train, y_pred_class))

    y_pred_class = LG.predict(X_val)
    print("Validation")
    print("Accuracy: ", metrics.accuracy_score(y_val, y_pred_class))
    print("Auroc: ", metrics.roc_auc_score(y_val, y_pred_class))
    y_pred_class = LG.predict_proba(X_val)[:, 1]
    print("Auroc: ", metrics.roc_auc_score(y_val, y_pred_class))
    print(f"Total Time Elapsed: {round(time.time() - start, 2)}s")
    
    return round(metrics.roc_auc_score(y_val, y_pred_class), 3)

In [5]:
import warnings
# Ignore scipy warnings as it was intended to convert to sparse matrix below
warnings.filterwarnings("ignore", message="Converting data to scipy sparse matrix.")

# Raw

In [6]:
train["Processed"] = train["Comment"]
test["Processed"] = test["Comment"]
train_data()

Fitting...
Operation Took -134.14s
(40013, 96827) (4446, 96827)
X_train:  (40013, 96839)
X_val:  (4446, 96839)
CPU times: user 13min 44s, sys: 2.12 s, total: 13min 46s
Wall time: 3min 56s
Train
Accuracy:  0.7929422937545297
Auroc:  0.7952545917127827
Auroc:  0.8816791574120229
Validation
Accuracy:  0.7343679712100765
Auroc:  0.7361645726391701
Auroc:  0.8135503102614787
Total Time Elapsed: 391.76s


0.814

# Remove Space

In [7]:
train["Processed"] = train["Comment"].apply(lambda x: "".join(x.split()))
test["Processed"] = test["Comment"].apply(lambda x: "".join(x.split()))
train_data()

Fitting...
Operation Took -137.05s
(40013, 108120) (4446, 108120)
X_train:  (40013, 108132)
X_val:  (4446, 108132)
CPU times: user 13min 9s, sys: 1.65 s, total: 13min 11s
Wall time: 3min 47s
Train
Accuracy:  0.7890935445980056
Auroc:  0.7920127079635708
Auroc:  0.8777334719765177
Validation
Accuracy:  0.7285200179937023
Auroc:  0.7322778935429769
Auroc:  0.806675524127596
Total Time Elapsed: 386.28s


0.807

# Replace Digits

In [8]:
import re
train["Processed"] = train["Comment"].apply(lambda x: re.sub('\d', '1', x))
train_data()

Fitting...
Operation Took -126.89s
(40013, 91675) (4446, 91675)
X_train:  (40013, 91687)
X_val:  (4446, 91687)
CPU times: user 12min 43s, sys: 1.65 s, total: 12min 45s
Wall time: 3min 38s
Train
Accuracy:  0.7945167820458351
Auroc:  0.7968764841613839
Auroc:  0.882032042460955
Validation
Accuracy:  0.7379667116509222
Auroc:  0.7392942454144945
Auroc:  0.814373366875634
Total Time Elapsed: 365.53s


0.814

# Remove Stopwords

In [9]:
train["Processed"] = train["Comment"].apply(lambda x: remove_stopwords(x))
train_data()

Fitting...
Operation Took -107.27s
(40013, 90352) (4446, 90352)
X_train:  (40013, 90364)
X_val:  (4446, 90364)
CPU times: user 10min 39s, sys: 1.42 s, total: 10min 41s
Wall time: 2min 58s
Train
Accuracy:  0.7887936420663284
Auroc:  0.7914470682097328
Auroc:  0.8768147136549023
Validation
Accuracy:  0.7285200179937023
Auroc:  0.7318071051596803
Auroc:  0.8109424553795289
Total Time Elapsed: 300.52s


0.811

# Lemmatize

In [10]:
import en_core_web_sm
nlp = en_core_web_sm.load()
# Helper function to lematize 
def lemm_text(text):
    doc = nlp(text)
    lemma_word1 = [] 
    for token in doc:
        lemma_word1.append(token.lemma_)
    return "".join(lemma_word1)

In [11]:
train["Processed"] = train["Comment"].apply(lambda x: lemm_text(x))
train_data()

Fitting...
Operation Took -126.27s
(40013, 104201) (4446, 104201)
X_train:  (40013, 104213)
X_val:  (4446, 104213)
CPU times: user 12min 42s, sys: 1.74 s, total: 12min 44s
Wall time: 3min 36s
Train
Accuracy:  0.7866693324669483
Auroc:  0.7896420944481894
Auroc:  0.8747179555579423
Validation
Accuracy:  0.7240215924426451
Auroc:  0.726585633999481
Auroc:  0.8023022128082836
Total Time Elapsed: 362.95s


0.802

# Replace Digits, Remove Stopwords

In [12]:
train["Processed"] = train["Comment"].apply(lambda x: remove_stopwords(re.sub('\d', '1', x)))
train_data()

Fitting...
Operation Took -103.55s
(40013, 85433) (4446, 85433)
X_train:  (40013, 85445)
X_val:  (4446, 85445)
CPU times: user 10min 30s, sys: 1.42 s, total: 10min 31s
Wall time: 2min 55s
Train
Accuracy:  0.7877189913278184
Auroc:  0.790294802464962
Auroc:  0.8767804549173518
Validation
Accuracy:  0.728969860548808
Auroc:  0.7310875479147553
Auroc:  0.8087432481065069
Total Time Elapsed: 294.2s


0.809

# Remove Stop Words, Lemmatize

In [13]:
train["Processed"] = train["Comment"].apply(lambda x: lemm_text(remove_stopwords(x)))
train_data()

Fitting...
Operation Took -108.58s
(40013, 90770) (4446, 90770)
X_train:  (40013, 90782)
X_val:  (4446, 90782)
CPU times: user 9min 1s, sys: 1.17 s, total: 9min 2s
Wall time: 2min 31s
Train
Accuracy:  0.7820458351035914
Auroc:  0.7847888748251318
Auroc:  0.869847581183626
Validation
Accuracy:  0.7161493477282951
Auroc:  0.719198803769517
Auroc:  0.7987066899893476
Total Time Elapsed: 272.61s


0.799

# Replace Digits, Lemmatize

In [14]:
train["Processed"] = train["Comment"].apply(lambda x: lemm_text(re.sub('\d', '1', x)))
train_data()

Fitting...
Operation Took -118.01s
(40013, 101656) (4446, 101656)
X_train:  (40013, 101668)
X_val:  (4446, 101668)
CPU times: user 12min 21s, sys: 1.68 s, total: 12min 23s
Wall time: 3min 30s
Train
Accuracy:  0.7861944867917927
Auroc:  0.7891246649137829
Auroc:  0.8746406303611769
Validation
Accuracy:  0.725820962663068
Auroc:  0.7288860772360443
Auroc:  0.800884292026248
Total Time Elapsed: 348.07s


0.801

# Remove Digits, Stopwords, Lemmatize

In [15]:
train["Processed"] = train["Comment"].apply(lambda x: lemm_text(remove_stopwords(re.sub('\d', '1', x))))
train_data()

Fitting...
Operation Took -97.08s
(40013, 88374) (4446, 88374)
X_train:  (40013, 88386)
X_val:  (4446, 88386)
CPU times: user 8min 45s, sys: 1.4 s, total: 8min 46s
Wall time: 2min 27s
Train
Accuracy:  0.781870891960113
Auroc:  0.7847798031256425
Auroc:  0.8702119911490469
Validation
Accuracy:  0.7123256860098965
Auroc:  0.7152225143844576
Auroc:  0.796711189228329
Total Time Elapsed: 256.48s


0.797

# Stem

In [16]:
stemmer = SnowballStemmer("english")
def stem_text(text):
    text = word_tokenize(text)
    text = [stemmer.stem(word) for word in text]
    return "".join(text)

In [17]:
train["Processed"] = train["Comment"].apply(lambda x: stem_text(x))
train_data()

Fitting...
Operation Took -121.85s
(40013, 107611) (4446, 107611)
X_train:  (40013, 107623)
X_val:  (4446, 107623)
CPU times: user 12min 3s, sys: 1.64 s, total: 12min 5s
Wall time: 3min 25s
Train
Accuracy:  0.7877189913278184
Auroc:  0.7909758208343514
Auroc:  0.8768177290939043
Validation
Accuracy:  0.7228969860548808
Auroc:  0.7253538268943213
Auroc:  0.8019341007376026
Total Time Elapsed: 346.66s


0.802

# Stem, Remove Digits

In [18]:
train["Processed"] = train["Comment"].apply(lambda x: stem_text(re.sub('\d', '1', x)))
train_data()

Fitting...
Operation Took -117.24s
(40013, 105020) (4446, 105020)
X_train:  (40013, 105032)
X_val:  (4446, 105032)
CPU times: user 12min 18s, sys: 1.82 s, total: 12min 20s
Wall time: 3min 29s
Train
Accuracy:  0.7891185364756454
Auroc:  0.7919712697906522
Auroc:  0.8772379716384561
Validation
Accuracy:  0.7192982456140351
Auroc:  0.7221653055710849
Auroc:  0.8001145283278596
Total Time Elapsed: 346.33s


0.8

# Stem, Stopwords Removal

In [19]:
train["Processed"] = train["Comment"].apply(lambda x: stem_text(remove_stopwords(x)))
train_data()

Fitting...
Operation Took -102.74s
(40013, 91252) (4446, 91252)
X_train:  (40013, 91264)
X_val:  (4446, 91264)
CPU times: user 8min 21s, sys: 1.23 s, total: 8min 22s
Wall time: 2min 21s
Train
Accuracy:  0.7825456726563866
Auroc:  0.7858949672947465
Auroc:  0.8707028147645824
Validation
Accuracy:  0.7147998200629779
Auroc:  0.7175323198900067
Auroc:  0.7949173373377781
Total Time Elapsed: 255.45s


0.795

# Stem, Stopwords Removal, Digits Replace

In [20]:
train["Processed"] = train["Comment"].apply(lambda x: stem_text(remove_stopwords(re.sub('\d', '1', x))))
train_data()

Fitting...
Operation Took -93.4s
(40013, 88806) (4446, 88806)
X_train:  (40013, 88818)
X_val:  (4446, 88818)
CPU times: user 8min 17s, sys: 1.02 s, total: 8min 18s
Wall time: 2min 19s
Train
Accuracy:  0.7803963711793667
Auroc:  0.7837049996430462
Auroc:  0.8703654536423642
Validation
Accuracy:  0.711650922177238
Auroc:  0.7143892724447023
Auroc:  0.7930860363712833
Total Time Elapsed: 244.77s


0.793