# In this notebook, our objective is to find the optimal preprocessing steps that would maximize performance on both the train set and validation set (Attempt to minimize both bias and variance). Often times, the best preprocessing might differ from dataset to dataset, and it may even be the case that if we use lesser preprocessing steps, we might obtain a better outcome. We use multiple preprocessing steps to see how well our model does.

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import string
import hyperopt
import time

In [2]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from scipy.sparse import coo_matrix, hstack, vstack
from sklearn import metrics
from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from gensim.parsing.preprocessing import remove_stopwords
from lightgbm import LGBMClassifier

In [3]:
# Read in the preprocessed files
train = pd.read_csv("../datasets/preprocessed_train.csv")
test = pd.read_csv("../datasets/preprocessed_test.csv")
train["Comment"] = train["Comment"].apply(lambda x: x.lower())
test["Comment"] = test["Comment"].apply(lambda x: x.lower())

In [4]:
# Helper function to get train, val and test data
def get_train_test(train, test = None, ngram_range = (1,1), max_features=None, random_state=1, test_size=0.2, min_df=50):
    
    if type(test) != pd.core.frame.DataFrame:
        # To check if we want to split into train val, or train test
        
        # Use only the train data for train val split
        X = train.Processed
        y = train.Outcome
        
        # split into train and test set, using random_state so that it is reproducable
        X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=random_state, test_size=test_size)
        
        # We use tfidf character level analyser
        tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=ngram_range, max_features=max_features, min_df=min_df, binary = True)

        print("Fitting...")
        start = time.time()
        # Fit transform only on the train set, use it to transform the val set
        X_train_dtm =  tfidf_vect_ngram_chars.fit_transform(X_train) 
        X_val_dtm =  tfidf_vect_ngram_chars.transform(X_val) 
        print(f"Operation Took {round(start-time.time(), 2)}s")
        print(X_train_dtm.shape, X_val_dtm.shape)

        # Next, we need to add in the other variables from EDA, need to use scipy to maintain the sparse matrix or we will run out of memory
        add_var_df = train.loc[X_train.index].reset_index()[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]

        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            # Stacks horizontally, effectively increasing columns of features to include our EDA
            X_train_dtm = hstack((X_train_dtm, var_sparse))

        # Repeat the same for the validation set
        add_var_df = train.loc[X_val.index].reset_index()[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]
        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            X_val_dtm = hstack((X_val_dtm, var_sparse))
        
        print("X_train: ", X_train_dtm.shape)
        print("X_val: ", X_val_dtm.shape)
        
        return X_train_dtm, X_val_dtm, y_train, y_val
    else:
        # We use tfidf character level analyser
        tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=ngram_range, max_features=max_features, min_df=min_df, binary = True)

        print("Fitting...")
        start = time.time()
        # Fit on train, transform train and test
        X_train_dtm =  tfidf_vect_ngram_chars.fit_transform(train.Processed) 
        X_test_dtm =  tfidf_vect_ngram_chars.transform(test.Processed) 
        print(f"Operation Took {time.time()-start}s")
        print(X_train_dtm.shape, X_test_dtm.shape)

        # Next, we need to add in the other variables from EDA, need to use scipy to maintain the sparse matrix or we will run out of memory
        add_var_df = train[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]

        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            # Stacks horizontally, effectively increasing columns of features to include our EDA
            X_train_dtm = hstack((X_train_dtm, var_sparse))

        # Repeat the same for the test set
        add_var_df = test[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]
        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            X_test_dtm = hstack((X_test_dtm, var_sparse))
        
        print(X_train_dtm.shape, X_test_dtm.shape)
        
        print("X_train: ", X_train_dtm.shape)
        print("X_test: ", X_test_dtm.shape)
        
        return X_train_dtm, X_test_dtm, train.Outcome
    
def train_data(min_df = 50):
    start = time.time()
    # Using ngram_range of 2,5 because it was found to produce one of the best results for the model
    X_train, X_val, y_train, y_val = get_train_test(train, test = None, ngram_range = (2,5), 
                        max_features=None, random_state=1, test_size=0.1, min_df = min_df)
    
    LG = LGBMClassifier(class_weight='balanced')
    %time LG.fit(X_train, y_train)

    from sklearn import metrics
    print("Train")
    y_pred_class = LG.predict(X_train)
    print("Accuracy: ", metrics.accuracy_score(y_train, y_pred_class))
    print("Auroc: ", metrics.roc_auc_score(y_train, y_pred_class))
    y_pred_class = LG.predict_proba(X_train)[:, 1]
    print("Auroc: ", metrics.roc_auc_score(y_train, y_pred_class))

    y_pred_class = LG.predict(X_val)
    print("Validation")
    print("Accuracy: ", metrics.accuracy_score(y_val, y_pred_class))
    print("Auroc: ", metrics.roc_auc_score(y_val, y_pred_class))
    y_pred_class = LG.predict_proba(X_val)[:, 1]
    print("Auroc: ", metrics.roc_auc_score(y_val, y_pred_class))
    print(f"Total Time Elapsed: {round(time.time() - start, 2)}s")
    
    return round(metrics.roc_auc_score(y_val, y_pred_class), 3)

In [5]:
import warnings
# Ignore scipy warnings as it was intended to convert to sparse matrix below
warnings.filterwarnings("ignore", message="Converting data to scipy sparse matrix.")

# Raw

In [6]:
train["Processed"] = train["Comment"]
test["Processed"] = test["Comment"]
train_data()

Fitting...
Operation Took -150.42s
(40013, 96827) (4446, 96827)
X_train:  (40013, 96839)
X_val:  (4446, 96839)
CPU times: user 32min 28s, sys: 14.4 s, total: 32min 42s
Wall time: 8min 55s
Train
Accuracy:  0.8099117786719316
Auroc:  0.8118934389240573
Auroc:  0.8967581748843243
Validation
Accuracy:  0.7327935222672065
Auroc:  0.7349461402039905
Auroc:  0.8127328092794694
Total Time Elapsed: 708.35s


0.813

# Remove Space

In [7]:
train["Processed"] = train["Comment"].apply(lambda x: "".join(x.split()))
test["Processed"] = test["Comment"].apply(lambda x: "".join(x.split()))
train_data()

Fitting...
Operation Took -144.01s
(40013, 108120) (4446, 108120)
X_train:  (40013, 108132)
X_val:  (4446, 108132)
CPU times: user 32min 6s, sys: 6.47 s, total: 32min 12s
Wall time: 8min 45s
Train
Accuracy:  0.8050633544098168
Auroc:  0.8072222231275308
Auroc:  0.8929065480476218
Validation
Accuracy:  0.7215474583895637
Auroc:  0.7239815857543714
Auroc:  0.802298920581827
Total Time Elapsed: 691.37s


0.802

# Replace Digits

In [8]:
import re
train["Processed"] = train["Comment"].apply(lambda x: re.sub('\d', '1', x))
train_data()

Fitting...
Operation Took -139.35s
(40013, 91675) (4446, 91675)
X_train:  (40013, 91687)
X_val:  (4446, 91687)
CPU times: user 31min 39s, sys: 6.48 s, total: 31min 46s
Wall time: 8min 38s
Train
Accuracy:  0.8099867543048509
Auroc:  0.8119091468737744
Auroc:  0.8968964484997057
Validation
Accuracy:  0.7305443094916779
Auroc:  0.7325413745415831
Auroc:  0.8095990212210743
Total Time Elapsed: 679.66s


0.81

# Remove Stopwords

In [9]:
train["Processed"] = train["Comment"].apply(lambda x: remove_stopwords(x))
train_data()

Fitting...
Operation Took -119.86s
(40013, 90352) (4446, 90352)
X_train:  (40013, 90364)
X_val:  (4446, 90364)
CPU times: user 26min 42s, sys: 5.53 s, total: 26min 47s
Wall time: 7min 8s
Train
Accuracy:  0.8042136305700647
Auroc:  0.8062698330156995
Auroc:  0.8916762514730533
Validation
Accuracy:  0.7402159244264508
Auroc:  0.7417578596248138
Auroc:  0.8102819524466696
Total Time Elapsed: 564.46s


0.81

# Lemmatize

In [10]:
import en_core_web_sm
nlp = en_core_web_sm.load()
# Helper function to lematize 
def lemm_text(text):
    doc = nlp(text)
    lemma_word1 = [] 
    for token in doc:
        lemma_word1.append(token.lemma_)
    return "".join(lemma_word1)

In [11]:
train["Processed"] = train["Comment"].apply(lambda x: lemm_text(x))
train_data()

Fitting...
Operation Took -143.38s
(40013, 104201) (4446, 104201)
X_train:  (40013, 104213)
X_val:  (4446, 104213)
CPU times: user 31min 12s, sys: 13.6 s, total: 31min 25s
Wall time: 8min 29s
Train
Accuracy:  0.8025641666458401
Auroc:  0.8045303980948557
Auroc:  0.8905412077431216
Validation
Accuracy:  0.7154745838956366
Auroc:  0.7170120452277841
Auroc:  0.798011001386233
Total Time Elapsed: 675.67s


0.798

# Replace Digits, Remove Stopwords

In [12]:
train["Processed"] = train["Comment"].apply(lambda x: remove_stopwords(re.sub('\d', '1', x)))
train_data()

Fitting...
Operation Took -118.4s
(40013, 85433) (4446, 85433)
X_train:  (40013, 85445)
X_val:  (4446, 85445)
CPU times: user 26min 2s, sys: 5.81 s, total: 26min 8s
Wall time: 6min 57s
Train
Accuracy:  0.8050883462874566
Auroc:  0.807193514269928
Auroc:  0.8918406766608569
Validation
Accuracy:  0.7343679712100765
Auroc:  0.7354583900642252
Auroc:  0.807373681900553
Total Time Elapsed: 552.39s


0.807

# Remove Stop Words, Lemmatize

In [13]:
train["Processed"] = train["Comment"].apply(lambda x: lemm_text(remove_stopwords(x)))
train_data()

Fitting...
Operation Took -123.67s
(40013, 90770) (4446, 90770)
X_train:  (40013, 90782)
X_val:  (4446, 90782)
CPU times: user 23min 12s, sys: 10.9 s, total: 23min 23s
Wall time: 6min 14s
Train
Accuracy:  0.7982155799365206
Auroc:  0.8001569632453978
Auroc:  0.8854646400358018
Validation
Accuracy:  0.7215474583895637
Auroc:  0.722981160439866
Auroc:  0.7992211003731944
Total Time Elapsed: 512.76s


0.799

# Replace Digits, Lemmatize

In [14]:
train["Processed"] = train["Comment"].apply(lambda x: lemm_text(re.sub('\d', '1', x)))
train_data()

Fitting...
Operation Took -139.2s
(40013, 101656) (4446, 101656)
X_train:  (40013, 101668)
X_val:  (4446, 101668)
CPU times: user 30min 55s, sys: 13.1 s, total: 31min 8s
Wall time: 8min 24s
Train
Accuracy:  0.8025641666458401
Auroc:  0.8046895145363019
Auroc:  0.8899540677569042
Validation
Accuracy:  0.7244714349977508
Auroc:  0.725042197083787
Auroc:  0.7993285092613417
Total Time Elapsed: 666.37s


0.799

# Remove Digits, Stopwords, Lemmatize

In [15]:
train["Processed"] = train["Comment"].apply(lambda x: lemm_text(remove_stopwords(re.sub('\d', '1', x))))
train_data()

Fitting...
Operation Took -116.68s
(40013, 88374) (4446, 88374)
X_train:  (40013, 88386)
X_val:  (4446, 88386)
CPU times: user 22min 45s, sys: 10.6 s, total: 22min 56s
Wall time: 6min 7s
Train
Accuracy:  0.7975907829955264
Auroc:  0.799652670415162
Auroc:  0.8845944701326174
Validation
Accuracy:  0.7208726945569051
Auroc:  0.7218536757605506
Auroc:  0.7977056473823816
Total Time Elapsed: 498.42s


0.798

# Stem

In [16]:
stemmer = SnowballStemmer("english")
def stem_text(text):
    text = word_tokenize(text)
    text = [stemmer.stem(word) for word in text]
    return "".join(text)

In [17]:
train["Processed"] = train["Comment"].apply(lambda x: stem_text(x))
train_data()

Fitting...
Operation Took -144.56s
(40013, 107611) (4446, 107611)
X_train:  (40013, 107623)
X_val:  (4446, 107623)
CPU times: user 31min 34s, sys: 8.87 s, total: 31min 43s
Wall time: 8min 34s
Train
Accuracy:  0.8048134356334191
Auroc:  0.8068983045684058
Auroc:  0.8923512706925427
Validation
Accuracy:  0.7233468286099866
Auroc:  0.7248108152931327
Auroc:  0.8028919328723256
Total Time Elapsed: 681.42s


0.803

# Stem, Remove Digits

In [18]:
train["Processed"] = train["Comment"].apply(lambda x: stem_text(re.sub('\d', '1', x)))
train_data()

Fitting...
Operation Took -138.49s
(40013, 105020) (4446, 105020)
X_train:  (40013, 105032)
X_val:  (4446, 105032)
CPU times: user 30min 34s, sys: 6.8 s, total: 30min 41s
Wall time: 8min 17s
Train
Accuracy:  0.8046634843675805
Auroc:  0.8067014075698676
Auroc:  0.8918943174625651
Validation
Accuracy:  0.7197480881691408
Auroc:  0.7212103541345116
Auroc:  0.8015594042140087
Total Time Elapsed: 657.98s


0.802

# Stem, Stopwords Removal

In [19]:
train["Processed"] = train["Comment"].apply(lambda x: stem_text(remove_stopwords(x)))
train_data()

Fitting...
Operation Took -133.52s
(40013, 91252) (4446, 91252)
X_train:  (40013, 91264)
X_val:  (4446, 91264)
CPU times: user 22min 6s, sys: 4.65 s, total: 22min 10s
Wall time: 5min 55s
Train
Accuracy:  0.7973408642191288
Auroc:  0.7999270296758744
Auroc:  0.8843194966158119
Validation
Accuracy:  0.7199730094466936
Auroc:  0.7234104873462248
Auroc:  0.7969309453443083
Total Time Elapsed: 502.54s


0.797

# Stem, Stopwords Removal, Digits Replace

In [20]:
train["Processed"] = train["Comment"].apply(lambda x: stem_text(remove_stopwords(re.sub('\d', '1', x))))
train_data()

Fitting...
Operation Took -111.76s
(40013, 88806) (4446, 88806)
X_train:  (40013, 88818)
X_val:  (4446, 88818)
CPU times: user 21min 43s, sys: 4.7 s, total: 21min 48s
Wall time: 5min 49s
Train
Accuracy:  0.7973908479744083
Auroc:  0.7996659429156177
Auroc:  0.884612264522452
Validation
Accuracy:  0.7222222222222222
Auroc:  0.7238732509275334
Auroc:  0.7959654999359044
Total Time Elapsed: 474.71s


0.796