# In this notebook, our objective is to find the optimal preprocessing steps that would maximize performance on both the train set and validation set (Attempt to minimize both bias and variance). Often times, the best preprocessing might differ from dataset to dataset, and it may even be the case that if we use lesser preprocessing steps, we might obtain a better outcome. We use multiple preprocessing steps to see how well our model does.

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import string
import hyperopt
import time

In [2]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from scipy.sparse import coo_matrix, hstack, vstack
from sklearn import metrics
from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from gensim.parsing.preprocessing import remove_stopwords
from lightgbm import LGBMClassifier

In [3]:
# Read in the preprocessed files
train = pd.read_csv("../datasets/preprocessed_train.csv")
test = pd.read_csv("../datasets/preprocessed_test.csv")
train["Comment"] = train["Comment"].apply(lambda x: x.lower())
test["Comment"] = test["Comment"].apply(lambda x: x.lower())

In [4]:
# Helper function to get train, val and test data
def get_train_test(train, test = None, ngram_range = (1,1), max_features=None, random_state=1, test_size=0.2, min_df=50):
    
    if type(test) != pd.core.frame.DataFrame:
        # To check if we want to split into train val, or train test
        
        # Use only the train data for train val split
        X = train.Processed
        y = train.Outcome
        
        # split into train and test set, using random_state so that it is reproducable
        X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=random_state, test_size=test_size)
        
        # We use tfidf character level analyser
        tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=ngram_range, max_features=max_features, min_df=min_df)

        print("Fitting...")
        start = time.time()
        # Fit transform only on the train set, use it to transform the val set
        X_train_dtm =  tfidf_vect_ngram_chars.fit_transform(X_train) 
        X_val_dtm =  tfidf_vect_ngram_chars.transform(X_val) 
        print(f"Operation Took {round(start-time.time(), 2)}s")
        print(X_train_dtm.shape, X_val_dtm.shape)

        # Next, we need to add in the other variables from EDA, need to use scipy to maintain the sparse matrix or we will run out of memory
        add_var_df = train.loc[X_train.index].reset_index()[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]

        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            # Stacks horizontally, effectively increasing columns of features to include our EDA
            X_train_dtm = hstack((X_train_dtm, var_sparse))

        # Repeat the same for the validation set
        add_var_df = train.loc[X_val.index].reset_index()[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]
        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            X_val_dtm = hstack((X_val_dtm, var_sparse))
        
        print("X_train: ", X_train_dtm.shape)
        print("X_val: ", X_val_dtm.shape)
        
        return X_train_dtm, X_val_dtm, y_train, y_val
    else:
        # We use tfidf character level analyser
        tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=ngram_range, max_features=max_features, min_df=min_df)

        print("Fitting...")
        start = time.time()
        # Fit on train, transform train and test
        X_train_dtm =  tfidf_vect_ngram_chars.fit_transform(train.Processed) 
        X_test_dtm =  tfidf_vect_ngram_chars.transform(test.Processed) 
        print(f"Operation Took {time.time()-start}s")
        print(X_train_dtm.shape, X_test_dtm.shape)

        # Next, we need to add in the other variables from EDA, need to use scipy to maintain the sparse matrix or we will run out of memory
        add_var_df = train[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]

        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            # Stacks horizontally, effectively increasing columns of features to include our EDA
            X_train_dtm = hstack((X_train_dtm, var_sparse))

        # Repeat the same for the test set
        add_var_df = test[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]
        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            X_test_dtm = hstack((X_test_dtm, var_sparse))
        
        print(X_train_dtm.shape, X_test_dtm.shape)
        
        print("X_train: ", X_train_dtm.shape)
        print("X_test: ", X_test_dtm.shape)
        
        return X_train_dtm, X_test_dtm, train.Outcome
    
def train_data(min_df = 50):
    start = time.time()
    # Using ngram_range of 2,5 because it was found to produce one of the best results for the model
    X_train, X_val, y_train, y_val = get_train_test(train, test = None, ngram_range = (2,5), 
                        max_features=None, random_state=1, test_size=0.1, min_df = min_df)
    
    LG = LGBMClassifier(class_weight='balanced')
    %time LG.fit(X_train, y_train)

    from sklearn import metrics
    print("Train")
    y_pred_class = LG.predict(X_train)
    print("Accuracy: ", metrics.accuracy_score(y_train, y_pred_class))
    print("Auroc: ", metrics.roc_auc_score(y_train, y_pred_class))
    y_pred_class = LG.predict_proba(X_train)[:, 1]
    print("Auroc: ", metrics.roc_auc_score(y_train, y_pred_class))

    y_pred_class = LG.predict(X_val)
    print("Validation")
    print("Accuracy: ", metrics.accuracy_score(y_val, y_pred_class))
    print("Auroc: ", metrics.roc_auc_score(y_val, y_pred_class))
    y_pred_class = LG.predict_proba(X_val)[:, 1]
    print("Auroc: ", metrics.roc_auc_score(y_val, y_pred_class))
    print(f"Total Time Elapsed: {round(time.time() - start, 2)}s")
    
    return round(metrics.roc_auc_score(y_val, y_pred_class), 3)

In [5]:
import warnings
# Ignore scipy warnings as it was intended to convert to sparse matrix below
warnings.filterwarnings("ignore", message="Converting data to scipy sparse matrix.")

# Raw

In [6]:
train["Processed"] = train["Comment"]
test["Processed"] = test["Comment"]
train_data()

Fitting...
Operation Took -155.85s
(40013, 96827) (4446, 96827)
X_train:  (40013, 96839)
X_val:  (4446, 96839)
CPU times: user 32min 18s, sys: 13.9 s, total: 32min 32s
Wall time: 8min 56s
Train
Accuracy:  0.8104615999800066
Auroc:  0.8129102903901771
Auroc:  0.8986492561818136
Validation
Accuracy:  0.7361673414304993
Auroc:  0.7384650158757331
Auroc:  0.813580146063742
Total Time Elapsed: 715.02s


0.814

# Remove Space

In [7]:
train["Processed"] = train["Comment"].apply(lambda x: "".join(x.split()))
test["Processed"] = test["Comment"].apply(lambda x: "".join(x.split()))
train_data()

Fitting...
Operation Took -156.31s
(40013, 108120) (4446, 108120)
X_train:  (40013, 108132)
X_val:  (4446, 108132)
CPU times: user 31min 51s, sys: 6.34 s, total: 31min 57s
Wall time: 8min 42s
Train
Accuracy:  0.8065878589458426
Auroc:  0.809124296309362
Auroc:  0.8944642469271319
Validation
Accuracy:  0.728969860548808
Auroc:  0.7311463964626673
Auroc:  0.8053403205352831
Total Time Elapsed: 700.74s


0.805

# Replace Digits

In [8]:
import re
train["Processed"] = train["Comment"].apply(lambda x: re.sub('\d', '1', x))
train_data()

Fitting...
Operation Took -145.74s
(40013, 91675) (4446, 91675)
X_train:  (40013, 91687)
X_val:  (4446, 91687)
CPU times: user 31min 36s, sys: 6.31 s, total: 31min 43s
Wall time: 8min 42s
Train
Accuracy:  0.8099617624272112
Auroc:  0.8124661223169783
Auroc:  0.8985171048213758
Validation
Accuracy:  0.7388663967611336
Auroc:  0.7400325271973915
Auroc:  0.8138027828778709
Total Time Elapsed: 689.41s


0.814

# Remove Stopwords

In [9]:
train["Processed"] = train["Comment"].apply(lambda x: remove_stopwords(x))
train_data()

Fitting...
Operation Took -121.96s
(40013, 90352) (4446, 90352)
X_train:  (40013, 90364)
X_val:  (4446, 90364)
CPU times: user 26min 31s, sys: 5.9 s, total: 26min 37s
Wall time: 7min 8s
Train
Accuracy:  0.8069877289880789
Auroc:  0.8093777962453956
Auroc:  0.8923867985542524
Validation
Accuracy:  0.7307692307692307
Auroc:  0.7332114455075822
Auroc:  0.8113714736396571
Total Time Elapsed: 566.39s


0.811

# Lemmatize

In [10]:
import en_core_web_sm
nlp = en_core_web_sm.load()
# Helper function to lematize 
def lemm_text(text):
    doc = nlp(text)
    lemma_word1 = [] 
    for token in doc:
        lemma_word1.append(token.lemma_)
    return "".join(lemma_word1)

In [11]:
train["Processed"] = train["Comment"].apply(lambda x: lemm_text(x))
train_data()

Fitting...
Operation Took -146.53s
(40013, 104201) (4446, 104201)
X_train:  (40013, 104213)
X_val:  (4446, 104213)
CPU times: user 30min 48s, sys: 12.7 s, total: 31min 1s
Wall time: 8min 22s
Train
Accuracy:  0.8037137930172694
Auroc:  0.8064239427623381
Auroc:  0.8917881501501606
Validation
Accuracy:  0.7190733243364822
Auroc:  0.7205536578384928
Auroc:  0.8016059069127084
Total Time Elapsed: 671.41s


0.802

# Replace Digits, Remove Stopwords

In [12]:
train["Processed"] = train["Comment"].apply(lambda x: remove_stopwords(re.sub('\d', '1', x)))
train_data()

Fitting...
Operation Took -114.96s
(40013, 85433) (4446, 85433)
X_train:  (40013, 85445)
X_val:  (4446, 85445)
CPU times: user 25min 48s, sys: 5.27 s, total: 25min 54s
Wall time: 6min 54s
Train
Accuracy:  0.8057131432284508
Auroc:  0.8081051451902659
Auroc:  0.8923377545202484
Validation
Accuracy:  0.7350427350427351
Auroc:  0.7369978145789253
Auroc:  0.8108541825576607
Total Time Elapsed: 545.39s


0.811

# Remove Stop Words, Lemmatize

In [13]:
train["Processed"] = train["Comment"].apply(lambda x: lemm_text(remove_stopwords(x)))
train_data()

Fitting...
Operation Took -125.08s
(40013, 90770) (4446, 90770)
X_train:  (40013, 90782)
X_val:  (4446, 90782)
CPU times: user 22min 53s, sys: 10.2 s, total: 23min 3s
Wall time: 6min 9s
Train
Accuracy:  0.7985904581011172
Auroc:  0.8011010964354499
Auroc:  0.8857950971225945
Validation
Accuracy:  0.7188484030589294
Auroc:  0.7208840121869993
Auroc:  0.8003377824344492
Total Time Elapsed: 508.4s


0.8

# Replace Digits, Lemmatize

In [14]:
train["Processed"] = train["Comment"].apply(lambda x: lemm_text(re.sub('\d', '1', x)))
train_data()

Fitting...
Operation Took -139.31s
(40013, 101656) (4446, 101656)
X_train:  (40013, 101668)
X_val:  (4446, 101668)
CPU times: user 30min 13s, sys: 13.1 s, total: 30min 26s
Wall time: 8min 12s
Train
Accuracy:  0.8043385899582636
Auroc:  0.8067754838087855
Auroc:  0.8920877979788671
Validation
Accuracy:  0.725820962663068
Auroc:  0.7276502577298906
Auroc:  0.797522517285732
Total Time Elapsed: 653.76s


0.798

# Remove Digits, Stopwords, Lemmatize

In [15]:
train["Processed"] = train["Comment"].apply(lambda x: lemm_text(remove_stopwords(re.sub('\d', '1', x))))
train_data()

Fitting...
Operation Took -116.54s
(40013, 88374) (4446, 88374)
X_train:  (40013, 88386)
X_val:  (4446, 88386)
CPU times: user 22min 29s, sys: 9.79 s, total: 22min 39s
Wall time: 6min 3s
Train
Accuracy:  0.7986654337340364
Auroc:  0.8014286726104015
Auroc:  0.886496283143996
Validation
Accuracy:  0.7235717498875394
Auroc:  0.7253043406153953
Auroc:  0.7984040109194921
Total Time Elapsed: 493.33s


0.798

# Stem

In [16]:
stemmer = SnowballStemmer("english")
def stem_text(text):
    text = word_tokenize(text)
    text = [stemmer.stem(word) for word in text]
    return "".join(text)

In [17]:
train["Processed"] = train["Comment"].apply(lambda x: stem_text(x))
train_data()

Fitting...
Operation Took -142.89s
(40013, 107611) (4446, 107611)
X_train:  (40013, 107623)
X_val:  (4446, 107623)
CPU times: user 31min 47s, sys: 9.35 s, total: 31min 56s
Wall time: 8min 36s
Train
Accuracy:  0.8063129482918051
Auroc:  0.8087654400312615
Auroc:  0.8937823376325827
Validation
Accuracy:  0.7217723796671165
Auroc:  0.7241808683370736
Auroc:  0.8022929534213743
Total Time Elapsed: 681.69s


0.802

# Stem, Remove Digits

In [18]:
train["Processed"] = train["Comment"].apply(lambda x: stem_text(re.sub('\d', '1', x)))
train_data()

Fitting...
Operation Took -131.78s
(40013, 105020) (4446, 105020)
X_train:  (40013, 105032)
X_val:  (4446, 105032)
CPU times: user 31min 4s, sys: 6.69 s, total: 31min 11s
Wall time: 8min 23s
Train
Accuracy:  0.8081123634818684
Auroc:  0.8104280916713559
Auroc:  0.8939054299847731
Validation
Accuracy:  0.7235717498875394
Auroc:  0.7248335522320986
Auroc:  0.8023050935064331
Total Time Elapsed: 657.14s


0.802

# Stem, Stopwords Removal

In [19]:
train["Processed"] = train["Comment"].apply(lambda x: stem_text(remove_stopwords(x)))
train_data()

Fitting...
Operation Took -123.22s
(40013, 91252) (4446, 91252)
X_train:  (40013, 91264)
X_val:  (4446, 91264)
CPU times: user 21min 57s, sys: 4.83 s, total: 22min 2s
Wall time: 5min 56s
Train
Accuracy:  0.798065628670682
Auroc:  0.80093385886851
Auroc:  0.8867457828006802
Validation
Accuracy:  0.7147998200629779
Auroc:  0.7173557742462705
Auroc:  0.7939500400519925
Total Time Elapsed: 493.39s


0.794

# Stem, Stopwords Removal, Digits Replace

In [20]:
train["Processed"] = train["Comment"].apply(lambda x: stem_text(remove_stopwords(re.sub('\d', '1', x))))
train_data()

Fitting...
Operation Took -112.2s
(40013, 88806) (4446, 88806)
X_train:  (40013, 88818)
X_val:  (4446, 88818)
CPU times: user 21min 49s, sys: 5.08 s, total: 21min 54s
Wall time: 5min 54s
Train
Accuracy:  0.7980906205483218
Auroc:  0.8008415034343286
Auroc:  0.886016343537249
Validation
Accuracy:  0.717948717948718
Auroc:  0.7204399731436627
Auroc:  0.7968336188996844
Total Time Elapsed: 479.95s


0.797