In this notebook, our objective is to find the optimal ngram and min_df range.

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import string
import hyperopt
import time

In [2]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from scipy.sparse import coo_matrix, hstack, vstack
from sklearn import metrics
from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from gensim.parsing.preprocessing import remove_stopwords
from lightgbm import LGBMClassifier

In [3]:
# Read in the preprocessed files
train = pd.read_csv("../datasets/preprocessed_train.csv")
test = pd.read_csv("../datasets/preprocessed_test.csv")

In [4]:
train.head()

Unnamed: 0,Comment,Outcome,Id,num_numbers,prop_numbers,num_words,num_punctuation,prop_punctuation,nchar,word_density,pos_tags,noun_count,verb_count,adj_count,adv_count,pron_count,lda
0,combining lindelof's and gregg lind's ideas: l...,1,15086,3,0.006897,80,51,0.117241,435,5.37037,"[('combining', 'VBG'), ('lindelof', 'NN'), (""'...",30,9,9,6,1,[0.00076923 0.00076923 0.00076923 0.00076923 0...
1,in most cases r is an interpreted language tha...,1,41061,0,0.0,39,4,0.017094,234,5.85,"[('in', 'IN'), ('most', 'JJS'), ('cases', 'NNS...",10,8,7,1,0,[0.00131579 0.00131579 0.00131579 0.00131579 0...
2,"i don't know r at all, but a bit of creative g...",1,34417,12,0.013423,164,49,0.05481,894,5.418182,"[('i', 'NNS'), ('do', 'VBP'), (""n't"", 'RB'), (...",49,30,14,10,4,[3.18471347e-04 3.18471350e-04 3.18471338e-04 ...
3,if you don't want to modify the list in-place ...,1,30549,12,0.021164,102,92,0.162257,567,5.504854,"[('if', 'IN'), ('you', 'PRP'), ('do', 'VBP'), ...",54,18,8,6,2,[6.57894739e-04 6.57894745e-04 6.57894738e-04 ...
4,i assume it helps if the matrix is sparse? yes...,1,8496,0,0.0,23,14,0.084848,165,6.875,"[('i', 'NN'), ('assume', 'VBP'), ('it', 'PRP')...",9,6,0,2,1,[0.00172414 0.00172414 0.00172414 0.00172414 0...


In [5]:
import re
# Lowering and replacing only digits was found to be one of the best preprocessing steps for character level model
# We replace all digits by 1 to represent all digits, and lowercase the text
train["Comment"] = train["Comment"].apply(lambda x: re.sub('\d', '1', x.lower()))
test["Comment"] = test["Comment"].apply(lambda x: re.sub('\d', '1', x.lower()))

In [6]:
# Helper function to get train, val and test data
def get_train_test(train, test = None, ngram_range = (1,1), max_features=None, random_state=1, test_size=0.2, min_df=1):
    
    if type(test) != pd.core.frame.DataFrame:
        # To check if we want to split into train val, or train test
        
        # Use only the train data for train val split
        X = train.Comment
        y = train.Outcome
        
        # split into train and test set, using random_state so that it is reproducable
        X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=random_state, test_size=test_size)
        
        # We use tfidf character level analyser
        tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=ngram_range, max_features=max_features, min_df=min_df)

        print("Fitting...")
        start = time.time()
        # Fit transform only on the train set, use it to transform the val set
        X_train_dtm =  tfidf_vect_ngram_chars.fit_transform(X_train) 
        X_val_dtm =  tfidf_vect_ngram_chars.transform(X_val) 
        print(f"Operation Took {round(start-time.time(), 2)}s")
        print(X_train_dtm.shape, X_val_dtm.shape)

        # Next, we need to add in the other variables from EDA, need to use scipy to maintain the sparse matrix or we will run out of memory
        add_var_df = train.loc[X_train.index].reset_index()[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]

        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            # Stacks horizontally, effectively increasing columns of features to include our EDA
            X_train_dtm = hstack((X_train_dtm, var_sparse))

        # Repeat the same for the validation set
        add_var_df = train.loc[X_val.index].reset_index()[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]
        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            X_val_dtm = hstack((X_val_dtm, var_sparse))
        
        print("X_train: ", X_train_dtm.shape)
        print("X_val: ", X_val_dtm.shape)
        
        return X_train_dtm, X_val_dtm, y_train, y_val
    else:
        # We use tfidf character level analyser
        tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=ngram_range, max_features=max_features, min_df=min_df)

        print("Fitting...")
        start = time.time()
        # Fit on train, transform train and test
        X_train_dtm =  tfidf_vect_ngram_chars.fit_transform(train.Comment) 
        X_test_dtm =  tfidf_vect_ngram_chars.transform(test.Comment) 
        print(f"Operation Took {time.time()-start}s")
        print(X_train_dtm.shape, X_test_dtm.shape)

        # Next, we need to add in the other variables from EDA, need to use scipy to maintain the sparse matrix or we will run out of memory
        add_var_df = train[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]

        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            # Stacks horizontally, effectively increasing columns of features to include our EDA
            X_train_dtm = hstack((X_train_dtm, var_sparse))

        # Repeat the same for the test set
        add_var_df = test[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]
        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            X_test_dtm = hstack((X_test_dtm, var_sparse))
        
        print(X_train_dtm.shape, X_test_dtm.shape)
        
        print("X_train: ", X_train_dtm.shape)
        print("X_test: ", X_test_dtm.shape)
        
        return X_train_dtm, X_test_dtm, train.Outcome

In [7]:
def train_data(train = train, test = None, ngram_range = (2,2), min_df = 1):
    start = time.time()
    
    # Vary only the character ngram_range for this notebook
    X_train, X_val, y_train, y_val = get_train_test(train = train, test = test, ngram_range = ngram_range, 
                        max_features=None, random_state=1, test_size=0.1, min_df = min_df)
    
    LG = LGBMClassifier(class_weight='balanced')
    %time LG.fit(X_train, y_train)
    
    # Metrics for train and validation sets
    from sklearn import metrics
    print("Train")
    y_pred_class = LG.predict(X_train)
    print("Accuracy: ", metrics.accuracy_score(y_train, y_pred_class))
    # Predict using binary outcomes for additional datapoint
    print("Auroc: ", metrics.roc_auc_score(y_train, y_pred_class))
    y_pred_class = LG.predict_proba(X_train)[:, 1]
    # Actual AUROC best done using proba
    print("Auroc: ", metrics.roc_auc_score(y_train, y_pred_class))
    
    y_pred_class = LG.predict(X_val)
    print("Validation")
    print("Accuracy: ", metrics.accuracy_score(y_val, y_pred_class))
    print("Auroc: ", metrics.roc_auc_score(y_val, y_pred_class))
    y_pred_class = LG.predict_proba(X_val)[:, 1]
    print("Auroc: ", metrics.roc_auc_score(y_val, y_pred_class))
    print(f"Total Time Elapsed: {round(time.time() - start, 2)}s")
    print(round(metrics.roc_auc_score(y_val, y_pred_class), 3))

In [8]:
import warnings
# Ignore scipy warnings as it was intended to convert to sparse matrix below
warnings.filterwarnings("ignore", message="Converting data to scipy sparse matrix.")

# First, we optimise for the best ngram range to use

In [9]:
train_data(ngram_range = (1,1))

Fitting...
Operation Took -12.4s
(40013, 1163) (4446, 1163)
X_train:  (40013, 1175)
X_val:  (4446, 1175)
CPU times: user 7.5 s, sys: 60.4 ms, total: 7.56 s
Wall time: 2.13 s
Train
Accuracy:  0.743983205458226
Auroc:  0.7460519603391796
Auroc:  0.8274411947132776
Validation
Accuracy:  0.6675663517768781
Auroc:  0.6692684858001128
Auroc:  0.7386151208257397
Total Time Elapsed: 15.44s
0.739


In [10]:
train_data(ngram_range = (2,2))

Fitting...
Operation Took -25.71s
(40013, 7121) (4446, 7121)
X_train:  (40013, 7133)
X_val:  (4446, 7133)
CPU times: user 1min 36s, sys: 653 ms, total: 1min 37s
Wall time: 25.7 s
Train
Accuracy:  0.7912928298303051
Auroc:  0.7933687696658086
Auroc:  0.8764090736529251
Validation
Accuracy:  0.7147998200629779
Auroc:  0.7158257120005564
Auroc:  0.7866412973182964
Total Time Elapsed: 54.64s
0.787


In [11]:
train_data(ngram_range = (3,3))

Fitting...
Operation Took -30.85s
(40013, 86113) (4446, 86113)
X_train:  (40013, 86125)
X_val:  (4446, 86125)
CPU times: user 5min 47s, sys: 2.16 s, total: 5min 49s
Wall time: 1min 32s
Train
Accuracy:  0.8014395321520505
Auroc:  0.8039956399391809
Auroc:  0.8888623265432805
Validation
Accuracy:  0.7206477732793523
Auroc:  0.7208305135070793
Auroc:  0.7959212606428937
Total Time Elapsed: 129.16s
0.796


In [12]:
train_data(ngram_range = (4,4))

Fitting...
Operation Took -38.12s
(40013, 475254) (4446, 475254)
X_train:  (40013, 475266)
X_val:  (4446, 475266)
CPU times: user 10min 34s, sys: 3.9 s, total: 10min 38s
Wall time: 2min 51s
Train
Accuracy:  0.7980906205483218
Auroc:  0.8012042889208257
Auroc:  0.8868079076973918
Validation
Accuracy:  0.7251461988304093
Auroc:  0.7269347128859595
Auroc:  0.8080123738331372
Total Time Elapsed: 221.02s
0.808


In [13]:
train_data(ngram_range = (5,5))

Fitting...
Operation Took -48.44s
(40013, 1359094) (4446, 1359094)
X_train:  (40013, 1359106)
X_val:  (4446, 1359106)
CPU times: user 13min 39s, sys: 4.49 s, total: 13min 43s
Wall time: 3min 46s
Train
Accuracy:  0.7915177567290631
Auroc:  0.794765200938423
Auroc:  0.8812606472219484
Validation
Accuracy:  0.7235717498875394
Auroc:  0.7257751289986919
Auroc:  0.8028956366270893
Total Time Elapsed: 288.25s
0.803


In [14]:
train_data(ngram_range = (2,5))

Fitting...
Operation Took -153.94s
(40013, 1927582) (4446, 1927582)
X_train:  (40013, 1927594)
X_val:  (4446, 1927594)
CPU times: user 31min 16s, sys: 14.1 s, total: 31min 31s
Wall time: 8min 45s
Train
Accuracy:  0.8095868842626146
Auroc:  0.812094808316132
Auroc:  0.8986435933498158
Validation
Accuracy:  0.7393162393162394
Auroc:  0.7416080633210376
Auroc:  0.8130581224062142
Total Time Elapsed: 714.82s
0.813


### Ngram boosts performance from (1,1) to (4,4), with slight decrease when using (5,5). (1,1) does poorly so we stick to using (2,5) for range

# After finding the optimal ngram range to use, we then optimise for min_df

In [15]:
train_data(ngram_range = (2,5), min_df = 3)

Fitting...
Operation Took -146.44s
(40013, 646000) (4446, 646000)
X_train:  (40013, 646012)
X_val:  (4446, 646012)
CPU times: user 31min 20s, sys: 6.53 s, total: 31min 27s
Wall time: 8min 39s
Train
Accuracy:  0.8093119736085772
Auroc:  0.811793233956952
Auroc:  0.8982669325288872
Validation
Accuracy:  0.7341430499325237
Auroc:  0.7352591074815228
Auroc:  0.8137719182548401
Total Time Elapsed: 698.73s
0.814


In [16]:
train_data(ngram_range = (2,5), min_df = 5)

Fitting...
Operation Took -146.3s
(40013, 441293) (4446, 441293)
X_train:  (40013, 441305)
X_val:  (4446, 441305)
CPU times: user 31min 30s, sys: 6.33 s, total: 31min 37s
Wall time: 8min 41s
Train
Accuracy:  0.8101117136930498
Auroc:  0.8125293615047017
Auroc:  0.8982312929842518
Validation
Accuracy:  0.7341430499325237
Auroc:  0.7357298958648195
Auroc:  0.810908915822502
Total Time Elapsed: 699.02s
0.811


In [17]:
train_data(ngram_range = (2,5), min_df = 10)

Fitting...
Operation Took -144.55s
(40013, 273048) (4446, 273048)
X_train:  (40013, 273060)
X_val:  (4446, 273060)
CPU times: user 31min 17s, sys: 5.92 s, total: 31min 23s
Wall time: 8min 37s
Train
Accuracy:  0.8124609501911879
Auroc:  0.8147251506289197
Auroc:  0.8989429086679926
Validation
Accuracy:  0.7348178137651822
Auroc:  0.7366219863524868
Auroc:  0.8130891927933986
Total Time Elapsed: 691.35s
0.813


In [18]:
train_data(ngram_range = (2,5), min_df = 20)

Fitting...
Operation Took -145.16s
(40013, 171098) (4446, 171098)
X_train:  (40013, 171110)
X_val:  (4446, 171110)
CPU times: user 31min 18s, sys: 6.33 s, total: 31min 24s
Wall time: 8min 36s
Train
Accuracy:  0.8110364131657212
Auroc:  0.813293790521199
Auroc:  0.8981564501943418
Validation
Accuracy:  0.7368421052631579
Auroc:  0.7377681955697742
Auroc:  0.8092483991034445
Total Time Elapsed: 687.17s
0.809


In [19]:
train_data(ngram_range = (2,5), min_df = 50)

Fitting...
Operation Took -141.61s
(40013, 91675) (4446, 91675)
X_train:  (40013, 91687)
X_val:  (4446, 91687)
CPU times: user 31min 7s, sys: 5.59 s, total: 31min 13s
Wall time: 8min 32s
Train
Accuracy:  0.8099617624272112
Auroc:  0.8124661223169783
Auroc:  0.8985171048213758
Validation
Accuracy:  0.7388663967611336
Auroc:  0.7400325271973915
Auroc:  0.8138027828778709
Total Time Elapsed: 673.25s
0.814


### The min df doesn't seem to matter much! min_df of 3 and min_df of 50 does the best with 0.814 AUC. We can use min_df of 50 to reduce the dimensionality of the data