In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import string
import hyperopt
import time

In [2]:
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from scipy.sparse import coo_matrix, hstack, vstack
from sklearn import metrics
from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, StratifiedKFold
from gensim.parsing.preprocessing import remove_stopwords
from lightgbm import LGBMClassifier

In [3]:
import warnings
# Ignore scipy warnings as it was intended to convert to sparse matrix below
warnings.filterwarnings("ignore", message="Converting data to scipy sparse matrix.")

In [4]:
train = pd.read_csv("../datasets/preprocessed_train.csv")
test = pd.read_csv("../datasets/preprocessed_test.csv")

In [5]:
import re
# Lower and place with digits since this was found to be one of the better combinations
train["Processed"] = train["Comment"].apply(lambda x: re.sub('\d', '1', x.lower()))
test["Processed"] = test["Comment"].apply(lambda x: re.sub('\d', '1', x.lower()))

In [6]:
# Helper function to get train, val and test data
def get_train_test(train, test = None, ngram_range = (1,1), max_features=None, random_state=1, test_size=0.1, min_df=5):
    
    if type(test) != pd.core.frame.DataFrame:
        # To check if we want to split into train val, or train test
        
        # Use only the train data for train val split
        X = train.Processed
        y = train.Outcome
        
        # split into train and test set, using random_state so that it is reproducable
        X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=random_state, test_size=test_size)
        
        # We use tfidf character level analyser
        tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=ngram_range, max_features=max_features, min_df=min_df)

        print("Fitting...")
        start = time.time()
        # Fit transform only on the train set, use it to transform the val set
        X_train_dtm =  tfidf_vect_ngram_chars.fit_transform(X_train) 
        X_val_dtm =  tfidf_vect_ngram_chars.transform(X_val) 
        print(f"Operation Took {round(start-time.time(), 2)}s")
        print(X_train_dtm.shape, X_val_dtm.shape)

        # Next, we need to add in the other variables from EDA, need to use scipy to maintain the sparse matrix or we will run out of memory
        add_var_df = train.loc[X_train.index].reset_index()[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]

        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            # Stacks horizontally, effectively increasing columns of features to include our EDA
            X_train_dtm = hstack((X_train_dtm, var_sparse))

        # Repeat the same for the validation set
        add_var_df = train.loc[X_val.index].reset_index()[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]
        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            X_val_dtm = hstack((X_val_dtm, var_sparse))
        
        print("X_train: ", X_train_dtm.shape)
        print("X_val: ", X_val_dtm.shape)
        
        return X_train_dtm, X_val_dtm, y_train, y_val
    else:
        # We use tfidf character level analyser
        tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=ngram_range, max_features=max_features, min_df=min_df)

        print("Fitting...")
        start = time.time()
        # Fit on train, transform train and test
        X_train_dtm =  tfidf_vect_ngram_chars.fit_transform(train.Processed) 
        X_test_dtm =  tfidf_vect_ngram_chars.transform(test.Processed) 
        print(f"Operation Took {time.time()-start}s")
        print(X_train_dtm.shape, X_test_dtm.shape)

        # Next, we need to add in the other variables from EDA, need to use scipy to maintain the sparse matrix or we will run out of memory
        add_var_df = train[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]

        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            # Stacks horizontally, effectively increasing columns of features to include our EDA
            X_train_dtm = hstack((X_train_dtm, var_sparse))

        # Repeat the same for the test set
        add_var_df = test[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]
        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            X_test_dtm = hstack((X_test_dtm, var_sparse))
        
        print(X_train_dtm.shape, X_test_dtm.shape)
        
        print("X_train: ", X_train_dtm.shape)
        print("X_test: ", X_test_dtm.shape)
        
        return X_train_dtm, X_test_dtm, train.Outcome

In [7]:
start = time.time()
# ngram range set to 2,5 because this was found to yield the best results, default to 50 min df since it helps to reduce the dimensionality of
# data while not affecting performance too much
X_train, X_val, y_train, y_val = get_train_test(train, test = None, ngram_range = (2,5))

LG = LGBMClassifier(class_weight='balanced')
%time LG.fit(X_train, y_train)

from sklearn import metrics
print("Train")
y_pred_class = LG.predict(X_train)

print("Accuracy: ", metrics.accuracy_score(y_train, y_pred_class))
print("Auroc: ", metrics.roc_auc_score(y_train, y_pred_class))
y_pred_class = LG.predict_proba(X_train)[:, 1]
print("Auroc: ", metrics.roc_auc_score(y_train, y_pred_class))

y_pred_class = LG.predict(X_val)
print("Validation")
print("Accuracy: ", metrics.accuracy_score(y_val, y_pred_class))
print("Auroc: ", metrics.roc_auc_score(y_val, y_pred_class))
y_pred_class = LG.predict_proba(X_val)[:, 1]
print("Auroc: ", metrics.roc_auc_score(y_val, y_pred_class))
print(f"Total Time Elapsed: {round(time.time() - start, 2)}s")

Fitting...
Operation Took -165.83s
(40013, 441293) (4446, 441293)
X_train:  (40013, 441305)
X_val:  (4446, 441305)
CPU times: user 32min 24s, sys: 16.1 s, total: 32min 40s
Wall time: 9min 3s
Train
Accuracy:  0.8101117136930498
Auroc:  0.8125293615047017
Auroc:  0.8982312929842518
Validation
Accuracy:  0.7341430499325237
Auroc:  0.7357298958648195
Auroc:  0.810908915822502
Total Time Elapsed: 745.45s


In [8]:
# Set maximum number of evaluations
max_evals = 40

# Define spacing to run optimiser on, including regularization that is neccessary due to the nature of the dimentionality of the data
lg_space ={'boosting_type': hp.choice('boosting_type', ['gbdt', 'dart', 'goss']),
           'num_leaves': hp.choice('num_leaves', [20, 25, 30, 35, 40]),
           'max_depth': hp.choice('max_depth', [-1, 50, 100, 150, 200]),
           'learning_rate': 0.1,
           'n_estimators': hp.choice('n_estimators', [50, 100, 150, 200]),
           'objective': 'binary',
           'class_weight': 'balanced',
           'min_child_samples': hp.choice('min_child_weight', [10, 15, 20, 25, 30]),
           'colsample_bytree': hp.uniform ('colsample_bytree', 0.5,0.8),
           'reg_alpha': hp.uniform ('reg_alpha', 0,0.1),
           'reg_lambda':hp.uniform ('reg_lambda', 1.5,3),
           'random_state':1234,
           'metric':'auc' # optimise for auc, the metric we're to be measured against
#            'device':'gpu',
#            'gpu_platform_id':0,
#            'gpu_device_id':0
          }

def lg_objective(space):
    model = LGBMClassifier(boosting_type=space['boosting_type'],
                          num_leaves=space['num_leaves'],
                          max_depth=space['max_depth'],
                          learning_rate=space['learning_rate'],
                          n_estimators=space['n_estimators'],
                          objective=space['objective'],
                          class_weight=space['class_weight'],
                          min_child_samples=space['min_child_samples'],
                          colsample_bytree=space['colsample_bytree'],
                          reg_alpha=space['reg_alpha'],
                          reg_lambda=space['reg_lambda'],
                          random_state=space['random_state'],
                          metric=space['metric'])
#                           device='gpu', 
#                           gpu_platform_id=0,
#                           gpu_device_id=0) # GPU variables
    
    auc = cross_val_score(model, X_train, y_train, cv = 2, scoring='roc_auc').mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -auc, 'status': STATUS_OK }

In [9]:
# Run optimisations
lg_trials = Trials()
lg_best = fmin(fn= lg_objective,
            space= lg_space,
            algo= tpe.suggest,
            max_evals = max_evals,
            trials= lg_trials,
            rstate=np.random.RandomState(1234))

100%|██████████| 40/40 [6:37:31<00:00, 596.28s/trial, best loss: -0.8070446587150484]


In [10]:
# Find optimal params
lg_params = hyperopt.space_eval(lg_space, lg_best)
lg_params

{'boosting_type': 'gbdt',
 'class_weight': 'balanced',
 'colsample_bytree': 0.6370495458782991,
 'learning_rate': 0.1,
 'max_depth': 200,
 'metric': 'auc',
 'min_child_samples': 20,
 'n_estimators': 200,
 'num_leaves': 25,
 'objective': 'binary',
 'random_state': 1234,
 'reg_alpha': 0.0720812229772364,
 'reg_lambda': 1.87246159415014}

In [11]:
start = time.time()

# Use optimal params for prediction
LG = LGBMClassifier(**lg_params)
%time LG.fit(X_train, y_train)

from sklearn import metrics
print("Train")
y_pred_class = LG.predict(X_train)

print("Accuracy: ", metrics.accuracy_score(y_train, y_pred_class))
print("Auroc: ", metrics.roc_auc_score(y_train, y_pred_class))
y_pred_class = LG.predict_proba(X_train)[:, 1]
print("Auroc: ", metrics.roc_auc_score(y_train, y_pred_class))

y_pred_class = LG.predict(X_val)
print("Validation")
print("Accuracy: ", metrics.accuracy_score(y_val, y_pred_class))
print("Auroc: ", metrics.roc_auc_score(y_val, y_pred_class))

y_pred_class = LG.predict_proba(X_val)[:, 1]
print("Auroc: ", metrics.roc_auc_score(y_val, y_pred_class))
print(f"Total Time Elapsed: {round(time.time() - start, 2)}s")

CPU times: user 44min 11s, sys: 8.96 s, total: 44min 20s
Wall time: 12min 6s
Train
Accuracy:  0.8401019668607702
Auroc:  0.842641819662503
Auroc:  0.9227164749266271
Validation
Accuracy:  0.7458389563652722
Auroc:  0.7463868329048982
Auroc:  0.8187717814216781
Total Time Elapsed: 750.34s


In [12]:
start = time.time()
X_train, X_test, y_train = get_train_test(train, test = test, ngram_range = (2,5))

Fitting...
Operation Took 293.12916135787964s
(44459, 470340) (27924, 470340)
(44459, 470352) (27924, 470352)
X_train:  (44459, 470352)
X_test:  (27924, 470352)


In [13]:
LG = LGBMClassifier(**lg_params)
%time LG.fit(X_train, y_train)

CPU times: user 47min 30s, sys: 12.5 s, total: 47min 43s
Wall time: 13min


LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=0.6370495458782991, importance_type='split',
               learning_rate=0.1, max_depth=200, metric='auc',
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=200, n_jobs=-1, num_leaves=25, objective='binary',
               random_state=1234, reg_alpha=0.0720812229772364,
               reg_lambda=1.87246159415014, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [14]:
y_pred_class = LG.predict_proba(X_test)[:, 1]

In [15]:
test["Outcome"] = y_pred_class

In [16]:
test[["Id", "Outcome"]].to_csv("submission.csv", index=False)