I attempted to use pseudo labelling which does the following:
1. Train on train set
2. Predict values on test set, keep the predicted y values
3. Concatenate train and test set with the y from train and predicted y from test set 
<br><br>
My score did not improve, it deproved from 0.815 on the public leaderboard to 0.811 so I decided not to use it!

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
import warnings
# Ignore scipy warnings as it was intended to convert to sparse matrix below
warnings.filterwarnings("ignore", message="Converting data to scipy sparse matrix.")

In [3]:
import matplotlib.pyplot as plt
import string
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from scipy.sparse import coo_matrix, hstack, vstack
from sklearn import metrics
from xgboost import XGBClassifier
from hyperopt import hp, fmin, tpe, STATUS_OK, Trials
import hyperopt
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from gensim.parsing.preprocessing import remove_stopwords
from lightgbm import LGBMClassifier
import time

In [4]:
train = pd.read_csv("../datasets/preprocessed_train.csv")
test = pd.read_csv("../datasets/preprocessed_test.csv")

In [5]:
import re
# We will just lowercase everything, and sub all digits with 1 as per the results of the hyperparameter tuning notebook
train["Comment"] = train["Comment"].apply(lambda x: re.sub('\d', '1', x.lower()))
test["Comment"] = test["Comment"].apply(lambda x: re.sub('\d', '1', x.lower()))

In [6]:
# Helper function to split into train and test sets
def get_train_test(train, test = None, ngram_range = (1,1), max_features=None, random_state=1, test_size=0.1, min_df=5):
    
    if type(test) != pd.core.frame.DataFrame:
        # Just to check if test is provided, then we'll do train, test instead
        # of train val split
        X = train.Comment
        y = train.Outcome
        
        # We split by using test_size for y_val
        X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=random_state, test_size=test_size)
        
        # We're using tfidf vectorizer for our analysis, character level model
        tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=ngram_range, max_features=max_features, min_df=min_df)
        
        print("Fitting...")
        start = time.time()
        # Fit transform the training, ONLY on training
        X_train_dtm =  tfidf_vect_ngram_chars.fit_transform(X_train) 
        # Transform the x_val
        X_val_dtm =  tfidf_vect_ngram_chars.transform(X_val) 
        print(f"Operation Took {round(start-time.time(), 2)}s")
        print(X_train_dtm.shape, X_val_dtm.shape)

        # Adding in additional variables from EDA
        add_var_df = train.loc[X_train.index].reset_index()[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]

        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            X_train_dtm = hstack((X_train_dtm, var_sparse))

        add_var_df = train.loc[X_val.index].reset_index()[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]
        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            X_val_dtm = hstack((X_val_dtm, var_sparse))
        
        print("X_train: ", X_train_dtm.shape)
        print("X_val: ", X_val_dtm.shape)
        
        return X_train_dtm, X_val_dtm, y_train, y_val
    else:
        # We're using tfidf vectorizer for our analysis, character level model
        tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=ngram_range, max_features=max_features, min_df=min_df)

        print("Fitting...")
        start = time.time()
        # Fit transform the training, ONLY on training
        X_train_dtm =  tfidf_vect_ngram_chars.fit_transform(train.Comment)
        # Transform the test comment
        X_test_dtm =  tfidf_vect_ngram_chars.transform(test.Comment) 
        print(f"Operation Took {time.time()-start}s")
        print(X_train_dtm.shape, X_test_dtm.shape)

        # Add in additional variables from EDA
        add_var_df = train[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]

        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            X_train_dtm = hstack((X_train_dtm, var_sparse))

        add_var_df = test[['num_numbers', 'prop_numbers', 'num_words',
               'num_punctuation', 'prop_punctuation', 'nchar', 'word_density', 'noun_count', 'verb_count', 'adj_count', 'adv_count', 'pron_count']]
        for column in add_var_df.columns:
            var_sparse = add_var_df[column].values[:, None]
            X_test_dtm = hstack((X_test_dtm, var_sparse))
        
        print(X_train_dtm.shape, X_test_dtm.shape)
        
        print("X_train: ", X_train_dtm.shape)
        print("X_test: ", X_test_dtm.shape)
        
        return X_train_dtm, X_test_dtm, train.Outcome
    

In [7]:
# Hyperparameters from bayesian optimisation
lg_params = {'boosting_type': 'gbdt',
 'class_weight': 'balanced',
 'colsample_bytree': 0.6370495458782991,
 'learning_rate': 0.1,
 'max_depth': 200,
 'metric': 'auc',
 'min_child_samples': 20,
 'n_estimators': 200,
 'num_leaves': 25,
 'objective': 'binary',
 'random_state': 1234,
 'reg_alpha': 0.0720812229772364,
 'reg_lambda': 1.87246159415014}

In [8]:
start = time.time()
X_train, X_val, y_train, y_val = get_train_test(train, test = None, ngram_range = (2,5), 
                    max_features=None, random_state=1, test_size=0.1)

LG = LGBMClassifier(**lg_params)
%time LG.fit(X_train, y_train)

from sklearn import metrics
print("Train")
y_pred_class = LG.predict(X_train)
# Comparison between vanilla roc_auc using predict vs if we use predict_proba
print("Accuracy: ", metrics.accuracy_score(y_train, y_pred_class))
print("Auroc: ", metrics.roc_auc_score(y_train, y_pred_class))
y_pred_class = LG.predict_proba(X_train)
print("Auroc: ", metrics.roc_auc_score(y_train, y_pred_class[:, 1]))

print("Validation")
y_pred_class = LG.predict(X_val)
print("Accuracy: ", metrics.accuracy_score(y_val, y_pred_class))
print("Auroc: ", metrics.roc_auc_score(y_val, y_pred_class))
y_pred_class = LG.predict_proba(X_val)
print("Auroc: ", metrics.roc_auc_score(y_val, y_pred_class[:, 1]))
end = time.time() - start
print(f"Entire Process Took {round(end,2)}seconds")

Fitting...
Operation Took -138.59s
(40013, 441293) (4446, 441293)
X_train:  (40013, 441305)
X_val:  (4446, 441305)
CPU times: user 40min 52s, sys: 12.8 s, total: 41min 5s
Wall time: 11min 4s
Train
Accuracy:  0.8401019668607702
Auroc:  0.842641819662503
Auroc:  0.9227164749266271
Validation
Accuracy:  0.7458389563652722
Auroc:  0.7463868329048982
Auroc:  0.8187717814216781
Entire Process Took 838.03seconds


### Concatenate the train and val here

In [9]:
# Form new y using the predicted labels from the model
new_y = pd.concat([y_train, pd.Series(LG.predict(X_val))])
# Merge X_train and X_val
new_train = vstack((X_train, X_val))

# Predict using the y_train, and the PREDICTED values of X_val
LG = LGBMClassifier(**lg_params)
%time LG.fit(new_train, new_y)

CPU times: user 45min 6s, sys: 8.01 s, total: 45min 14s
Wall time: 12min 14s


LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=0.6370495458782991, importance_type='split',
               learning_rate=0.1, max_depth=200, metric='auc',
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=200, n_jobs=-1, num_leaves=25, objective='binary',
               random_state=1234, reg_alpha=0.0720812229772364,
               reg_lambda=1.87246159415014, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [10]:
from sklearn import metrics
print("New Train")
# Use the new model to predict just X_val
y_pred_class = LG.predict(X_val)
# Comparison between vanilla roc_auc using predict vs if we use predict_proba
# Compare against actual y_val
print("Accuracy: ", metrics.accuracy_score(y_val, y_pred_class))
print("Auroc: ", metrics.roc_auc_score(y_val, y_pred_class))
y_pred_class = LG.predict_proba(X_val)
print("Auroc: ", metrics.roc_auc_score(y_val, y_pred_class[:, 1]))

New Train
Accuracy:  0.7435897435897436
Auroc:  0.7446294012695237
Auroc:  0.8206329181904359


### Did better for the train set! But when I tried it on the test set, it didn't work. It seems that we have mixed performance from this method.

In [11]:
X_train, X_test, y_train = get_train_test(train, test = test, ngram_range = (2,5), 
                    max_features=None, random_state=1)

Fitting...
Operation Took 238.39069437980652s
(44459, 470340) (27924, 470340)
(44459, 470352) (27924, 470352)
X_train:  (44459, 470352)
X_test:  (27924, 470352)


In [12]:
LG = LGBMClassifier(**lg_params)
%time LG.fit(X_train, y_train)

CPU times: user 44min 46s, sys: 6.87 s, total: 44min 53s
Wall time: 12min 8s


LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=0.6370495458782991, importance_type='split',
               learning_rate=0.1, max_depth=200, metric='auc',
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=200, n_jobs=-1, num_leaves=25, objective='binary',
               random_state=1234, reg_alpha=0.0720812229772364,
               reg_lambda=1.87246159415014, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [13]:
new_y = pd.concat([y_train, pd.Series(LG.predict(X_test))])
new_train = vstack((X_train, X_test))

LG = LGBMClassifier(**lg_params)
%time LG.fit(new_train, new_y)

CPU times: user 1h 8min 42s, sys: 14.5 s, total: 1h 8min 56s
Wall time: 19min 20s


LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
               colsample_bytree=0.6370495458782991, importance_type='split',
               learning_rate=0.1, max_depth=200, metric='auc',
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=200, n_jobs=-1, num_leaves=25, objective='binary',
               random_state=1234, reg_alpha=0.0720812229772364,
               reg_lambda=1.87246159415014, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [14]:
y_pred_class = LG.predict_proba(X_test)[:, 1]

In [15]:
test["Outcome"] = y_pred_class

In [16]:
test[["Id", "Outcome"]].to_csv("submission.csv", index=False)