In [1]:
# importing required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from wordcloud import WordCloud, STOPWORDS
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
from bs4 import BeautifulSoup
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle
from sklearn.metrics import roc_auc_score
import tensorflow as tf
from prettytable import PrettyTable
import os
import random as rn

# settings
sns.set_style("whitegrid")
import warnings
warnings.filterwarnings("ignore")

# Modelling

Now that we have performed exploratory analysis on our dataset, applied the necessary processing functions and loaded text feature representations we can go ahead with the modelling phase. Under this section we will be experimenting with various deep learning algorithms.

In [None]:
# loading tokenized data 
with open("resources/tokenized_data.pkl","rb") as f:
    x_train, y_train, x_cv, y_cv, x_test = pickle.load(file=f)

## Custom Callbacks 

In [None]:
# custom callback for performance metric: mean column wise AUC
class CustomMetrics(tf.keras.callbacks.Callback):

    def __init__(self,train_data,train_labels,val_data,val_labels):
        '''
        This function initializes callback object to 
        compute custom metric
        '''
        
        self.train_data = train_data
        self.train_labels = train_labels
        self.val_data = val_data
        self.val_labels = val_labels
    
    def on_epoch_end(self,epoch,logs={}):
        '''
        This function computes the mean wise column AUC at 
        the end of each epoch
        '''
        
        # predicting probabilities for training datapoints
        train_proba = self.model.predict(self.train_data)
        
        # mean column wise auc for train set
        train_auc = roc_auc_score(y_true=self.train_labels,
                                  y_score=train_proba,
                                  average="macro")
        
        # predicting probabilities for val datapoints
        val_proba = self.model.predict(self.val_data)
        
        # mean column wise auc for val set
        val_auc = roc_auc_score(y_true=self.val_labels,
                                  y_score=val_proba,
                                  average="macro")
        
        print(f"train_auc: {round(train_auc,4)} val_auc: {round(val_auc,4)}")


# custom callback to save model after each epoch
class SaveModel(tf.keras.callbacks.Callback):

    def __init__(self,file_path):
        '''
        This function initializes callback object to 
        save model
        '''
        self.file_path = file_path
    
    def on_epoch_end(self,epoch,logs={}):
        '''
        Function saves model architecture, weights and optimizer state for current epoch
        '''

        # saving the model to specified file location
        self.model.save(self.file_path)

## Defining Utility Functions 

In [None]:
# importing required modules
from tensorflow.keras import Input
from tensorflow.keras import Model
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.layers import concatenate
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import GRU
from tensorflow.keras.initializers import HeNormal
from tensorflow.keras.regularizers import L2
from tensorflow.keras.callbacks import ReduceLROnPlateau 
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ModelCheckpoint

In [None]:
def load_embeddings(embedding_type):
    '''
    Function to load embeddings based on input type specified
    '''

    # creating file path
    fp = f"resources/{embedding_type}_embedding_matrix.pkl"

    # loading embedding matrix
    with open(fp,mode="rb") as f:
      embedding_matrix = pickle.load(file=f)

    return embedding_matrix



def get_cnn_architecture(max_length,vocab_size,embedding_matrix):
    '''
    Function creates CNN architecture with 1d conv layers
    '''

    # clearing backend session
    tf.keras.backend.clear_session()

    # defining kernel initializer and regularizer
    initializer = HeNormal()
    regularizer = L2(l2=0.01)

    # defining input and embedding layers
    input_layer = Input(shape=(max_length,))
    embedding = Embedding(input_dim=vocab_size,output_dim=300,input_length=max_length,weights=[embedding_matrix],trainable=False)(input_layer)

    # defining the first set of conv1d layers
    conv_a_1 = Conv1D(50,3,1,activation='relu',kernel_initializer=initializer,padding='same')(embedding)
    conv_a_2 = Conv1D(50,4,1,activation='relu',kernel_initializer=initializer,padding='same')(embedding)
    conv_a_3 = Conv1D(50,5,1,activation='relu',kernel_initializer=initializer,padding='same')(embedding)

    # concatenating and max pool first set of conv1d layers
    concat_a = concatenate([conv_a_1,conv_a_2,conv_a_3])
    maxpool_a = MaxPooling1D(pool_size=2,strides=1)(concat_a)

    # defining the second set of conv1d layers
    conv_b_1 = Conv1D(50,3,1,activation='relu',kernel_initializer=initializer,padding='same')(maxpool_a)
    conv_b_2 = Conv1D(50,4,1,activation='relu',kernel_initializer=initializer,padding='same')(maxpool_a)
    conv_b_3 = Conv1D(50,5,1,activation='relu',kernel_initializer=initializer,padding='same')(maxpool_a)

    # concatenating and max pool second set of conv1d layers
    concat_b = concatenate([conv_b_1,conv_b_2,conv_b_3])
    maxpool_b = MaxPooling1D(pool_size=2,strides=1)(concat_b)

    # final conv1d layer and dense layers
    conv_c = Conv1D(50,5,1,activation='relu',kernel_initializer=initializer)(maxpool_b)
    flatten = Flatten()(conv_c)
    drop_1 = Dropout(rate=0.5)(flatten)
    dense_1 = Dense(units=32,activation='relu',kernel_initializer=initializer,kernel_regularizer=regularizer)(drop_1)
    output_layer = Dense(units=6,activation='sigmoid',kernel_initializer=initializer)(dense_1)

    # creating the model
    model = Model(inputs=input_layer,outputs=output_layer)

    return model
  

def get_lstm_architecture(max_length,vocab_size,embedding_matrix):
    '''
    Function creates LSTM architecture with the input embedding matrix specified 
    '''

    # clearing backend session
    tf.keras.backend.clear_session()

    # defining input and embedding layers
    input_layer = Input(shape=(max_length,))
    embedding = Embedding(input_dim=vocab_size,output_dim=300,input_length=max_length,weights=[embedding_matrix],trainable=False)(input_layer) 

    # bi-directional lstm layers
    lstm_output_1 = Bidirectional(LSTM(units=64,return_sequences=True))(embedding)
    drop = Dropout(rate=0.5)(lstm_output_1)
    lstm_output_2 = Bidirectional(LSTM(units=64,return_sequences=False))(drop)

    # output layer
    output_layer = Dense(units=6,activation='sigmoid')(lstm_output_2)

    # creating the model
    model = Model(inputs=input_layer,outputs=output_layer)

    return model


def get_gru_architecture(max_length,vocab_size,embedding_matrix):
    '''
    Function creates GRU architecture with the input embedding matrix specified 
    '''

    # clearing backend session
    tf.keras.backend.clear_session()

    # defining input and embedding layers
    input_layer = Input(shape=(max_length,))
    embedding = Embedding(input_dim=vocab_size,output_dim=300,input_length=max_length,weights=[embedding_matrix],trainable=False)(input_layer) 

    # bi-directional GRU layers with MaxPooling1D
    gru_output_1 = Bidirectional(GRU(units=64,return_sequences=True))(embedding)
    max_pool = MaxPooling1D()(gru_output_1)
    drop = Dropout(rate=0.5)(max_pool)
    gru_output_2 = Bidirectional(GRU(units=64,return_sequences=False))(drop)

    # output layer
    output_layer = Dense(units=6,activation='sigmoid')(gru_output_2)

    # creating the model
    model = Model(inputs=input_layer,outputs=output_layer)

    return model


def get_test_predictions(test_ids,test_data,model_type):
    '''
    Function returns predictions of test data set by using input model specified
    '''

    # model file path
    fp = f"models/{model_type}.hdf5"

    # loading the model
    model = tf.keras.models.load_model(fp)

    # predicting class probabilities
    pred_proba = model.predict(test_data)

    # dataframe to store results
    pred_df = pd.DataFrame()

    # saving ids
    pred_df["id"] = test_ids

    # adding predicted probability for each class
    class_labels = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
    for i,label in enumerate(class_labels):
      pred_df[label] = pred_proba[:,i]

    
    # filepath to save predictions
    fp = f"predictions/{model_type}.csv"

    # saving to disk
    pred_df.to_csv(fp)

    print("Predictions saved to disk")

## CNN Model + GloVe

In [None]:
# loading the glove embeddings
word_embedding_matrix = load_embeddings(embedding_type="glove")

In [None]:
# getting model architecture
model = get_cnn_architecture(max_length=max_length,
                             vocab_size=vocab_size,
                             embedding_matrix=word_embedding_matrix)


model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 200, 300)     23813400    input_1[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 200, 50)      45050       embedding[0][0]                  
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 200, 50)      60050       embedding[0][0]                  
______________________________________________________________________________________________

In [None]:
# defining callbacks

# filepath to save model
filepath = "models/cnn-glove.hdf5"

custom_metric = CustomMetrics(train_data=x_train,
                              train_labels=y_train,
                              val_data=x_cv,
                              val_labels=y_cv)
early_stop = EarlyStopping(monitor="val_loss",
                           patience=2,
                           verbose=1)
save_model = SaveModel(file_path=filepath)

# adding callbacks to single list
callbacks = [custom_metric,early_stop,save_model]


In [None]:
# compiling the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy')

In [None]:
# training the model
model.fit(x_train,y_train,validation_data=(x_cv,y_cv),batch_size=64,epochs=20,callbacks=callbacks)

Epoch 1/20
train_auc: 0.9655 val_auc: 0.9653
Epoch 2/20
train_auc: 0.9772 val_auc: 0.9716
Epoch 3/20
train_auc: 0.9835 val_auc: 0.9737
Epoch 4/20
train_auc: 0.9862 val_auc: 0.9725
Epoch 00004: early stopping


<keras.callbacks.History at 0x7f19e02a0650>

In [None]:
# predicting on test data
get_test_predictions(test_ids=processed_test["id"],
                     test_data=x_test,
                     model_type="cnn-glove")

Predictions saved to disk


## CNN Model + FastText

In [None]:
# loading the fasttext embeddings
word_embedding_matrix = load_embeddings(embedding_type="fasttext")

In [None]:
# getting model architecture
model = get_cnn_architecture(max_length=max_length,
                             vocab_size=vocab_size,
                             embedding_matrix=word_embedding_matrix)


model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 200)]        0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 200, 300)     23813400    input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_7 (Conv1D)               (None, 200, 50)      45050       embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_8 (Conv1D)               (None, 200, 50)      60050       embedding_1[0][0]                
____________________________________________________________________________________________

In [None]:
# defining callbacks

# filepath to save model
filepath = "models/cnn-fasttext.hdf5"

custom_metric = CustomMetrics(train_data=x_train,
                              train_labels=y_train,
                              val_data=x_cv,
                              val_labels=y_cv)
early_stop = EarlyStopping(monitor="val_loss",
                           patience=2,
                           verbose=1)
save_model = SaveModel(file_path=filepath)

# adding callbacks to single list
callbacks = [custom_metric,early_stop,save_model]


In [None]:
# compiling the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy')

In [None]:
model.fit(x_train,y_train,validation_data=(x_cv,y_cv),batch_size=64,epochs=20,callbacks=callbacks)

Epoch 1/20
train_auc: 0.9722 val_auc: 0.9684
Epoch 2/20
train_auc: 0.9748 val_auc: 0.9669
Epoch 3/20
train_auc: 0.9811 val_auc: 0.9709
Epoch 4/20
train_auc: 0.983 val_auc: 0.9672
Epoch 5/20
train_auc: 0.9874 val_auc: 0.9707
Epoch 6/20
train_auc: 0.9878 val_auc: 0.9688
Epoch 00006: early stopping


<keras.callbacks.History at 0x7f2f3071c790>

In [None]:
# predicting on test data
get_test_predictions(test_ids=processed_test["id"],
                     test_data=x_test,
                     model_type="cnn-fasttext")

Predictions saved to disk


## LSTM Model + GloVe

In [None]:
# loading the glove embeddings
word_embedding_matrix = load_embeddings(embedding_type="glove")

In [None]:
# getting model architecture
model = get_lstm_architecture(max_length=max_length,
                             vocab_size=vocab_size,
                             embedding_matrix=word_embedding_matrix)


model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 200, 300)          23813400  
_________________________________________________________________
bidirectional (Bidirectional (None, 200, 128)          186880    
_________________________________________________________________
dropout (Dropout)            (None, 200, 128)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 6)                 774       
Total params: 24,099,870
Trainable params: 286,470
Non-trainable params: 23,813,400
___________________________________________

In [None]:
# defining callbacks

# filepath to save model
filepath = "models/lstm-glove.hdf5"

custom_metric = CustomMetrics(train_data=x_train,
                              train_labels=y_train,
                              val_data=x_cv,
                              val_labels=y_cv)
reduced_lr = ReduceLROnPlateau(monitor="val_loss",
                               patience=1,
                               verbose=1)
early_stop = EarlyStopping(monitor="val_loss",
                           patience=2,
                           verbose=1)
save_model = SaveModel(file_path=filepath)

# adding callbacks to single list
callbacks = [custom_metric,early_stop,save_model,reduced_lr]


In [None]:
# compiling the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy')

In [None]:
model.fit(x_train,y_train,validation_data=(x_cv,y_cv),batch_size=128,epochs=10,callbacks=callbacks)

Epoch 1/10
train_auc: 0.976 val_auc: 0.9735
Epoch 2/10
train_auc: 0.9833 val_auc: 0.981
Epoch 3/10
train_auc: 0.9871 val_auc: 0.9829
Epoch 4/10
train_auc: 0.9893 val_auc: 0.9838
Epoch 5/10
train_auc: 0.9912 val_auc: 0.9854
Epoch 6/10
train_auc: 0.9925 val_auc: 0.9851
Epoch 7/10
train_auc: 0.9935 val_auc: 0.9839

Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 8/10
train_auc: 0.9944 val_auc: 0.9844

Epoch 00008: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 00008: early stopping


<keras.callbacks.History at 0x7faff0275e10>

In [None]:
# predicting on test data
get_test_predictions(test_ids=processed_test["id"],
                     test_data=x_test,
                     model_type="lstm-glove")

Predictions saved to disk


## LSTM Model + Fasttext

In [None]:
# loading the fasttext embeddings
word_embedding_matrix = load_embeddings(embedding_type="fasttext")

In [None]:
# getting model architecture
model = get_lstm_architecture(max_length=max_length,
                             vocab_size=vocab_size,
                             embedding_matrix=word_embedding_matrix)


model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 200, 300)          23813400  
_________________________________________________________________
bidirectional (Bidirectional (None, 200, 128)          186880    
_________________________________________________________________
dropout (Dropout)            (None, 200, 128)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               98816     
_________________________________________________________________
dense (Dense)                (None, 6)                 774       
Total params: 24,099,870
Trainable params: 286,470
Non-trainable params: 23,813,400
___________________________________________

In [None]:
# defining callbacks

# filepath to save model
filepath = "models/lstm-glove.hdf5"

custom_metric = CustomMetrics(train_data=x_train,
                              train_labels=y_train,
                              val_data=x_cv,
                              val_labels=y_cv)
reduced_lr = ReduceLROnPlateau(monitor="val_loss",
                               patience=1,
                               verbose=1)
early_stop = EarlyStopping(monitor="val_loss",
                           patience=2,
                           verbose=1)
model_checkpoint = ModelCheckpoint(filepath=filepath,
                                   monitor="val_loss",
                                   save_best_only=True)

# adding callbacks to single list
callbacks = [custom_metric,early_stop,model_checkpoint,reduced_lr]


In [None]:
# compiling the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy')

In [None]:
model.fit(x_train,y_train,validation_data=(x_cv,y_cv),batch_size=128,epochs=10,callbacks=callbacks)

Epoch 1/10
train_auc: 0.9695 val_auc: 0.9684
Epoch 2/10
train_auc: 0.974 val_auc: 0.9733
Epoch 3/10
train_auc: 0.9787 val_auc: 0.9779
Epoch 4/10
train_auc: 0.981 val_auc: 0.9795

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 5/10
train_auc: 0.983 val_auc: 0.9814
Epoch 6/10
train_auc: 0.9836 val_auc: 0.9819
Epoch 7/10
train_auc: 0.984 val_auc: 0.9819

Epoch 00007: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 8/10
train_auc: 0.9842 val_auc: 0.9821

Epoch 00008: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 9/10
train_auc: 0.9842 val_auc: 0.9821
Epoch 10/10
train_auc: 0.9842 val_auc: 0.9822

Epoch 00010: ReduceLROnPlateau reducing learning rate to 1.0000001111620805e-07.


<keras.callbacks.History at 0x7fb9704bf4d0>

In [None]:
# predicting on test data
get_test_predictions(test_ids=processed_test["id"],
                     test_data=x_test,
                     model_type="lstm-fasttext")

Predictions saved to disk


## GRU Model + GloVe

In [None]:
# loading the glove embeddings
word_embedding_matrix = load_embeddings(embedding_type="glove")

In [None]:
# getting model architecture
model = get_gru_architecture(max_length=max_length,
                             vocab_size=vocab_size,
                             embedding_matrix=word_embedding_matrix)


model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 200, 300)          23813400  
_________________________________________________________________
bidirectional (Bidirectional (None, 200, 128)          140544    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 100, 128)          0         
_________________________________________________________________
dropout (Dropout)            (None, 100, 128)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               74496     
_________________________________________________________________
dense (Dense)                (None, 6)                 774   

In [None]:
# defining callbacks

# filepath to save model
filepath = "models/gru-glove.hdf5"

custom_metric = CustomMetrics(train_data=x_train,
                              train_labels=y_train,
                              val_data=x_cv,
                              val_labels=y_cv)
reduced_lr = ReduceLROnPlateau(monitor="val_loss",
                               patience=1,
                               verbose=1)
early_stop = EarlyStopping(monitor="val_loss",
                           patience=2,
                           verbose=1)
save_model = SaveModel(file_path=filepath)

# adding callbacks to single list
callbacks = [custom_metric,early_stop,save_model,reduced_lr]


In [None]:
# compiling the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy')

In [None]:
model.fit(x_train,y_train,validation_data=(x_cv,y_cv),batch_size=128,epochs=10,callbacks=callbacks)

Epoch 1/10
train_auc: 0.982 val_auc: 0.9808
Epoch 2/10
train_auc: 0.9868 val_auc: 0.9847
Epoch 3/10
train_auc: 0.9894 val_auc: 0.9862
Epoch 4/10
train_auc: 0.9911 val_auc: 0.9863

Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 5/10
train_auc: 0.9917 val_auc: 0.9862

Epoch 00005: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 00005: early stopping


<keras.callbacks.History at 0x7fb46c7e1bd0>

In [None]:
# predicting on test data
get_test_predictions(test_ids=processed_test["id"],
                     test_data=x_test,
                     model_type="gru-glove")

Predictions saved to disk


## GRU Model + Fasttext

In [None]:
# loading the fasttext embeddings
word_embedding_matrix = load_embeddings(embedding_type="fasttext")

In [None]:
# getting model architecture
model = get_gru_architecture(max_length=max_length,
                             vocab_size=vocab_size,
                             embedding_matrix=word_embedding_matrix)


model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 200)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 200, 300)          23813400  
_________________________________________________________________
bidirectional (Bidirectional (None, 200, 128)          140544    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 100, 128)          0         
_________________________________________________________________
dropout (Dropout)            (None, 100, 128)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128)               74496     
_________________________________________________________________
dense (Dense)                (None, 6)                 774   

In [None]:
# defining callbacks

# filepath to save model
filepath = "models/gru-fasttext.hdf5"

custom_metric = CustomMetrics(train_data=x_train,
                              train_labels=y_train,
                              val_data=x_cv,
                              val_labels=y_cv)
reduced_lr = ReduceLROnPlateau(monitor="val_loss",
                               patience=1,
                               verbose=1)
early_stop = EarlyStopping(monitor="val_loss",
                           patience=2,
                           verbose=1)
save_model = SaveModel(file_path=filepath)

# adding callbacks to single list
callbacks = [custom_metric,early_stop,save_model,reduced_lr]


In [None]:
# compiling the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy')

In [None]:
model.fit(x_train,y_train,validation_data=(x_cv,y_cv),batch_size=128,epochs=10,callbacks=callbacks)

Epoch 1/10
train_auc: 0.9784 val_auc: 0.977
Epoch 2/10
train_auc: 0.983 val_auc: 0.9813
Epoch 3/10
train_auc: 0.9867 val_auc: 0.9849
Epoch 4/10
train_auc: 0.9881 val_auc: 0.9856
Epoch 5/10
train_auc: 0.9895 val_auc: 0.9864
Epoch 6/10
train_auc: 0.9904 val_auc: 0.9862

Epoch 00006: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 7/10
train_auc: 0.9908 val_auc: 0.9864
Epoch 8/10
train_auc: 0.991 val_auc: 0.9864

Epoch 00008: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 9/10
train_auc: 0.991 val_auc: 0.9863

Epoch 00009: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.
Epoch 00009: early stopping


<keras.callbacks.History at 0x7fcf400440d0>

In [None]:
# predicting on test data
get_test_predictions(test_ids=processed_test["id"],
                     test_data=x_test,
                     model_type="gru-fasttext")

Predictions saved to disk


## Ensemble Predictions

In [7]:
# Taking simple average of predictions on test data from all previously trained models

# predictions folder file path
filepath = "predictions/"

# class label columns
cols = list(pd.read_csv(filepath + "cnn-glove.csv", nrows =1))
required_cols = [col for col in cols if col != "id"]

# loading predictions
cnn_glove = pd.read_csv(filepath + "cnn-glove.csv",usecols=required_cols)
cnn_fasttext = pd.read_csv(filepath + "cnn-fasttext.csv",usecols=required_cols)
lstm_glove = pd.read_csv(filepath + "lstm-glove.csv",usecols=required_cols)
lstm_fasttext = pd.read_csv(filepath + "lstm-fasttext.csv",usecols=required_cols)
gru_glove = pd.read_csv(filepath + "gru-glove.csv",usecols=required_cols)
gru_fasttext = pd.read_csv(filepath + "gru-fasttext.csv",usecols=required_cols)

# taking average of all model predictions
ensemble_predictions = (cnn_glove + cnn_fasttext + lstm_glove + lstm_fasttext + gru_glove + gru_fasttext) / 6

# adding ids
ids = processed_test["id"]
ensemble_predictions["id"] = ids

# saving predictions to disk
ensemble_predictions.to_csv(filepath + "ensemble_predictions.csv")
print("Predictions saved to disk")

Predictions saved to disk


# Transfer Learning with BERT

Under this section we will be using the pre-trained BERT model to get sentence level embeddings for our comments data. We will be then passing these embeddings through a feed forward neural network architecture.

In [None]:
# importing required modules
import tensorflow_hub as hub
from tensorflow.keras.models import Model 
from tensorflow.keras.layers import Input, Dense, Activation, Dropout,BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.initializers import HeNormal
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping


## Defining Utility Functions

In [None]:
def create_tokens(corpus,max_len,tokenizer):
    '''
    Function to convert sentences into token
    representations
    '''

    tokens_array = []
    masked_array = []

    for text in corpus:
        tokens = tokenizer.tokenize(text)

        # truncating excess words
        if len(tokens) > max_len - 2:
            tokens = tokens[:max_len - 2]
        
        # adding CLS and SEP special characters
        tokens = ['[CLS]',*tokens,'[SEP]']

        # padding and masked input
        pad_req = max_len - len(tokens)
        masked = [1]*len(tokens) + [0]*pad_req
        tokens = tokens + ['[PAD]']*pad_req

        # converting tokens to ids
        tokens = tokenizer.convert_tokens_to_ids(tokens)

        tokens_array.append(tokens)
        masked_array.append(masked)
    
    # creating segment array
    tokens_array = np.array(tokens_array)
    masked_array = np.array(masked_array)
    segment_array = np.zeros_like(tokens_array)
    
    return tokens_array, masked_array, segment_array

def save_bert_data(x,y,data_type,test=False):
    '''
    Function to save bert embeddings and data class labels to disk
    '''

    # specifying file path
    filepath = f"resources/bert/{data_type}.pkl"

    # saving pickle file

    if not test:
        with open(filepath,mode="wb") as f:
            pickle.dump((x,y),f)
    else:
        with open(filepath,mode="wb") as f:
            pickle.dump(x,f)
    
    print(f"{data_type} data saved to disk")


def load_bert_data(data_type,test=False):
    '''
    Function to load saved bert embeddings and class labels from disk
    '''

    # specifying file path
    filepath = f"resources/bert/{data_type}.pkl"

    # loading pickle file
    if not test:
      with open(filepath,mode="rb") as f:
        x,y = pickle.load(f)
        return x,y
    else:
      with open(filepath,mode="rb") as f:
        x = pickle.load(f)
        return x


def get_mlp_architecture(input_dim):
    '''
    Function returns feed forward neural net architecture
    '''

    # clearing backend session 
    tf.keras.backend.clear_session()

    # setting seed for reproducible results
    os.environ['PYTHONHASHSEED'] = '0'
    np.random.seed(0)
    rn.seed(0)

    # setting initializer and regularizer
    initializer = HeNormal()
    regularizer = l2(0.0001)

    # input layer
    inputs = Input(shape=(input_dim,))

    # densely connected layers
    dense_1 = Dense(units=512,activation='relu',kernel_initializer=initializer,kernel_regularizer=regularizer)(inputs)
    drop_1 = Dropout(rate=0.3)(dense_1)
    dense_2 = Dense(units=256,activation='relu',kernel_initializer=initializer,kernel_regularizer=regularizer)(drop_1)
    drop_2 = Dropout(rate=0.3)(dense_2)
    dense_3 = Dense(units=128,activation='relu',kernel_initializer=initializer,kernel_regularizer=regularizer)(drop_2)
    drop_3 = Dropout(rate=0.3)(dense_3)
    bn_layer = BatchNormalization()(drop_3)
    dense_4 = Dense(units=64,activation='relu',kernel_initializer=initializer,kernel_regularizer=regularizer)(bn_layer)

    # output layer
    outputs = Dense(units=6,activation='sigmoid',kernel_initializer=initializer)(dense_4)

    # defining the model
    model = Model(inputs=inputs,outputs=outputs)

    return model



## Defining BERT Architecture 

In [None]:
# clearing session
tf.keras.backend.clear_session()

# maximum sequence length
max_seq_length = 200

# token inputs
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")

# mask arrays
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")

# segment arrays
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")

#bert layer 
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=False)

# output layer 
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

# bert model
bert_model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=pooled_output)


## Tokenization

For tokenizing our data before passing it through the BERT model to get the embeddings, we will be using the tokenization script provided by Google's Research Team. 

https://github.com/google-research/bert/blob/master/tokenization.py.

In [None]:
# loading processed train, val and test data 
with open("resources/processed_data.pkl","rb") as f:
    x_train, y_train, x_cv, y_cv, x_test = pickle.load(file=f)

In [None]:
#getting Vocab file
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

In [None]:
# importing the tokenization script 
from resources import tokenization

# instantiating tokenizer class
tokenizer = tokenization.FullTokenizer(vocab_file,do_lower_case)

In [None]:
# creating tokenized data along with masks and segments 
x_train_tokens, x_train_mask, x_train_segment = create_tokens(corpus=x_train, 
                                                              max_len=200, 
                                                              tokenizer=tokenizer)

x_val_tokens, x_val_mask, x_val_segment = create_tokens(corpus=x_cv, 
                                                              max_len=200, 
                                                              tokenizer=tokenizer)

x_test_tokens, x_test_mask, x_test_segment = create_tokens(corpus=x_test, 
                                                              max_len=200, 
                                                              tokenizer=tokenizer)

## BERT Embeddings

In [None]:
# getting embeddings of tokenized train, val and test data
x_train_bert = bert_model.predict([x_train_tokens, x_train_mask, x_train_segment])
x_val_bert = bert_model.predict([x_val_tokens, x_val_mask, x_val_segment])
x_test_bert = bert_model.predict([x_test_tokens, x_test_mask, x_test_segment])

In [None]:
# saving the embedding data and labels 

# train data
save_bert_data(x=x_train_bert,
               y=y_train,
               data_type="train")

# val data
save_bert_data(x=x_val_bert,
               y=y_cv,
               data_type="val")

# test data
save_bert_data(x=x_test_bert,
               y=None,
               data_type="test",
               test=True)

train data saved to disk
val data saved to disk
test data saved to disk


## Model Training

In [None]:
# loading the embedding data and labels

# train data
x_train_bert, y_train = load_bert_data(data_type="train")

# val data
x_val_bert, y_val = load_bert_data(data_type="val")

# test data
x_test_bert = load_bert_data(data_type="test",
                                     test=True)

In [None]:
# getting model architecture
model = get_mlp_architecture(input_dim=768)

model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 768)]             0         
_________________________________________________________________
dense (Dense)                (None, 512)               393728    
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               131328    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 128)               32896     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128)               0     

In [None]:
# defining callbacks

# filepath to save model
filepath = "models/bert-mlp.hdf5"

custom_metric = CustomMetrics(train_data=x_train_bert,
                              train_labels=y_train,
                              val_data=x_val_bert,
                              val_labels=y_val)
early_stop = EarlyStopping(monitor="val_loss",
                           patience=7,
                           verbose=1)
save_model = SaveModel(file_path=filepath)

# adding callbacks to single list
callbacks = [custom_metric,early_stop,save_model]


In [None]:
# compiling the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy')

In [None]:
model.fit(x_train_bert,y_train,validation_data=(x_val_bert,y_val),batch_size=128,epochs=30,callbacks=callbacks)

Epoch 1/30
train_auc: 0.909 val_auc: 0.9112
Epoch 2/30
train_auc: 0.9352 val_auc: 0.937
Epoch 3/30
train_auc: 0.9411 val_auc: 0.941
Epoch 4/30
train_auc: 0.9387 val_auc: 0.9383
Epoch 5/30
train_auc: 0.9499 val_auc: 0.9506
Epoch 6/30
train_auc: 0.9486 val_auc: 0.9482
Epoch 7/30
train_auc: 0.9518 val_auc: 0.9514
Epoch 8/30
train_auc: 0.9533 val_auc: 0.9531
Epoch 9/30
train_auc: 0.9515 val_auc: 0.9517
Epoch 10/30
train_auc: 0.9535 val_auc: 0.9527
Epoch 11/30
train_auc: 0.9543 val_auc: 0.9545
Epoch 12/30
train_auc: 0.9548 val_auc: 0.9546
Epoch 13/30
train_auc: 0.9545 val_auc: 0.9542
Epoch 14/30
train_auc: 0.9544 val_auc: 0.9544
Epoch 15/30
train_auc: 0.9547 val_auc: 0.9543
Epoch 16/30
train_auc: 0.9555 val_auc: 0.9546
Epoch 17/30
train_auc: 0.9561 val_auc: 0.9553
Epoch 18/30
train_auc: 0.9562 val_auc: 0.9559
Epoch 19/30
train_auc: 0.9569 val_auc: 0.9564
Epoch 20/30
train_auc: 0.9563 val_auc: 0.9556
Epoch 21/30
train_auc: 0.9552 val_auc: 0.9546
Epoch 22/30
train_auc: 0.9568 val_auc: 0.9568


<keras.callbacks.History at 0x7fc02060d790>

In [None]:
# predicting on test data
get_test_predictions(test_ids=processed_test["id"],
                     test_data=x_test_bert,
                     model_type="bert-mlp")

Predictions saved to disk


## Summarizing Results

In [4]:
# creating table object
table = PrettyTable(field_names=["Model","Train Score","Val Score","Kaggle Private Score"])

# adding rows
table.add_row(["CNN-GloVe",0.9862,0.9725,0.9629])
table.add_row(["CNN-Fasttext",0.9878,0.9688,0.9595])
table.add_row(["LSTM-GloVe",0.9944,0.9844,0.9792])
table.add_row(["LSTM-Fasttext",0.9842,0.9822,0.9765])
table.add_row(["GRU-GloVe",0.9917,0.9862,0.9799])
table.add_row(["GRU-Fasttext",0.9911,0.9863,0.9806])
table.add_row(["Ensemble (Simple Average)","--","--",0.9809])
table.add_row(["BERT-MLP",0.9585,0.9581,0.9461])

print(table)

+---------------------------+-------------+-----------+----------------------+
|           Model           | Train Score | Val Score | Kaggle Private Score |
+---------------------------+-------------+-----------+----------------------+
|         CNN-GloVe         |    0.9862   |   0.9725  |        0.9629        |
|        CNN-Fasttext       |    0.9878   |   0.9688  |        0.9595        |
|         LSTM-GloVe        |    0.9944   |   0.9844  |        0.9792        |
|       LSTM-Fasttext       |    0.9842   |   0.9822  |        0.9765        |
|         GRU-GloVe         |    0.9917   |   0.9862  |        0.9799        |
|        GRU-Fasttext       |    0.9911   |   0.9863  |        0.9806        |
| Ensemble (Simple Average) |      --     |     --    |        0.9809        |
|          BERT-MLP         |    0.9585   |   0.9581  |        0.9461        |
+---------------------------+-------------+-----------+----------------------+
