In [1]:
import os
import re
import nltk
import pandas as pd 
import numpy as np
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

import torch
import torchtext
from torchtext import data
from torch import nn
from torch.nn import functional as F
from torch import optim

# I. Load dataset and pre-processing

**Text preprocessing** is traditionally an important step for natural language processing (NLP) tasks. 

In the pre-processing task, I did:
- Remove contractions
- Remove punctuation
- Tokenization

I didn't remove `stopwords` because of deep learning model that were used later can be affect. I predicted that it counld be harder to learn the context if we removed `stopwords`.

## Load dataset

In [2]:
DATA_PATH = '../input/quora-insincere-questions-classification/'

train_df = pd.read_csv('../input/train-dataset-quora-insincere-questions/out.csv')
test_df  = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))

In [3]:
train_df.head()

Unnamed: 0,qid,question_text,target,cleaned_question_text
0,00002165364db923c7e6,How did Quebec nationalists see their province...,0,how quebec nationalists see province nation 1960s
1,000032939017120e6e44,"Do you have an adopted dog, how would you enco...",0,do adopted dog would encourage people adopt shop
2,0000412ca6e4628ce2cf,Why does velocity affect time? Does velocity a...,0,why velocity affect time does velocity affect ...
3,000042bf85aa498cd78e,How did Otto von Guericke used the Magdeburg h...,0,how otto von guericke used magdeburg hemispheres
4,0000455dfa3e01eae3af,Can I convert montra helicon D to a mountain b...,0,can i convert montra helicon d mountain bike c...


In [None]:
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}


punctuation = [',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', 
    '•', '~', '@', '£', '·', '_', '{', '}', '©', '^', '®', '`', '<', '→', '°', '€', '™', '›', '♥', '←', '×', '§', '″', '′', 
    '█', '…', '“', '★', '”', '–', '●', '►', '−', '¢', '¬', '░', '¡', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', 
    '—', '‹', '─', '▒', '：', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', '¯', '♦', '¤', '▲', '¸', '⋅', '‘', '∞', 
    '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '・', '╦', '╣', '╔', '╗', '▬', '❤', '≤', '‡', '√', '◄', '━', 
    '⇒', '▶', '≥', '╝', '♡', '◊', '。', '✈', '≡', '☺', '✔', '↵', '≈', '✓', '♣', '☎', '℃', '◦', '└', '‟', '～', '！', '○', 
    '◆', '№', '♠', '▌', '✿', '▸', '⁄', '□', '❖', '✦', '．', '÷', '｜', '┃', '／', '￥', '╠', '↩', '✭', '▐', '☼', '☻', '┐', 
    '├', '«', '∼', '┌', '℉', '☮', '฿', '≦', '♬', '✧', '〉', '－', '⌂', '✖', '･', '◕', '※', '‖', '◀', '‰', '\x97', '↺', 
    '∆', '┘', '┬', '╬', '،', '⌘', '⊂', '＞', '〈', '⎙', '？', '☠', '⇐', '▫', '∗', '∈', '≠', '♀', '♔', '˚', '℗', '┗', '＊', 
    '┼', '❀', '＆', '∩', '♂', '‿', '∑', '‣', '➜', '┛', '⇓', '☯', '⊖', '☀', '┳', '；', '∇', '⇑', '✰', '◇', '♯', '☞', '´', 
    '↔', '┏', '｡', '◘', '∂', '✌', '♭', '┣', '┴', '┓', '✨', '\xa0', '˜', '❥', '┫', '℠', '✒', '［', '∫', '\x93', '≧', '］', 
    '\x94', '∀', '♛', '\x96', '∨', '◎', '↻', '⇩', '＜', '≫', '✩', '✪', '♕', '؟', '₤', '☛', '╮', '␊', '＋', '┈', '％', 
    '╋', '▽', '⇨', '┻', '⊗', '￡', '।', '▂', '✯', '▇', '＿', '➤', '✞', '＝', '▷', '△', '◙', '▅', '✝', '∧', '␉', '☭', 
    '┊', '╯', '☾', '➔', '∴', '\x92', '▃', '↳', '＾', '׳', '➢', '╭', '➡', '＠', '⊙', '☢', '˝', '∏', '„', '∥', '❝', '☐', 
    '▆', '╱', '⋙', '๏', '☁', '⇔', '▔', '\x91', '➚', '◡', '╰', '\x85', '♢', '˙', '۞', '✘', '✮', '☑', '⋆', 'ⓘ', '❒', '☣', '✉', '⌊', '➠', '∣', '❑', '◢', 'ⓒ', '\x80', '〒', '∕', '▮', '⦿', '✫', '✚', '⋯', '♩', '☂', '❞', '‗', '܂', '☜', 
    '‾', '✜', '╲', '∘', '⟩', '＼', '⟨', '·', '✗', '♚', '∅', 'ⓔ', '◣', '͡', '‛', '❦', '◠', '✄', '❄', '∃', '␣', '≪', '｢', 
    '≅', '◯', '☽', '∎', '｣', '❧', '̅', 'ⓐ', '↘', '⚓', '▣', '˘', '∪', '⇢', '✍', '⊥', '＃', '⎯', '↠', '۩', '☰', '◥', 
    '⊆', '✽', '⚡', '↪', '❁', '☹', '◼', '☃', '◤', '❏', 'ⓢ', '⊱', '➝', '̣', '✡', '∠', '｀', '▴', '┤', '∝', '♏', 'ⓐ', 
    '✎', ';', '␤', '＇', '❣', '✂', '✤', 'ⓞ', '☪', '✴', '⌒', '˛', '♒', '＄', '✶', '▻', 'ⓔ', '◌', '◈', '❚', '❂', '￦', 
    '◉', '╜', '̃', '✱', '╖', '❉', 'ⓡ', '↗', 'ⓣ', '♻', '➽', '׀', '✲', '✬', '☉', '▉', '≒', '☥', '⌐', '♨', '✕', 'ⓝ', 
    '⊰', '❘', '＂', '⇧', '̵', '➪', '▁', '▏', '⊃', 'ⓛ', '‚', '♰', '́', '✏', '⏑', '̶', 'ⓢ', '⩾', '￠', '❍', '≃', '⋰', '♋', 
    '､', '̂', '❋', '✳', 'ⓤ', '╤', '▕', '⌣', '✸', '℮', '⁺', '▨', '╨', 'ⓥ', '♈', '❃', '☝', '✻', '⊇', '≻', '♘', '♞', 
    '◂', '✟', '⌠', '✠', '☚', '✥', '❊', 'ⓒ', '⌈', '❅', 'ⓡ', '♧', 'ⓞ', '▭', '❱', 'ⓣ', '∟', '☕', '♺', '∵', '⍝', 'ⓑ', 
    '✵', '✣', '٭', '♆', 'ⓘ', '∶', '⚜', '◞', '்', '✹', '➥', '↕', '̳', '∷', '✋', '➧', '∋', '̿', 'ͧ', '┅', '⥤', '⬆', '⋱', 
    '☄', '↖', '⋮', '۔', '♌', 'ⓛ', '╕', '♓', '❯', '♍', '▋', '✺', '⭐', '✾', '♊', '➣', '▿', 'ⓑ', '♉', '⏠', '◾', '▹', 
    '⩽', '↦', '╥', '⍵', '⌋', '։', '➨', '∮', '⇥', 'ⓗ', 'ⓓ', '⁻', '⎝', '⌥', '⌉', '◔', '◑', '✼', '♎', '♐', '╪', '⊚', 
    '☒', '⇤', 'ⓜ', '⎠', '◐', '⚠', '╞', '◗', '⎕', 'ⓨ', '☟', 'ⓟ', '♟', '❈', '↬', 'ⓓ', '◻', '♮', '❙', '♤', '∉', '؛', 
    '⁂', 'ⓝ', '־', '♑', '╫', '╓', '╳', '⬅', '☔', '☸', '┄', '╧', '׃', '⎢', '❆', '⋄', '⚫', '̏', '☏', '➞', '͂', '␙', 'ⓤ', '◟', '̊', '⚐', '✙', '↙', '̾', '℘', '✷', '⍺', '❌', '⊢', '▵', '✅', 'ⓖ', '☨', '▰', '╡', 'ⓜ', '☤', '∽', '╘', 
    '˹', '↨', '♙', '⬇', '♱', '⌡', '⠀', '╛', '❕', '┉', 'ⓟ', '̀', '♖', 'ⓚ', '┆', '⎜', '◜', '⚾', '⤴', '✇', '╟', '⎛', 
    '☩', '➲', '➟', 'ⓥ', 'ⓗ', '⏝', '◃', '╢', '↯', '✆', '˃', '⍴', '❇', '⚽', '╒', '̸', '♜', '☓', '➳', '⇄', '☬', '⚑', 
    '✐', '⌃', '◅', '▢', '❐', '∊', '☈', '॥', '⎮', '▩', 'ு', '⊹', '‵', '␔', '☊', '➸', '̌', '☿', '⇉', '⊳', '╙', 'ⓦ', 
    '⇣', '｛', '̄', '↝', '⎟', '▍', '❗', '״', '΄', '▞', '◁', '⛄', '⇝', '⎪', '♁', '⇠', '☇', '✊', 'ி', '｝', '⭕', '➘', 
    '⁀', '☙', '❛', '❓', '⟲', '⇀', '≲', 'ⓕ', '⎥', '\u06dd', 'ͤ', '₋', '̱', '̎', '♝', '≳', '▙', '➭', '܀', 'ⓖ', '⇛', '▊', 
    '⇗', '̷', '⇱', '℅', 'ⓧ', '⚛', '̐', '̕', '⇌', '␀', '≌', 'ⓦ', '⊤', '̓', '☦', 'ⓕ', '▜', '➙', 'ⓨ', '⌨', '◮', '☷', 
    '◍', 'ⓚ', '≔', '⏩', '⍳', '℞', '┋', '˻', '▚', '≺', 'ْ', '▟', '➻', '̪', '⏪', '̉', '⎞', '┇', '⍟', '⇪', '▎', '⇦', '␝', 
    '⤷', '≖', '⟶', '♗', '̴', '♄', 'ͨ', '̈', '❜', '̡', '▛', '✁', '➩', 'ா', '˂', '↥', '⏎', '⎷', '̲', '➖', '↲', '⩵', '̗', '❢', 
    '≎', '⚔', '⇇', '̑', '⊿', '̖', '☍', '➹', '⥊', '⁁', '✢']


In [None]:
def clean_text(txt, contraction_dict, punctuation):
    """""
    cleans the input text in the following steps
    1- replace contractions
    2- removing punctuation
    3- spliting into words
    4- removing stopwords
    """""

    def _get_contraction(contraction_dict):
        contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
        return contraction_dict, contraction_re

    # replace contractions
    def remove_contraction(text, contraction_dict):
        contractions, contractions_re = _get_contraction(contraction_dict)
        def replace(match):
            return contractions[match.group(0)]
        return contractions_re.sub(replace, text)
    
    # remove punctuations
    def remove_punctuation(text):
        txt  = "".join([char for char in text if char not in punctuation])
        return re.sub("[^a-zA-Z0-9]+", ' ', txt)
    
    # remove stopword
    def remove_stopword(words):
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]
        return words

    # to lower case
    def to_lower(words):
        return words.lower()

    txt = remove_contraction(txt, contraction_dict)
    txt = remove_punctuation(txt)
    # split into words
    words = word_tokenize(txt)
    # words = remove_stopword(words)
    
    cleaned_text = ' '.join(words)
    
    # to lower case
    cleaned_text = to_lower(cleaned_text)
    return cleaned_text

In [None]:
# Start preprocessing train dataset and test dataset
tqdm.pandas()
train_df['cleaned_question_text'] = train_df['question_text'].progress_apply(lambda txt: clean_text(txt, contraction_dict, punctuation))
test_df['cleaned_question_text']  = test_df['question_text'].progress_apply(lambda txt: clean_text(txt, contraction_dict, punctuation))

**Notes**: Althought, this training (and testing) dataset are already chose very careful, I found that there are still some non English text inside the training and testing dataset. 

We can easily remove rows that have `non English text` in training dataset, but we *can not* remove it if they were in the test set.

In [4]:
# plot and remove NaN value in train_set
nan_rows = train_df[train_df['cleaned_question_text'].isnull()]
nan_rows

Unnamed: 0,qid,question_text,target,cleaned_question_text


In [None]:
train_df = train_df[train_df['cleaned_question_text'].notna()]

**Note:** when I was testing my code, I saved a version of preprocessed train_set to save some time

In [None]:
# compression_opts = dict(method='zip', archive_name='out.csv')  
# train_df.to_csv("./train_processed.zip", index=False, compression=compression_opts)

train_df.to_csv('./train_preprocessed.csv', index=False)

# II. Tokenizate dataset and embedding word

Tokenization and embedding words are the process of transform text into a more digestible form so that machine learning algorithms can perform better.

## Tokenizate dataset

In [5]:
TEXT  = data.Field(tokenize='spacy', batch_first=True, include_lengths=True)
LABEL = data.LabelField(dtype = torch.int64, batch_first=True)



In [7]:
fields = [(None, None), (None,None), ('target', LABEL), ('text', TEXT)]

# TabularDataset from torchtext only support to load from storage file
dataset = data.TabularDataset('../input/train-dataset-quora-insincere-questions/out.csv', format = 'csv', fields = fields, skip_header = True)



In [8]:
# log data example
print(vars(dataset.examples[0]))

{'target': '0', 'text': ['how', 'quebec', 'nationalists', 'see', 'province', 'nation', '1960s']}


In [9]:
import random

SEED = 42
torch.manual_seed(SEED)
training_set, valid_set = dataset.split(split_ratio=0.8, random_state = random.seed(SEED))

## Load GloVe word embedding

In [None]:
!unzip ../input/quora-insincere-questions-classification/embeddings.zip -d ./

In [None]:
#due to the space limitation of kaggle, we must clear non use embedding.
NON_USE_DIR = ['./glove.840B.300d', './GoogleNews-vectors-negative300', './paragram_300_sl999']

import os, shutil
def remove_dir(path):
    for filename in os.listdir(path):
        file_path = os.path.join(path, filename)
        try:
            if os.path.isfile(file_path) or os.path.islink(file_path):
                os.unlink(file_path)
            elif os.path.isdir(file_path):
                shutil.rmtree(file_path)
        except Exception as e:
            print('Failed to delete %s. Reason: %s' % (file_path, e))
    os.rmdir(path)
            
for dir in NON_USE_DIR:
    remove_dir(dir)
    

In [10]:
# Load embedding as storage file
import torchtext.vocab as vocab

custom_embeddings = vocab.Vectors(name = '../input/glove6b/glove.6B.100d.txt')

100%|█████████▉| 399999/400000 [00:27<00:00, 14422.92it/s]


In [11]:
TEXT.build_vocab(training_set, min_freq=3, vectors = custom_embeddings)
LABEL.build_vocab(training_set)

#No. of unique tokens in text
print("Size of TEXT vocabulary:",len(TEXT.vocab))

#No. of unique tokens in label
print("Size of LABEL vocabulary:",len(LABEL.vocab))

#Commonly used words
print(TEXT.vocab.freqs.most_common(10))  

Size of TEXT vocabulary: 64853
Size of LABEL vocabulary: 2
[('what', 346962), ('i', 264607), ('how', 210414), ('why', 116259), ('is', 89048), ('would', 50611), ('get', 50099), ('best', 49881), ('people', 44434), ('can', 42469)]


# Model & Training

In this challenge, we are building a basic bidirectional RNN to perform sentiment analysis task.

## Bidirectional LSTM Model

In [13]:
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

In [14]:
class BidirectionalLSTM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layer, output_size, dropout_rate=0.1):
        super(BidirectionalLSTM, self).__init__()
        self.dimension = hidden_size
        # Define layer
        self.embedding   = nn.Embedding(vocab_size, embedding_dim)
        self.lstm        = nn.LSTM(embedding_dim, hidden_size, num_layer, batch_first=True, bidirectional=True)
        self.dropout     = nn.Dropout(0.1)
        self.fc          = nn.Linear(hidden_size*2, output_size)
        self.sigmoid     = nn.Sigmoid()

    def forward(self, input, input_len):
        # embedding word
        x = self.embedding(input)
        
        x = pack_padded_sequence(x, input_len.cpu(), batch_first=True, enforce_sorted=False)
        
        packed_output, (hidden, cell) = self.lstm(x)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        out_forward = output[range(len(output)), input_len.cpu() - 1, :self.dimension]
        out_reverse = output[:, 0, self.dimension:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        text_feature = self.dropout(out_reduced)

        text_feature = self.fc(text_feature).squeeze(1)
        return self.sigmoid(text_feature)
        
        
        # Notes!
#         output = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
#         output = self.fc(output)
#         return self.sigmoid(output)

In [49]:
class LSTM_GRU(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, num_layer, output_size, dropout_rate=0.1):
        super(LSTM_GRU, self).__init__()
        # Define layer
        self.embedding   = nn.Embedding(vocab_size, embedding_dim)
        self.lstm        = nn.LSTM(embedding_dim, hidden_size, num_layer, batch_first=True, bidirectional=True)
        self.gru         = nn.GRU(hidden_size*2, hidden_size//2, num_layer, batch_first=True, bidirectional=True)

        self.fc          = nn.Linear(hidden_size*3, output_size)
        
        self.emb_dropout = nn.Dropout2d(dropout_rate)
        self.sigmoid     = nn.Sigmoid()

    def forward(self, input, input_len):
        # layer 1: embedding
        x = self.embedding(input)
        
        # layer 2: Spatial Dropout 1D
        embed = x.unsqueeze(2) # (N, T, 1, K)
        embed = embed.permute(0, 3, 2, 1)  # (N, K, 1, T)
        embed = self.emb_dropout(embed)  # (N, K, 1, T)
        embed = embed.permute(0, 3, 2, 1)  # (N, T, 1, K)
        x = embed.squeeze(2)  # (N, T, K)
        
        # layer 3: Bidirectional LSTM
        x = pack_padded_sequence(x, input_len.cpu(), batch_first=True, enforce_sorted=False) # (N, T, K)
        packed_lstm, h_lstm = self.lstm(x)
        
        # layer 4: Bidirectional GRU
        packed_gru, h_gru = self.gru(packed_lstm)
        
        packed_lstm = pad_packed_sequence(packed_lstm, batch_first=True)
        packed_gru = pad_packed_sequence(packed_gru, batch_first=True)
        
        # layer 5: Concat
        x = torch.cat((packed_gru[0], packed_lstm[0]), 2)
        
        # layer 6: Global Average Pool
        avg_pool = torch.mean(x, 1)
        
        # layer 7: Fully connected
        x = self.fc(avg_pool)
        x = self.sigmoid(x)
        return x

## Helper function 

In [15]:
def binary_accuracy(preds, y):
    #round predictions to the closest integer
    rounded_preds = torch.round(preds)
    
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

def f1_loss(y_true:torch.Tensor, y_pred:torch.Tensor, is_training=False) -> torch.Tensor:
    '''Calculate F1 score. Can work with gpu tensors
    
    The original implmentation is written by Michal Haltuf on Kaggle.
    
    Returns
    -------
    torch.Tensor
        `ndim` == 1. 0 <= val <= 1
    
    Reference
    ---------
    - https://www.kaggle.com/rejpalcz/best-loss-function-for-f1-score-metric
    - https://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
    - https://discuss.pytorch.org/t/calculating-precision-recall-and-f1-score-in-case-of-multi-label-classification/28265/6
    
    '''
    assert y_true.ndim == 1
    assert y_pred.ndim == 1 or y_pred.ndim == 2
    
    if y_pred.ndim == 2:
        y_pred = y_pred.argmax(dim=1)
        
    
    tp = (y_true * y_pred).sum().to(torch.float32)
    tn = ((1 - y_true) * (1 - y_pred)).sum().to(torch.float32)
    fp = ((1 - y_true) * y_pred).sum().to(torch.float32)
    fn = (y_true * (1 - y_pred)).sum().to(torch.float32)
    
    epsilon = 1e-7
    
    precision = tp / (tp + fp + epsilon)
    recall = tp / (tp + fn + epsilon)
    
    f1 = 2* (precision*recall) / (precision + recall + epsilon)
#     f1.requires_grad = is_training
    return f1

In [53]:
def train(model, device, train_iterator, optimizer, loss_function):
    model.train()
    running_loss = 0
    accuracy     = 0
    for i, (batch) in enumerate(train_iterator):
        # load data into cuda
        input, input_len = batch.text
        input, input_len = input.to(device), input_len.to(device)

        # forward
        predict = model(input, input_len).squeeze()
        loss = loss_function(predict, batch.target.to(dtype=torch.float32, device=device))

        # metric
        accuracy     += f1_loss(predict, batch.target)
        running_loss += loss.item()
        
        # zero the gradient + backprpagation + step
        optimizer.zero_grad()

        loss.backward()
        optimizer.step()
        
    epoch_loss = running_loss/len(train_iterator)
    epoch_acc  = accuracy/len(train_iterator)

    return epoch_loss, epoch_acc

In [54]:
def test(model, device, test_iterator, loss_function):
    model.eval()
    running_loss = 0
    accuracy     = 0
    
    with torch.no_grad():
        for i, (batch) in enumerate(test_iterator):
            input, input_len = batch.text
            input, input_len = input.to(device), input_len.to(device)

            predict = model(input, input_len).squeeze()
            loss = loss_function(predict, batch.target.to(dtype=torch.float32, device=device))

            running_loss += loss.item()
            accuracy     += f1_loss(predict, batch.target)

    epoch_loss = running_loss/len(test_iterator)
    epoch_acc  = accuracy/len(test_iterator)
    
    return epoch_loss, epoch_acc

## Training process

I used `Adam` from `torch` library as optimizer and `Binary Cross entropy` as loss function

In [60]:
# using cuda
device = f"cuda:{torch.cuda.current_device()}" if torch.cuda.is_available() else "cpu"
print("Training on:", torch.cuda.get_device_name(torch.cuda.current_device()) if torch.cuda.is_available() else "cpu")

# config
BATCH_SIZE       = 32
VOCAB_SIZE       = len(TEXT.vocab)
EMBEDDING_DIM    = 100
HIDDEN_SIZE      = 128
OUTPUT_SIZE      = 1
NUM_LAYER        = 2

N_EPOCH          = 15

Training on: Tesla P100-PCIE-16GB


In [56]:
#Load an iterator
train_iterator, valid_iterator = data.BucketIterator.splits(
                                    (training_set, valid_set), 
                                    batch_size = BATCH_SIZE,
                                    sort_key = lambda x: len(x.text),
                                    sort_within_batch=True,
                                    device = device)

In [61]:
# init model
model = LSTM_GRU(vocab_size=VOCAB_SIZE, 
             embedding_dim=EMBEDDING_DIM, 
             hidden_size=HIDDEN_SIZE, 
             num_layer=NUM_LAYER, 
             output_size=OUTPUT_SIZE).to(device)

model.load_state_dict(torch.load('./weights.pt'))

# loss function & optimizer
# optimizer  = optim.Adam(model.parameters(), lr=0.0001)
optimizer = optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.07)

citeration = nn.BCELoss()

In [21]:
#Initialize the pretrained embedding
pretrained_embeddings = TEXT.vocab.vectors
model.embedding.weight.data.copy_(pretrained_embeddings)

print(pretrained_embeddings.shape)

torch.Size([64853, 100])


In [None]:
# Start training
best_accuracy = 0

for epoch in range(N_EPOCH):
    # train
    train_loss, train_accuracy = train(model, device, train_iterator, optimizer, citeration)
    
    # evaluate
    test_loss, test_accuracy = test(model, device, valid_iterator, citeration)
    
    #save the best model
    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        torch.save(model.state_dict(), './weights.pt')
    
    print(f'Epoch {epoch+1} summary ===========================')
    print(f'Train Loss: {train_loss:.3f} | Train F1 score: {train_accuracy*100:.2f}%')
    print(f' Val. Loss: {test_loss:.3f} |  Val. F1 score: {test_accuracy*100:.2f}%')

## Visualize training experiment

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

In [None]:
epoch = range(0, len(train_losses))

fig, (ax0, ax1) = plt.subplots(nrows=1, ncols=2, sharex=True, figsize=(24, 6))

ax0.plot(epoch, train_losses, 'g', label='Training loss')
ax0.plot(epoch, valid_losses, 'b', label='validation loss')
ax0.set_title('Training and Validation loss')
ax0.set_xlabel('Epochs')
ax0.set_ylabel('Loss')
ax0.legend()

ax1.plot(epoch, train_res, 'g', label='Training F1 score')
ax1.plot(epoch, valid_res, 'b', label='validation F1 score')
ax1.set_title('Training and Validation F1 score')
ax1.set_xlabel('Epochs')
ax1.set_ylabel('F1 score')
ax1.legend()

plt.show()

# Prediction and submission

The submission file is `csv` format and contains 2 columns: 
- `qid` is the ID of the question
- `preidction` is `1` or `0` refer as insincere of not.%load_ext tensorboard
%tensorboard --logdir logs%load_ext tensorboard
%tensorboard --logdir logs

In [None]:
# test dataset
test_df.head()

In [None]:
# helper function
import spacy
nlp = spacy.load('en')

def predict(model, sentence):   
    tokenized = [tok.text for tok in nlp.tokenizer(sentence)]  #tokenize the sentence # nlp.tokenizer(sentence)
    indexed = [TEXT.vocab.stoi[t] for t in tokenized]          #convert to integer sequence
    length = [len(indexed)]                                    #compute no. of words
    tensor = torch.LongTensor(indexed).to(device)              #convert to tensor
    tensor = tensor.unsqueeze(1).T                             #reshape in form of batch,no. of words
    length_tensor = torch.LongTensor(length)                   #convert to tensor
    try:
        prediction = model(tensor, length_tensor)              #prediction 
    except:
        # print("Empty sentence:", sentence)
        return 0
    
    return prediction.item()  

In [None]:
raw_prediction, prediction = [], []
THRES_HOLD = 0.32

model.load_state_dict(torch.load('./weightsv3.pt'))
model.eval()

for idx, row in test_df.iterrows():
    pred = 0
    raw_pred = predict(model, row['cleaned_question_text'])
    if raw_pred >= THRES_HOLD:
        pred = 1
        
    # save to list
    prediction.append(pred)
    raw_prediction.append(raw_pred)

print('# of prediction', len(prediction))

In [None]:
# merge
test_df['raw_prediction'] = raw_prediction
test_df['prediction'] = prediction

test_df.head(10)

In [None]:
prediction_df = test_df.drop(['question_text', 'cleaned_question_text', 'raw_prediction'], axis=1)
prediction_df.to_csv('./submission.csv', index=False)

prediction_df.head(10)