In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import gc
import time
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import roc_auc_score
from nltk.tokenize import WordPunctTokenizer
from collections import Counter

from sklearn.model_selection import train_test_split

SEED = 41
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

In [2]:
RAW_DATA_PATH        = '../../dl_nlp/data/jigsaw_toxic/raw/'
PROCESSED_DATA_PATH  = '../../dl_nlp/data/jigsaw_toxic/processed/' 

SEQ_LEN = 512

### Helper Methods

In [3]:
def tokenize_sentences(sentences):
    tokenizer = WordPunctTokenizer()
    return [tokenizer.tokenize(sentence.lower()) for sentence in sentences]

def get_tokens(tokenized_sentences):
    return [token for tokenized_sentence in tokenized_sentences for token in tokenized_sentence]

def get_chars(tokens):
    return list(set([char for token in tokens for char in token]))

def load_sample():
    return pd.read_csv(os.path.join(PROCESSED_DATA_PATH, 'train_sample.csv'))

def load_full():
    train       = pd.read_csv(os.path.join(RAW_DATA_PATH, 'train.csv'))
    test        = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test.csv'))
    test_labels = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test_labels.csv'))
    
    return train, test, test_labels

In [4]:
# %%time
# train = load_sample()

In [5]:
%%time
train, _, _ = load_full()

CPU times: user 1.37 s, sys: 232 ms, total: 1.6 s
Wall time: 1.6 s


In [6]:
TARGET_COLS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [7]:
%%time
train_tokenized_comments = tokenize_sentences(train.comment_text)

CPU times: user 4.51 s, sys: 336 ms, total: 4.85 s
Wall time: 4.85 s


In [30]:
# comment_len = [len(comment) for comment in train_tokenized_comments]
# pd.Series(comment_len).describe()

In [9]:
# fixed character set anything other than this would be considered as UNK (unknown) symbol
# unique_chars = 'abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:’"/|_#$%ˆ&*˜‘+=<>()[]{}'
# print(unique_chars)

abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:’"/|_#$%ˆ&*˜‘+=<>()[]{}


In [20]:
%%time
unique_chars = list(set([char.lower() for comment in train_tokenized_comments for token in comment for char in token]))
print(len(unique_chars))

2103


In [21]:
# token to index
UNK, PAD       = 'UNK', 'PAD'
UNK_IX, PAD_IX =  0, 1

char_to_id = {UNK: UNK_IX,
              PAD: PAD_IX
             }

for char in unique_chars:
    char_to_id[char] = len(char_to_id)
    
char_to_id[UNK] = len(char_to_id)
char_to_id[PAD] = len(char_to_id) + 1

In [22]:
# create a batch out of sentences

def as_matrix(sequences, char_to_id, UNK_IX, PAD_IX, max_len=SEQ_LEN):
    """ Convert a list of tokens into a matrix with padding """

    matrix = np.full((len(sequences), max_len), np.int32(PAD_IX))

    for i,seq in enumerate(sequences):
        row_ix                  = [char_to_id.get(char, UNK_IX) for word in seq[:max_len] for char in word]
        matrix[i, :len(row_ix)] = row_ix

    return matrix

### Split into train and test set

In [23]:
data_train, data_val = train_test_split(train, test_size=0.2, random_state=42)
data_train.index     = range(len(data_train))
data_val.index       = range(len(data_val))

print("Train size = ", len(data_train))
print("Validation size = ", len(data_val))

Train size =  127656
Validation size =  31915


In [24]:
def iterate_batches(matrix, labels, batch_size, predict_mode='train'):
    indices = np.arange(len(matrix))
    
    if predict_mode == 'train':
        np.random.shuffle(indices)
        
    for start in range(0, len(matrix), batch_size):
        end = min(start + batch_size, len(matrix))
        batch_indices = indices[start: end]
        X = matrix[batch_indices]
        
        if predict_mode != 'train': yield X
        else: 
            yield X, labels[batch_indices]

In [25]:
# matrix = as_matrix(data_train.comment_text, char_to_id, UNK_IX=UNK_IX, PAD_IX=PAD_IX)
# labels = data_train.loc[:, TARGET_COLS].values
# X, y   = next(iterate_batches(matrix, labels, batch_size=2))

In [26]:
class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)
    
class ConvBlock(nn.Module):
    def __init__(self, num_output_channels, num_feature_maps, upsample=False):
        super(ConvBlock, self).__init__()
        
        self.num_output_channels = num_output_channels
        self.num_feature_maps    = num_feature_maps
        self.kernel_size         = 3
        
        self.conv1               = nn.Conv1d(self.num_output_channels,
                                             self.num_feature_maps,
                                             kernel_size=self.kernel_size,
                                             padding=1,
                                             bias=False
                                            )
        
        self.relu                = nn.ReLU()
        self.bn                  = nn.BatchNorm1d(self.num_feature_maps)
        self.upsample            = upsample
        self.conv1x1             = nn.Conv1d(self.num_output_channels,
                                             self.num_feature_maps,
                                             kernel_size=1,
                                             bias=False
                                            )
        
    def forward(self, x):
        identity = x

        out      = self.conv1(x)
        out      = self.bn(out)
        
        if self.upsample:
            identity = self.conv1x1(identity)
            
        # optional shortcut
        out     += identity
        out      = self.relu(out)

        return out
    
class KMaxPool(nn.Module):
    def __init__(self):
        super(KMaxPool, self).__init__()
        
    def forward(self, x, dim, k=8):
        index = x.topk(k, dim = dim)[1].sort(dim = dim)[0]
        return x.gather(dim, index)
        
class VDCNN(nn.Module):
    def __init__(self, vocab_size):
        super(VDCNN, self).__init__()
        
        self.hidden_dim       = 16
        self.num_feature_maps = 64
        
        # define embedding space for characters
        self.char_embedding = nn.Embedding(vocab_size, self.hidden_dim)
        
        # convolutional layer of fixed kernel size
        self.conv1 = nn.Conv1d(self.hidden_dim, 
                               self.num_feature_maps, 
                               kernel_size=3,
                               padding=1
                              )
        
        # relu layer
        self.relu     = nn.ReLU()
        
        # conv blocks
        self.convbl_1 = ConvBlock(self.num_feature_maps, 64, upsample=False)
        self.convbl_2 = ConvBlock(64, 128, upsample=True)
#         self.convbl_3 = ConvBlock(128, 256, upsample=True)
#         self.convbl_4 = ConvBlock(256, 512, upsample=True)
        
        # pooling layer
        self.pool     = nn.MaxPool1d(kernel_size=2)
        
        # k-max pooling layer
        self.kmax_pool = KMaxPool()
        
        # flatten any layer
        self.flatten = Flatten()
        
        # dropout layer
        self.dropout = nn.Dropout(0.4)
        
        # fc
        self.fc = nn.Linear(256 * 8, 6)
        
    def forward(self, x):
        embed = self.char_embedding(x)
        
        # raw embedding produces (batch_size, seq_len, channels)
        # but pytorch expects (batch_size, channels, seq_len)
        
        embed = torch.transpose(embed, 1, 2)
        
        # first layer of convolutions
        out = self.conv1(embed)
        out = self.relu(out)
        
        ## ConvBlock followed by pooling
        
        # (Convolutional Block, 3, 64)
        out = self.convbl_1(out)
        out = self.pool(out)
        
        # (Convolutional Block, 3, 128)
        out =  self.convbl_2(out)
        out =  self.pool(out)
        
        # (Convolutional Block, 3, 256)
#         out = self.convbl_3(out)
#         out = self.pool(out)
        
        # (Convolutional Block, 3, 512)
#         out = self.convbl_4(out)
#         out = self.pool(out)
        
        # k-max pooling at the end
        out = self.kmax_pool(out, dim=2)
        
        # flatten
        out = self.flatten(out)
        
        # pass it through fully connected layer
        out = self.fc(out)
        
        return out

In [15]:
# convert input and output into torch tensors
# X = torch.cuda.LongTensor(X)
# y = torch.cuda.LongTensor(y)

In [16]:
# vocab_size = len(char_to_id)

# model    = VDCNN(vocab_size).cuda()
# logits   = model(X)

In [17]:
# logits.shape

### Training Loop

In [27]:
def do_epoch(model, criterion, data, batch_size, optimizer=None):
    epoch_loss, total_size = 0, 0
    per_label_preds = [[], [], [], [], [], []]
    per_label_true  = [[], [], [], [], [], []]
    
    is_train = not optimizer is None
    model.train(is_train)
    
    data, labels = data
    batchs_count = math.ceil(data.shape[0] / batch_size)
    
    with torch.autograd.set_grad_enabled(is_train):
        for i, (X_batch, y_batch) in enumerate(iterate_batches(data, labels, batch_size)):
            X_batch, y_batch = torch.cuda.LongTensor(X_batch), torch.cuda.FloatTensor(y_batch)

            logits = model(X_batch)
            loss   = criterion(logits, y_batch)
            
            if is_train:
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
            
            # convert true target
            batch_target = y_batch.cpu().detach().numpy()
            logits_cpu   = logits.cpu().detach().numpy()
            
            # per_label_preds
            for j in range(6):
                label_preds     = logits_cpu[:, j]
                per_label_preds[j].extend(label_preds)
                per_label_true[j].extend(batch_target[:, j])
                            
            # calculate log loss
            epoch_loss += loss.item()

            print('\r[{} / {}]: Loss = {:.4f}'.format(
                  i, batchs_count, loss.item(), end=''))
    
    label_auc = []
    
    for i in range(6):
        label_auc.append(roc_auc_score(per_label_true[i], per_label_preds[i]))
    
    return epoch_loss / batchs_count, np.mean(label_auc)

def fit(model, criterion, optimizer, train_data, epochs_count=1, 
        batch_size=32, val_data=None, val_batch_size=None):
    
    if not val_data is None and val_batch_size is None:
        val_batch_size = batch_size
        
    for epoch in range(epochs_count):
        start_time = time.time()
        train_loss, train_auc = do_epoch(
            model, criterion, train_data, batch_size, optimizer
        )
        
        output_info = '\rEpoch {} / {}, Epoch Time = {:.2f}s: Train Loss = {:.4f}, Train AUC = {:.4f}'
        if not val_data is None:
            val_loss, val_auc   = do_epoch(model, criterion, val_data, val_batch_size, None)
            
            epoch_time   = time.time() - start_time
            output_info += ', Val Loss = {:.4f}, Val AUC = {:.4f}'
            print(output_info.format(epoch+1, epochs_count, epoch_time, 
                                     train_loss,
                                     train_auc,
                                     val_loss,
                                     val_auc
                                    ))
        else:
            epoch_time = time.time() - start_time
            print(output_info.format(epoch+1, epochs_count, epoch_time, train_loss, train_auc))

### Run on full batch

In [28]:
vocab_size = len(char_to_id)

model      = VDCNN(vocab_size).cuda()

criterion  = nn.BCEWithLogitsLoss().cuda()
# optimizer  = optim.Adam([param for param in model.parameters() if param.requires_grad], lr=0.01)
optimizer  = optim.SGD([param for param in model.parameters() if param.requires_grad], lr=0.03, momentum=0.9)

X_train      = as_matrix(data_train.comment_text, char_to_id, UNK_IX=UNK_IX, PAD_IX=PAD_IX)
train_labels = data_train.loc[:, TARGET_COLS].values 

X_test       = as_matrix(data_val.comment_text, char_to_id, UNK_IX=UNK_IX, PAD_IX=PAD_IX)
test_labels  = data_val.loc[:, TARGET_COLS].values

fit(model, criterion, optimizer, train_data=(X_train, train_labels), epochs_count=7, 
    batch_size=512, val_data=(X_test, test_labels), val_batch_size=1024)

[0 / 250]: Loss = 0.5423
[1 / 250]: Loss = 0.1643
[2 / 250]: Loss = 0.2826
[3 / 250]: Loss = 0.4249
[4 / 250]: Loss = 0.3325
[5 / 250]: Loss = 0.3136
[6 / 250]: Loss = 0.2977
[7 / 250]: Loss = 0.1880
[8 / 250]: Loss = 0.3451
[9 / 250]: Loss = 0.3080
[10 / 250]: Loss = 0.1842
[11 / 250]: Loss = 0.2125
[12 / 250]: Loss = 0.2250
[13 / 250]: Loss = 0.2382
[14 / 250]: Loss = 0.1801
[15 / 250]: Loss = 0.2040
[16 / 250]: Loss = 0.2210
[17 / 250]: Loss = 0.1935
[18 / 250]: Loss = 0.1936
[19 / 250]: Loss = 0.1719
[20 / 250]: Loss = 0.1845
[21 / 250]: Loss = 0.1708
[22 / 250]: Loss = 0.1938
[23 / 250]: Loss = 0.1653
[24 / 250]: Loss = 0.1920
[25 / 250]: Loss = 0.1844
[26 / 250]: Loss = 0.1820
[27 / 250]: Loss = 0.1490
[28 / 250]: Loss = 0.1876
[29 / 250]: Loss = 0.1643
[30 / 250]: Loss = 0.1611
[31 / 250]: Loss = 0.1866
[32 / 250]: Loss = 0.1190
[33 / 250]: Loss = 0.1673
[34 / 250]: Loss = 0.1877
[35 / 250]: Loss = 0.1744
[36 / 250]: Loss = 0.1705
[37 / 250]: Loss = 0.1109
[38 / 250]: Loss = 0.1

`Epoch 7 / 7, Epoch Time = 36.67s: Train Loss = 0.0696, Train AUC = 0.9440, Val Loss = 0.0702, Val AUC = 0.9403`

```
BEST RUN:

Epoch 3 / 3, 
Epoch Time = 161.67s: 
Train Loss = 0.0745, 
Train AUC = 0.9344, 
Val Loss = 0.0728, 
Val AUC = 0.9401
=========================================


```