In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import gc
import time
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import roc_auc_score
import gensim.models.keyedvectors as word2vec
from nltk.tokenize import WordPunctTokenizer
from collections import Counter

SEED = 41
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

In [2]:
RAW_DATA_PATH        = '../../dl_nlp/data/jigsaw_toxic/raw/'
PROCESSED_DATA_PATH  = '../../dl_nlp/data/jigsaw_toxic/processed/' 

MAX_LEN = 100

### Load Data

In [3]:
def load_sample():
    return pd.read_csv(os.path.join(PROCESSED_DATA_PATH, 'train_sample.csv'))

def load_full():
    train       = pd.read_csv(os.path.join(RAW_DATA_PATH, 'train.csv'))
    test        = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test.csv'))
    test_labels = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test_labels.csv'))
    
    return train, test, test_labels

In [4]:
%%time
train, _, _ = load_full()

CPU times: user 1.42 s, sys: 332 ms, total: 1.75 s
Wall time: 2.39 s


#### Define target columns

In [5]:
TARGET_COLS = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

#### Tokenization

In [6]:
# define tokenizer
tokenizer = WordPunctTokenizer()

In [7]:
%%time
train_tokenized_comments = list(map(tokenizer.tokenize, train.comment_text))

CPU times: user 4.8 s, sys: 336 ms, total: 5.14 s
Wall time: 5.14 s


In [8]:
%%time
train.loc[:, 'tokenized_comments'] = list(map(' '.join, map(tokenizer.tokenize, train.comment_text)))

CPU times: user 5.25 s, sys: 4.38 s, total: 9.63 s
Wall time: 9.62 s


### Create vocabulary

#### Create word freq mapping

In [9]:
token_counts = Counter()

for tok_comments in train_tokenized_comments:
    token_counts.update(tok_comments)

In [10]:
# we can put a threshold on the token frequency to reduce the vocabulary
tokens    = {}
min_count = 10

for token, freq in token_counts.items():
    if freq >= min_count:
        tokens[token] = freq
        
print('Size of the vocabulary: {}'.format(len(tokens)))

Size of the vocabulary: 32838


### Load Pretrained Embeddings

In [11]:
def load_wv_embedding_matrix(words):
    word2vec_dict = word2vec.KeyedVectors.load_word2vec_format('../../dl_nlp/data/jigsaw_toxic/processed/word2vec.bin.gz', binary=True)
    embed_size    = 300

    embedding_index = dict()
    for word in word2vec_dict.wv.vocab:
        embedding_index[word] = word2vec_dict.word_vec(word)

    print('Loaded %d word vectors'%(len(embedding_index)))

    all_embs          = np.stack(list(embedding_index.values()))
    emb_mean, emb_std = all_embs.mean(), all_embs.std()

    UNK, PAD       = 'UNK', 'PAD'
    UNK_IX, PAD_IX = len(words), len(words) + 1

    nb_words = len(words) + 2

    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

    embed_cnt = 0
    for i, word in enumerate(list(words.keys()) + [UNK, PAD]):
        embedding_vector = embedding_index.get(word)

        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            embed_cnt +=1

    print('total embedded ', embed_cnt, ' common words')
    del embedding_index
    gc.collect()

    return embedding_matrix, UNK, PAD, UNK_IX, PAD_IX

### Token to ID mapping

In [12]:
# token to index (manual)
# UNK, PAD       = 'UNK', 'PAD'
# UNK_IX, PAD_IX =  0, 1

# token_to_id = {UNK: UNK_IX,
#                PAD: PAD_IX
#               }

# for token in tokens.keys():
#     token_to_id[token] = len(token_to_id)

In [13]:
%%time

# token to index ( word2vec embeddings )
embedding_matrix, UNK, PAD, UNK_IX, PAD_IX = load_wv_embedding_matrix(tokens)

  


Loaded 3000000 word vectors
total embedded  29714  common words
CPU times: user 1min 59s, sys: 8.26 s, total: 2min 7s
Wall time: 2min 4s


In [14]:
token_to_id      = {word: index for index, word in enumerate(tokens.keys())}
token_to_id[UNK] = UNK_IX
token_to_id[PAD] = PAD_IX

### Pad Sequences and convert map tokens to indices

In [22]:
UNK_IX, PAD_IX = map(token_to_id.get, [UNK, PAD])

def as_matrix(sequences, token_to_id, word_dropout, UNK_IX, PAD_IX, max_len=None):
    """ Convert a list of tokens into a matrix with padding """

    if isinstance(sequences[0], str):
        sequences = list(map(str.split, sequences))

    max_len = min(max(map(len, sequences)), max_len or float('inf'))
    matrix = np.full((len(sequences), max_len), np.int32(PAD_IX))

    for i,seq in enumerate(sequences):
        row_ix = [token_to_id.get(word, UNK_IX) for word in seq[:max_len]]
        matrix[i, :len(row_ix)] = row_ix

    if word_dropout != 0:
        matrix = apply_word_dropout(matrix, 1 - word_dropout, replace_with=UNK_IX, pad_ix=PAD_IX)

    return matrix

def apply_word_dropout(matrix, keep_prop, replace_with, pad_ix):
    dropout_mask = np.random.choice(2, np.shape(matrix), p=[keep_prop, 1-keep_prop])
    dropout_mask &= matrix != pad_ix
    return np.choose(dropout_mask, [matrix, np.full_like(matrix, replace_with)])

### Split data into training and validation split

In [16]:
from sklearn.model_selection import train_test_split

data_train, data_val = train_test_split(train, test_size=0.2, random_state=42)
data_train.index     = range(len(data_train))
data_val.index       = range(len(data_val))

print("Train size = ", len(data_train))
print("Validation size = ", len(data_val))

Train size =  127656
Validation size =  31915


### Data loader

In [17]:
def iterate_batches(matrix, labels, batch_size, predict_mode='train'):
    indices = np.arange(len(matrix))
    if predict_mode == 'train':
        np.random.shuffle(indices)
    
    for start in range(0, len(matrix), batch_size):
        end = min(start + batch_size, len(matrix))
        
        batch_indices = indices[start: end]
        X = matrix[batch_indices]
        
        if predict_mode != 'train': yield X
        else: yield X, labels[batch_indices]

### Model Definition

In [18]:
# class Net(nn.Module):
#     def __init__(self, vocab_size, hidden_dim, PAD_IX):
#         super(Net, self).__init__()
        
#         # num feature maps: It is referred to as the number of output channels
#         # to be produced by the output kernel.
#         self.num_feature_maps = 3 
        
#         # kernel_size = refers to the height of the kernel
#         # by default it the width covers the entire dimension of the word in question
#         # height refers to the number of words are convolving over.
#         # for example if k = 2, it means we are considering bigrams
#         # and k = 3 would indicate that we are moving over trigrams and so on.
        
#         self.kernel_size = 3
        
#         self.embedding = nn.Embedding(vocab_size, hidden_dim, padding_idx=PAD_IX)
#         self.conv1     = nn.Conv1d(hidden_dim, self.num_feature_maps, self.kernel_size)
#         self.relu      = nn.ReLU()
#         self.fc        = nn.Linear(3, 6)
        
#     def forward(self, x):
#         out = self.embedding(x)
        
#         # shape of the output after passing it through embedding layer is
#         # (batch, seq len, channels)
#         # channels here refer to the hidden dim
#         # but we wan't to convert it into (batch, channels, seq len)
#         # so as to make sure our convolution operation works correctly.
        
#         out = torch.transpose(out, 1, 2)
#         out = self.conv1(out)
#         out = self.relu(out)
        
#         # before taking max let's transpose dimension 1 with 2
#         out = torch.transpose(out, 1, 2)
        
#         # take max pooling over time
#         out = out.max(dim=1)[0]
        
#         # pass it to fully connected layer
#         out = self.fc(out)
        
#         return out

In [26]:
class Net2(nn.Module):
    def __init__(self, weights, vocab_size, hidden_dim, PAD_IX):
        super(Net2, self).__init__()
        
        # num feature maps: It is referred to as the number of output channels
        # to be produced by the output kernel.
        self.nfms = [100, 100]
        
        # kernel_size = refers to the height of the kernel
        # by default it the width covers the entire dimension of the word in question
        # height refers to the number of words are convolving over.
        # for example if k = 2, it means we are considering bigrams
        # and k = 3 would indicate that we are moving over trigrams and so on.
        
        self.ks = [4, 5]
        
        self.embedding        = nn.Embedding(vocab_size, hidden_dim)
        self.embedding.weight = nn.Parameter(weights)
        self.embedding.weight.requires_grad = False
        
        self.conv1     = nn.Conv1d(hidden_dim, self.nfms[0], self.ks[0])
        self.conv2     = nn.Conv1d(hidden_dim, self.nfms[1], self.ks[1])
#         self.conv3     = nn.Conv1d(hidden_dim, self.nfms[2], self.ks[2])
        
        
        self.relu      = nn.ReLU()
        self.fc        = nn.Linear(self.nfms[0] + self.nfms[1], 
                                   6)
        self.dropout   = nn.Dropout(0.1)
        
    def forward(self, x):
        out = self.embedding(x)
        
        # shape of the output after passing it through embedding layer is
        # (batch, seq len, channels)
        # channels here refer to the hidden dim
        # but we wan't to convert it into (batch, channels, seq len)
        # so as to make sure our convolution operation works correctly.
        
        out = torch.transpose(out, 1, 2)
        
        out1 = self.conv1(out)
        out1 = self.relu(out1)
        
        out2 = self.conv2(out)
        out2 = self.relu(out2)
        
#         out3 = self.conv2(out)
#         out3 = self.relu(out3)
        
        # before taking max let's transpose dimension 1 with 2
        out1 = torch.transpose(out1, 1, 2)
        out2 = torch.transpose(out2, 1, 2)
#         out3 = torch.transpose(out3, 1, 2)
        
        
        # take max pooling over time
        out1 = out1.max(dim=1)[0]
        out2 = out2.max(dim=1)[0]
#         out3 = out3.max(dim=1)[0]
        
        # concatenate outputs from multiple convolutional layers
        out = torch.cat((out1, out2), dim=1)
        
        # pass it through dropout layer as well
        out = self.dropout(out)
        
        # pass it to fully connected layer
        out = self.fc(out)
        
        return out

### Training Loop

In [20]:
def do_epoch(model, criterion, data, batch_size, optimizer=None):
    epoch_loss, total_size = 0, 0
    per_label_preds = [[], [], [], [], [], []]
    per_label_true  = [[], [], [], [], [], []]
    
    is_train = not optimizer is None
    model.train(is_train)
    
    data, labels = data
    batchs_count = math.ceil(data.shape[0] / batch_size)
    
    with torch.autograd.set_grad_enabled(is_train):
        for i, (X_batch, y_batch) in enumerate(iterate_batches(data, labels, batch_size)):
            X_batch, y_batch = torch.cuda.LongTensor(X_batch), torch.cuda.FloatTensor(y_batch)

            logits = model(X_batch)
            loss   = criterion(logits, y_batch)
            
            if is_train:
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
            
            # convert true target
            batch_target = y_batch.cpu().detach().numpy()
            logits_cpu   = logits.cpu().detach().numpy()
            
            # per_label_preds
            for j in range(6):
                label_preds     = logits_cpu[:, j]
                per_label_preds[j].extend(label_preds)
                per_label_true[j].extend(batch_target[:, j])
                            
            # calculate log loss
            epoch_loss += loss.item()

            print('\r[{} / {}]: Loss = {:.4f}'.format(
                  i, batchs_count, loss.item(), end=''))
    
    label_auc = []
    
    for i in range(6):
        label_auc.append(roc_auc_score(per_label_true[i], per_label_preds[i]))
    
    return epoch_loss / batchs_count, np.mean(label_auc)

def fit(model, criterion, optimizer, train_data, epochs_count=1, 
        batch_size=32, val_data=None, val_batch_size=None):
    if not val_data is None and val_batch_size is None:
        val_batch_size = batch_size
        
    for epoch in range(epochs_count):
        start_time = time.time()
        train_loss, train_auc = do_epoch(
            model, criterion, train_data, batch_size, optimizer
        )
        
        output_info = '\rEpoch {} / {}, Epoch Time = {:.2f}s: Train Loss = {:.4f}, Train AUC = {:.4f}'
        if not val_data is None:
            val_loss, val_auc   = do_epoch(model, criterion, val_data, val_batch_size, None)
            
            epoch_time   = time.time() - start_time
            output_info += ', Val Loss = {:.4f}, Val AUC = {:.4f}'
            print(output_info.format(epoch+1, epochs_count, epoch_time, 
                                     train_loss,
                                     train_auc,
                                     val_loss,
                                     val_auc
                                    ))
        else:
            epoch_time = time.time() - start_time
            print(output_info.format(epoch+1, epochs_count, epoch_time, train_loss, train_auc))

### Run on a single batch

In [21]:
# matrix = as_matrix(data_train['tokenized_comments'], 
#                    token_to_id, 
#                    word_dropout=0, 
#                    UNK_IX=UNK_IX, 
#                    PAD_IX=PAD_IX
#                   )

# labels = data_train.loc[:, TARGET_COLS].values
# X, y   = next(iterate_batches(matrix, labels, batch_size=2))

In [22]:
# X = torch.cuda.LongTensor(X)
# y = torch.cuda.LongTensor(y)

In [23]:
# vocab_size = len(token_to_id)
# hidden_dim = 300

# model = Net2(torch.FloatTensor(embedding_matrix), 
#              vocab_size, 
#              hidden_dim, 
#              PAD_IX).cuda()

In [31]:
# logits = model(X)
# print(logits)

tensor([[ 0.5611,  0.0108, -0.4054, -0.6412, -0.1093,  0.0689],
        [ 0.5287,  0.0313, -0.4272, -0.6379, -0.0615,  0.0176]],
       device='cuda:0', grad_fn=<AddmmBackward>)


### Run on full batch

In [27]:
vocab_size = len(token_to_id)
hidden_dim = 300

model        = Net2(torch.FloatTensor(embedding_matrix), 
                    vocab_size, 
                    hidden_dim, 
                    PAD_IX
                    ).cuda()

criterion    = nn.BCEWithLogitsLoss().cuda()
optimizer    = optim.Adam([param for param in model.parameters() if param.requires_grad], lr=0.001)

X_train      = as_matrix(data_train['tokenized_comments'], 
                         token_to_id, 
                         word_dropout=0.01, 
                         UNK_IX=UNK_IX, 
                         PAD_IX=PAD_IX,
                         max_len=MAX_LEN
                        )

train_labels = data_train.loc[:, TARGET_COLS].values 

X_test       = as_matrix(data_val['tokenized_comments'],
                         token_to_id, 
                         word_dropout=0.01, 
                         UNK_IX=UNK_IX, 
                         PAD_IX=PAD_IX,
                         max_len=MAX_LEN
                        )

test_labels  = data_val.loc[:, TARGET_COLS].values

fit(model, criterion, optimizer, train_data=(X_train, train_labels), epochs_count=5, 
    batch_size=512, val_data=(X_test, test_labels), val_batch_size=1024)

[0 / 250]: Loss = 0.7095
[1 / 250]: Loss = 0.6211
[2 / 250]: Loss = 0.5331
[3 / 250]: Loss = 0.4665
[4 / 250]: Loss = 0.3823
[5 / 250]: Loss = 0.3307
[6 / 250]: Loss = 0.2959
[7 / 250]: Loss = 0.2467
[8 / 250]: Loss = 0.2100
[9 / 250]: Loss = 0.1956
[10 / 250]: Loss = 0.1514
[11 / 250]: Loss = 0.1775
[12 / 250]: Loss = 0.1930
[13 / 250]: Loss = 0.1665
[14 / 250]: Loss = 0.1472
[15 / 250]: Loss = 0.1498
[16 / 250]: Loss = 0.1461
[17 / 250]: Loss = 0.1749
[18 / 250]: Loss = 0.1723
[19 / 250]: Loss = 0.1508
[20 / 250]: Loss = 0.2100
[21 / 250]: Loss = 0.1978
[22 / 250]: Loss = 0.1897
[23 / 250]: Loss = 0.1356
[24 / 250]: Loss = 0.1435
[25 / 250]: Loss = 0.1727
[26 / 250]: Loss = 0.1372
[27 / 250]: Loss = 0.1774
[28 / 250]: Loss = 0.1718
[29 / 250]: Loss = 0.1323
[30 / 250]: Loss = 0.1235
[31 / 250]: Loss = 0.1527
[32 / 250]: Loss = 0.1413
[33 / 250]: Loss = 0.1606
[34 / 250]: Loss = 0.1312
[35 / 250]: Loss = 0.1230
[36 / 250]: Loss = 0.1231
[37 / 250]: Loss = 0.1394
[38 / 250]: Loss = 0.1

```
Num feature maps : 3
kernel size      : 3

Epoch 5 / 5, Epoch Time = 5.61s: Train Loss = 0.1502, Train AUC = 0.6088, Val Loss = 0.1462, Val AUC = 0.5961

Num feature maps : 100
kernel size      : 3

Epoch 5 / 5, Epoch Time = 10.27s: Train Loss = 0.0633, Train AUC = 0.9024, Val Loss = 0.0653, Val AUC = 0.9299

Num feature maps : 100
kernel size      : 5

Epoch 5 / 5, Epoch Time = 11.90s: Train Loss = 0.0628, Train AUC = 0.9572, Val Loss = 0.0686, Val AUC = 0.9571

Num feature maps : 100
kernel size      : 7

Epoch 5 / 5, Epoch Time = 13.89s: Train Loss = 0.0647, Train AUC = 0.9273, Val Loss = 0.0676, Val AUC = 0.9603


Num feature maps: [100, 100]
kernel sizes    : [5, 5]

Epoch 5 / 5, Epoch Time = 22.96s: Train Loss = 0.0577, Train AUC = 0.9596, Val Loss = 0.0619, Val AUC = 0.9607

Num feature maps: [100, 100]
kernel sizes    : [3, 3]

Epoch 5 / 5, Epoch Time = 19.88s: Train Loss = 0.0591, Train AUC = 0.9488, Val Loss = 0.0591, Val AUC = 0.9558

( Full Dataset )
Num feature maps : [100, 100]
kernel size      : [5, 6]
word dropout     : 0.01
dropout          : 0.1
max len          : 100

Epoch 5 / 5, Epoch Time = 9.96s: Train Loss = 0.0478, Train AUC = 0.9812, Val Loss = 0.0513, Val AUC = 0.9794

Num feature maps : [100, 100]
kernel size      : [5, 6]
word dropout     : 0.01
dropout          : 0.1
max len          : 100
pad_ix ( not considered in embedding layer )

Epoch 5 / 5, Epoch Time = 9.85s: Train Loss = 0.0478, Train AUC = 0.9803, Val Loss = 0.0502, Val AUC = 0.9793

Num feature maps : [100, 100]
kernel size      : [4, 5]
word dropout     : 0.01
dropout          : 0.1
max len          : 100
pad_ix ( not considered in embedding layer )

Epoch 5 / 5, Epoch Time = 9.18s: Train Loss = 0.0467, Train AUC = 0.9819, Val Loss = 0.0492, Val AUC = 0.9815
```