In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import gc
import time
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import roc_auc_score
import gensim.models.keyedvectors as word2vec
from nltk.tokenize import WordPunctTokenizer
from collections import Counter

SEED = 41
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

In [2]:
RAW_DATA_PATH        = '../data/raw/'
PROCESSED_DATA_PATH  = '../data/processed/' 

MAX_LEN = 500

### Load Data

In [3]:
def load_sample():
    return pd.read_csv(os.path.join(PROCESSED_DATA_PATH, 'train_sample.csv'))

def load_full():
    train       = pd.read_csv(os.path.join(RAW_DATA_PATH, 'train.csv'))
    test        = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test.csv'))
    test_labels = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test_labels.csv'))
    
    return train, test, test_labels

In [4]:
%%time
train, _, _ = load_full()

CPU times: user 1.54 s, sys: 164 ms, total: 1.7 s
Wall time: 1.7 s


In [5]:
# %%time
# train = load_sample()

#### Define target columns

In [6]:
TARGET_COLS = ['toxic', 
               'severe_toxic', 
               'obscene', 
               'threat', 
               'insult', 
               'identity_hate'
              ]

#### Tokenization

In [7]:
# define tokenizer
tokenizer = WordPunctTokenizer()

In [8]:
%%time
train_tokenized_comments = list(map(tokenizer.tokenize, train.comment_text))

CPU times: user 5.53 s, sys: 296 ms, total: 5.82 s
Wall time: 5.82 s


In [9]:
%%time
train.loc[:, 'tokenized_comments'] = list(map(' '.join, map(tokenizer.tokenize, train.comment_text)))

CPU times: user 5.96 s, sys: 3.89 s, total: 9.85 s
Wall time: 9.85 s


### Create vocabulary

#### Create word freq mapping

In [10]:
token_counts = Counter()

for tok_comments in train_tokenized_comments:
    token_counts.update(tok_comments)

In [11]:
# we can put a threshold on the token frequency to reduce the vocabulary
tokens    = {}
min_count = 10

for token, freq in token_counts.items():
    if freq >= min_count:
        tokens[token] = freq
        
print('Size of the vocabulary: {}'.format(len(tokens)))

Size of the vocabulary: 32838


### Load Pretrained Embeddings

In [12]:
def load_wv_embedding_matrix(words):
    word2vec_dict = word2vec.KeyedVectors.load_word2vec_format(os.path.join(PROCESSED_DATA_PATH, 'word2vec.bin.gz'), binary=True)
    embed_size    = 300

    embedding_index = dict()
    for word in word2vec_dict.wv.vocab:
        embedding_index[word] = word2vec_dict.word_vec(word)

    print('Loaded %d word vectors'%(len(embedding_index)))

    all_embs          = np.stack(list(embedding_index.values()))
    emb_mean, emb_std = all_embs.mean(), all_embs.std()

    UNK, PAD       = 'UNK', 'PAD'
    UNK_IX, PAD_IX = len(words), len(words) + 1

    nb_words = len(words) + 2

    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

    embed_cnt = 0
    for i, word in enumerate(list(words.keys()) + [UNK, PAD]):
        embedding_vector = embedding_index.get(word)

        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            embed_cnt +=1

    print('total embedded ', embed_cnt, ' common words')
    del embedding_index
    gc.collect()

    return embedding_matrix, UNK, PAD, UNK_IX, PAD_IX


def load_fake_embedding_matrix(words):
    UNK, PAD       = 'UNK', 'PAD'
    UNK_IX, PAD_IX = len(words), len(words) + 1

    nb_words = len(words) + 2
    emb_mean = .2
    embed_size = 300
    emb_std  = 2.3

    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

    return embedding_matrix, UNK, PAD, UNK_IX, PAD_IX

### Token to ID mapping

In [13]:
# token to index (manual)
# UNK, PAD       = 'UNK', 'PAD'
# UNK_IX, PAD_IX =  0, 1

# token_to_id = {UNK: UNK_IX,
#                PAD: PAD_IX
#               }

# for token in tokens.keys():
#     token_to_id[token] = len(token_to_id)

In [14]:
%%time

# token to index ( word2vec embeddings )
embedding_matrix, UNK, PAD, UNK_IX, PAD_IX = load_wv_embedding_matrix(tokens)
# embedding_matrix, UNK, PAD, UNK_IX, PAD_IX = load_fake_embedding_matrix(tokens)

  


Loaded 3000000 word vectors
total embedded  29714  common words
CPU times: user 2min 37s, sys: 8.49 s, total: 2min 45s
Wall time: 2min 44s


In [15]:
token_to_id      = {word: index for index, word in enumerate(tokens.keys())}
token_to_id[UNK] = UNK_IX
token_to_id[PAD] = PAD_IX

### Pad Sequences and convert map tokens to indices

In [16]:
UNK_IX, PAD_IX = map(token_to_id.get, [UNK, PAD])

def as_matrix(sequences, token_to_id, word_dropout, UNK_IX, PAD_IX, max_len=None):
    """ Convert a list of tokens into a matrix with padding """

    if isinstance(sequences[0], str):
        sequences = list(map(str.split, sequences))

    max_len = min(max(map(len, sequences)), max_len or float('inf'))
    matrix = np.full((len(sequences), max_len), np.int32(PAD_IX))

    for i,seq in enumerate(sequences):
        row_ix = [token_to_id.get(word, UNK_IX) for word in seq[:max_len]]
        matrix[i, :len(row_ix)] = row_ix

    if word_dropout != 0:
        matrix = apply_word_dropout(matrix, 1 - word_dropout, replace_with=UNK_IX, pad_ix=PAD_IX)

    return matrix

def apply_word_dropout(matrix, keep_prop, replace_with, pad_ix):
    dropout_mask = np.random.choice(2, np.shape(matrix), p=[keep_prop, 1-keep_prop])
    dropout_mask &= matrix != pad_ix
    return np.choose(dropout_mask, [matrix, np.full_like(matrix, replace_with)])

### Split data into training and validation split

In [17]:
from sklearn.model_selection import train_test_split

data_train, data_val = train_test_split(train, test_size=0.2, random_state=42)
data_train.index     = range(len(data_train))
data_val.index       = range(len(data_val))

print("Train size = ", len(data_train))
print("Validation size = ", len(data_val))

Train size =  127656
Validation size =  31915


### Data loader

In [18]:
def iterate_batches(matrix, labels, batch_size, predict_mode='train'):
    indices = np.arange(len(matrix))
    if predict_mode == 'train':
        np.random.shuffle(indices)
    
    for start in range(0, len(matrix), batch_size):
        end = min(start + batch_size, len(matrix))
        
        batch_indices = indices[start: end]
        X = matrix[batch_indices]
        
        if predict_mode != 'train': yield X
        else: yield X, labels[batch_indices]

### Model Definition

In [19]:
# class RCNN(nn.Module):
#     def __init__(self, weights, vocab_size, embed_size, hidden_size, num_classes):
#         super(RCNN, self).__init__()
        
#         self.vocab_size  = vocab_size
#         self.embed_size  = embed_size
#         self.hidden_size = hidden_size
#         self.num_classes = num_classes
        
#         self.embedding = nn.Embedding(self.vocab_size, self.embed_size)
#         self.embedding.weight = nn.Parameter(weights)
        
#         self.Wl  = nn.Parameter(data=torch.Tensor(self.hidden_size, self.hidden_size), requires_grad=True)
#         self.Wsl = nn.Parameter(data=torch.Tensor(self.hidden_size, self.embed_size), requires_grad=True) 
        
#         self.Wr  = nn.Parameter(data=torch.Tensor(self.hidden_size, self.hidden_size), requires_grad=True)
#         self.Wsr = nn.Parameter(data=torch.Tensor(self.hidden_size, self.embed_size), requires_grad=True) 
        
        
#         self.cl  = nn.Parameter(data=torch.Tensor(1, self.hidden_size), requires_grad=True)
#         self.cr  = nn.Parameter(data=torch.Tensor(1, self.hidden_size), requires_grad=True)
        
#         self.relu = nn.ReLU() 
#         self.fc   = nn.Linear(self.hidden_size * 2 + self.embed_size, self.num_classes)
        
#         self.reset_parameters()

#     def reset_parameters(self):
#         nn.init.kaiming_uniform_(self.Wl, a=np.sqrt(5))
#         nn.init.kaiming_uniform_(self.Wsl, a=np.sqrt(5))
#         nn.init.kaiming_uniform_(self.Wr, a=np.sqrt(5))
#         nn.init.kaiming_uniform_(self.Wsr, a=np.sqrt(5))
#         nn.init.kaiming_uniform_(self.cl, a=np.sqrt(5))
#         nn.init.kaiming_uniform_(self.cr, a=np.sqrt(5))
        
#     def forward(self, x):
        
#         # create left and right context vectors to be equal to be
#         # equal to (batch_size, hidden_size)
        
#         cl        = self.cl.repeat(x.size(0), 1)
#         cr        = self.cr.repeat(x.size(0), 1)
        
        
#         embed     = self.embedding(x)
#         cxt       = cl.t()
        
#         left_context  = []
#         right_context = []
        
#         # O(n)
#         for i in range(1, x.size(1)):
#             cxt         = self.relu(torch.mm(self.Wl, cxt) + torch.mm(self.Wsl, embed[:, i-1, :].t()))
#             left_context.append(cxt)
        
#         cxt = cr.t()
        
#         # O(n)
#         for i in range(x.size(1)-2, -1, -1):
#             cxt         = self.relu(torch.mm(self.Wr, cxt) + torch.mm(self.Wsr, embed[:, i-1, :].t()))
#             right_context.append(cxt)
        
        
#         left_context  = torch.cat([cl.t()] + left_context, dim=1)
#         left_context  = left_context.view(x.size(0), x.size(1), -1)
        
#         right_context = torch.cat(right_context + [cr.t()], dim=1).t()
#         right_context = right_context.view(x.size(0), x.size(1), -1)
        
#         # word representation
#         word_repr = torch.cat((left_context, embed, right_context), dim=2)
        
# #         print('WORD REPR ', word_repr.shape)
        
# #         out = self.fc1(word_repr)
# #         out = self.relu(out)
        
#         # text representation
#         out = word_repr.max(dim=1)[0]
        
#         # final layer
#         out = self.fc(out)
        
#         return out

In [20]:
class RCNN2(nn.Module):
    def __init__(self, weights, vocab_size, embed_size, hidden_size, num_classes):
        super(RCNN2, self).__init__()
        
        self.vocab_size  = vocab_size
        self.embed_size  = embed_size
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        
        # define embedding
        self.embedding        = nn.Embedding(self.vocab_size, self.embed_size)
        self.embedding.weight = nn.Parameter(weights)
        
        # lstm
        self.lstm      = nn.LSTM(self.embed_size, self.hidden_size)
        
        # time-distributed dense
        self.td_dense  = nn.Linear(self.hidden_size * 2 + self.embed_size, 32)
        
        # activation layer
        self.relu      = nn.ReLU()
        self.tanh      = nn.Tanh()
        
        # fully connected layer
        self.fc        = nn.Linear(32, self.num_classes)
        
        # spatial dropout
        self.spatial_dropout = nn.Dropout2d(0.5)
        
        # dropout
        self.dropout = nn.Dropout(0.2)
    
    def forward(self, x):
        """
        Args:
        
        x: Batch of sentences
        """
        
        # embedding
        embed = self.embedding(x)
        
        # fwd seq
        fwd_seq = F.pad(embed, (0, 0, 1, 0, 0, 0))[:, :-1, :]
        
        # pass through lstm layer
        lout, _ = self.lstm(fwd_seq)
        
        # rev seq
        rev_seq = F.pad(embed, (0, 0, 1, 0, 0, 0))[:, 1:, :]
        rev_seq = torch.flip(rev_seq, [1])
        
        rout, _ = self.lstm(rev_seq)
        rout    = torch.flip(rout, [1])
        
        # word representation
        w_repr  = torch.cat((lout, embed, rout), dim=2)
        
        # time distributed dense layer
        out     = self.td_dense(w_repr)
        
        # pass it through relu activation
        out     = self.tanh(out)
        
        # text representation
        t_repr, _ = out.max(dim=1)
        
        t_repr    = self.dropout(t_repr)
        
        out       = self.fc(t_repr)
        
        return out

### Training Loop

In [21]:
def do_epoch(model, criterion, data, batch_size, optimizer=None):
    epoch_loss, total_size = 0, 0
    per_label_preds = [[], [], [], [], [], []]
    per_label_true  = [[], [], [], [], [], []]
    
    is_train = not optimizer is None
    model.train(is_train)
    
    data, labels = data
    batchs_count = math.ceil(data.shape[0] / batch_size)
    
    with torch.autograd.set_grad_enabled(is_train):
        for i, (X_batch, y_batch) in enumerate(iterate_batches(data, labels, batch_size)):
            X_batch, y_batch = torch.cuda.LongTensor(X_batch), torch.cuda.FloatTensor(y_batch)

            logits = model(X_batch)
            loss   = criterion(logits, y_batch)
            
            if is_train:
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
            
            # convert true target
            batch_target = y_batch.cpu().detach().numpy()
            logits_cpu   = logits.cpu().detach().numpy()
            
            # per_label_preds
            for j in range(6):
                label_preds     = logits_cpu[:, j]
                per_label_preds[j].extend(label_preds)
                per_label_true[j].extend(batch_target[:, j])
                            
            # calculate log loss
            epoch_loss += loss.item()

            print('\r[{} / {}]: Loss = {:.4f}'.format(
                  i, batchs_count, loss.item(), end=''))
    
    label_auc = []
    
    for i in range(6):
        label_auc.append(roc_auc_score(per_label_true[i], per_label_preds[i]))
    
    return epoch_loss / batchs_count, np.mean(label_auc)

def fit(model, criterion, optimizer, train_data, epochs_count=1, 
        batch_size=32, val_data=None, val_batch_size=None):
    if not val_data is None and val_batch_size is None:
        val_batch_size = batch_size
        
    for epoch in range(epochs_count):
        start_time = time.time()
        train_loss, train_auc = do_epoch(
            model, criterion, train_data, batch_size, optimizer
        )
        
        output_info = '\rEpoch {} / {}, Epoch Time = {:.2f}s: Train Loss = {:.4f}, Train AUC = {:.4f}'
        if not val_data is None:
            val_loss, val_auc   = do_epoch(model, criterion, val_data, val_batch_size, None)
            
            epoch_time   = time.time() - start_time
            output_info += ', Val Loss = {:.4f}, Val AUC = {:.4f}'
            print(output_info.format(epoch+1, epochs_count, epoch_time, 
                                     train_loss,
                                     train_auc,
                                     val_loss,
                                     val_auc
                                    ))
        else:
            epoch_time = time.time() - start_time
            print(output_info.format(epoch+1, epochs_count, epoch_time, train_loss, train_auc))

### Run on a single batch

In [39]:
matrix = as_matrix(data_train['tokenized_comments'], 
                   token_to_id, 
                   word_dropout=0, 
                   UNK_IX=UNK_IX, 
                   PAD_IX=PAD_IX,
                   max_len=MAX_LEN
                  )

labels = data_train.loc[:, TARGET_COLS].values
X, y   = next(iterate_batches(matrix, labels, batch_size=2))

In [40]:
X = torch.cuda.LongTensor(X)
y = torch.cuda.LongTensor(y)

In [41]:
vocab_size  = len(token_to_id)
embed_size  = 300
hidden_size = 2
num_classes = 6

model =  RCNN2(torch.FloatTensor(embedding_matrix),
              vocab_size, 
              embed_size,
              hidden_size,
              num_classes
              ).cuda()

In [42]:
logits = model(X)
print(logits)

tensor([[ 0.2126,  0.0702,  0.1236, -0.2934, -0.0940,  0.1928],
        [-0.1989,  0.1868,  0.0017, -0.2311,  0.0167,  0.1820]],
       device='cuda:0', grad_fn=<AddmmBackward>)


### Run on full batch

In [22]:
vocab_size  = len(token_to_id)
embed_size  = 300
hidden_size = 64
num_classes = 6


model        = RCNN2(torch.FloatTensor(embedding_matrix),
                  vocab_size, 
                  embed_size,
                  hidden_size,
                  num_classes
                  ).cuda()

criterion    = nn.BCEWithLogitsLoss().cuda()
optimizer    = optim.Adam([param for param in model.parameters() if param.requires_grad], lr=0.001)

X_train      = as_matrix(data_train['tokenized_comments'], 
                         token_to_id, 
                         word_dropout=0.000, 
                         UNK_IX=UNK_IX, 
                         PAD_IX=PAD_IX,
                         max_len=MAX_LEN
                        )

train_labels = data_train.loc[:, TARGET_COLS].values 

X_test       = as_matrix(data_val['tokenized_comments'],
                         token_to_id, 
                         word_dropout=0.000, 
                         UNK_IX=UNK_IX, 
                         PAD_IX=PAD_IX,
                         max_len=MAX_LEN
                        )

test_labels  = data_val.loc[:, TARGET_COLS].values

fit(model, criterion, optimizer, train_data=(X_train, train_labels), epochs_count=3, 
    batch_size=256, val_data=(X_test, test_labels), val_batch_size=1024)

[0 / 499]: Loss = 0.7376
[1 / 499]: Loss = 0.7134
[2 / 499]: Loss = 0.6913
[3 / 499]: Loss = 0.6705
[4 / 499]: Loss = 0.6481
[5 / 499]: Loss = 0.6241
[6 / 499]: Loss = 0.6030
[7 / 499]: Loss = 0.5758
[8 / 499]: Loss = 0.5467
[9 / 499]: Loss = 0.5248
[10 / 499]: Loss = 0.5057
[11 / 499]: Loss = 0.4825
[12 / 499]: Loss = 0.4602
[13 / 499]: Loss = 0.4432
[14 / 499]: Loss = 0.4254
[15 / 499]: Loss = 0.4103
[16 / 499]: Loss = 0.3904
[17 / 499]: Loss = 0.3749
[18 / 499]: Loss = 0.3535
[19 / 499]: Loss = 0.3448
[20 / 499]: Loss = 0.3129
[21 / 499]: Loss = 0.2963
[22 / 499]: Loss = 0.3116
[23 / 499]: Loss = 0.2956
[24 / 499]: Loss = 0.2598
[25 / 499]: Loss = 0.2636
[26 / 499]: Loss = 0.2788
[27 / 499]: Loss = 0.2663
[28 / 499]: Loss = 0.2672
[29 / 499]: Loss = 0.2557
[30 / 499]: Loss = 0.2564
[31 / 499]: Loss = 0.2567
[32 / 499]: Loss = 0.2441
[33 / 499]: Loss = 0.2418
[34 / 499]: Loss = 0.2317
[35 / 499]: Loss = 0.2213
[36 / 499]: Loss = 0.2153
[37 / 499]: Loss = 0.2368
[38 / 499]: Loss = 0.1

In [28]:
# Epoch 2 / 2, Epoch Time = 167.66s: Train Loss = 0.0427, Train AUC = 0.9793, Val Loss = 0.0483, Val AUC = 0.9781

### Playground

In [26]:
x = torch.LongTensor([[[1, 2, 3, 4],
                      [4, 5, 5, 6]],
                      
                      [[1, 1, 1, 1],
                       [10, 10, 10, 10]
                      ]
                     ])

print(F.pad(x, (0, 0, 0, 1, 0, 0))[:, 1:, :])
print()
print(torch.flip(F.pad(x, (0, 0, 0, 1, 0, 0))[:, 1:, :], [1]))

tensor([[[ 4,  5,  5,  6],
         [ 0,  0,  0,  0]],

        [[10, 10, 10, 10],
         [ 0,  0,  0,  0]]])

tensor([[[ 0,  0,  0,  0],
         [ 4,  5,  5,  6]],

        [[ 0,  0,  0,  0],
         [10, 10, 10, 10]]])


In [None]:
F.fl