In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import gc
import time
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import roc_auc_score
import gensim.models.keyedvectors as word2vec
from nltk.tokenize import WordPunctTokenizer
from collections import Counter

SEED = 41
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

In [2]:
RAW_DATA_PATH        = '../data/raw/'
PROCESSED_DATA_PATH  = '../data/processed/' 

MAX_LEN = 100

### Load Data

In [3]:
def load_sample():
    return pd.read_csv(os.path.join(PROCESSED_DATA_PATH, 'train_sample.csv'))

def load_full():
    train       = pd.read_csv(os.path.join(RAW_DATA_PATH, 'train.csv'))
    test        = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test.csv'))
    test_labels = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test_labels.csv'))
    
    return train, test, test_labels

In [4]:
%%time
train, _, _ = load_full()

CPU times: user 1.52 s, sys: 236 ms, total: 1.75 s
Wall time: 2.71 s


In [5]:
# %%time
# train = load_sample()

CPU times: user 132 ms, sys: 40 ms, total: 172 ms
Wall time: 487 ms


#### Define target columns

In [5]:
TARGET_COLS = ['toxic', 
               'severe_toxic', 
               'obscene', 
               'threat', 
               'insult', 
               'identity_hate'
              ]

#### Tokenization

In [6]:
# define tokenizer
tokenizer = WordPunctTokenizer()

In [7]:
%%time
train_tokenized_comments = list(map(tokenizer.tokenize, train.comment_text))

CPU times: user 5.25 s, sys: 328 ms, total: 5.58 s
Wall time: 5.58 s


In [8]:
%%time
train.loc[:, 'tokenized_comments'] = list(map(' '.join, map(tokenizer.tokenize, train.comment_text)))

CPU times: user 5.99 s, sys: 3.92 s, total: 9.91 s
Wall time: 9.9 s


### Create vocabulary

#### Create word freq mapping

In [9]:
token_counts = Counter()

for tok_comments in train_tokenized_comments:
    token_counts.update(tok_comments)

In [10]:
# we can put a threshold on the token frequency to reduce the vocabulary
tokens    = {}
min_count = 10

for token, freq in token_counts.items():
    if freq >= min_count:
        tokens[token] = freq
        
print('Size of the vocabulary: {}'.format(len(tokens)))

Size of the vocabulary: 32838


### Load Pretrained Embeddings

In [11]:
def load_wv_embedding_matrix(words):
    word2vec_dict = word2vec.KeyedVectors.load_word2vec_format(os.path.join(PROCESSED_DATA_PATH, 'word2vec.bin.gz'), binary=True)
    embed_size    = 300

    embedding_index = dict()
    for word in word2vec_dict.wv.vocab:
        embedding_index[word] = word2vec_dict.word_vec(word)

    print('Loaded %d word vectors'%(len(embedding_index)))

    all_embs          = np.stack(list(embedding_index.values()))
    emb_mean, emb_std = all_embs.mean(), all_embs.std()

    UNK, PAD       = 'UNK', 'PAD'
    UNK_IX, PAD_IX = len(words), len(words) + 1

    nb_words = len(words) + 2

    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

    embed_cnt = 0
    for i, word in enumerate(list(words.keys()) + [UNK, PAD]):
        embedding_vector = embedding_index.get(word)

        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            embed_cnt +=1

    print('total embedded ', embed_cnt, ' common words')
    del embedding_index
    gc.collect()

    return embedding_matrix, UNK, PAD, UNK_IX, PAD_IX

### Token to ID mapping

In [12]:
# token to index (manual)
# UNK, PAD       = 'UNK', 'PAD'
# UNK_IX, PAD_IX =  0, 1

# token_to_id = {UNK: UNK_IX,
#                PAD: PAD_IX
#               }

# for token in tokens.keys():
#     token_to_id[token] = len(token_to_id)

In [13]:
%%time

# token to index ( word2vec embeddings )
embedding_matrix, UNK, PAD, UNK_IX, PAD_IX = load_wv_embedding_matrix(tokens)

  


Loaded 3000000 word vectors
total embedded  29714  common words
CPU times: user 2min 39s, sys: 9.04 s, total: 2min 48s
Wall time: 2min 47s


In [14]:
token_to_id      = {word: index for index, word in enumerate(tokens.keys())}
token_to_id[UNK] = UNK_IX
token_to_id[PAD] = PAD_IX

### Pad Sequences and convert map tokens to indices

In [15]:
UNK_IX, PAD_IX = map(token_to_id.get, [UNK, PAD])

def as_matrix(sequences, token_to_id, word_dropout, UNK_IX, PAD_IX, max_len=None):
    """ Convert a list of tokens into a matrix with padding """

    if isinstance(sequences[0], str):
        sequences = list(map(str.split, sequences))

    max_len = min(max(map(len, sequences)), max_len or float('inf'))
    matrix = np.full((len(sequences), max_len), np.int32(PAD_IX))

    for i,seq in enumerate(sequences):
        row_ix = [token_to_id.get(word, UNK_IX) for word in seq[:max_len]]
        matrix[i, :len(row_ix)] = row_ix

    if word_dropout != 0:
        matrix = apply_word_dropout(matrix, 1 - word_dropout, replace_with=UNK_IX, pad_ix=PAD_IX)

    return matrix

def apply_word_dropout(matrix, keep_prop, replace_with, pad_ix):
    dropout_mask = np.random.choice(2, np.shape(matrix), p=[keep_prop, 1-keep_prop])
    dropout_mask &= matrix != pad_ix
    return np.choose(dropout_mask, [matrix, np.full_like(matrix, replace_with)])

### Split data into training and validation split

In [16]:
from sklearn.model_selection import train_test_split

data_train, data_val = train_test_split(train, test_size=0.2, random_state=42)
data_train.index     = range(len(data_train))
data_val.index       = range(len(data_val))

print("Train size = ", len(data_train))
print("Validation size = ", len(data_val))

Train size =  127656
Validation size =  31915


### Data loader

In [17]:
def iterate_batches(matrix, labels, batch_size, predict_mode='train'):
    indices = np.arange(len(matrix))
    if predict_mode == 'train':
        np.random.shuffle(indices)
    
    for start in range(0, len(matrix), batch_size):
        end = min(start + batch_size, len(matrix))
        
        batch_indices = indices[start: end]
        X = matrix[batch_indices]
        
        if predict_mode != 'train': yield X
        else: yield X, labels[batch_indices]

### Model Definition

In [18]:
class RCNN(nn.Module):
    def __init__(self, weights, vocab_size, embed_size, hidden_size, num_classes):
        super(RCNN, self).__init__()
        
        self.vocab_size  = vocab_size
        self.embed_size  = embed_size
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        
        self.embedding = nn.Embedding(self.vocab_size, self.embed_size)
        self.embedding.weight = nn.Parameter(weights)
        
        self.Wl  = nn.Parameter(data=torch.Tensor(self.hidden_size, self.hidden_size), requires_grad=True)
        self.Wsl = nn.Parameter(data=torch.Tensor(self.hidden_size, self.embed_size), requires_grad=True) 
        
        self.Wr  = nn.Parameter(data=torch.Tensor(self.hidden_size, self.hidden_size), requires_grad=True)
        self.Wsr = nn.Parameter(data=torch.Tensor(self.hidden_size, self.embed_size), requires_grad=True) 
        
        
        self.cl  = nn.Parameter(data=torch.Tensor(1, self.hidden_size), requires_grad=True)
        self.cr  = nn.Parameter(data=torch.Tensor(1, self.hidden_size), requires_grad=True)
        
        self.relu = nn.ReLU() 
        self.fc   = nn.Linear(self.hidden_size * 2 + self.embed_size, self.num_classes)
        
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.Wl, a=np.sqrt(5))
        nn.init.kaiming_uniform_(self.Wsl, a=np.sqrt(5))
        nn.init.kaiming_uniform_(self.Wr, a=np.sqrt(5))
        nn.init.kaiming_uniform_(self.Wsr, a=np.sqrt(5))
        nn.init.kaiming_uniform_(self.cl, a=np.sqrt(5))
        nn.init.kaiming_uniform_(self.cr, a=np.sqrt(5))
        
    def forward(self, x):
        
        # create left and right context vectors to be equal to be
        # equal to (batch_size, hidden_size)
        
        cl        = self.cl.repeat(x.size(0), 1)
        cr        = self.cr.repeat(x.size(0), 1)
        
        
        embed     = self.embedding(x)
        cxt       = cl.t()
        
        left_context  = []
        right_context = []
        
        # O(n)
        for i in range(1, x.size(1)):
            cxt         = self.relu(torch.mm(self.Wl, cxt) + torch.mm(self.Wsl, embed[:, i-1, :].t()))
            left_context.append(cxt)
        
        cxt = cr.t()
        
        # O(n)
        for i in range(x.size(1)-2, -1, -1):
            cxt         = self.relu(torch.mm(self.Wr, cxt) + torch.mm(self.Wsr, embed[:, i-1, :].t()))
            right_context.append(cxt)
        
        
        left_context  = torch.cat([cl.t()] + left_context, dim=1)
        left_context  = left_context.view(x.size(0), x.size(1), -1)
        
        right_context = torch.cat(right_context + [cr.t()], dim=1).t()
        right_context = right_context.view(x.size(0), x.size(1), -1)
        
        # word representation
        word_repr = torch.cat((left_context, embed, right_context), dim=2)
        
#         print('WORD REPR ', word_repr.shape)
        
#         out = self.fc1(word_repr)
#         out = self.relu(out)
        
        # text representation
        out = word_repr.max(dim=1)[0]
        
        # final layer
        out = self.fc(out)
        
        return out

### Training Loop

In [19]:
def do_epoch(model, criterion, data, batch_size, optimizer=None):
    epoch_loss, total_size = 0, 0
    per_label_preds = [[], [], [], [], [], []]
    per_label_true  = [[], [], [], [], [], []]
    
    is_train = not optimizer is None
    model.train(is_train)
    
    data, labels = data
    batchs_count = math.ceil(data.shape[0] / batch_size)
    
    with torch.autograd.set_grad_enabled(is_train):
        for i, (X_batch, y_batch) in enumerate(iterate_batches(data, labels, batch_size)):
            X_batch, y_batch = torch.cuda.LongTensor(X_batch), torch.cuda.FloatTensor(y_batch)

            logits = model(X_batch)
            loss   = criterion(logits, y_batch)
            
            if is_train:
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
            
            # convert true target
            batch_target = y_batch.cpu().detach().numpy()
            logits_cpu   = logits.cpu().detach().numpy()
            
            # per_label_preds
            for j in range(6):
                label_preds     = logits_cpu[:, j]
                per_label_preds[j].extend(label_preds)
                per_label_true[j].extend(batch_target[:, j])
                            
            # calculate log loss
            epoch_loss += loss.item()

            print('\r[{} / {}]: Loss = {:.4f}'.format(
                  i, batchs_count, loss.item(), end=''))
    
    label_auc = []
    
    for i in range(6):
        label_auc.append(roc_auc_score(per_label_true[i], per_label_preds[i]))
    
    return epoch_loss / batchs_count, np.mean(label_auc)

def fit(model, criterion, optimizer, train_data, epochs_count=1, 
        batch_size=32, val_data=None, val_batch_size=None):
    if not val_data is None and val_batch_size is None:
        val_batch_size = batch_size
        
    for epoch in range(epochs_count):
        start_time = time.time()
        train_loss, train_auc = do_epoch(
            model, criterion, train_data, batch_size, optimizer
        )
        
        output_info = '\rEpoch {} / {}, Epoch Time = {:.2f}s: Train Loss = {:.4f}, Train AUC = {:.4f}'
        if not val_data is None:
            val_loss, val_auc   = do_epoch(model, criterion, val_data, val_batch_size, None)
            
            epoch_time   = time.time() - start_time
            output_info += ', Val Loss = {:.4f}, Val AUC = {:.4f}'
            print(output_info.format(epoch+1, epochs_count, epoch_time, 
                                     train_loss,
                                     train_auc,
                                     val_loss,
                                     val_auc
                                    ))
        else:
            epoch_time = time.time() - start_time
            print(output_info.format(epoch+1, epochs_count, epoch_time, train_loss, train_auc))

### Run on a single batch

In [21]:
matrix = as_matrix(data_train['tokenized_comments'], 
                   token_to_id, 
                   word_dropout=0, 
                   UNK_IX=UNK_IX, 
                   PAD_IX=PAD_IX,
                   max_len=MAX_LEN
                  )

labels = data_train.loc[:, TARGET_COLS].values
X, y   = next(iterate_batches(matrix, labels, batch_size=2))

In [22]:
X = torch.cuda.LongTensor(X)
y = torch.cuda.LongTensor(y)

In [36]:
vocab_size  = len(token_to_id)
embed_size  = 300
hidden_size = 2
num_classes = 6

model =  RCNN(torch.FloatTensor(embedding_matrix),
              vocab_size, 
              embed_size,
              hidden_size,
              num_classes,
              batch_size=2
              ).cuda()

In [37]:
logits = model(X)
print(logits)

tensor([[ 0.1889,  0.0636,  0.0409,  0.1613, -0.0313, -0.0770],
        [ 0.1457,  0.0622,  0.0820,  0.1540, -0.0168, -0.1331]],
       device='cuda:0', grad_fn=<AddmmBackward>)


### Run on full batch

In [23]:
vocab_size  = len(token_to_id)
embed_size  = 300
hidden_size = 32
num_classes = 6


model        = RCNN(torch.FloatTensor(embedding_matrix),
                  vocab_size, 
                  embed_size,
                  hidden_size,
                  num_classes
                  ).cuda()

criterion    = nn.BCEWithLogitsLoss().cuda()
optimizer    = optim.Adam([param for param in model.parameters() if param.requires_grad], lr=0.001)

X_train      = as_matrix(data_train['tokenized_comments'], 
                         token_to_id, 
                         word_dropout=0.000, 
                         UNK_IX=UNK_IX, 
                         PAD_IX=PAD_IX,
                         max_len=MAX_LEN
                        )

train_labels = data_train.loc[:, TARGET_COLS].values 

X_test       = as_matrix(data_val['tokenized_comments'],
                         token_to_id, 
                         word_dropout=0.000, 
                         UNK_IX=UNK_IX, 
                         PAD_IX=PAD_IX,
                         max_len=MAX_LEN
                        )

test_labels  = data_val.loc[:, TARGET_COLS].values

fit(model, criterion, optimizer, train_data=(X_train, train_labels), epochs_count=10, 
    batch_size=512, val_data=(X_test, test_labels), val_batch_size=1024)

[0 / 250]: Loss = 0.6359
[1 / 250]: Loss = 0.5850
[2 / 250]: Loss = 0.5344
[3 / 250]: Loss = 0.4948
[4 / 250]: Loss = 0.4465
[5 / 250]: Loss = 0.4094
[6 / 250]: Loss = 0.3817
[7 / 250]: Loss = 0.3427
[8 / 250]: Loss = 0.3005
[9 / 250]: Loss = 0.2863
[10 / 250]: Loss = 0.2658
[11 / 250]: Loss = 0.2426
[12 / 250]: Loss = 0.2279
[13 / 250]: Loss = 0.2014
[14 / 250]: Loss = 0.1907
[15 / 250]: Loss = 0.1702
[16 / 250]: Loss = 0.1697
[17 / 250]: Loss = 0.1988
[18 / 250]: Loss = 0.1495
[19 / 250]: Loss = 0.1577
[20 / 250]: Loss = 0.1743
[21 / 250]: Loss = 0.1117
[22 / 250]: Loss = 0.1423
[23 / 250]: Loss = 0.1430
[24 / 250]: Loss = 0.1455
[25 / 250]: Loss = 0.1314
[26 / 250]: Loss = 0.1390
[27 / 250]: Loss = 0.1486
[28 / 250]: Loss = 0.1444
[29 / 250]: Loss = 0.1695
[30 / 250]: Loss = 0.1426
[31 / 250]: Loss = 0.1579
[32 / 250]: Loss = 0.1402
[33 / 250]: Loss = 0.1616
[34 / 250]: Loss = 0.1551
[35 / 250]: Loss = 0.1378
[36 / 250]: Loss = 0.1475
[37 / 250]: Loss = 0.1342
[38 / 250]: Loss = 0.1

In [28]:
# Epoch 2 / 2, Epoch Time = 167.66s: Train Loss = 0.0427, Train AUC = 0.9793, Val Loss = 0.0483, Val AUC = 0.9781

### Playground

In [16]:
class RCNN(nn.Module):
    def __init__(self, weights, vocab_size, embed_size, hidden_size, num_classes):
        super(RCNN, self).__init__()
        
        self.vocab_size  = vocab_size
        self.embed_size  = embed_size
        self.hidden_size = hidden_size
        self.num_classes = num_classes
        
        self.embedding = nn.Embedding(self.vocab_size, self.embed_size)
        self.embedding.weight = nn.Parameter(weights)
        
        self.Wl  = nn.Parameter(data=torch.Tensor(self.hidden_size, self.hidden_size), requires_grad=True)
        self.Wsl = nn.Parameter(data=torch.Tensor(self.hidden_size, self.embed_size), requires_grad=True) 
        
        self.Wr  = nn.Parameter(data=torch.Tensor(self.hidden_size, self.hidden_size), requires_grad=True)
        self.Wsr = nn.Parameter(data=torch.Tensor(self.hidden_size, self.embed_size), requires_grad=True) 
        
        
        self.cl  = nn.Parameter(data=torch.Tensor(1, self.hidden_size), requires_grad=True)
        self.cr  = nn.Parameter(data=torch.Tensor(1, self.hidden_size), requires_grad=True)
        
        self.relu = nn.ReLU() 
        self.fc   = nn.Linear(self.hidden_size * 2 + self.embed_size, self.num_classes)
        
        self.reset_parameters()

    def reset_parameters(self):
        nn.init.kaiming_uniform_(self.Wl, a=np.sqrt(5))
        nn.init.kaiming_uniform_(self.Wsl, a=np.sqrt(5))
        nn.init.kaiming_uniform_(self.Wr, a=np.sqrt(5))
        nn.init.kaiming_uniform_(self.Wsr, a=np.sqrt(5))
        nn.init.kaiming_uniform_(self.cl, a=np.sqrt(5))
        nn.init.kaiming_uniform_(self.cr, a=np.sqrt(5))
        
    def forward(self, x):
        
        # create left and right context vectors to be equal to be
        # equal to (batch_size, hidden_size)
        
        cl        = self.cl.repeat(x.size(0), 1)
        cr        = self.cr.repeat(x.size(0), 1)
        
        
        embed     = self.embedding(x)
        cxt       = cl.t()
        
        left_context  = []
        right_context = []
        
        # O(n)
        for i in range(1, x.size(1)):
            cxt         = torch.mm(self.Wl, cxt) + torch.mm(self.Wsl, embed[:, i-1, :].t())
            left_context.append(cxt)
        
        cxt = cr.t()
        
        # O(n)
        for i in range(x.size(1)-2, -1, -1):
            cxt         = torch.mm(self.Wr, cxt) + torch.mm(self.Wsr, embed[:, i-1, :].t())
            right_context.append(cxt)
        
        
        left_context  = torch.cat([cl.t()] + left_context, dim=1)
        left_context  = left_context.view(x.size(0), x.size(1), -1)
        
        right_context = torch.cat(right_context + [cr.t()], dim=1).t()
        right_context = right_context.view(x.size(0), x.size(1), -1)
        
        # word representation
        word_repr = torch.cat((left_context, embed, right_context), dim=2)
        
#         print('WORD REPR ', word_repr.shape)
        
#         out = self.fc1(word_repr)
#         out = self.relu(out)
        
        # text representation
        out = word_repr.max(dim=1)[0]
        
        # final layer
        out = self.fc(out)
        
        return out

In [17]:
vocab_size  = 3
hidden_size = 2
embed_size  = 2
num_classes = 2

X = torch.LongTensor([[0, 1],
                      [1, 0],
                      [0, 0]
                     ])

model = RCNN(torch.Tensor(vocab_size, embed_size),
             vocab_size,
             embed_size,
             hidden_size,
             num_classes
            )
logits = model(X)

In [18]:
logits

tensor([[-0.0631, -0.4866],
        [-0.1683, -0.3037],
        [-0.1000, -0.3597]], grad_fn=<AddmmBackward>)

In [4]:
class Temp(nn.Module):
    def __init__(self):
        super(Temp, self).__init__()
        self.cr = nn.Parameter(data=torch.Tensor(1, 3), requires_grad=True)     
        
    def forward(self, x):
        cr = self.cr.repeat(x.size(0), 1)
        
        return cr

In [6]:
model  = Temp()
logits = model(torch.Tensor(3, 1, 2))

In [7]:
for params in model.parameters():
    print(params)

Parameter containing:
tensor([[-1.2047e-37,  4.5673e-41,  5.4213e-13]], requires_grad=True)


In [9]:
logits

tensor([[-1.2047e-37,  4.5673e-41,  5.4213e-13],
        [-1.2047e-37,  4.5673e-41,  5.4213e-13],
        [-1.2047e-37,  4.5673e-41,  5.4213e-13]], grad_fn=<RepeatBackward>)