In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import gc
import time
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import roc_auc_score
import gensim.models.keyedvectors as word2vec
from nltk.tokenize import WordPunctTokenizer
from collections import Counter

SEED = 41
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)

In [2]:
RAW_DATA_PATH        = '../data/raw/'
PROCESSED_DATA_PATH  = '../data/processed/' 

MAX_LEN = 100

### Load Data

In [3]:
def load_sample():
    return pd.read_csv(os.path.join(PROCESSED_DATA_PATH, 'train_sample.csv'))

def load_full():
    train       = pd.read_csv(os.path.join(RAW_DATA_PATH, 'train.csv'))
    test        = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test.csv'))
    test_labels = pd.read_csv(os.path.join(RAW_DATA_PATH, 'test_labels.csv'))
    
    return train, test, test_labels

In [4]:
%%time
train, _, _ = load_full()

CPU times: user 1.47 s, sys: 224 ms, total: 1.69 s
Wall time: 1.69 s


In [6]:
%%time
train = load_sample()

CPU times: user 180 ms, sys: 12 ms, total: 192 ms
Wall time: 187 ms


#### Define target columns

In [5]:
TARGET_COLS = ['toxic', 
               'severe_toxic', 
               'obscene', 
               'threat', 
               'insult', 
               'identity_hate'
              ]

#### Tokenization

In [6]:
# define tokenizer
tokenizer = WordPunctTokenizer()

In [7]:
%%time
train_tokenized_comments = list(map(tokenizer.tokenize, train.comment_text))

CPU times: user 5.31 s, sys: 348 ms, total: 5.66 s
Wall time: 5.66 s


In [8]:
%%time
train.loc[:, 'tokenized_comments'] = list(map(' '.join, map(tokenizer.tokenize, train.comment_text)))

CPU times: user 5.94 s, sys: 3.92 s, total: 9.87 s
Wall time: 9.86 s


### Create vocabulary

#### Create word freq mapping

In [9]:
token_counts = Counter()

for tok_comments in train_tokenized_comments:
    token_counts.update(tok_comments)

In [10]:
# we can put a threshold on the token frequency to reduce the vocabulary
tokens    = {}
min_count = 10

for token, freq in token_counts.items():
    if freq >= min_count:
        tokens[token] = freq
        
print('Size of the vocabulary: {}'.format(len(tokens)))

Size of the vocabulary: 32838


### Load Pretrained Embeddings

In [11]:
def load_wv_embedding_matrix(words):
    word2vec_dict = word2vec.KeyedVectors.load_word2vec_format(os.path.join(PROCESSED_DATA_PATH, 'word2vec.bin.gz'), binary=True)
    embed_size    = 300

    embedding_index = dict()
    for word in word2vec_dict.wv.vocab:
        embedding_index[word] = word2vec_dict.word_vec(word)

    print('Loaded %d word vectors'%(len(embedding_index)))

    all_embs          = np.stack(list(embedding_index.values()))
    emb_mean, emb_std = all_embs.mean(), all_embs.std()

    UNK, PAD       = 'UNK', 'PAD'
    UNK_IX, PAD_IX = len(words), len(words) + 1

    nb_words = len(words) + 2

    embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

    embed_cnt = 0
    for i, word in enumerate(list(words.keys()) + [UNK, PAD]):
        embedding_vector = embedding_index.get(word)

        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
            embed_cnt +=1

    print('total embedded ', embed_cnt, ' common words')
    del embedding_index
    gc.collect()

    return embedding_matrix, UNK, PAD, UNK_IX, PAD_IX

### Token to ID mapping

In [12]:
# token to index (manual)
# UNK, PAD       = 'UNK', 'PAD'
# UNK_IX, PAD_IX =  0, 1

# token_to_id = {UNK: UNK_IX,
#                PAD: PAD_IX
#               }

# for token in tokens.keys():
#     token_to_id[token] = len(token_to_id)

In [13]:
%%time

# token to index ( word2vec embeddings )
embedding_matrix, UNK, PAD, UNK_IX, PAD_IX = load_wv_embedding_matrix(tokens)

  


Loaded 3000000 word vectors
total embedded  29714  common words
CPU times: user 2min 38s, sys: 8.93 s, total: 2min 47s
Wall time: 2min 45s


In [30]:
token_to_id      = {word: index for index, word in enumerate(tokens.keys())}
token_to_id[UNK] = UNK_IX
token_to_id[PAD] = PAD_IX

### Pad Sequences and convert map tokens to indices

In [31]:
UNK_IX, PAD_IX = map(token_to_id.get, [UNK, PAD])

def as_matrix(sequences, token_to_id, word_dropout, UNK_IX, PAD_IX, max_len=None):
    """ Convert a list of tokens into a matrix with padding """

    if isinstance(sequences[0], str):
        sequences = list(map(str.split, sequences))

    max_len = min(max(map(len, sequences)), max_len or float('inf'))
    matrix = np.full((len(sequences), max_len), np.int32(PAD_IX))

    for i,seq in enumerate(sequences):
        row_ix = [token_to_id.get(word, UNK_IX) for word in seq[:max_len]]
        matrix[i, :len(row_ix)] = row_ix

    if word_dropout != 0:
        matrix = apply_word_dropout(matrix, 1 - word_dropout, replace_with=UNK_IX, pad_ix=PAD_IX)

    return matrix

def apply_word_dropout(matrix, keep_prop, replace_with, pad_ix):
    dropout_mask = np.random.choice(2, np.shape(matrix), p=[keep_prop, 1-keep_prop])
    dropout_mask &= matrix != pad_ix
    return np.choose(dropout_mask, [matrix, np.full_like(matrix, replace_with)])

### Split data into training and validation split

In [32]:
from sklearn.model_selection import train_test_split

data_train, data_val = train_test_split(train, test_size=0.2, random_state=42)
data_train.index     = range(len(data_train))
data_val.index       = range(len(data_val))

print("Train size = ", len(data_train))
print("Validation size = ", len(data_val))

Train size =  127656
Validation size =  31915


### Data loader

In [33]:
def iterate_batches(matrix, labels, batch_size, predict_mode='train'):
    indices = np.arange(len(matrix))
    if predict_mode == 'train':
        np.random.shuffle(indices)
    
    for start in range(0, len(matrix), batch_size):
        end = min(start + batch_size, len(matrix))
        
        batch_indices = indices[start: end]
        X = matrix[batch_indices]
        
        if predict_mode != 'train': yield X
        else: yield X, labels[batch_indices]

### Model Definition

In [48]:
class Flatten(nn.Module):
    def forward(self, input):
        return input.view(input.size(0), -1)
    
class MultiChannel(nn.Module):
    def __init__(self, pre_trained_embedding, vocab_size, embed_size, num_classes):
        super(MultiChannel, self).__init__()
        
        self.embed_size  = embed_size
        self.vocab_size  = vocab_size
        self.num_classes = num_classes
        self.in_channels = 2
        self.nfms        = 32
        self.ks          = [2, 3, 4, 5]
        
        # first embedding layer ( static )
        self.static_embedding        = nn.Embedding(self.vocab_size, self.embed_size)
        self.static_embedding.weight = nn.Parameter(pre_trained_embedding)
        
        # make it non-trainable
        self.static_embedding.weight.requires_grad  = False
        
        # second embedding layer ( non-static )
        self.non_static_embedding        = nn.Embedding(self.vocab_size, self.embed_size)
        self.non_static_embedding.weight = nn.Parameter(pre_trained_embedding)
        
        # define conv layers
        self.conv_layer1 = nn.Conv2d(self.in_channels, self.nfms, kernel_size=(self.ks[0], self.embed_size), padding=1)
        self.conv_layer2 = nn.Conv2d(self.in_channels, self.nfms, kernel_size=(self.ks[1], self.embed_size), padding=1)
        self.conv_layer3 = nn.Conv2d(self.in_channels, self.nfms, kernel_size=(self.ks[2], self.embed_size), padding=1)
        self.conv_layer4 = nn.Conv2d(self.in_channels, self.nfms, kernel_size=(self.ks[3], self.embed_size), padding=1) 
        
        # define activation function
        self.relu        = nn.ReLU()
        
        # define max pooling layer
        self.max_pool1   = nn.MaxPool2d(kernel_size=(MAX_LEN - self.ks[0] + 1, 1))
        self.max_pool2   = nn.MaxPool2d(kernel_size=(MAX_LEN - self.ks[1] + 1, 1))
        self.max_pool3   = nn.MaxPool2d(kernel_size=(MAX_LEN - self.ks[2] + 1, 1))
        self.max_pool4   = nn.MaxPool2d(kernel_size=(MAX_LEN - self.ks[3] + 1, 1))
        
        # fully connected layer
        self.fc = nn.Linear(self.nfms * (3 * len(self.ks)), self.num_classes)
        
        # flatten layer
        self.flatten = Flatten()
        
        # dropout layer
        self.dropout = nn.Dropout(0.1)
        
        # spatial dropout
        self.spatial_dropout = nn.Dropout2d(0.4)
        
        
    def forward(self, x):
        s_embed  = self.static_embedding(x)
        ns_embed = self.non_static_embedding(x)
        
        # batch, seq, embedding -> batch, embedding, seq
        s_embed_t  = torch.transpose(s_embed, 1, 2)
        s_embed_t  = self.spatial_dropout(s_embed_t)
        s_embed    = torch.transpose(s_embed_t, 1, 2)
        
        ns_embed_t = torch.transpose(ns_embed, 1, 2)
        ns_embed_t = self.spatial_dropout(ns_embed_t)
        ns_embed   = torch.transpose(ns_embed_t, 1, 2)
        
        del s_embed_t
        del ns_embed_t
        
        # change embedding to batch, channel, seq and elements
        s_embed  = s_embed.unsqueeze(1)
        ns_embed = ns_embed.unsqueeze(1)
        
        out      = torch.cat((s_embed, ns_embed), dim=1)
        
        # pass through first conv layer
        out1     = self.conv_layer1(out)
        out1     = self.relu(out1)
        
        # pass through second conv layer
        out2     = self.conv_layer2(out)
        out2     = self.relu(out2)
        
        # pass through third conv layer
        out3     = self.conv_layer3(out)
        out3     = self.relu(out3)
        
        # pass through fourth conv layer
        out4     = self.conv_layer4(out)
        out4     = self.relu(out4)
        
#         print('Conv')
#         print('out1 : {}'.format(out1.shape))
#         print('out2 : {}'.format(out2.shape))
#         print('out3 : {}'.format(out3.shape))
#         print()
        
        # max pooling over sequence
        out1      = self.max_pool1(out1)
        out2      = self.max_pool2(out2)
        out3      = self.max_pool3(out3)
        out4      = self.max_pool3(out4)
        
#         print('Max Pooling')
#         print('out1 : {}'.format(out1.shape))
#         print('out2 : {}'.format(out2.shape))
#         print('out3 : {}'.format(out3.shape))
        
        # concatenate along first axis
        out = torch.cat((out1, out2, out3, out4), dim=1)
        
        # flatten channels, seq and element -> batch, channels * seq * element
        out      = self.flatten(out)
        
#         print('shape after flatten ', out.shape)
        
        # dropout
        out      = self.dropout(out)
                
        # pass through fully connected layer
        out      = self.fc(out)
        
        return out

### Training Loop

In [35]:
def do_epoch(model, criterion, data, batch_size, optimizer=None):
    epoch_loss, total_size = 0, 0
    per_label_preds = [[], [], [], [], [], []]
    per_label_true  = [[], [], [], [], [], []]
    
    is_train = not optimizer is None
    model.train(is_train)
    
    data, labels = data
    batchs_count = math.ceil(data.shape[0] / batch_size)
    
    with torch.autograd.set_grad_enabled(is_train):
        for i, (X_batch, y_batch) in enumerate(iterate_batches(data, labels, batch_size)):
            X_batch, y_batch = torch.cuda.LongTensor(X_batch), torch.cuda.FloatTensor(y_batch)

            logits = model(X_batch)
            loss   = criterion(logits, y_batch)
            
            if is_train:
                loss.backward()
                optimizer.step()
                optimizer.zero_grad()
            
            # convert true target
            batch_target = y_batch.cpu().detach().numpy()
            logits_cpu   = logits.cpu().detach().numpy()
            
            # per_label_preds
            for j in range(6):
                label_preds     = logits_cpu[:, j]
                per_label_preds[j].extend(label_preds)
                per_label_true[j].extend(batch_target[:, j])
                            
            # calculate log loss
            epoch_loss += loss.item()

            print('\r[{} / {}]: Loss = {:.4f}'.format(
                  i, batchs_count, loss.item(), end=''))
    
    label_auc = []
    
    for i in range(6):
        label_auc.append(roc_auc_score(per_label_true[i], per_label_preds[i]))
    
    return epoch_loss / batchs_count, np.mean(label_auc)

def fit(model, criterion, optimizer, train_data, epochs_count=1, 
        batch_size=32, val_data=None, val_batch_size=None):
    if not val_data is None and val_batch_size is None:
        val_batch_size = batch_size
        
    for epoch in range(epochs_count):
        start_time = time.time()
        train_loss, train_auc = do_epoch(
            model, criterion, train_data, batch_size, optimizer
        )
        
        output_info = '\rEpoch {} / {}, Epoch Time = {:.2f}s: Train Loss = {:.4f}, Train AUC = {:.4f}'
        if not val_data is None:
            val_loss, val_auc   = do_epoch(model, criterion, val_data, val_batch_size, None)
            
            epoch_time   = time.time() - start_time
            output_info += ', Val Loss = {:.4f}, Val AUC = {:.4f}'
            print(output_info.format(epoch+1, epochs_count, epoch_time, 
                                     train_loss,
                                     train_auc,
                                     val_loss,
                                     val_auc
                                    ))
        else:
            epoch_time = time.time() - start_time
            print(output_info.format(epoch+1, epochs_count, epoch_time, train_loss, train_auc))

### Run on a single batch

In [36]:
matrix = as_matrix(data_train['tokenized_comments'], 
                   token_to_id, 
                   word_dropout=0, 
                   UNK_IX=UNK_IX, 
                   PAD_IX=PAD_IX,
                   max_len=MAX_LEN
                  )

labels = data_train.loc[:, TARGET_COLS].values
X, y   = next(iterate_batches(matrix, labels, batch_size=2))

In [37]:
X = torch.cuda.LongTensor(X)
y = torch.cuda.LongTensor(y)

In [46]:
vocab_size  = len(token_to_id)
embed_size  = 300
num_classes = 6

model = MultiChannel(torch.FloatTensor(embedding_matrix), 
                     vocab_size, 
                     embed_size,
                     num_classes
                    ).cuda()

In [47]:
logits = model(X)
print(logits)

out1 : torch.Size([2, 32, 101, 3])
out2 : torch.Size([2, 32, 100, 3])
out3 : torch.Size([2, 32, 99, 3])

out1 : torch.Size([2, 32, 1, 3])
out2 : torch.Size([2, 32, 1, 3])
out3 : torch.Size([2, 32, 1, 3])
shape after flatten  torch.Size([2, 384])
tensor([[ 0.1082, -0.1250,  0.0540,  0.0787,  0.1425, -0.0455],
        [ 0.1440, -0.1389,  0.1897,  0.1586,  0.1045,  0.1465]],
       device='cuda:0', grad_fn=<AddmmBackward>)


### Run on full batch

In [49]:
vocab_size  = len(token_to_id)
embed_size  = 300
num_classes = 6

model        = MultiChannel(torch.FloatTensor(embedding_matrix), 
                     vocab_size, 
                     embed_size,
                     num_classes
                    ).cuda()

criterion    = nn.BCEWithLogitsLoss().cuda()
optimizer    = optim.Adam([param for param in model.parameters() if param.requires_grad], lr=0.001)

X_train      = as_matrix(data_train['tokenized_comments'], 
                         token_to_id, 
                         word_dropout=0.001, 
                         UNK_IX=UNK_IX, 
                         PAD_IX=PAD_IX,
                         max_len=MAX_LEN
                        )

train_labels = data_train.loc[:, TARGET_COLS].values 

X_test       = as_matrix(data_val['tokenized_comments'],
                         token_to_id, 
                         word_dropout=0.001, 
                         UNK_IX=UNK_IX, 
                         PAD_IX=PAD_IX,
                         max_len=MAX_LEN
                        )

test_labels  = data_val.loc[:, TARGET_COLS].values

fit(model, criterion, optimizer, train_data=(X_train, train_labels), epochs_count=5, 
    batch_size=512, val_data=(X_test, test_labels), val_batch_size=1024)

[0 / 250]: Loss = 0.7188
[1 / 250]: Loss = 0.5770
[2 / 250]: Loss = 0.4508
[3 / 250]: Loss = 0.3552
[4 / 250]: Loss = 0.2681
[5 / 250]: Loss = 0.1899
[6 / 250]: Loss = 0.1967
[7 / 250]: Loss = 0.1959
[8 / 250]: Loss = 0.1869
[9 / 250]: Loss = 0.1457
[10 / 250]: Loss = 0.1814
[11 / 250]: Loss = 0.1775
[12 / 250]: Loss = 0.2292
[13 / 250]: Loss = 0.2387
[14 / 250]: Loss = 0.1985
[15 / 250]: Loss = 0.2140
[16 / 250]: Loss = 0.2229
[17 / 250]: Loss = 0.1788
[18 / 250]: Loss = 0.1559
[19 / 250]: Loss = 0.1723
[20 / 250]: Loss = 0.1358
[21 / 250]: Loss = 0.1387
[22 / 250]: Loss = 0.1284
[23 / 250]: Loss = 0.1480
[24 / 250]: Loss = 0.1655
[25 / 250]: Loss = 0.1761
[26 / 250]: Loss = 0.1281
[27 / 250]: Loss = 0.1622
[28 / 250]: Loss = 0.1459
[29 / 250]: Loss = 0.1351
[30 / 250]: Loss = 0.1187
[31 / 250]: Loss = 0.1295
[32 / 250]: Loss = 0.1594
[33 / 250]: Loss = 0.1936
[34 / 250]: Loss = 0.1332
[35 / 250]: Loss = 0.1527
[36 / 250]: Loss = 0.1246
[37 / 250]: Loss = 0.1102
[38 / 250]: Loss = 0.1

In [None]:
Epoch 5 / 5, Epoch Time = 13.93s: Train Loss = 0.0485, Train AUC = 0.9726, Val Loss = 0.0570, Val AUC = 0.9683