In [3]:
%reload_ext autoreload
%autoreload
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import os
import time
import gc

import argparse
import logging
import spacy
import re

from nltk.tokenize import WordPunctTokenizer
from torch.utils.data import Dataset
from collections import Counter, defaultdict
from pathlib import Path
from functools import partial
import gensim.models.keyedvectors as word2vec
from sklearn.metrics import roc_auc_score

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

SEED = 41
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [11]:
text = [['This is a cool story'],
        ['Aruba is in Jamaica.'],
        ['What the hell is going on here?'],
        ['The meanining of a word is its use in the language.'],
        ['What goes up comes down? me amigo'],
        ['This is going to be a sweet sweet ride.'],
        ['Man City quadruple is off.'],
        ['Scousers are bin dippers.'],
        ['Man Utd will be back because we always come back.']
       ]

text  = pd.DataFrame(text)
text.columns = ['comment_text']
text.loc[:, 'toxic']  = [1., 0., 1., 1., 0., 0., 0., 1., 0.]
text.loc[:, 'severe_toxic']  = [0., 1., 0., 0., 1., 1., 1., 1., 0.]
text.loc[:, 'obscene']  = [0., 1., 0., 0., 1., 1., 1., 1., 0.]
text.loc[:, 'threat']  = [0., 1., 0., 0., 1., 1., 1., 1., 0.]
text.loc[:, 'insult']  = [0., 1., 0., 0., 1., 1., 1., 1., 0.]
text.loc[:, 'identity_hate']  = [0., 1., 0., 0., 1., 1., 1., 1., 0.]

In [12]:
def check_labels(y): return all(v is None for v in y)

def load_w2v_embedding(emb_matrix):
    word2vec_dict   = word2vec.KeyedVectors.load_word2vec_format('../data/processed/word2vec.bin.gz', binary=True)
    embedding_index = dict()
    
    for word in word2vec_dict.wv.vocab:
        embedding_index[word] = word2vec_dict.word_vec(word)

    embed_cnt = 0

    for i, word in enumerate(vocab.itos):
        embedding_vector = embedding_index.get(word)

        if embedding_vector is not None:
            emb_matrix[i] = embedding_vector
            embed_cnt +=1

    del embedding_index
    gc.collect()

    # fill pad token with all zeros
    emb_matrix[vocab.stoi['xxxpad']] = np.zeros(embed_size)
    emb_matrix[vocab.stoi['xxxunk']] = np.zeros(embed_size)
    print('total embedded {} common words'.format(embed_cnt))
    
    return emb_matrix
    
    
class Tokenizer():
    def __init__(self, lang='en'):
        self.tok = spacy.blank(lang, disable=['parser', 'tagger', 'ner'])
    
    def tokenizer(self, t):
        return [t.text for t in self.tok.tokenizer(t)]
    
class Vocab():
    def __init__(self, itos):
        self.itos = itos
        self.stoi = defaultdict(int, {v:k for k,v in enumerate(self.itos)})
    
    def numericalize(self, t):
        return [self.stoi.get(w, self.stoi['xxxunk']) for w in t]
    
    def fix_len(self, sent_len, numericalized_tokens):
        return [nt[:sent_len] for nt in numericalized_tokens]
    
    def __getstate__(self):
        return {'itos':self.itos}
    
    def textify(self, nums, sep=' '):
        "Convert a list of `nums` to their tokens."
        return sep.join([self.itos[i] for i in nums]) if sep is not None else [self.itos[i] for i in nums]


    @classmethod
    def create(cls, tokens, max_vocab, min_freq):
        freq = Counter(p for o in tokens for p in o)
        itos = [o for o, c in freq.most_common(max_vocab) if c>= min_freq]
        itos = cls.add_special_symbols(itos)
        return cls(itos)
    
    @classmethod
    def add_special_symbols(cls, itos):
        pad_sym = 'xxxpad' #TODO: make sure we use config to introduce this symbol
        unk_sym = 'xxxunk'
        itos.append(pad_sym)
        itos.append(unk_sym)
        return itos

class TextLMData():
    
    def __init__(self, path, 
                 csv,
                 test_csv,
                 text_col, 
                 label_cols, 
                 max_vocab,
                 min_freq,
                 valid_pct=0.2):
        
        self.path       = path
        self.csv        = csv
        self.test_csv   = test_csv
        self.text_cols  = text_col
        self.label_cols = label_cols
        self.valid_pct  = valid_pct
        self.max_vocab  = max_vocab
        self.min_freq   = min_freq
        
        self.df      = pd.read_csv(Path(self.path)/self.csv)
        if self.test_csv is not None: self.test_df = pd.read_csv(Path(self.path)/self.test_csv) 
        self.cut     = int(valid_pct * len(self.df)) + 1
        
    def process(self):
        tok = Tokenizer()
        
        # consider entire corpus as text ( train + test text columns )
        if self.test_csv:
            text = list(self.df.loc[:, text_col].values) + list(self.test_df.loc[:, text_col])
        else:
            text = list(self.df.loc[:, text_col].values)
        
        self.tokens  = [tok.tokenizer(text) for text in text]
        self.vocab   = Vocab.create(self.tokens, self.max_vocab, self.min_freq)
        
        self.ntokens = [self.vocab.numericalize(t) for t in self.tokens]
        
        # only full training
        if self.valid_pct == 0 and self.test_csv is None:
            self.trn_ds      = (self.ntokens, self.df.loc[:, label_cols].values)
            self.vld_tokens  = ([], [])
            self.test_tokens = ([], [])
        
        # holdout
        elif self.valid_pct > 0 and self.test_csv is None:
            self.trn_ds  = (self.ntokens[self.cut:], self.df.loc[:, label_cols].values[self.cut:])
            self.vld_ds  = (self.ntokens[:self.cut], self.df.loc[:, label_cols].values[:self.cut])
            self.tst_ds  = ([], [])
        
        # holdout and test prediction
        elif self.valid_pct > 0 and self.test_csv is not None:
            self.trn_tokens  = self.ntokens[:len(self.df)]
            self.tst_ds      = (self.ntokens[len(self.df):], [])
            
            trn_tokens  = self.trn_tokens[self.cut:]
            vld_tokens  = self.trn_tokens[:self.cut]
            
            self.trn_ds = (trn_tokens, self.df.loc[:, label_cols].values[self.cut:])
            self.vld_ds = (vld_tokens, self.df.loc[:, label_cols].values[:self.cut])
        
        # full training and test prediction
        else:
            self.trn_ds  = (self.ntokens[:len(self.df)], self.df.loc[:, label_cols].values)
            self.vld_ds  = ([], [])
            self.tst_ds  = (self.ntokens[len(self.df):], [])
            
        return self.vocab, self.trn_ds, self.vld_ds, self.tst_ds
    
    def fill_emb_matrix(self, vocab, emb_type, embed_size):
        emb_matrix = np.random.random(size=(len(self.vocab.itos), embed_size))
        
        if emb_type == 'w2v':
            emb_matrix = load_w2v_embedding(emb_matrix)
            
        return emb_matrix
        
class TextClassData(Dataset):
    def __init__(self, vocab, ds):
        self.vocab       = vocab
        self.ds, self.y  = ds
                            
    def __len__(self):
        return len(self.ds)
                            
    def __getitem__(self, index):
        x = torch.LongTensor(self.ds[index])
        y = None
        if len(self.y) > 0: y = torch.FloatTensor(self.y[index])
        
        return x, y
        
def pad_collate(data, pad_idx, sent_len):
    if len(data) == 1:
        sequences, labels = data[0]
        sequences = sequences.view(1, -1)
        if labels is not None: labels    = labels.view(1, -1)
    else:
        sequences, labels = zip(*data)
        if not check_labels(labels): labels = torch.cat([l.view(-1, 1) for l in labels], dim=1).t()
        sequences = nn.utils.rnn.pad_sequence(sequences, batch_first=True, padding_value=pad_idx)
    
    sent_len  = min(sequences.size(1), sent_len)
    sequences = sequences[:, :sent_len]
    
    return sequences, labels

In [13]:
path       = Path('../data/processed')
csv        = 'sample.csv'
text_col   = 'comment_text'
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
test_csv   = None
max_vocab  = 100000
min_freq   = 3
embed_size = 3
emb_type   = 'glove'
valid_pct  = .2 # change it to 0 for full training modmax_vocab  = 100000

tmp = TextLMData(path, 
                 csv,
                 test_csv,
                 text_col, 
                 label_cols,
                 max_vocab,
                 min_freq,
                 valid_pct=valid_pct
                )

vocab, trn_ds, vld_ds, tst_ds = tmp.process()
emb_matrix                    = tmp.fill_emb_matrix(vocab, emb_type, embed_size)

In [14]:
sent_len   = 10
collate_fn = partial(pad_collate, pad_idx=vocab.stoi['xxxpad'], sent_len=sent_len) #TODO: make sure pad symbol is defined as a constant

trn_ds     = TextClassData(vocab, trn_ds)
vld_ds     = TextClassData(vocab, vld_ds)

In [21]:
trn_dl     = torch.utils.data.DataLoader(trn_ds, batch_size=2, shuffle=True, collate_fn=collate_fn, num_workers=4)
vld_dl     = torch.utils.data.DataLoader(vld_ds, batch_size=2, shuffle=False, collate_fn=collate_fn, num_workers=4)

### Model Definition

In [65]:
class CNN(nn.Module):
    def __init__(self, emb_matrix, vocab_size, embed_size):
        super(CNN, self).__init__()
        
        self.embedding        = nn.Embedding(vocab_size, embed_size)
        self.embedding.weight = nn.Parameter(emb_matrix)
        self.embedding.weight.requires_grad = False
        
        self.input_size  = 3
        self.hidden_size = 5 
        self.nfilters    = 7
        
        self.relu     = nn.ReLU()
        self.tanh     = nn.Tanh()
        self.lstm     = nn.LSTM(self.input_size, self.hidden_size, num_layers=1)
        self.conv     = nn.Conv1d(in_channels=self.hidden_size * 2 + embed_size,
                                  out_channels=self.nfilters,
                                  kernel_size=1,
                                 )
        self.projection_layer = nn.Linear(self.nfilters, 6)
        
        
        
    def forward(self, x):
        embed         = self.embedding(x)
        left_context  = F.pad(embed, (0, 0, 1, 0, 0, 0))[:, :-1, :]
        right_context = F.pad(embed, (0, 0, 0, 1, 0, 0))[:, 1:, :]
        
        left_context  = left_context.permute(1, 0, 2)
        fwd, _        = self.lstm(left_context)
        
        right_context_reversed = torch.flip(right_context, [1])
        right_context_reversed = right_context_reversed.permute(1, 0, 2)
        
        bwd, _   = self.lstm(right_context_reversed)
        bwd      = torch.flip(bwd, [1])
        
        # change from (seq_len, batch, embed_size) -> (batch, seq_len, embed_size)
        fwd       = fwd.permute(1, 0, 2)
        bwd       = bwd.permute(1, 0, 2)
        
        out = torch.cat((fwd, embed, bwd), dim=2)
        
        # change from (batch, seq_len, embed_size) -> (batch, embed_size, seq_len)
        out = out.permute(0, 2, 1)
        out = self.conv(out)
        out = self.tanh(out)
        
        print(embed.size())
        print(out.size())
        
        # do max pooling
        out = out.max(dim=2)[0]
        print('after max pooling: {}'.format(out.size()))
        out = self.projection_layer(out)
        print(out.size())
        
        return -1

### Run on a batch

In [66]:
x, _  = next(iter(trn_dl))
model = CNN(torch.FloatTensor(emb_matrix), len(vocab.itos), embed_size) 
out   = model(x)

torch.Size([2, 10, 3])
torch.Size([2, 7, 10])
after max pooling: torch.Size([2, 7])
torch.Size([2, 6])


### Training Loop

In [4]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss      = 0
    per_label_preds = [[], [], [], [], [], []]
    per_label_true  = [[], [], [], [], [], []]

    
    model.train()
    
    for i, batch in enumerate(iterator):
        
        optimizer.zero_grad()
        X, y        = batch        
        
        X           = X.to(device)
        y           = y.to(device)
        
        predictions = model(X)
        
        loss = criterion(predictions, y)
        loss.backward()
        optimizer.step()
        
        # convert true target
        batch_target = y.cpu().detach().numpy()
        logits_cpu   = predictions.cpu().detach().numpy()

        # per_label_preds
        for j in range(6):
            label_preds     = logits_cpu[:, j]
            per_label_preds[j].extend(label_preds)
            per_label_true[j].extend(batch_target[:, j])

        # calculate log loss
        epoch_loss += loss.item()

        print('\r[{} / {}]: Loss = {:.4f}'.format(
              i, len(iterator), loss.item(), end=''))
    
    label_auc = []

    for i in range(6):
        label_auc.append(roc_auc_score(per_label_true[i], per_label_preds[i]))
    
    return epoch_loss / len(iterator), np.mean(label_auc)


def evaluate(model, iterator, criterion):
    epoch_loss      = 0
    per_label_preds = [[], [], [], [], [], []]
    per_label_true  = [[], [], [], [], [], []]
    preds           = []

    model.eval()
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):
            X, y        = batch
            
            X           = X.to(device)
            predictions = model(X)
            
            # convert true target
            logits_cpu   = predictions.cpu().detach().numpy()

            preds.append(logits_cpu)
            
            if not check_labels(y): 
                y    = y.to(device)
                loss = criterion(predictions, y)
                batch_target = y.cpu().detach().numpy()
                
                
                # per_label_preds
                for j in range(6):
                    label_preds     = logits_cpu[:, j]
                    per_label_preds[j].extend(label_preds)
                    per_label_true[j].extend(batch_target[:, j])

                # calculate log loss
                epoch_loss += loss.item()

                print('\r[{} / {}]: Loss = {:.4f}'.format(
                      i, len(iterator), loss.item(), end=''))
    
    label_auc = []

    if len(per_label_preds[0]) > 0:
        for i in range(6):
            label_auc.append(roc_auc_score(per_label_true[i], per_label_preds[i]))

    return epoch_loss / len(iterator), np.mean(label_auc) if len(label_auc) > 0 else 0, np.vstack(preds)

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_auc    = train(model, trn_dl, optimizer, criterion)
    valid_loss, valid_auc, _ = evaluate(model, vld_dl, criterion)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
#         torch.save(model.state_dict(), 'tut4-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train AUC: {train_auc:.2f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. AUC: {valid_auc:.2f}')