In [1]:
%%html
<style> table {float:left} </style>

In [2]:
!pip install torch tqdm lazyme nltk gensim
!python -m nltk.downloader punkt



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Z370\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
import numpy as np
from tqdm import tqdm

import pandas as pd

from gensim.corpora import Dictionary

import torch
from torch import nn, optim, tensor, autograd
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

In [4]:
try: # Use the default NLTK tokenizer.
    from nltk import word_tokenize, sent_tokenize 
    # Testing whether it works. 
    # Sometimes it doesn't work on some machines because of setup issues.
    word_tokenize(sent_tokenize("This is a foobar sentence. Yes it is.")[0])
    print("OK")
except: # Use a naive sentence tokenizer and toktok.
    import re
    from nltk.tokenize import ToktokTokenizer
    # See https://stackoverflow.com/a/25736515/610569
    sent_tokenize = lambda x: re.split(r'(?<=[^A-Z].[.?]) +(?=[A-Z])', x)
    # Use the toktok tokenizer that requires no dependencies.
    toktok = ToktokTokenizer()
    word_tokenize = word_tokenize = toktok.tokenize

OK


# Classifying Toxic Comments

Lets apply what we learnt in a realistic task and **fight cyber-abuse with NLP**!

From https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/

> *The threat of abuse and harassment online means that many people stop <br>*
> *expressing themselves and give up on seeking different opinions. <br>*
> *Platforms struggle to effectively facilitate conversations, leading many <br>*
> *communities to limit or completely shut down user comments.*


The goal of the task is to build a model to detect different types of of toxicity:

 - toxic
 - severe toxic
 - threats
 - obscenity
 - insults
 - identity-based hate
 
In this part, you'll be munging the data as how I would be doing it at work. 

Your task is to train a feed-forward network on the toxic comments given the skills we have accomplished thus far.

## Digging into the data...

If you're using linux/Mac you can use these bang commands in the notebook:

```
!pip3 install kaggle
!mkdir -p /content/.kaggle/
!echo '{"username":"natgillin","key":"54ae95ab760b52c3307ed4645c6c9b5d"}' > /content/.kaggle/kaggle.json
!chmod 600 /content/.kaggle/kaggle.json
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge
!unzip /content/.kaggle/competitions/jigsaw-toxic-comment-classification-challenge/*
```

Otherwise, download the data from https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/ 

In [5]:
import os
os.chdir("D:/projects/tsundoku-master/data/toxic/")
os.getcwd()

'D:\\projects\\tsundoku-master\\data\\toxic'

In [6]:
# df_train = pd.read_csv("../input/train.csv")
df_train = pd.read_csv("train.csv")
df_train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
df_train['comment_text_tokenzied'] = df_train['comment_text'].apply(word_tokenize)

In [8]:
df_train['comment_text_tokenzied'].head(5)

0    [Explanation, Why, the, edits, made, under, my...
1    [D'aww, !, He, matches, this, background, colo...
2    [Hey, man, ,, I, 'm, really, not, trying, to, ...
3    [``, More, I, ca, n't, make, any, real, sugges...
4    [You, ,, sir, ,, are, my, hero, ., Any, chance...
Name: comment_text_tokenzied, dtype: object

In [9]:
# Just in case your Jupyter kernel dies, save the tokenized text =)

# To save your tokenized text you can do this:
import pickle
with open('train_tokenized_text.pkl', 'wb') as fout:
    pickle.dump(df_train['comment_text_tokenzied'], fout)


In [10]:
# To load it back:
import pickle
with open('train_tokenized_text.pkl', 'rb') as fin:
    text_tokenzied = pickle.load(fin)
    df_train['comment_text_tokenzied'] = text_tokenzied
text_tokenzied[:5]

0    [Explanation, Why, the, edits, made, under, my...
1    [D'aww, !, He, matches, this, background, colo...
2    [Hey, man, ,, I, 'm, really, not, trying, to, ...
3    [``, More, I, ca, n't, make, any, real, sugges...
4    [You, ,, sir, ,, are, my, hero, ., Any, chance...
Name: comment_text_tokenzied, dtype: object

# How to get a one-hot?

There are many variants of how to get your one-hot embeddings from the individual columns.

This is one way:

In [11]:
label_column_names = "toxic	severe_toxic	obscene	threat	insult	identity_hate".split()
y_train = df_train[label_column_names].values

In [12]:
ts_y_train = torch.tensor(y_train).float()

In [13]:
# Convert one-hot to indices of the column.

print(np.argmax(df_train[label_column_names].values, axis=1))

[0 0 0 ... 0 0 0]


In [14]:
class ToxicDataset(Dataset):
    def __init__(self, tokenized_texts, labels):
        self.sents = tokenized_texts
        self.labels = labels
        self.vocab = Dictionary(tokenized_texts)
        special_tokens = {'<pad>': 0, '<unk>':1}
        self.vocab.patch_with_special_tokens(special_tokens)
        self.max_len = max([len(sent) for sent in tokenized_texts])
        self.vocab_size = len(self.vocab)
        self._len = len(tokenized_texts)
        
    def __getitem__(self, sent_index):
        sent = self.sents[sent_index]
        vectorized_sent = self.vectorize(sent)
        sent_len = len(vectorized_sent)
        pad_len = self.max_len - len(vectorized_sent)
        pad_dim = (0, pad_len)
        padded_vectorized_sent = F.pad(vectorized_sent, pad_dim, 'constant')
        return {'x':padded_vectorized_sent, 
                'y':torch.tensor(self.labels[sent_index]), 
                'x_len':sent_len}
    
    
    def __len__(self):
        return self._len
    
    
    def vectorize(self, tokens):
        return torch.tensor(self.vocab.doc2idx(tokens))
        
    def unvectorize(self, indices):
        return [self.vocab[i] for i in indices]


In [15]:
label_column_names = "toxic	severe_toxic	obscene	threat	insult	identity_hate".split()
toxic_data = ToxicDataset(text_tokenzied, df_train[label_column_names].values)

In [16]:

batch_size = 100
dataloader = DataLoader(toxic_data, batch_size=batch_size, shuffle=True)

for data_dict in dataloader:
    # Sort indices of data in batch by lengths.
    sorted_indices = np.array(data_dict['x_len']).argsort()[::-1].tolist()
    data_batch = {name:_tensor[sorted_indices]
                  for name, _tensor in data_dict.items()}
    print(data_batch)
    break


    

{'x': tensor([[   103,   1128,     11,  ...,      0,      0,      0],
        [   103,   6134,     90,  ...,      0,      0,      0],
        [   103,   3914,    409,  ...,      0,      0,      0],
        ...,
        [  4705,    870,    104,  ...,      0,      0,      0],
        [  2062,    165,     82,  ...,      0,      0,      0],
        [ 16971,      2, 116462,  ...,      0,      0,      0]]), 'y': tensor([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
  

In [17]:
class FFNet(nn.Module):
    def __init__(self, max_len, num_labels, vocab_size, embedding_size, hidden_dim, output_size):
        super(FFNet, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_size, padding_idx=0)
        self.linear1 = nn.Linear(embedding_size*max_len, hidden_size)
        self.linear2 = nn.Linear(hidden_size, output_size)
    
    def forward(self, inputs):
        # We want to flatten the inputs so that we get the matrix of shape.
        # batch_size x no. of tokens in each input * embedding_size
        batch_size, max_len = inputs.shape
        embedded = self.embeddings(inputs).view(batch_size, -1)
        hid = F.relu(self.linear1(embedded))
        out = self.linear2(hid)
        probs = F.sigmoid(out)
        return probs
            

In [18]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

embedding_size = 100
learning_rate = 0.003
hidden_size = 100

criterion = nn.BCELoss()
# Hint: the CBOW model object you've created.
model = FFNet(toxic_data.max_len, 
              len(label_column_names),
              toxic_data.vocab_size, 
              embedding_size=embedding_size, 
              hidden_dim=hidden_size,
             output_size=6).to(device)


optimizer = optim.Adam(model.parameters(), lr=learning_rate)

#model = nn.DataParallel(model)

losses = []
num_epochs = 10
for _e in range(num_epochs):
    epoch_loss = []
    nbatch = 0
    for batch in tqdm(dataloader):
        optimizer.zero_grad()
        x = batch['x'].to(device)
        x_len = batch['x_len'].to(device)
        y = batch['y'].to(device)
        output = model(x)
        loss = criterion(output, y.float())
        loss.backward()
        optimizer.step()
        epoch_loss.append(loss.float().data)
        nbatch = nbatch + 1
        if nbatch % 100 == 0:
            print(sum(epoch_loss)/len(epoch_loss))
        
    losses.append(epoch_loss/nbatch)
     

cuda


  7%|█████▏                                                                         | 104/1596 [00:04<00:54, 27.46it/s]

tensor(0.2120, device='cuda:0')


 13%|██████████                                                                     | 204/1596 [00:07<00:49, 27.84it/s]

tensor(0.1692, device='cuda:0')


 19%|███████████████                                                                | 304/1596 [00:11<00:46, 27.70it/s]

tensor(0.1528, device='cuda:0')


 25%|███████████████████▉                                                           | 404/1596 [00:14<00:43, 27.71it/s]

tensor(0.1415, device='cuda:0')


 32%|████████████████████████▉                                                      | 504/1596 [00:18<00:39, 27.50it/s]

tensor(0.1346, device='cuda:0')


 38%|█████████████████████████████▉                                                 | 604/1596 [00:21<00:36, 27.53it/s]

tensor(0.1298, device='cuda:0')


 44%|██████████████████████████████████▊                                            | 703/1596 [00:25<00:33, 26.96it/s]

tensor(0.1241, device='cuda:0')


 50%|███████████████████████████████████████▊                                       | 805/1596 [00:29<00:29, 27.06it/s]

tensor(0.1203, device='cuda:0')


 57%|████████████████████████████████████████████▊                                  | 905/1596 [00:32<00:25, 27.07it/s]

tensor(0.1173, device='cuda:0')


 63%|█████████████████████████████████████████████████                             | 1003/1596 [00:36<00:22, 26.79it/s]

tensor(0.1145, device='cuda:0')


 69%|█████████████████████████████████████████████████████▉                        | 1104/1596 [00:39<00:18, 26.44it/s]

tensor(0.1117, device='cuda:0')


 75%|██████████████████████████████████████████████████████████▊                   | 1203/1596 [00:43<00:15, 25.90it/s]

tensor(0.1104, device='cuda:0')


 82%|███████████████████████████████████████████████████████████████▋              | 1304/1596 [00:46<00:10, 26.55it/s]

tensor(0.1084, device='cuda:0')


 88%|████████████████████████████████████████████████████████████████████▌         | 1404/1596 [00:50<00:07, 25.59it/s]

tensor(0.1067, device='cuda:0')


 94%|█████████████████████████████████████████████████████████████████████████▌    | 1504/1596 [00:53<00:03, 26.58it/s]

tensor(0.1054, device='cuda:0')


100%|██████████████████████████████████████████████████████████████████████████████| 1596/1596 [00:57<00:00, 27.89it/s]


TypeError: unsupported operand type(s) for /: 'list' and 'int'

In [None]:
def predict(text):
    # Vectorize and Pad.
    vectorized_sent = toxic_data.vectorize(word_tokenize(text))
    pad_dim = (0, toxic_data.max_len - len(vectorized_sent))
    vectorized_sent = F.pad(vectorized_sent, pad_dim, 'constant')
    # Forward Propagation.
    # Unsqueeze because model is expecting `batch_size` x `sequence_len` shape.
    outputs = model(vectorized_sent.unsqueeze(0).to(device))
    # To get the boolean output, we check if outputs are > 0.5
    return [int(l > 0.5) for l in outputs.squeeze()]
    # What happens if you use torch.max instead? =)
    ##return label_column_names[int(torch.max(outputs, dim=1).indices)]

In [None]:
text = "I will kill you."

In [None]:
print(label_column_names)
predict(text)