## Imports 

In [113]:
from IPython.display import clear_output
from collections import Counter

import pandas as pd 
import numpy as np 
import torch
import torch.nn as nn 
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

%matplotlib inline

## RNNs

Please, read about RNNs (Recurrent Neural Networks).  

1. Understand it's difference from the FFNNs. (Write your answer down below)  

https://towardsdatascience.com/recurrent-neural-networks-rnn-explained-the-eli5-way-3956887e8b75

https://towardsdatascience.com/learn-how-recurrent-neural-networks-work-84e975feaaf7

2. Why do we need recurrent neural networks? 
3. For which tasks it would work better? 

In [130]:
### 1. Your answer here 

In [131]:
### 2. Your answer here 

In [132]:
### 3. Your answer here 

## Load data 

In [8]:
# Load the DF created during the previous task

df_binary = pd.read_json("../jigsaw-toxic-comment-classification-challenge/df_binary.json")
df_binary.head()

Unnamed: 0,index,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,cleaned,toxicity
0,60236,b3925e41b823f473,"""\n\nThank you Ian. I knew about WP:NOTCENSORE...",0,0,0,0,0,0,"[``, thank, ian, knew, wp, notcensored, also, ...",0
1,116612,b686d9f97deab4ad,Oh. I never took your comments in any negative...,0,0,0,0,0,0,"[oh, never, took, comment, negative, way, perf...",0
2,72935,d96a1c99002f9cfc,Village pump and newbie \n\nI think your handl...,0,0,0,0,0,0,"[village, pump, newbie, think, handling, newbi...",0
3,30137,59a0576f85786c1f,I didn't change it hence this BS claim of me s...,0,0,0,0,0,0,"[n't, change, hence, b, claim, saying, keep, a...",0
4,148580,0f701c200f54455c,What the hell do you people expect? Wikipedia'...,1,0,1,0,1,1,"[hell, people, expect, wikipedia, 's, controll...",4


In [134]:
# Work with small amount of this data: 
df_sample, _ = train_test_split(df_binary, test_size=0.9, stratify=df_binary['obscene'])

In [164]:
def flat_nested(nested):
    flatten = []
    for item in nested:
        if isinstance(item, list):
            flatten.extend(item)
        else:
            flatten.append(item)
    return flatten

cnt_vocab = Counter(flat_nested(df_sample.cleaned.tolist()))

print("Vocab size before filtering: {}".format(len(cnt_vocab)))

threshold_count_l = 2
threshold_count_h = 500
threshold_len = 2

cleaned_vocab = [token for token, count in cnt_vocab.items() if 
                     threshold_count_h > count > threshold_count_l and len(token) > threshold_len
                ]
print("Vocab size after filtering: {}".format(len(cleaned_vocab)))

Vocab size before filtering: 50787
Vocab size after filtering: 13304


In [165]:
cleaned_vocab.append(" ")
# Convert list to set 
cleaned_vocab = set(cleaned_vocab)

In [166]:
token_to_id = {v: k for k, v in enumerate(sorted(cleaned_vocab))}
id_to_token = {v: k for k, v in token_to_id.items()}

Before passing our raw text to the model we need to represent each raw text by a vector.   
Let's do this by creating an empty list with all of the tokens in it represented by its id. 

In [167]:
def vectorize(data, token_to_id, max_len=None, dtype='int32', batch_first = True):
    """
    Casts a list of tokens into rnn-digestable matrix
        "data" contains only sequences represented by tokens from the dictionary, filter noise before 
    """
    
    max_len = max_len or max(map(len, data))
    # Create a marix with a shape [batch size, max number of tokens in sequence]
    data_ix = np.zeros([len(data), max_len], dtype) + token_to_id[' ']

    for i in range(len(data)):
        line_ix = [token_to_id[c] for c in data[i]]
        data_ix[i, :len(line_ix)] = line_ix
        
    if not batch_first: # convert [batch, time] into [time, batch]
        data_ix = np.transpose(data_ix)

    return data_ix

In [168]:
def filter_noise_tokens(df, cleaned_vocab): 
    df['filtered_tokens'] = df.cleaned.apply(lambda x: [tok for tok in x if tok in cleaned_vocab])
    return df 

In [169]:
# After applying this function there would be sentences with all tokens filtered - empty lists. 
df_sample = filter_noise_tokens(df_sample, cleaned_vocab)

# Remove examples without any tokens assigned 
df_filtered = df_sample[df_sample.astype(str)['filtered_tokens'] != '[]']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [170]:
# Perform train-test split (would be imbalanced)
# df_train, df_test = train_test_split(df_filtered, test_size=0.4, stratify=df_filtered['obscene'])

# Select only obscene texts
df_obscene = df_filtered[df_filtered['obscene'] == 1] 
# Select only clean texts 
df_clean = df_filtered[df_filtered['toxic'] == 0]

In [171]:
# Create a balanced dataset, number of 1 == number of 0 
df_balanced = df_obscene.append(df_clean.sample(df_obscene.shape[0]), ignore_index=True, sort=False)

df_train, df_test = train_test_split(df_balanced, test_size=0.4, stratify=df_balanced['obscene'])

In [172]:
print("Train shape: {}".format(df_train.shape))
print("Test shape: {}".format(df_test.shape))

Train shape: (981, 12)
Test shape: (655, 12)


In [173]:
# Example of vectorization 

vectorize(df_train.sample(4).filtered_tokens.tolist(), token_to_id)

array([[ 5413,  1260,  6763,   482,  7202, 12923,   216,  8349,  3511,
         2812, 10265,  9325,  3294, 11470,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0],
       [ 5668,   728,  3595,  5522,  1404,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     

In [174]:
class RNNLoop(nn.Module):
    
    def __init__(self, num_tokens, emb_size=200, hid_size=128):
        super(self.__class__, self).__init__()
        self.emb = nn.Embedding(num_tokens, emb_size)
        self.rnn = nn.RNN(emb_size, hid_size, batch_first=True)
        self.logits = nn.Linear(hid_size, 1)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        emb = self.emb(x)
        all_hidden_states, hidden = self.rnn(emb)
        logits = self.logits(hidden)
        output = self.sigmoid(logits)
        return output
    
# Initialise the model 
model = RNNLoop(num_tokens=len(cleaned_vocab))
# specify loss function
criterion = nn.BCELoss()
# specify optimizer
optimizer = torch.optim.SGD(model.parameters(), lr = 0.01)
history = []

In [175]:
batch_size = 64
n_epochs = 100 
n_iters = df_train.shape[0] // batch_size
print("Number of iterations for 1 epoch: {}".format(n_iters))

for epoch in range(n_epochs):
    epoch_loss = 0 
    for step in range(n_iters):

        optimizer.zero_grad()    # Forward pass
        # Make a random sample from the dataframe 
        sample = df_train.sample(4)

        # Vectorize the obtained sample 
        batch_ix = vectorize(sample.filtered_tokens.tolist(), token_to_id)
        # Convert vectorized batch to tensor 
        batch_ix = torch.tensor(batch_ix, dtype=torch.int64)

        # Select true labels 
        y_true = sample.obscene.tolist()
        # Convert true labels to tensor 
        y_true = torch.tensor(y_true, dtype=torch.float)

        # Make prediction 
        y_pred = model(batch_ix)

        loss = criterion(y_pred.squeeze(), y_true)

        epoch_loss += loss.item() / n_iters
        loss.backward()   # Backward pass 
        optimizer.step()

#         history.append(loss.data.numpy())
#         if (step + 1) % 100 == 0:
#             clear_output(True)
#             plt.plot(history, label='loss')
#             plt.legend()
#             plt.show()
            
    print('Epoch {}: train loss: {}'.format(epoch, epoch_loss))    
# assert np.mean(history[:25]) > np.mean(history[-25:]), "RNN didn't converge."

Number of iterations for 1 epoch: 15
Epoch 0: train loss: 0.707436223824819
Epoch 1: train loss: 0.7060327967007954
Epoch 2: train loss: 0.7112535278002421
Epoch 3: train loss: 0.6693460702896118
Epoch 4: train loss: 0.6519306083520253
Epoch 5: train loss: 0.712637746334076
Epoch 6: train loss: 0.680217734972636
Epoch 7: train loss: 0.684671096007029
Epoch 8: train loss: 0.6827967087427774
Epoch 9: train loss: 0.708529508113861
Epoch 10: train loss: 0.6483648419380189
Epoch 11: train loss: 0.7002609113852183
Epoch 12: train loss: 0.6998960673809053
Epoch 13: train loss: 0.6939862966537476
Epoch 14: train loss: 0.7067749937375387
Epoch 15: train loss: 0.6987305363019307
Epoch 16: train loss: 0.6728693942228954
Epoch 17: train loss: 0.6675439238548277
Epoch 18: train loss: 0.6990039388338725
Epoch 19: train loss: 0.6623107075691224
Epoch 20: train loss: 0.6954385121663412
Epoch 21: train loss: 0.7057859341303507
Epoch 22: train loss: 0.6943241635958353
Epoch 23: train loss: 0.69175578753