# NLP with RNNs: Toxic Comment Classification

In [1]:
import os

In [2]:
os.environ['KAGGLE_CONFIG_DIR'] = '/content'

In [3]:
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

Downloading jigsaw-toxic-comment-classification-challenge.zip to /content
 95% 50.0M/52.6M [00:03<00:00, 22.9MB/s]
100% 52.6M/52.6M [00:03<00:00, 15.0MB/s]


In [4]:
!unzip jigsaw-toxic-comment-classification-challenge.zip -d data

Archive:  jigsaw-toxic-comment-classification-challenge.zip
  inflating: data/sample_submission.csv.zip  
  inflating: data/test.csv.zip       
  inflating: data/test_labels.csv.zip  
  inflating: data/train.csv.zip      


In [5]:
import pandas as pd

In [6]:
train_fname = 'data/train.csv.zip'
test_fname = 'data/test.csv.zip'
sub_fname = 'data/sample_submission.csv.zip'

In [7]:
df = pd.read_csv(train_fname)
test_df = pd.read_csv(test_fname)
sub_df = pd.read_csv(sub_fname)

In [8]:
df.sample(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
17579,2e7bd5dfb7f7b066,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
66118,b0cb480bf62834d5,YOU'RE THE DISEASE OF AMERICA\nSTEALING JOBS F...,1,0,1,0,1,1
30740,51a650bf44d83489,More YouTube\n\nI f*cking hate those chain let...,1,0,1,0,0,0
79229,d4102514f29cdf19,"Only sites critisizing Phelps are listed, none...",0,0,0,0,0,0
159127,f8f20423b1f45a33,get lost \n\nGet lost steve and take your puri...,1,0,0,0,0,0
87752,eabbb835881464bf,"Unblock \n\nJon, after I finish leaving this m...",0,0,0,0,0,0
126437,a44803a9075c3039,Good point. I suppose repetition is okay. My m...,0,0,0,0,0,0
30068,4fd58fbae2a25c1c,"""\n\nThanks\nThanks for your message on my tal...",0,0,0,0,0,0
145145,17e1cd64f68e10e7,advice for teens \n\nhow is that nonsence? i a...,0,0,0,0,0,0
138243,e3c376cd0ed00a95,I wish to issue a sincere apology for what see...,0,0,0,0,0,0


In [9]:
target_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [10]:
for col in target_cols:
  print(df[col].value_counts(normalize=True))

0    0.904156
1    0.095844
Name: toxic, dtype: float64
0    0.990004
1    0.009996
Name: severe_toxic, dtype: float64
0    0.947052
1    0.052948
Name: obscene, dtype: float64
0    0.997004
1    0.002996
Name: threat, dtype: float64
0    0.950636
1    0.049364
Name: insult, dtype: float64
0    0.991195
1    0.008805
Name: identity_hate, dtype: float64


## Create Vocabulary

In [11]:
from torchtext.data.utils import get_tokenizer

In [12]:
tokenizer = get_tokenizer('basic_english')

In [13]:
from torchtext.vocab import build_vocab_from_iterator

In [14]:
VOCAB_SIZE = 1500

In [15]:
comment_tokens = df['comment_text'].map(tokenizer)

In [16]:
unk_token = '<unk>'
pad_token = '<pad>'

In [17]:
vocab = build_vocab_from_iterator(comment_tokens, specials=[unk_token, pad_token],
                                  max_tokens=VOCAB_SIZE)

In [18]:
## Replace the words not found in Vocabulary with Unknown Token
vocab.set_default_index(vocab[unk_token])

In [19]:
vocab['good']

108

### Check Indices of Sample Comments

In [20]:
sample_comment = df['comment_text'].sample(1).values[0]

In [21]:
sample_comment

'Sources\nno source given to support name Territory of the Military Commander in Serbia. show this source here.'

In [22]:
sample_comment_tokens = tokenizer(sample_comment)
sample_comment_tokens[:10]

['sources',
 'no',
 'source',
 'given',
 'to',
 'support',
 'name',
 'territory',
 'of',
 'the']

In [23]:
sample_indices = vocab.lookup_indices(sample_comment_tokens)
sample_indices

[120,
 58,
 131,
 336,
 5,
 349,
 123,
 0,
 7,
 3,
 910,
 0,
 15,
 0,
 2,
 309,
 18,
 131,
 74,
 2]

In [24]:
sample_indices_recovered = vocab.lookup_tokens(sample_indices)
sample_indices_recovered

['sources',
 'no',
 'source',
 'given',
 'to',
 'support',
 'name',
 '<unk>',
 'of',
 'the',
 'military',
 '<unk>',
 'in',
 '<unk>',
 '.',
 'show',
 'this',
 'source',
 'here',
 '.']

## Training and Validation Sets

In [25]:
MAX_LENGTH = 1150

In [26]:
def pad_tokens(tokens):
  
  if (len(tokens) >= MAX_LENGTH):
    return tokens[:MAX_LENGTH]
  else:
    return tokens + [pad_token] * (MAX_LENGTH - len(tokens))

In [27]:
import torch
from torch.utils.data import Dataset

In [28]:
class ToxicDataset(Dataset):
  def __init__(self, df, is_test=False):
    self.df = df
    self.is_test = is_test

  def __getitem__(self, index):
    comment_text = self.df['comment_text'].values[index]
    comment_tokens = pad_tokens(tokenizer(comment_text))
    input = torch.tensor(vocab.lookup_indices(comment_tokens))

    if self.is_test:
      target = torch.tensor([0,0,0,0,0,0]).float()
    else:
      target = torch.tensor(self.df[target_cols].values[index]).float()

    return input, target

  def __len__(self):
    return len(self.df)

In [29]:
dataset = ToxicDataset(df)

In [30]:
dataset[0]

(tensor([667,  85,   3,  ...,   1,   1,   1]),
 tensor([0., 0., 0., 0., 0., 0.]))

In [31]:
from torch.utils.data import random_split

In [32]:
VAL_FRAC = 0.25

In [33]:
train_ds, val_ds = random_split(dataset, [1-VAL_FRAC, VAL_FRAC])

In [34]:
len(dataset), len(train_ds), len(val_ds)

(159571, 119679, 39892)

In [35]:
test_dataset = ToxicDataset(test_df, is_test=True)

In [36]:
test_dataset[0]

(tensor([  0, 636,   0,  ...,   1,   1,   1]),
 tensor([0., 0., 0., 0., 0., 0.]))

## Pytorch DataLoader

In [37]:
from torch.utils.data import DataLoader

In [38]:
BATCH_SIZE = 256

In [39]:
train_dl = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=8, pin_memory=True)
val_dl = DataLoader(val_ds, batch_size=BATCH_SIZE*2, num_workers=8, pin_memory=True)
test_dl = DataLoader(test_dataset, batch_size=BATCH_SIZE*2, num_workers=8, pin_memory=True)



In [40]:
for batch in train_dl:
  b_inputs, b_targets = batch

  print(f'Batch Inputs Shape: {b_inputs.shape}')
  print(f'Batch Targets Shape: {b_targets.shape}')

  break

Batch Inputs Shape: torch.Size([256, 1150])
Batch Targets Shape: torch.Size([256, 6])


## Recurrent Neural Network

In [41]:
!pip install pytorch_lightning --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m720.6/720.6 kB[0m [31m43.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m53.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m63.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m33.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.6/149.6 kB[0m [31m18.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [42]:
import numpy as np
import pytorch_lightning as pl
import torch.nn.functional as F
import torch.nn as nn

In [43]:
emb_layer = nn.Embedding(VOCAB_SIZE, 256, 1)

In [44]:
rnn_layer = nn.RNN(256, 128, 1, batch_first=True)

In [45]:
for batch in train_dl:
  b_inputs, b_targets = batch

  print(f'Batch Inputs Shape: {b_inputs.shape}')
  print(f'Batch Targets Shape: {b_targets.shape}')

  emb_out = emb_layer(b_inputs)
  print('emb_out.shape', emb_out.shape)

  rnn_out, hn = rnn_layer(emb_out)
  print('rnn_out.shape', rnn_out.shape)
  print('hn.shape', hn.shape)

  break



Batch Inputs Shape: torch.Size([256, 1150])
Batch Targets Shape: torch.Size([256, 6])
emb_out.shape torch.Size([256, 1150, 256])
rnn_out.shape torch.Size([256, 1150, 128])
hn.shape torch.Size([1, 256, 128])


In [46]:
class ToxicModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.emb = nn.Embedding(VOCAB_SIZE, 256, 1)
        self.lstm = nn.LSTM(256, 128, 1, batch_first=True)
        self.linear = nn.Linear(128, 6)
        self.learning_rate = 0.001

    def forward(self, x):
        out = self.emb(x)
        out, hn = self.lstm(out)
        out = F.relu(out[:,-1,:])
        out = self.linear(out)
        return out

    def training_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self(inputs)
        probs = torch.sigmoid(outputs)
        loss = F.binary_cross_entropy(probs, targets)
        return loss

    def validation_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self(inputs)
        probs = torch.sigmoid(outputs)
        loss = F.binary_cross_entropy(probs, targets)
        return loss.item()

    def validation_epoch_end(self, validation_step_outputs):
        loss = np.mean(validation_step_outputs)
        print("Epoch #{}; Loss: {:4f} ".format(self.current_epoch, loss)) 

    def predict_step(self, batch, batch_idx):
        inputs, targets = batch
        outputs = self(inputs)
        probs = torch.sigmoid(outputs)
        return probs       

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.learning_rate)

In [47]:
model = ToxicModel()

In [48]:
for batch in train_dl:
  b_inputs, b_targets = batch

  print(f'Batch Inputs Shape: {b_inputs.shape}')
  print(f'Batch Targets Shape: {b_targets.shape}')

  outputs = model(b_inputs)
  print('outputs.shape', outputs.shape)

  probs = torch.sigmoid(outputs)
  loss = F.binary_cross_entropy(probs, b_targets)
  print('Loss', loss)
  break

Batch Inputs Shape: torch.Size([256, 1150])
Batch Targets Shape: torch.Size([256, 6])
outputs.shape torch.Size([256, 6])
Loss tensor(0.6932, grad_fn=<BinaryCrossEntropyBackward0>)


## Train and Evaluate

In [49]:
from pytorch_lightning.callbacks import lr_finder
trainer = pl.Trainer(max_epochs=3, accelerator='gpu')

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [50]:
trainer.tune(model, train_dl)

AttributeError: ignored

In [None]:
trainer.fit(model, train_dl, val_dl)