In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pdb
import os

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from pandas_summary import DataFrameSummary

import torch
import torch.nn.functional as F
import torchtext
from torchtext import vocab, data
from torchtext.datasets import language_modeling

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import spacy
from spacy.lang.en.stop_words import STOP_WORDS as spacy_STOPWORDS
spacy_en = spacy.load('en')

from wordcloud import WordCloud, STOPWORDS

# pandas and plotting config
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)

In [3]:
PATH = 'data'

os.makedirs(f'{PATH}/models', exist_ok=True)
os.makedirs(f'{PATH}/tmp', exist_ok=True)
os.makedirs(f'{PATH}/submissions', exist_ok=True)

In [4]:
raw_train_df = pd.read_csv(f'{PATH}/train.csv')
test_df = pd.read_csv(f'{PATH}/test.csv')
sample_subm_df = pd.read_csv(f'{PATH}/sample_submission.csv')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
raw_train_df['none'] = 1 - raw_train_df[label_cols].max(axis=1)

In [106]:
pretrained_vectors = 'fasttext.en.300d'
max_features = 30000
min_freq = 10
max_len = 100
emb_sz = 300

In [107]:
tokenize = lambda x: x.split()

TEXT_fld = data.Field(sequential=True, tokenize=tokenize, lower=True, fix_length=max_len)

LABEL_fld = data.Field(sequential=False, use_vocab=False, tensor_type=torch.cuda.ByteTensor)

In [108]:
# torchtext cannot read the .csv files correctly if there are newline characters, so replace with " "
raw_train_df.comment_text = raw_train_df.comment_text.str.replace("\n", " ")
test_df.comment_text = test_df.comment_text.str.replace("\n", " ")

In [109]:
# split the training data into a train and validatin dataset
trn, val = train_test_split(raw_train_df, test_size=0.05, random_state=42)
print(len(trn), len(val), len(trn[trn.none != 1]), len(val[val.none != 1]))

# save train, val, and test datasets for torchtext
trn.to_csv(f'{PATH}/train_ds.csv', index=None)
val.to_csv(f'{PATH}/valid_ds.csv', index=None)
test_df.to_csv(f'{PATH}/test_ds.csv', index=None)

151592 7979 15427 798


In [110]:
display(pd.read_csv("data/train_ds.csv").head(2))
display(pd.read_csv("data/valid_ds.csv").head(2))
display(pd.read_csv("data/test_ds.csv").head(2))

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
0,6e76a143f0fd817f,""" I don't think that is the official chart. Listed at United World Singles Chart: """"The South African sales and airplay chart is merged into one to form the official singles chart which is published weekly"""". From what I've seen, """"Get Together"""" has yet to enter the record chart.""",0,0,0,0,0,0,1
1,f4279ea981109ed7,"Mariusz and I both like to vandalize pages. We simply do it out of anger and frustration. I have kids who drive me nuts, and I take it out on Wikipedia. —",0,0,0,0,0,0,1


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
0,7ca72b5b9c688e9e,"Geez, are you forgetful! We've already discussed why Marx was not an anarchist, i.e. he wanted to use a State to mold his 'socialist man.' Ergo, he is a statist - the opposite of an anarchist. I know a guy who says that, when he gets old and his teeth fall out, he'll quit eating meat. Would you call him a vegetarian?",0,0,0,0,0,0,1
1,c03f72fd8f8bf54f,"Carioca RFA Thanks for your support on my request for adminship. The final outcome was (31/4/1), so I am now an administrator. If you have any comments or concerns on my actions as an administrator, please let me know. Thank you!",0,0,0,0,0,0,1


Unnamed: 0,id,comment_text
0,00001cee341fdb12,"Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,"
1,0000247867823ef7,"== From RfC == The title is fine as it is, IMO."


There are various built-in Datasets in torchtext that handle common use cases. **For csv/tsv files, the TabularDataset class** is convenient. Here’s how we would read data from a csv file using the TabularDataset:

In [111]:
%%time

# train/validation
train_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                    ("comment_text", TEXT_fld), ("toxic", LABEL_fld),
                    ("severe_toxic", LABEL_fld), ("obscene", LABEL_fld),
                    ("threat", LABEL_fld), ("insult", LABEL_fld),
                    ("identity_hate", LABEL_fld), ("none", None)]

train_ds, valid_ds = data.TabularDataset.splits(PATH, train='train_ds.csv', validation='valid_ds.csv',
                                          format='csv', skip_header=True, fields=train_datafields)

# test
test_datafields = [("id", None), ("comment_text", TEXT_fld)]

test_ds = data.TabularDataset(f'{PATH}/test_ds.csv', format='csv', skip_header=True, fields=test_datafields)

CPU times: user 14.4 s, sys: 736 ms, total: 15.1 s
Wall time: 15 s


In [112]:
train_ds[0]

<torchtext.data.example.Example at 0x7f1344296828>

In [113]:
train_ds[0].__dict__.keys()

dict_keys(['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

In [114]:
train_ds[1].comment_text[:5]

['mariusz', 'and', 'i', 'both', 'like']

In [115]:
TEXT_fld.build_vocab(train_ds, min_freq=min_freq, max_size=max_features, vectors=pretrained_vectors)

In [116]:
# The vocab.freqs is a collections.Counter object, so we can take a look at the most frequent words.
TEXT_fld.vocab.freqs.most_common(10)

[('the', 466417),
 ('to', 279718),
 ('of', 211982),
 ('and', 207824),
 ('a', 201672),
 ('i', 186835),
 ('you', 178267),
 ('is', 162474),
 ('that', 139210),
 ('in', 133885)]

In [117]:
train_iter, val_iter = data.BucketIterator.splits(
    (train_ds, valid_ds), # we pass in the datasets we want the iterator to draw data from
    batch_sizes=(64, 64),
    device=0, # if you want to use the GPU, specify the GPU number here
    sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.
    sort_within_batch=False,
    repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [118]:
batch = next(train_iter.__iter__()); batch

<torchtext.data.batch.Batch at 0x7f12fa489898>

In [119]:
batch.__dict__.keys()

dict_keys(['batch_size', 'dataset', 'train', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

For the test set, we don't want the data to be shuffled. This is why we'll be using a standard Iterator.

In [120]:
test_iter = data.Iterator(test_ds, batch_size=64, device=0, sort=False, sort_within_batch=False, repeat=False)

In [121]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([ getattr(batch, feat).unsqueeze(1) for feat in self.y_vars ], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [122]:
train_dl = BatchWrapper(train_iter, "comment_text", label_cols)
valid_dl = BatchWrapper(val_iter, "comment_text", label_cols)
test_dl = BatchWrapper(test_iter, "comment_text", None)

## 6. Training the model

Define a simple LSTM

In [123]:
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable

In [124]:
class SimpleBiLSTMBaseline(nn.Module):
    def __init__(self, vocab_sz, emb_sz=300, n_hidden=256, out_sz=1,
                 spatial_dropout=0.2, recurrent_dropout=0.1):
        
        super().__init__() # don't forget to call this!
        
        self.emb = nn.Embedding(vocab_sz, emb_sz)
        self.emb.data = train_ds.fields['comment_text'].vocab.vectors # to use the pretrained vectors
        self.emb_drop = nn.Dropout(spatial_dropout)
        
        self.encoder = nn.LSTM(emb_sz, n_hidden, bidirectional=True, num_layers=1, dropout=recurrent_dropout)
        
        # x4 if bidrectional; els x2
        self.outp = nn.Linear(n_hidden * 4, out_sz)
    
    def forward(self, seq):
        x = self.emb_drop(self.emb(seq))
        
        # output = seq_len, batch, hidden_size * num_directions 
        # h = num_layers * num_directions, batch, hidden_size
        output, h = self.encoder(x)        
        sl, bs, _ = output.size()
  
        avg_pool = F.adaptive_avg_pool1d(output.permute(1,2,0), (1,)).view(bs,-1)   
        max_pool = F.adaptive_max_pool1d(output.permute(1,2,0), (1,)).view(bs,-1) 
        
        conc = torch.cat([avg_pool, max_pool], dim=1)
        # pdb.set_trace()
        outp = F.sigmoid(self.outp(conc))
        return outp

In [125]:
vocab_sz = len(TEXT_fld.vocab)
nh = 80
nl = 1 #3

model = SimpleBiLSTMBaseline(vocab_sz, emb_sz, n_hidden=nh, out_sz=6); model

SimpleBiLSTMBaseline(
  (emb): Embedding(30002, 300)
  (emb_drop): Dropout(p=0.2)
  (encoder): LSTM(300, 80, dropout=0.1, bidirectional=True)
  (outp): Linear(in_features=320, out_features=6, bias=True)
)

In [126]:
# if you're using a GPU, remember to call model.cuda() to move your model to the GPU.
model.cuda()

SimpleBiLSTMBaseline(
  (emb): Embedding(30002, 300)
  (emb_drop): Dropout(p=0.2)
  (encoder): LSTM(300, 80, dropout=0.1, bidirectional=True)
  (outp): Linear(in_features=320, out_features=6, bias=True)
)

### Training Loop

In [127]:
import tqdm

In [128]:
opt = optim.Adam(model.parameters(), lr=1e-3)
loss_func = nn.BCELoss()

# nn.utils.clip_grad_norm(model.parameters(), 0.3)

In [129]:
n_epochs = 4

In [130]:
%%time

for epoch in range(n_epochs):
    print(f'Epoch {epoch}/{n_epochs - 1}')
    print('-' * 10)
    
    running_loss = 0.0
    running_corrects = 0
    
    model.train(True) # turn on training mode
    
    for x, y in tqdm.tqdm(train_dl): # thanks to our wrapper, we can intuitively iterate over our data!
        # zero out gradients
        opt.zero_grad()
        
        # forward
        preds = model(x)
        loss = loss_func(preds, y)
        
        # backprop
        loss.backward()
        opt.step()
        
        running_loss += loss.data[0] * x.size(0)
        
    epoch_loss = running_loss / len(train_ds)
    
    # calculate the validation loss for this epoch
    val_loss = 0.0
    model.eval() # turn on evaluation mode
    
    for x, y in valid_dl:
        preds = model(x)
        loss = loss_func(preds, y)
        val_loss += loss.data[0]  * x.size(0)

    val_loss /= len(valid_ds)
    
    print('Epoch: {}, Training Loss: {:.4f}, Validation Loss: {:.4f}'.format(epoch, epoch_loss, val_loss))

  0%|          | 0/2369 [00:00<?, ?it/s]

Epoch 0/3
----------


  

  0%|          | 0/2369 [00:00<?, ?it/s]

Epoch: 0, Training Loss: 0.1254, Validation Loss: 0.0912
Epoch 1/3
----------


100%|██████████| 2369/2369 [01:30<00:00, 26.26it/s]
  0%|          | 0/2369 [00:00<?, ?it/s]

Epoch: 1, Training Loss: 0.0844, Validation Loss: 0.0822
Epoch 2/3
----------


100%|██████████| 2369/2369 [01:30<00:00, 26.28it/s]
  0%|          | 0/2369 [00:00<?, ?it/s]

Epoch: 2, Training Loss: 0.0736, Validation Loss: 0.0811
Epoch 3/3
----------


100%|██████████| 2369/2369 [01:29<00:00, 26.37it/s]


Epoch: 3, Training Loss: 0.0659, Validation Loss: 0.0819
CPU times: user 4min 43s, sys: 1min 21s, total: 6min 5s
Wall time: 6min 5s


### Predictions

In [131]:
test_preds = []

for x, y in tqdm.tqdm(test_dl):
    preds = model(x)
    
    # if you're data is on the GPU, you need to move the data back to the cpu
    preds = preds.data.cpu().numpy()
    #preds = preds.data.numpy()

    #pdb.set_trace()
    test_preds.append(preds)    

  
100%|██████████| 2394/2394 [00:26<00:00, 89.34it/s]


In [132]:
final_preds = np.concatenate((test_preds[:]), axis=0); final_preds.shape

(153164, 6)

### Prepare submission

In [133]:
subm_df = pd.read_csv("data/test.csv")

for i, col in enumerate(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]):
    subm_df[col] = final_preds[:, i]

subm_df.head(30)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,"Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,",0.000752,4.1e-05,0.000384,7.4e-05,0.00028,8.6e-05
1,0000247867823ef7,"== From RfC == \n\n The title is fine as it is, IMO.",3.5e-05,2e-06,0.000133,2e-06,2.9e-05,1.2e-05
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lapland — / """,0.000157,2e-06,0.000101,2e-06,4.3e-05,7e-06
3,00017563c3f7919a,":If you have a look back at the source, the information I updated was the correct form. I can only guess the source hadn't updated. I shall update the information once again but thank you for your message.",0.001052,1.6e-05,0.000288,1.7e-05,0.000385,6.3e-05
4,00017695ad8997eb,I don't anonymously edit articles at all.,0.466336,0.00431,0.139967,0.000765,0.105867,0.005094
5,0001ea8717f6de06,Thank you for understanding. I think very highly of you and would not revert without discussion.,0.000128,4e-06,0.000134,4e-06,8e-05,4e-06
6,00024115d4cbde0f,"Please do not add nonsense to Wikipedia. Such edits are considered vandalism and quickly undone. If you would like to experiment, please use the sandbox instead. Thank you. -",0.341932,0.000501,0.2103,5.2e-05,0.036544,0.0007
7,000247e83dcc1211,:Dear god this site is horrible.,0.000782,7e-06,0.000345,3e-06,0.000333,2.4e-05
8,00025358d4737918,""" \n Only a fool can believe in such numbers. \n The correct number lies between 10 000 to 15 000. \n Ponder the numbers carefully. \n\n This error will persist for a long time as it continues to reproduce... The latest reproduction I know is from ENCYCLOPÆDIA BRITANNICA ALMANAC 2008 wich states \n Magnittude: 8.7 (fair enough) \n victims: 70 000 (today 10 000 to 15 000 is not """"a lot"""" so I guess people just come out with a number that impresses enough, I don't know. But I know this: it's just a shameless lucky number that they throw in the air. \n GC \n\n """,0.037141,0.002151,0.019343,0.000621,0.014469,0.003317
9,00026d1092fe71cc,"== Double Redirects == \n\n When fixing double redirects, don't just blank the outer one, you need edit it to point it to the final target, unless you think it's inappropriate, in which case, it needs to be nominated at WP:RfD",0.000252,4e-06,0.000128,6e-06,8.7e-05,1.1e-05


In [134]:
# if you want to write the submission file to disk, uncomment and run the below code
subm_df.drop("comment_text", axis=1).to_csv(f'{PATH}/submissions/subm1.csv', index=False)