In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pdb
import os

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from pandas_summary import DataFrameSummary

from fastai.model import *
from fastai.dataset import *
from fastai.torch_imports import *

import torchtext
from torchtext import vocab, data

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import spacy
from spacy.lang.en.stop_words import STOP_WORDS as spacy_STOPWORDS
spacy_en = spacy.load('en')

from wordcloud import WordCloud, STOPWORDS

# pandas and plotting config
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)

In [3]:
PATH = 'data'

os.makedirs(f'{PATH}/models', exist_ok=True)
os.makedirs(f'{PATH}/tmp', exist_ok=True)
os.makedirs(f'{PATH}/submissions', exist_ok=True)

In [4]:
raw_train_df = pd.read_csv(f'{PATH}/train.csv')
test_df = pd.read_csv(f'{PATH}/test.csv')
sample_subm_df = pd.read_csv(f'{PATH}/sample_submission.csv')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
raw_train_df['none'] = 1 - raw_train_df[label_cols].max(axis=1)

In [5]:
pretrained_vectors = 'fasttext.en.300d'
max_features = 30000
min_freq = 0 #10
max_len = 100
emb_sz = 300

In [6]:
import re, string
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [7]:
TEXT_fld = data.Field(sequential=True, tokenize=tokenize, lower=True, fix_length=max_len)
LABEL_fld = data.Field(sequential=False, use_vocab=False, tensor_type=torch.cuda.ByteTensor)

In [8]:
# torchtext cannot read the .csv files correctly if there are newline characters, so replace with " "
raw_train_df.comment_text = raw_train_df.comment_text.str.replace("\n", " ")
test_df.comment_text = test_df.comment_text.str.replace("\n", " ")

In [9]:
# split the training data into a train and validatin dataset
trn, val = train_test_split(raw_train_df, test_size=0.05, random_state=42)
print(len(trn), len(val), len(trn[trn.none != 1]), len(val[val.none != 1]))

# save train, val, and test datasets for torchtext
trn.to_csv(f'{PATH}/train_ds.csv', index=None)
val.to_csv(f'{PATH}/valid_ds.csv', index=None)
test_df.to_csv(f'{PATH}/test_ds.csv', index=None)

151592 7979 15427 798


In [10]:
display(pd.read_csv("data/train_ds.csv").head(2))
display(pd.read_csv("data/valid_ds.csv").head(2))
display(pd.read_csv("data/test_ds.csv").head(2))

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
0,6e76a143f0fd817f,""" I don't think that is the official chart. Listed at United World Singles Chart: """"The South African sales and airplay chart is merged into one to form the official singles chart which is published weekly"""". From what I've seen, """"Get Together"""" has yet to enter the record chart.""",0,0,0,0,0,0,1
1,f4279ea981109ed7,"Mariusz and I both like to vandalize pages. We simply do it out of anger and frustration. I have kids who drive me nuts, and I take it out on Wikipedia. —",0,0,0,0,0,0,1


Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
0,7ca72b5b9c688e9e,"Geez, are you forgetful! We've already discussed why Marx was not an anarchist, i.e. he wanted to use a State to mold his 'socialist man.' Ergo, he is a statist - the opposite of an anarchist. I know a guy who says that, when he gets old and his teeth fall out, he'll quit eating meat. Would you call him a vegetarian?",0,0,0,0,0,0,1
1,c03f72fd8f8bf54f,"Carioca RFA Thanks for your support on my request for adminship. The final outcome was (31/4/1), so I am now an administrator. If you have any comments or concerns on my actions as an administrator, please let me know. Thank you!",0,0,0,0,0,0,1


Unnamed: 0,id,comment_text
0,00001cee341fdb12,"Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,"
1,0000247867823ef7,"== From RfC == The title is fine as it is, IMO."


There are various built-in Datasets in torchtext that handle common use cases. **For csv/tsv files, the TabularDataset class** is convenient. Here’s how we would read data from a csv file using the TabularDataset:

In [11]:
%%time

# train/validation
train_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                    ("comment_text", TEXT_fld), ("toxic", LABEL_fld),
                    ("severe_toxic", LABEL_fld), ("obscene", LABEL_fld),
                    ("threat", LABEL_fld), ("insult", LABEL_fld),
                    ("identity_hate", LABEL_fld), ("none", None)]

train_ds, valid_ds = data.TabularDataset.splits(PATH, train='train_ds.csv', validation='valid_ds.csv',
                                          format='csv', skip_header=True, fields=train_datafields)

# test
test_datafields = [("id", None), ("comment_text", TEXT_fld)]

test_ds = data.TabularDataset(f'{PATH}/test_ds.csv', format='csv', skip_header=True, fields=test_datafields)

CPU times: user 28.3 s, sys: 1.06 s, total: 29.4 s
Wall time: 29.3 s


In [12]:
train_ds[0]

<torchtext.data.example.Example at 0x7fa4c8a7df28>

In [13]:
train_ds[0].__dict__.keys()

dict_keys(['comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

In [14]:
train_ds[1].comment_text[:5]

['mariusz', 'and', 'i', 'both', 'like']

In [15]:
TEXT_fld.build_vocab(train_ds, min_freq=min_freq, max_size=max_features, vectors=pretrained_vectors)

In [16]:
# The vocab.freqs is a collections.Counter object, so we can take a look at the most frequent words.
TEXT_fld.vocab.freqs.most_common(10)

[('.', 647642),
 ('the', 473074),
 (',', 451266),
 ('"', 373537),
 ('to', 282980),
 ('i', 228391),
 ('of', 214109),
 ('and', 213636),
 ("'", 208535),
 ('you', 207412)]

In [17]:
train_iter, val_iter = data.BucketIterator.splits(
    (train_ds, valid_ds), # we pass in the datasets we want the iterator to draw data from
    batch_sizes=(64, 64),
    device=0, # if you want to use the GPU, specify the GPU number here
    sort_key=lambda x: len(x.comment_text), # the BucketIterator needs to be told what function it should use to group the data.
    sort_within_batch=False,
    repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [18]:
batch = next(train_iter.__iter__()); batch

<torchtext.data.batch.Batch at 0x7fa469998fd0>

In [19]:
batch.__dict__.keys()

dict_keys(['batch_size', 'dataset', 'train', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

For the test set, we don't want the data to be shuffled. This is why we'll be using a standard Iterator.

In [20]:
test_iter = data.Iterator(test_ds, batch_size=64, device=0, train=False, 
                          shuffle=False, sort=False, sort_within_batch=False, repeat=False)

In [21]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([ getattr(batch, feat).unsqueeze(1) for feat in self.y_vars ], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [22]:
train_dl = BatchWrapper(train_iter, "comment_text", label_cols)
valid_dl = BatchWrapper(val_iter, "comment_text", label_cols)
test_dl = BatchWrapper(test_iter, "comment_text", None)

Construct a fastai ModelData

In [23]:
md = ModelData(PATH, trn_dl=train_dl, val_dl=valid_dl, test_dl=test_dl)

## 6. Training the model

Define a simple LSTM

In [24]:
class SimpleBiLSTMBaseline(nn.Module):
    def __init__(self, vocab_sz, emb_sz=300, n_hidden=256, out_sz=1, linears=[512], linear_drops=[0.4],
                 emb_drop=0.2, recurrent_dropout=0.1, use_bn=False):
        
        super().__init__() # don't forget to call this!
        self.use_bn = use_bn
        
        self.emb = nn.Embedding(vocab_sz, emb_sz)
        self.emb.data = train_ds.fields['comment_text'].vocab.vectors # to use the pretrained vectors
        self.emb_drop = nn.Dropout(emb_drop)
        
        self.encoder = nn.GRU(emb_sz, n_hidden, bidirectional=True, num_layers=1, dropout=recurrent_dropout)
        
        # x6 if bidrectional; els x4
        linears = [n_hidden * 4] + linears
        
        self.linears = nn.ModuleList([
            nn.Linear(linears[idx], linears[idx + 1]) for idx in range(len(linears) - 1)
        ])
        self.linear_bns = nn.ModuleList([
            nn.BatchNorm1d(sz) for sz in linears[1:]
        ])
        self.linear_drops = nn.ModuleList([
            nn.Dropout(drop) for drop in linear_drops
        ])
        
        self.outp = nn.Linear(linears[-1], out_sz)
        
        # initialize weights
        for o in self.linears: kaiming_normal(o.weight.data)
        kaiming_normal(self.outp.weight.data)
    
    def forward(self, seq):
        x = self.emb_drop(self.emb(seq))
        
        # output = seq_len, batch, hidden_size * num_directions 
        # h = num_layers * num_directions, batch, hidden_size
        output, h = self.encoder(x)        
        sl, bs, _ = output.size()
  
        avg_pool = F.adaptive_avg_pool1d(output.permute(1,2,0), (1,)).view(bs,-1)   
        max_pool = F.adaptive_max_pool1d(output.permute(1,2,0), (1,)).view(bs,-1) 
        
        x = torch.cat([avg_pool, max_pool], dim=1)
        #pdb.set_trace()
        
        for l, d, b in zip (self.linears, self.linear_drops, self.linear_bns):
            x = F.relu(l(x))
            if(self.use_bn): x = b(x)
            x = d(x)
        
        outp = F.sigmoid(self.outp(x))
        return outp

In [25]:
vocab_sz = len(TEXT_fld.vocab)
nh = 80
linears = []
linear_drops = []

model = SimpleBiLSTMBaseline(vocab_sz, emb_sz, n_hidden=nh, out_sz=6, 
                             linears=linears, linear_drops=linear_drops, 
                             emb_drop=0.2, recurrent_dropout=0.1, use_bn=True)

model.cuda()

SimpleBiLSTMBaseline(
  (emb): Embedding(30002, 300)
  (emb_drop): Dropout(p=0.2)
  (encoder): GRU(300, 80, dropout=0.1, bidirectional=True)
  (linears): ModuleList(
  )
  (linear_bns): ModuleList(
  )
  (linear_drops): ModuleList(
  )
  (outp): Linear(in_features=320, out_features=6, bias=True)
)

In [26]:
# it = iter(md.trn_dl)
# *xs,yts = next(it)
# t = model(*V(xs))

# xs[0].size(), yts.size()

In [27]:
opt = optim.Adam(model.parameters(), 1e-2)

In [28]:
fit(model, md, 2, opt, F.binary_cross_entropy)

                                                                

  


epoch      trn_loss   val_loss   
    0      0.061424   0.056956  
    1      0.054749   0.057919                                  



[0.057919208]

In [34]:
set_lrs(opt, 1e-4)

In [35]:
fit(model, md, 3, opt, F.binary_cross_entropy)

                                                                

  


epoch      trn_loss   val_loss   
    0      0.045362   0.048833  
    1      0.03977    0.048867                                  
    2      0.040688   0.048688                                  



[0.048688114]

### Predictions

In [36]:
preds = predict(model, test_dl)
preds.shape

  


(153164, 6)

### Prepare submission

In [37]:
subm_df = pd.read_csv("data/test.csv")

for i, col in enumerate(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]):
    subm_df[col] = preds[:, i]

subm_df.head(30)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,"Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,",0.999937,0.463412,0.986639,0.272552,0.972877,0.368868
1,0000247867823ef7,"== From RfC == \n\n The title is fine as it is, IMO.",0.001707,8.2e-05,0.000397,2.7e-05,0.000212,6.2e-05
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lapland — / """,0.106191,0.000358,0.02481,0.000448,0.010159,0.0025
3,00017563c3f7919a,":If you have a look back at the source, the information I updated was the correct form. I can only guess the source hadn't updated. I shall update the information once again but thank you for your message.",0.001304,7.9e-05,0.002325,0.00018,0.0014,0.000622
4,00017695ad8997eb,I don't anonymously edit articles at all.,0.009693,0.00011,0.002512,0.000264,0.00108,0.000285
5,0001ea8717f6de06,Thank you for understanding. I think very highly of you and would not revert without discussion.,0.001087,8.1e-05,0.000543,8.1e-05,0.000273,0.000114
6,00024115d4cbde0f,"Please do not add nonsense to Wikipedia. Such edits are considered vandalism and quickly undone. If you would like to experiment, please use the sandbox instead. Thank you. -",0.001663,3.1e-05,0.00023,1.5e-05,0.000218,9.2e-05
7,000247e83dcc1211,:Dear god this site is horrible.,0.029637,0.000462,0.004912,0.000247,0.009864,0.000737
8,00025358d4737918,""" \n Only a fool can believe in such numbers. \n The correct number lies between 10 000 to 15 000. \n Ponder the numbers carefully. \n\n This error will persist for a long time as it continues to reproduce... The latest reproduction I know is from ENCYCLOPÆDIA BRITANNICA ALMANAC 2008 wich states \n Magnittude: 8.7 (fair enough) \n victims: 70 000 (today 10 000 to 15 000 is not """"a lot"""" so I guess people just come out with a number that impresses enough, I don't know. But I know this: it's just a shameless lucky number that they throw in the air. \n GC \n\n """,0.008474,0.0012,0.013315,0.000407,0.004331,0.000366
9,00026d1092fe71cc,"== Double Redirects == \n\n When fixing double redirects, don't just blank the outer one, you need edit it to point it to the final target, unless you think it's inappropriate, in which case, it needs to be nominated at WP:RfD",0.000372,3.8e-05,0.000157,1.3e-05,0.000179,2.9e-05


In [38]:
# if you want to write the submission file to disk, uncomment and run the below code
subm_df.drop("comment_text", axis=1).to_csv(f'{PATH}/submissions/subm2.csv', index=False)