In [17]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Imports and configuration

In [18]:
import pdb
import os

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from pandas_summary import DataFrameSummary

from fastai.model import *
from fastai.dataset import *
from fastai.lm_rnn import *
from fastai.sgdr import *
from fastai.rnn_reg import EmbeddingDropout, WeightDrop, LockedDropout
from fastai.torch_imports import *

import torchtext
from torchtext import vocab, data

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import spacy
from spacy.lang.en.stop_words import STOP_WORDS as spacy_STOPWORDS
spacy_en = spacy.load('en')

from wordcloud import WordCloud, STOPWORDS

# pandas and plotting config
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)

In [19]:
PATH = 'data'

os.makedirs(f'{PATH}/models', exist_ok=True)
os.makedirs(f'{PATH}/tmp', exist_ok=True)
os.makedirs(f'{PATH}/submissions', exist_ok=True)

Load data and define labels (ordering is important for competition submission!)

*Note: We are also adding a "None" column

## Preprocess data

Clean comments using techniques from other kernels

In [None]:
raw_train_df = pd.read_csv(f'{PATH}/train.csv')
test_df = pd.read_csv(f'{PATH}/test.csv')
sample_subm_df = pd.read_csv(f'{PATH}/sample_submission.csv')

In [None]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
raw_train_df['none'] = 1 - raw_train_df[label_cols].max(axis=1)

In [None]:
raw_train_df.comment_text.fillna("<na>", inplace=True)
test_df.comment_text.fillna("<na>", inplace=True)

In [None]:
repl = {
    "&lt;3": " good ",
    ":d": " good ",
    ":dd": " good ",
    ":p": " good ",
    "8)": " good ",
    ":-)": " good ",
    ":)": " good ",
    ";)": " good ",
    "(-:": " good ",
    "(:": " good ",
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " bad ",
    ":(": " bad ",
    ":s": " bad ",
    ":-s": " bad ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "it's" : "it is",
    "'s" : " is",
    "that's" : "that is",
    "weren't" : "were not",
}

#https://drive.google.com/file/d/0B1yuv8YaUVlZZ1RzMFJmc1ZsQmM/view
# Aphost lookup dict
appos = {
    "aren't" : "are not",
    "can't" : "cannot",
    "couldn't" : "could not",
    "didn't" : "did not",
    "doesn't" : "does not",
    "don't" : "do not",
    "hadn't" : "had not",
    "hasn't" : "has not",
    "haven't" : "have not",
    "he'd" : "he would",
    "he'll" : "he will",
    "he's" : "he is",
    "i'd" : "I would",
    "i'd" : "I had",
    "i'll" : "I will",
    "i'm" : "I am",
    "isn't" : "is not",
    "it's" : "it is",
    "it'll":"it will",
    "i've" : "I have",
    "let's" : "let us",
    "mightn't" : "might not",
    "mustn't" : "must not",
    "shan't" : "shall not",
    "she'd" : "she would",
    "she'll" : "she will",
    "she's" : "she is",
    "shouldn't" : "should not",
    "that's" : "that is",
    "there's" : "there is",
    "they'd" : "they would",
    "they'll" : "they will",
    "they're" : "they are",
    "they've" : "they have",
    "we'd" : "we would",
    "we're" : "we are",
    "weren't" : "were not",
    "we've" : "we have",
    "what'll" : "what will",
    "what're" : "what are",
    "what's" : "what is",
    "what've" : "what have",
    "where's" : "where is",
    "who'd" : "who would",
    "who'll" : "who will",
    "who're" : "who are",
    "who's" : "who is",
    "who've" : "who have",
    "won't" : "will not",
    "wouldn't" : "would not",
    "you'd" : "you would",
    "you'll" : "you will",
    "you're" : "you are",
    "you've" : "you have",
    "'re": " are",
    "wasn't": "was not",
    "we'll":" will",
    "didn't": "did not",
    "tryin'": "trying"
}

repl = { **appos, **repl }  # repl becomes a merged dictionary with values from repl replacing those from appos

# display(repl)

In [None]:
repl_keys = [i for i in repl.keys()]

def clean(comment):
    # convert to lower case , so that Hi and hi are the same
    comment = comment.lower()
    
    # remove \n 
    # torchtext cannot read the .csv files correctly if there are newline characters, so replace with " "
    comment = re.sub("\\n"," ",comment)
    
    # remove leaky elements like ip,user
    comment = re.sub("\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}"," ",comment)
    
    # removing usernames
    comment = re.sub("\[\[.*\]","",comment)
    
    # do any substitutions
    comment = " ".join([ repl[w] if (w in repl_keys) else w for w in comment.split() ])
    
    return(comment)

In [None]:
%time raw_train_df['comment_text_cleaned'] = raw_train_df.comment_text.apply(lambda x: clean(x))
print('train cleaned ...')

%time test_df['comment_text_cleaned'] = test_df.comment_text.apply(lambda x: clean(x))
print('test cleaned ...')

In [None]:
raw_train_df.to_csv(f'{PATH}/train_preproc.csv', index=None)
test_df.to_csv(f'{PATH}/test_preproc.csv', index=None)

Used the preprocessed datasets for training and evaluation

In [20]:
raw_train_df = pd.read_csv(f'{PATH}/train_preproc.csv')
test_df = pd.read_csv(f'{PATH}/test_preproc.csv')
sample_subm_df = pd.read_csv(f'{PATH}/sample_submission.csv')

txt_col = 'comment_text_cleaned'

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
raw_train_df['none'] = 1 - raw_train_df[label_cols].max(axis=1)

model_cols = ['id', txt_col] + label_cols + ['none']

In [21]:
n_folds = 10
n_trn = len(raw_train_df)
n_examples_per_fold = n_trn // n_folds

# n_examples_per_fold

## Prepare Data

*Note: Only need to run 1x (or as desired to regenerate these .csv files)

Build cross-validation datasets

In [22]:
raw_train_df_rand = raw_train_df.sample(frac=1, random_state=9) # frac=1 = return all rows in random order

In [7]:
val_dfs = []

for i in range(0, n_folds):
    start = i * n_examples_per_fold
    end = n_examples_per_fold + start if (i + 1 < n_folds) else None
    val_dfs.append(raw_train_df_rand[start:end])
    
# [ print(idx,len(d)) for idx, d in enumerate(val_dfs) ]

In [8]:
trn_dfs = []

for idx, df in enumerate(val_dfs):
    trn_dfs.append(pd.concat([ val_df for val_idx, val_df in enumerate(val_dfs) if val_idx != idx]))
    
# [ print(idx,len(d)) for idx, d in enumerate(trn_dfs) ]

In [9]:
for idx, [trn_df, val_df] in enumerate(zip(trn_dfs, val_dfs)):
    print(idx, len(trn_df), len(val_df))
    
    trn_df[model_cols].to_csv(f'{PATH}/train_ds_{idx}_of_{n_folds}.csv', index=None)
    val_df[model_cols].to_csv(f'{PATH}/valid_ds_{idx}_of_{n_folds}.csv', index=None)

0 143614 15957
1 143614 15957
2 143614 15957
3 143614 15957
4 143614 15957
5 143614 15957
6 143614 15957
7 143614 15957
8 143614 15957
9 143613 15958


Use the below if you want to create a single training and cv dataset

In [23]:
# split the training data into a train and validatin dataset
trn, val = train_test_split(raw_train_df, test_size=0.05, random_state=9)
print(len(trn), len(val), len(trn[trn.none != 1]), len(val[val.none != 1]))

# save train, val, and test datasets for torchtext
trn[model_cols].to_csv(f'{PATH}/train_ds.csv', index=None)
val[model_cols].to_csv(f'{PATH}/valid_ds.csv', index=None)

# save full cleaned datasets (train+valid and test) as well
raw_train_df[model_cols].to_csv(f'{PATH}/full_train_ds.csv', index=None)
test_df[['id', txt_col]].to_csv(f'{PATH}/test_ds.csv', index=None)

91058 4793 9288 502


In [24]:
display(pd.read_csv("data/full_train_ds.csv").head(2))
display(pd.read_csv("data/test_ds.csv").head(2))

Unnamed: 0,id,comment_text_cleaned,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
0,22256635,"nonsense? kiss off, geek. what i said is true. i will have your account terminated.",1,0,0,0,0,0,0
1,27450690,""" please do not vandalize pages, as you did with this edit to w. s. merwin. if you continue to do so, you will be blocked from editing. """,0,0,0,0,0,0,1


Unnamed: 0,id,comment_text_cleaned
0,6044863,==orphaned non-free media (image:41cd1jboevl. ss500 .jpg)==
1,6102620,"::kentuckiana is colloquial. even though the area is often referred to as this, it (in my opinion) has never held the encyclopedic precision of ""louisville metropolitian area"", which has a specific u.s. census definition. also, apparently kentuckiana often refers to the local television viewing area, which is not nearly contiguous with the official metro area. as you indicate, kentuckiana seems to be more of a slang or marketing phenomena than anything we could pin down in encyclopedic terms here. that is why we see wikipedia language like ""the louisville metropolitan area, sometimes referred to as kentuckiana"". that is my take on it. — •"


## Build Datasets and DataLoaders

Define hyperparameters and column that holds text data

In [25]:
max_features = 100000 #30000
min_freq = 10 #0
max_len = 175 #100

pretrained_vectors = None #'fasttext.en.300d'

batch_sizes = (64,64,64)

Configure how we are going to process text and label fields

In [26]:
import re, string

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [38]:
TEXT_fld = data.Field(sequential=True, tokenize=tokenize, lower=True, fix_length=max_len)
LABEL_fld = data.Field(sequential=False, use_vocab=False, tensor_type=torch.cuda.ByteTensor)

There are various built-in Datasets in torchtext that handle common use cases. **For csv/tsv files, the TabularDataset class** is convenient. Here’s how we would read data from a csv file using the TabularDataset:

In [39]:
%%time

csv.field_size_limit(sys.maxsize)

# train/validation
train_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                    (txt_col, TEXT_fld),
                    ("toxic", LABEL_fld), ("severe_toxic", LABEL_fld), ("obscene", LABEL_fld),
                    ("threat", LABEL_fld), ("insult", LABEL_fld), ("identity_hate", LABEL_fld), ("none", None)]

train_ds, valid_ds = data.TabularDataset.splits(PATH, train='train_ds.csv', validation='valid_ds.csv',
                                          format='csv', skip_header=True, fields=train_datafields)

# test
test_datafields = [("id", None), (txt_col, TEXT_fld)]
test_ds = data.TabularDataset(f'{PATH}/test_ds.csv', format='csv', skip_header=True, fields=test_datafields)

# train+val
full_train_ds = data.TabularDataset(f'{PATH}/full_train_ds.csv', 
                                    format='csv', skip_header=True, fields=train_datafields)

CPU times: user 33.8 s, sys: 6.56 s, total: 40.4 s
Wall time: 40.4 s


In [40]:
display(train_ds[0])
display(train_ds[0].__dict__.keys())
display(train_ds[1].comment_text_cleaned[:5])

<torchtext.data.example.Example at 0x1cca16d7b8>

dict_keys(['comment_text_cleaned', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

['"', 'i', 'appreciate', 'your', 'intention']

Build vocab on *full* training data

In [41]:
TEXT_fld.build_vocab(full_train_ds, min_freq=min_freq, max_size=max_features, vectors=pretrained_vectors)

In [42]:
# The vocab.freqs is a collections.Counter object, so we can take a look at the most frequent words.
TEXT_fld.vocab.freqs.most_common(10)

[('.', 389774),
 ('the', 298502),
 (',', 284449),
 ('"', 234054),
 ('to', 178418),
 ('i', 144135),
 ('of', 135293),
 ('you', 134659),
 ('and', 134609),
 ('is', 133857)]

In [43]:
train_iter, val_iter = data.BucketIterator.splits(
    (train_ds, valid_ds), # we pass in the datasets we want the iterator to draw data from
    batch_sizes=(batch_sizes[0], batch_sizes[1]),
    device=0, #-1 if CPU else GPU number if you want to use the GPU
    sort_key=lambda x: len(x.comment_text_cleaned), # the BucketIterator needs to be told what function it should use to group the data.
    sort_within_batch=False,
    repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [44]:
batch = next(train_iter.__iter__()); 

display(batch)
display(batch.__dict__.keys())

<torchtext.data.batch.Batch at 0x1c2140c208>

dict_keys(['batch_size', 'dataset', 'train', 'comment_text_cleaned', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

For the test set, we don't want the data to be shuffled. This is why we'll be using a standard Iterator.

In [45]:
test_iter = data.Iterator(test_ds, batch_size=batch_sizes[2], device=0, train=False, 
                          shuffle=False, sort=False, sort_within_batch=False, repeat=False)

In [46]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([ getattr(batch, feat).unsqueeze(1) for feat in self.y_vars ], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [47]:
train_dl = BatchWrapper(train_iter, txt_col, label_cols)
valid_dl = BatchWrapper(val_iter, txt_col, label_cols)
test_dl = BatchWrapper(test_iter, txt_col, None)

Construct a fastai ModelData

In [49]:
md = ModelData(PATH, trn_dl=train_dl, val_dl=valid_dl, test_dl=test_dl)

In [50]:
x,y = next(iter(md.trn_dl))

In [51]:
x.size(), y.size()

(Variable containing:
      9    141    680  ...    8263  19180   1134
    220      9      2  ...      58     88     31
    233     18      9  ...      43    187    364
         ...            ⋱           ...         
      1      1      1  ...       1      1      1
      1      1      1  ...       1      1      1
      1      1      1  ...       1      1      1
 [torch.LongTensor of size 175x64], Variable containing:
     0     0     0     0     0     0
     0     0     0     0     0     0
     1     0     1     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     1     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     0     0     0
     0     0     0     

## Define Models

Define a simple GRU and a simple LSTM

In [19]:
class SimpleGru(nn.Module):
    def __init__(self, vocab_sz, emb_sz=300, n_rnn_hidden=256, n_rnn_layers=1, bi_dir=True, out_sz=1, bsz=64,
                 dropout_rnn=0.3, dropout_after_emb=0.4, dropout_emb=0.1, wdrop=0.05):
        
        super().__init__() 
        
        self.bsz = bsz
               
        # configure embeddings layer
        self.dropout_emb = dropout_emb
        self.dropout_after_emb = LockedDropout(dropout_after_emb)
        
        self.emb = nn.Embedding(vocab_sz, emb_sz)
        self.emb.data = train_ds.fields[txt_col].vocab.vectors # to use the pretrained vectors
        self.emb_with_drop = EmbeddingDropout(self.emb)
        
        # configure rnns
        self.n_rnn_hidden, self.n_rnn_layers, self.n_dirs = n_rnn_hidden, n_rnn_layers, 2 if bi_dir else 1
        self.rnn = nn.GRU(emb_sz, self.n_rnn_hidden, self.n_rnn_layers, bidirectional=bi_dir, dropout=dropout_rnn)
        if wdrop: self.rnn = WeightDrop(self.rnn, wdrop)
      
        self.outp = nn.Linear(n_rnn_hidden * 2 * self.n_dirs, out_sz)
        
        # initialize weights
        kaiming_normal(self.outp.weight.data)
        
        # init hidden
        self.init_hidden(self.bsz)
    
    def forward(self, seq):
        bsz = seq.size(1)
        if (self.hidden[0].size(1) != bsz): self.init_hidden(bsz)
        
        x = self.emb_with_drop(seq, dropout=self.dropout_emb if self.training else 0)
        x = self.dropout_after_emb(x)
        
        output, h = self.rnn(x, self.hidden)        
        self.hidden = repackage_var(h)
        
        sl, bs, _ = output.size()
  
        avg_pool = F.adaptive_avg_pool1d(output.permute(1,2,0), (1,)).view(bs,-1)   
        max_pool = F.adaptive_max_pool1d(output.permute(1,2,0), (1,)).view(bs,-1) 
        
        x = torch.cat([avg_pool, max_pool], dim=1)
        outp = F.sigmoid(self.outp(x))
        
        return outp

    def init_hidden(self, bsz):
        self.hidden = V(torch.zeros(self.n_dirs * self.n_rnn_layers, bsz, self.n_rnn_hidden))

In [20]:
class SimpleLstm(nn.Module):
    def __init__(self, vocab_sz, emb_sz=300, n_rnn_hidden=256, n_rnn_layers=1, bi_dir=True, out_sz=1, bsz=64,
                 dropout_rnn=0.3, dropout_after_emb=0.4, dropout_emb=0.1, wdrop=0.05):
        
        super().__init__() 
        
        self.bsz = bsz
               
        # configure embeddings layer
        self.dropout_emb = dropout_emb
        self.dropout_after_emb = LockedDropout(dropout_after_emb)
        
        self.emb = nn.Embedding(vocab_sz, emb_sz)
        self.emb.data = train_ds.fields[txt_col].vocab.vectors # to use the pretrained vectors
        self.emb_with_drop = EmbeddingDropout(self.emb)
        
        # configure rnns
        self.n_rnn_hidden, self.n_rnn_layers, self.n_dirs = n_rnn_hidden, n_rnn_layers, 2 if bi_dir else 1
        self.rnn = nn.LSTM(emb_sz, self.n_rnn_hidden, self.n_rnn_layers, bidirectional=bi_dir, dropout=dropout_rnn)
        if wdrop: self.rnn = WeightDrop(self.rnn, wdrop)
      
        self.outp = nn.Linear(n_rnn_hidden * 2 * self.n_dirs, out_sz)
        
        # initialize weights
        kaiming_normal(self.outp.weight.data)
        
        # init hidden
        self.init_hidden(self.bsz)
    
    def forward(self, seq):
        bsz = seq.size(1)
        if (self.hidden[0].size(1) != bsz): self.init_hidden(bsz)
        
        x = self.emb_with_drop(seq, dropout=self.dropout_emb if self.training else 0)
        x = self.dropout_after_emb(x)
        
        output, h = self.rnn(x, self.hidden)        
        self.hidden = repackage_var(h)
        
        sl, bs, _ = output.size()
  
        avg_pool = F.adaptive_avg_pool1d(output.permute(1,2,0), (1,)).view(bs,-1)   
        max_pool = F.adaptive_max_pool1d(output.permute(1,2,0), (1,)).view(bs,-1) 
        
        x = torch.cat([avg_pool, max_pool], dim=1)
        outp = F.sigmoid(self.outp(x))
        
        return outp

    def init_hidden(self, bsz):
        self.hidden = (V(torch.zeros(self.n_dirs * self.n_rnn_layers, bsz, self.n_rnn_hidden)),
                       V(torch.zeros(self.n_dirs * self.n_rnn_layers, bsz, self.n_rnn_hidden)))

Instantiate a `SimpleLstm` model, experimenting with various hyperparameters

In [25]:
vocab_sz = len(TEXT_fld.vocab)
emb_sz = 300
out_sz = 6

n_rnn_hidden = 128
n_rnn_layers = 1
bi_dir = True

model = SimpleLstm(vocab_sz, emb_sz, n_rnn_hidden, n_rnn_layers, True, out_sz, bsz=batch_sizes[0])
model.cuda()

SimpleLstm(
  (dropouti): LockedDropout(
  )
  (emb): Embedding(26970, 300)
  (emb_with_drop): EmbeddingDropout(
    (embed): Embedding(26970, 300)
  )
  (rnn): WeightDrop(
    (module): LSTM(300, 128, dropout=0.3, bidirectional=True)
  )
  (linears): ModuleList(
  )
  (linear_bns): ModuleList(
  )
  (linear_drops): ModuleList(
  )
  (outp): Linear(in_features=512, out_features=6, bias=True)
)

## Train Model

Utilize fastai callbacks to add weight-decay and SGDR with restarts goodness.  You can experiment with other callbacks here as well

In [84]:
lo = LayerOptimizer(optim.Adam, model, 1e-2, 1e-5)

In [85]:
on_end = lambda sched, cycle: save_model(model, f'{PATH}/models/lstm_fit_1_cyc_{cycle}')

In [86]:
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]

Utilize the fastai `fit()` method to train the model.  This is our training/validation loop

In [87]:
fit(model, md, 2**4-1, lo.opt, F.binary_cross_entropy, callbacks=cb)

  0%|          | 1/2369 [00:00<14:12,  2.78it/s, loss=0.752]

  result = self.forward(*input, **kwargs)


                                                                

  


epoch      trn_loss   val_loss   
    0      0.051143   0.045428  
    1      0.052452   0.048086                                  
    2      0.048935   0.042793                                  
    3      0.057762   0.053487                                  
    4      0.050881   0.046729                                  
    5      0.049764   0.043361                                  
    6      0.045766   0.041723                                  
    7      0.060439   0.052335                                  
    8      0.060282   0.052392                                  
    9      0.061376   0.049558                                  
    10     0.052035   0.045024                                  
    11     0.050743   0.045635                                  
    12     0.050397   0.042907                                  
    13     0.045974   0.042163                                  
    14     0.04428    0.042109                                  



[0.042109344]

In [97]:
model.load_state_dict(torch.load(f'{PATH}/models/lstm_fit_1_cyc_3'))

In [103]:
on_end = lambda sched, cycle: save_model(model, f'{PATH}/models/lstm_fit_2_cyc_{cycle}')

lo = LayerOptimizer(optim.Adam, model, 1e-2, 1e-5)
cb = [CosAnneal(lo, (len(md.trn_dl) * 20), on_cycle_end=on_end)]

fit(model, md, 20, lo.opt, F.binary_cross_entropy, callbacks=cb)

  0%|          | 1/2369 [00:00<12:26,  3.17it/s, loss=0.0313]

  result = self.forward(*input, **kwargs)


                                                                

  


epoch      trn_loss   val_loss   
    0      0.047446   0.042326  
    1      0.060109   0.052738                                  
    2      0.061474   0.052812                                  
    3      0.052966   0.051033                                  
    4      0.053826   0.052875                                  
    5      0.052001   0.048456                                  
    6      0.056809   0.048939                                  
    7      0.058547   0.049012                                  
    8      0.055328   0.045812                                  
    9      0.053131   0.04595                                   
    10     0.053966   0.045393                                  
    11     0.051473   0.045612                                  
    12     0.047847   0.046206                                  
    13     0.051698   0.043556                                  
    14     0.048082   0.043449                                  
    15     0.043666   0

[0.041532446]

## Predictions

In [107]:
preds = predict(model, test_dl)
preds.shape

  result = self.forward(*input, **kwargs)
  


(153164, 6)

## Prepare submission

In [108]:
subm_df = pd.read_csv("data/test.csv")

for i, col in enumerate(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]):
    subm_df[col] = preds[:, i]

subm_df.head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,"Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,",0.998924,0.333234,0.969354,0.054596,0.945446,0.271051
1,0000247867823ef7,"== From RfC == \n\n The title is fine as it is, IMO.",0.002046,6.5e-05,0.00078,2.9e-05,0.00078,0.000138


In [109]:
# if you want to write the submission file to disk, uncomment and run the below code
subm_df.drop(['comment_text'], axis=1).to_csv(f'{PATH}/submissions/lstm_subm.csv', index=False)

## K-Fold Cross Validation

We can perform a k-fold cross validation by following the same steps on the base training/validation datasets, except this time we use the CV training/validation datasets we created up top

In [49]:
train_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                    (txt_col, TEXT_fld),
                    ("toxic", LABEL_fld), ("severe_toxic", LABEL_fld), ("obscene", LABEL_fld),
                    ("threat", LABEL_fld), ("insult", LABEL_fld), ("identity_hate", LABEL_fld), ("none", None)]

# test
test_datafields = [("id", None), (txt_col, TEXT_fld)]

# define test dataset and iterator
test_ds = data.TabularDataset(f'{PATH}/test_ds.csv', format='csv', skip_header=True, fields=test_datafields)
test_iter = data.Iterator(test_ds, batch_size=batch_sizes[2], device=0, train=False, 
                          shuffle=False, sort=False, sort_within_batch=False, repeat=False)
test_dl = BatchWrapper(test_iter, txt_col, None)

# define FULL train dataset for building vocab
full_train_ds = data.TabularDataset(f'{PATH}/full_train_ds.csv', 
                                    format='csv', skip_header=True, fields=train_datafields)

TEXT_fld.build_vocab(full_train_ds, min_freq=min_freq, max_size=max_features, vectors=pretrained_vectors)

# cv
for i in range(n_folds):
    print('-' * 10)
    print(f'Fold {i} ....')
    
    # train/validation datsets
    train_ds, valid_ds = data.TabularDataset.splits(PATH, 
                                                    train=f'train_ds_{i}_of_{n_folds}.csv', 
                                                    validation=f'valid_ds_{i}_of_{n_folds}.csv',
                                                    format='csv', skip_header=True, fields=train_datafields)

    # train/validation iterators/dataloaders
    train_iter, val_iter = data.BucketIterator.splits(
        (train_ds, valid_ds), # we pass in the datasets we want the iterator to draw data from
        batch_sizes=(batch_sizes[0], batch_sizes[1]),
        device=0, # if you want to use the GPU, specify the GPU number here
        sort_key=lambda x: len(x.comment_text_cleaned), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False) # we pass repeat=False because we want to wrap this Iterator layer.
        
    train_dl = BatchWrapper(train_iter, txt_col, label_cols)
    valid_dl = BatchWrapper(val_iter, txt_col, label_cols)
        
    md = ModelData(PATH, trn_dl=train_dl, val_dl=valid_dl, test_dl=test_dl)
    
    model = SimpleGru(vocab_sz, emb_sz, n_rnn_hidden, n_rnn_layers, True, out_sz, bsz=batch_sizes[0],
                             linears=linears, linear_drops=linear_drops, use_bn=use_bn)
    
    model.cuda()
    
    lo = LayerOptimizer(optim.Adam, model, 1e-2, 1e-5)
    on_end = lambda sched, cycle: save_model(model, f'{PATH}/models/fit_1_cv{i}_cyc_{cycle}')
    cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2)] #, on_cycle_end=on_end)]
    fit(model, md, 2**4-1, lo.opt, F.binary_cross_entropy, callbacks=cb)
        
    on_end = lambda sched, cycle: save_model(model, f'{PATH}/models/fit_2_cv{i}_cyc_{cycle}')
    lo = LayerOptimizer(optim.Adam, model, 1e-3, 1e-5)
    cb = [CosAnneal(lo, (len(md.trn_dl) * 6))] #, on_cycle_end=on_end)]
    fit(model, md, 6, lo.opt, F.binary_cross_entropy, callbacks=cb)
        
    preds = predict(model, test_dl)
        
    subm_df = pd.read_csv("data/test.csv")
    for lbl_idx, col in enumerate(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]):
        subm_df[col] = preds[:, lbl_idx]
        
    # if you want to write the submission file to disk, uncomment and run the below code
    subm_df.drop(['comment_text'], axis=1).to_csv(f'{PATH}/submissions/subm_lstm_cv_{i}.csv', index=False)



----------
Fold 0 ....


  0%|          | 1/2244 [00:00<12:44,  2.93it/s, loss=0.593]

  result = self.forward(*input, **kwargs)


                                                                

  


epoch      trn_loss   val_loss   
    0      0.045258   0.046109  
    1      0.058406   0.049222                                  
    2      0.049904   0.042761                                  
    3      0.056364   0.061599                                  
    4      0.052477   0.047066                                  
    5      0.050293   0.042976                                  
    6      0.042117   0.041887                                  
    7      0.053778   0.058154                                  
    8      0.057674   0.052213                                  
    9      0.056211   0.047922                                  
    10     0.049906   0.046597                                  
    11     0.050576   0.045229                                  
    12     0.05148    0.0425                                    
    13     0.045406   0.041833                                  
    14     0.04314    0.041853                                  



epoch      trn_loss   val_loss                                  
    0      0.04807    0.042009  
    1      0.043071   0.042385                                  
    2      0.041938   0.041669                                  
    3      0.039948   0.041536                                  
    4      0.040798   0.041548                                  
    5      0.041751   0.04168                                   

----------
Fold 1 ....


epoch      trn_loss   val_loss                                  
    0      0.0469     0.044532  
    1      0.053637   0.046548                                  
    2      0.04584    0.041699                                  
    3      0.055985   0.052053                                  
    4      0.052709   0.048491                                  
    5      0.047826   0.042662                                  
    6      0.042924   0.041063                                  
    7      0.056542   0.052626                                  
    8      0.057511   0.049231                                  
    9      0.055543   0.048936                                  
    10     0.052002   0.045256                                  
    11     0.052559   0.043447                                  
    12     0.050953   0.042409                                  
    13     0.046146   0.04074                                   
    14     0.042522   0.040535                           

epoch      trn_loss   val_loss                                  
    0      0.048078   0.041007  
    1      0.044074   0.040757                                  
    2      0.041425   0.040185                                  
    3      0.039798   0.040258                                  
    4      0.040758   0.04013                                   
    5      0.04134    0.040319                                  

----------
Fold 2 ....


epoch      trn_loss   val_loss                                  
    0      0.046211   0.046232  
    1      0.054699   0.049172                                  
    2      0.047717   0.043116                                  
    3      0.056392   0.0542                                    
    4      0.050505   0.051808                                  
    5      0.050483   0.046325                                  
    6      0.04397    0.042497                                  
    7      0.057687   0.052988                                  
    8      0.057719   0.052651                                  
    9      0.05681    0.05001                                   
    10     0.051583   0.045737                                  
    11     0.050292   0.046601                                  
    12     0.04866    0.043947                                  
    13     0.047621   0.04177                                   
    14     0.038589   0.041778                           

epoch      trn_loss   val_loss                                  
    0      0.047435   0.042207  
    1      0.044574   0.04222                                   
    2      0.041033   0.04163                                   
    3      0.041096   0.041483                                  
    4      0.038985   0.041551                                  
    5      0.039009   0.041655                                  

----------
Fold 3 ....


epoch      trn_loss   val_loss                                  
    0      0.045842   0.044461  
    1      0.054112   0.046143                                  
    2      0.045601   0.041891                                  
    3      0.057592   0.055015                                  
    4      0.048375   0.047729                                  
    5      0.048961   0.042708                                  
    6      0.043502   0.041123                                  
    7      0.058658   0.050482                                  
    8      0.056168   0.059926                                  
    9      0.05659    0.048221                                  
    10     0.050267   0.045249                                  
    11     0.052742   0.044252                                  
    12     0.047401   0.041722                                  
    13     0.047722   0.040836                                  
    14     0.039114   0.04065                            

epoch      trn_loss   val_loss                                  
    0      0.048837   0.041079  
    1      0.043232   0.04098                                   
    2      0.042975   0.040517                                  
    3      0.043268   0.040325                                  
    4      0.039741   0.040574                                  
    5      0.038668   0.040645                                  

----------
Fold 4 ....


epoch      trn_loss   val_loss                                  
    0      0.047394   0.044547  
    1      0.054482   0.047049                                  
    2      0.043276   0.04153                                   
    3      0.056433   0.051465                                  
    4      0.04939    0.048894                                  
    5      0.047884   0.042169                                  
    6      0.044711   0.040562                                  
    7      0.059609   0.051617                                  
    8      0.052984   0.052207                                  
    9      0.054231   0.047272                                  
    10     0.051272   0.045369                                  
    11     0.05297    0.043546                                  
    12     0.048185   0.041898                                  
    13     0.048386   0.040756                                  
    14     0.040677   0.040414                           

epoch      trn_loss   val_loss                                  
    0      0.045067   0.040743  
    1      0.042674   0.040583                                  
    2      0.038848   0.040784                                  
    3      0.040375   0.040044                                  
    4      0.04045    0.040178                                  
    5      0.036137   0.040224                                  

----------
Fold 5 ....


epoch      trn_loss   val_loss                                  
    0      0.046817   0.047256  
    1      0.054021   0.050298                                  
    2      0.044163   0.043861                                  
    3      0.054303   0.052147                                  
    4      0.051907   0.04782                                   
    5      0.04641    0.044826                                  
    6      0.045651   0.04293                                   
    7      0.055372   0.05412                                   
    8      0.056726   0.059057                                  
    9      0.053099   0.050688                                  
    10     0.053554   0.047085                                  
    11     0.051476   0.045464                                  
    12     0.049338   0.043688                                  
    13     0.045399   0.042582                                  
    14     0.041039   0.042308                           

epoch      trn_loss   val_loss                                  
    0      0.045243   0.043258  
    1      0.040643   0.042408                                  
    2      0.039267   0.042547                                  
    3      0.040848   0.042014                                  
    4      0.040975   0.041901                                  
    5      0.036123   0.042103                                  

----------
Fold 6 ....


epoch      trn_loss   val_loss                                  
    0      0.048828   0.044603  
    1      0.057639   0.047223                                  
    2      0.045096   0.04174                                   
    3      0.056414   0.050528                                  
    4      0.050524   0.046926                                  
    5      0.044799   0.042479                                  
    6      0.043476   0.040853                                  
    7      0.228312   0.215125                                 
    8      0.18473    0.175434                                 
    9      0.191839   0.17328                                  
    10     0.163559   0.171044                                 
    11     0.131054   0.117068                                 
    12     0.129085   0.117278                                 
    13     0.129815   0.113865                                 
    14     0.125669   0.113395                                 


epoch      trn_loss   val_loss                                 
    0      0.132102   0.113707  
    1      0.113833   0.112545                                 
    2      0.120266   0.113575                                 
    3      0.123676   0.112638                                 
    4      0.12268    0.112613                                 
    5      0.122679   0.112709                                 

----------
Fold 7 ....


epoch      trn_loss   val_loss                                  
    0      0.047827   0.045387  
    1      0.058011   0.047186                                  
    2      0.044898   0.042663                                  
    3      0.056477   0.057776                                  
    4      0.049528   0.047493                                  
    5      0.048635   0.043087                                  
    6      0.043139   0.041725                                  
    7      0.064508   0.059641                                  
    8      0.060508   0.052746                                  
    9      0.054367   0.04921                                   
    10     0.050875   0.046835                                  
    11     0.050711   0.045438                                  
    12     0.048188   0.042555                                  
    13     0.049447   0.041567                                  
    14     0.042334   0.041672                           

epoch      trn_loss   val_loss                                  
    0      0.044745   0.042613  
    1      0.041323   0.041684                                  
    2      0.041905   0.041561                                  
    3      0.041196   0.040889                                  
    4      0.040183   0.041134                                  
    5      0.038006   0.041314                                  

----------
Fold 8 ....


epoch      trn_loss   val_loss                                  
    0      0.047918   0.047084  
    1      0.059236   0.053703                                  
    2      0.044655   0.044367                                  
    3      0.057386   0.054183                                  
    4      0.048193   0.049964                                  
    5      0.044946   0.046543                                  
    6      0.045924   0.04411                                   
    7      0.058995   0.055836                                  
    8      0.058722   0.052898                                  
    9      0.054531   0.052112                                  
    10     0.052147   0.048714                                  
    11     0.049246   0.045724                                  
    12     0.049708   0.044345                                  
    13     0.043683   0.043433                                  
    14     0.041986   0.04353                            

epoch      trn_loss   val_loss                                  
    0      0.044418   0.044448  
    1      0.03985    0.043705                                  
    2      0.039774   0.044234                                  
    3      0.041095   0.043214                                  
    4      0.039615   0.043575                                  
    5      0.036174   0.043688                                  

----------
Fold 9 ....


epoch      trn_loss   val_loss                                  
    0      0.048021   0.047052  
    1      0.056743   0.048223                                  
    2      0.044408   0.04363                                   
    3      0.057738   0.051385                                  
    4      0.051514   0.048031                                  
    5      0.048279   0.044213                                  
    6      0.045182   0.042637                                  
    7      0.064643   0.067792                                  
    8      0.056401   0.051193                                  
    9      0.057105   0.049551                                  
    10     0.052294   0.050228                                  
    11     0.05034    0.045985                                  
    12     0.047373   0.044812                                  
    13     0.044682   0.042055                                  
    14     0.040304   0.042011                           

epoch      trn_loss   val_loss                                  
    0      0.042868   0.042617  
    1      0.040834   0.042033                                  
    2      0.041524   0.042581                                  
    3      0.038731   0.041987                                  
    4      0.0412     0.041493                                  
    5      0.037848   0.041736                                  



In [50]:
cv_dfs = []
for i in range(n_folds):
    df = pd.read_csv(f'{PATH}/submissions/subm_lstm_cv_{i}.csv')
    cv_dfs.append(df)
    
final_cv_df = pd.concat([ df for df in cv_dfs ])

display(len(final_cv_df))
display(final_cv_df.head(2))

1531640

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.996244,0.393182,0.972876,0.087797,0.93519,0.310491
1,0000247867823ef7,0.00104,5.1e-05,0.001035,2e-05,0.000553,0.000119


In [51]:
final_cv_df = final_cv_df.groupby(['id']).mean().reset_index()
final_cv_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.90028,0.375404,0.875346,0.087246,0.826584,0.242148
1,0000247867823ef7,0.004881,0.000273,0.002911,2.6e-05,0.001958,0.00036
2,00013b17ad220c46,0.030315,0.003812,0.022104,0.000283,0.014694,0.001177
3,00017563c3f7919a,0.017715,0.001505,0.010097,0.001165,0.010401,0.001894
4,00017695ad8997eb,0.012319,0.000488,0.00405,0.0003,0.002884,0.000992


In [52]:
final_cv_df.to_csv(f'{PATH}/submissions/subm_lstm_cv_final.csv', index=False)