In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pdb
import os

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from pandas_summary import DataFrameSummary

from fastai.model import *
from fastai.dataset import *
from fastai.lm_rnn import *
from fastai.sgdr import *
from fastai.rnn_reg import EmbeddingDropout, WeightDrop, LockedDropout
from fastai.torch_imports import *

import torchtext
from torchtext import vocab, data

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import spacy
from spacy.lang.en.stop_words import STOP_WORDS as spacy_STOPWORDS
spacy_en = spacy.load('en')

from wordcloud import WordCloud, STOPWORDS

# pandas and plotting config
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)

In [3]:
PATH = 'data'

os.makedirs(f'{PATH}/models', exist_ok=True)
os.makedirs(f'{PATH}/tmp', exist_ok=True)
os.makedirs(f'{PATH}/submissions', exist_ok=True)

Load data and define labels (ordering is important for competition submission!)

*Note: We are also adding a "None" column

In [4]:
raw_train_df = pd.read_csv(f'{PATH}/train_preproc.csv')
test_df = pd.read_csv(f'{PATH}/test_preproc.csv')
sample_subm_df = pd.read_csv(f'{PATH}/sample_submission.csv')

txt_col = 'comment_text_cleaned'

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
raw_train_df['none'] = 1 - raw_train_df[label_cols].max(axis=1)

model_cols = ['id', txt_col] + label_cols + ['none']

In [5]:
n_folds = 10
n_trn = len(raw_train_df)
n_examples_per_fold = n_trn // n_folds

# n_examples_per_fold

## Prepare Data

*Note: Only need to run 1x (or as desired to regenerate these .csv files)

Build cross-validation datasets

In [6]:
raw_train_df_rand = raw_train_df.sample(frac=1, random_state=9) # frac=1 = return all rows in random order

In [7]:
val_dfs = []

for i in range(0, n_folds):
    start = i * n_examples_per_fold
    end = n_examples_per_fold + start if (i + 1 < n_folds) else None
    val_dfs.append(raw_train_df_rand[start:end])
    
# [ print(idx,len(d)) for idx, d in enumerate(val_dfs) ]

In [8]:
trn_dfs = []

for idx, df in enumerate(val_dfs):
    trn_dfs.append(pd.concat([ val_df for val_idx, val_df in enumerate(val_dfs) if val_idx != idx]))
    
# [ print(idx,len(d)) for idx, d in enumerate(trn_dfs) ]

In [9]:
for idx, [trn_df, val_df] in enumerate(zip(trn_dfs, val_dfs)):
    print(idx, len(trn_df), len(val_df))
    
    trn_df[model_cols].to_csv(f'{PATH}/train_ds_{idx}_of_{n_folds}.csv', index=None)
    val_df[model_cols].to_csv(f'{PATH}/valid_ds_{idx}_of_{n_folds}.csv', index=None)

0 143614 15957
1 143614 15957
2 143614 15957
3 143614 15957
4 143614 15957
5 143614 15957
6 143614 15957
7 143614 15957
8 143614 15957
9 143613 15958


Use the below if you want to create a single training and cv dataset

In [10]:
# split the training data into a train and validatin dataset
trn, val = train_test_split(raw_train_df, test_size=0.05, random_state=9)
print(len(trn), len(val), len(trn[trn.none != 1]), len(val[val.none != 1]))

# save train, val, and test datasets for torchtext
trn[model_cols].to_csv(f'{PATH}/train_ds.csv', index=None)
val[model_cols].to_csv(f'{PATH}/valid_ds.csv', index=None)

# save full cleaned datasets (train+valid and test) as well
raw_train_df[model_cols].to_csv(f'{PATH}/full_train_ds.csv', index=None)
test_df[['id', txt_col]].to_csv(f'{PATH}/test_ds.csv', index=None)

151592 7979 15417 808


In [11]:
display(pd.read_csv("data/full_train_ds.csv").head(2))
display(pd.read_csv("data/test_ds.csv").head(2))

Unnamed: 0,id,comment_text_cleaned,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
0,0000997932d777bf,"explanation why the edits made under my username hardcore metallica fan were reverted? they were not vandalisms, just closure on some gas after i voted at new york dolls fac. and please do not remove the template from the talk page since i am retired now.",0,0,0,0,0,0,1
1,000103f0d9cfb60f,"d'aww! he matches this background colour i am seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",0,0,0,0,0,0,1


Unnamed: 0,id,comment_text_cleaned
0,00001cee341fdb12,"yo bitch ja rule is more succesful then you will ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,"
1,0000247867823ef7,"== from rfc == the title is fine as it is, imo."


## Build Datasets and DataLoaders

Define hyperparameters and column that holds text data

In [6]:
max_features = 100000 #30000
min_freq = 10 #0
max_len = 175 #100

pretrained_vectors = None #'fasttext.en.300d'

batch_sizes = (64,64,64)

Configure how we are going to process text and label fields

In [7]:
import re, string

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [8]:
TEXT_fld = data.Field(sequential=True, tokenize=tokenize, lower=True, fix_length=max_len)
LABEL_fld = data.Field(sequential=False, use_vocab=False, tensor_type=torch.cuda.ByteTensor)

There are various built-in Datasets in torchtext that handle common use cases. **For csv/tsv files, the TabularDataset class** is convenient. Here’s how we would read data from a csv file using the TabularDataset:

In [9]:
%%time

# train/validation
train_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                    (txt_col, TEXT_fld),
                    ("toxic", LABEL_fld), ("severe_toxic", LABEL_fld), ("obscene", LABEL_fld),
                    ("threat", LABEL_fld), ("insult", LABEL_fld), ("identity_hate", LABEL_fld), ("none", None)]

train_ds, valid_ds = data.TabularDataset.splits(PATH, train='train_ds.csv', validation='valid_ds.csv',
                                          format='csv', skip_header=True, fields=train_datafields)

# test
test_datafields = [("id", None), (txt_col, TEXT_fld)]

test_ds = data.TabularDataset(f'{PATH}/test_ds.csv', format='csv', skip_header=True, fields=test_datafields)

# train+val
full_train_ds = data.TabularDataset(f'{PATH}/full_train_ds.csv', 
                                    format='csv', skip_header=True, fields=train_datafields)

CPU times: user 40.4 s, sys: 1.32 s, total: 41.7 s
Wall time: 41.8 s


In [10]:
display(train_ds[0])
display(train_ds[0].__dict__.keys())
display(train_ds[1].comment_text_cleaned[:5])

<torchtext.data.example.Example at 0x7fce0e36b470>

dict_keys(['comment_text_cleaned', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

['you', 'cunt']

Build vocab on *full* training data

In [11]:
TEXT_fld.build_vocab(full_train_ds, min_freq=min_freq, max_size=max_features, vectors=pretrained_vectors)

In [12]:
# The vocab.freqs is a collections.Counter object, so we can take a look at the most frequent words.
TEXT_fld.vocab.freqs.most_common(10)

[('.', 645337),
 ('the', 496748),
 (',', 473218),
 ('"', 392156),
 ('to', 297318),
 ('i', 240305),
 ('of', 224837),
 ('and', 224115),
 ('is', 222448),
 ('you', 221861)]

In [13]:
train_iter, val_iter = data.BucketIterator.splits(
    (train_ds, valid_ds), # we pass in the datasets we want the iterator to draw data from
    batch_sizes=(batch_sizes[0], batch_sizes[1]),
    device=0, # if you want to use the GPU, specify the GPU number here
    sort_key=lambda x: len(x.comment_text_cleaned), # the BucketIterator needs to be told what function it should use to group the data.
    sort_within_batch=False,
    repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [14]:
batch = next(train_iter.__iter__()); 

display(batch)
display(batch.__dict__.keys())

<torchtext.data.batch.Batch at 0x7fceae8ec7f0>

dict_keys(['batch_size', 'dataset', 'train', 'comment_text_cleaned', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

For the test set, we don't want the data to be shuffled. This is why we'll be using a standard Iterator.

In [15]:
test_iter = data.Iterator(test_ds, batch_size=batch_sizes[2], device=0, train=False, 
                          shuffle=False, sort=False, sort_within_batch=False, repeat=False)

In [16]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([ getattr(batch, feat).unsqueeze(1) for feat in self.y_vars ], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [17]:
train_dl = BatchWrapper(train_iter, txt_col, label_cols)
valid_dl = BatchWrapper(val_iter, txt_col, label_cols)
test_dl = BatchWrapper(test_iter, txt_col, None)

Construct a fastai ModelData

In [18]:
md = ModelData(PATH, trn_dl=train_dl, val_dl=valid_dl, test_dl=test_dl)

## Define the Model

Define a simple GRU

In [19]:
class SimpleGru(nn.Module):
    def __init__(self, vocab_sz, emb_sz=300, n_rnn_hidden=256, n_rnn_layers=1, bi_dir=True, out_sz=1, bsz=64,
                 linears=[512], linear_drops=[0.1],
                 dropouth=0.3, dropouti=0.4, dropoute=0.1, wdrop=0.05, use_bn=False):
        
        super().__init__() 
        
        self.bsz = bsz
               
        # configure embeddings layer
        self.dropoute = dropoute
        self.dropouti = LockedDropout(dropouti)
        
        self.emb = nn.Embedding(vocab_sz, emb_sz)
        self.emb.data = train_ds.fields[txt_col].vocab.vectors # to use the pretrained vectors
        #self.emb.weight.requires_grad=False
        self.emb_with_drop = EmbeddingDropout(self.emb)
        
        # configure rnns
        self.n_rnn_hidden, self.n_rnn_layers, self.n_dirs = n_rnn_hidden, n_rnn_layers, 2 if bi_dir else 1
        self.rnn = nn.GRU(emb_sz, self.n_rnn_hidden, self.n_rnn_layers, bidirectional=bi_dir, dropout=dropouth)
        if wdrop: self.rnn = WeightDrop(self.rnn, wdrop)
      
        # configure optional FC layers (3 because we are concat 3 outputs of bsz*n_rnn_hidden into first FC);
        # if not concatentatin outp[-1], then make this 2
        linears = [n_rnn_hidden * 2 * self.n_dirs] + linears
        self.use_bn = use_bn
        
        self.linears = nn.ModuleList([
            nn.Linear(linears[idx], linears[idx + 1]) for idx in range(len(linears) - 1)
        ])
        self.linear_bns = nn.ModuleList([
            nn.BatchNorm1d(sz) for sz in linears[1:]
        ])
        self.linear_drops = nn.ModuleList([
            nn.Dropout(drop) for drop in linear_drops
        ])
        
        self.outp = nn.Linear(linears[-1], out_sz)
        
        # initialize weights
        for o in self.linears: kaiming_normal(o.weight.data)
        kaiming_normal(self.outp.weight.data)
        
        # init hidden
        self.init_hidden(self.bsz)
    
    def forward(self, seq):
        bsz = seq.size(1)
        if (self.hidden[0].size(1) != bsz): self.init_hidden(bsz)
        
        x = self.emb_with_drop(seq, dropout=self.dropoute if self.training else 0)
        x = self.dropouti(x)
        
        #pdb.set_trace()
        output, h = self.rnn(x, self.hidden)        
        self.hidden = repackage_var(h)
        
        sl, bs, _ = output.size()
  
        avg_pool = F.adaptive_avg_pool1d(output.permute(1,2,0), (1,)).view(bs,-1)   
        max_pool = F.adaptive_max_pool1d(output.permute(1,2,0), (1,)).view(bs,-1) 
        
        x = torch.cat([avg_pool, max_pool], dim=1)
        
        for l, d, b in zip (self.linears, self.linear_drops, self.linear_bns):
            x = F.relu(l(x))
            if(self.use_bn): x = b(x)
            x = d(x)
        
        outp = F.sigmoid(self.outp(x))
        return outp

    def init_hidden(self, bsz):
        self.hidden = V(torch.zeros(self.n_dirs * self.n_rnn_layers, bsz, self.n_rnn_hidden))

In [20]:
class SimpleLstm(nn.Module):
    def __init__(self, vocab_sz, emb_sz=300, n_rnn_hidden=256, n_rnn_layers=1, bi_dir=True, out_sz=1, bsz=64,
                 linears=[512], linear_drops=[0.1],
                 dropouth=0.3, dropouti=0.4, dropoute=0.1, wdrop=0.05, use_bn=False):
        
        super().__init__() 
        
        self.bsz = bsz
               
        # configure embeddings layer
        self.dropoute = dropoute
        self.dropouti = LockedDropout(dropouti)
        
        self.emb = nn.Embedding(vocab_sz, emb_sz)
        self.emb.data = train_ds.fields[txt_col].vocab.vectors # to use the pretrained vectors
        #self.emb.weight.requires_grad=False
        self.emb_with_drop = EmbeddingDropout(self.emb)
        
        # configure rnns
        self.n_rnn_hidden, self.n_rnn_layers, self.n_dirs = n_rnn_hidden, n_rnn_layers, 2 if bi_dir else 1
        self.rnn = nn.LSTM(emb_sz, self.n_rnn_hidden, self.n_rnn_layers, bidirectional=bi_dir, dropout=dropouth)
        if wdrop: self.rnn = WeightDrop(self.rnn, wdrop)
      
        # configure optional FC layers (3 because we are concat 3 outputs of bsz*n_rnn_hidden into first FC);
        # if not concatentatin outp[-1], then make this 2
        linears = [n_rnn_hidden * 2 * self.n_dirs] + linears
        self.use_bn = use_bn
        
        self.linears = nn.ModuleList([
            nn.Linear(linears[idx], linears[idx + 1]) for idx in range(len(linears) - 1)
        ])
        self.linear_bns = nn.ModuleList([
            nn.BatchNorm1d(sz) for sz in linears[1:]
        ])
        self.linear_drops = nn.ModuleList([
            nn.Dropout(drop) for drop in linear_drops
        ])
        
        self.outp = nn.Linear(linears[-1], out_sz)
        
        # initialize weights
        for o in self.linears: kaiming_normal(o.weight.data)
        kaiming_normal(self.outp.weight.data)
        
        # init hidden
        self.init_hidden(self.bsz)
    
    def forward(self, seq):
        bsz = seq.size(1)
        if (self.hidden[0].size(1) != bsz): self.init_hidden(bsz)
        
        x = self.emb_with_drop(seq, dropout=self.dropoute if self.training else 0)
        x = self.dropouti(x)
        
        #pdb.set_trace()
        output, h = self.rnn(x, self.hidden)        
        self.hidden = repackage_var(h)
        
        sl, bs, _ = output.size()
  
        avg_pool = F.adaptive_avg_pool1d(output.permute(1,2,0), (1,)).view(bs,-1)   
        max_pool = F.adaptive_max_pool1d(output.permute(1,2,0), (1,)).view(bs,-1) 
        
        x = torch.cat([avg_pool, max_pool], dim=1)
        
        for l, d, b in zip (self.linears, self.linear_drops, self.linear_bns):
            x = F.relu(l(x))
            if(self.use_bn): x = b(x)
            x = d(x)
        
        outp = F.sigmoid(self.outp(x))
        return outp

    def init_hidden(self, bsz):
        self.hidden = (V(torch.zeros(self.n_dirs * self.n_rnn_layers, bsz, self.n_rnn_hidden)),
                       V(torch.zeros(self.n_dirs * self.n_rnn_layers, bsz, self.n_rnn_hidden)))

In [25]:
vocab_sz = len(TEXT_fld.vocab)
emb_sz = 300

n_rnn_hidden = 128
n_rnn_layers = 1
bi_dir = True

out_sz = 6
linears = []; linear_drops = []; use_bn = False

model = SimpleLstm(vocab_sz, emb_sz, n_rnn_hidden, n_rnn_layers, True, out_sz, bsz=batch_sizes[0],
                             linears=linears, linear_drops=linear_drops, use_bn=use_bn)

model.cuda()

SimpleLstm(
  (dropouti): LockedDropout(
  )
  (emb): Embedding(26970, 300)
  (emb_with_drop): EmbeddingDropout(
    (embed): Embedding(26970, 300)
  )
  (rnn): WeightDrop(
    (module): LSTM(300, 128, dropout=0.3, bidirectional=True)
  )
  (linears): ModuleList(
  )
  (linear_bns): ModuleList(
  )
  (linear_drops): ModuleList(
  )
  (outp): Linear(in_features=512, out_features=6, bias=True)
)

In [82]:
class ConvLstm(nn.Module):
    def __init__(self, vocab_sz, emb_sz=300, n_rnn_hidden=256, n_rnn_layers=1, bi_dir=True, out_sz=1, bsz=64,
                 dropouth=0.3, dropouti=0.4, dropoute=0.1, wdrop=0.05):
        
        super().__init__() 
        
        self.bsz = bsz
               
        # configure embeddings layer
        self.dropoute = dropoute
        self.dropouti = LockedDropout(dropouti)
        
        self.emb = nn.Embedding(vocab_sz, emb_sz)
        self.emb.data = train_ds.fields[txt_col].vocab.vectors # to use the pretrained vectors
        #self.emb.weight.requires_grad=False
        self.emb_with_drop = EmbeddingDropout(self.emb)
        
        # configure rnns
        self.n_rnn_hidden, self.n_rnn_layers, self.n_dirs = n_rnn_hidden, n_rnn_layers, 2 if bi_dir else 1
        self.rnn = nn.LSTM(emb_sz, self.n_rnn_hidden, self.n_rnn_layers, bidirectional=bi_dir, dropout=dropouth)
        if wdrop: self.rnn = WeightDrop(self.rnn, wdrop)
            
        self.conv = nn.Conv1d(n_rnn_hidden * self.n_dirs, 64, 4)
      
        self.outp = nn.Linear(64 * 2, out_sz)
        
        # initialize weights
        kaiming_normal(self.outp.weight.data)
        
        # init hidden
        self.init_hidden(self.bsz)
    
    def forward(self, seq):
        bsz = seq.size(1)
        if (self.hidden[0].size(1) != bsz): self.init_hidden(bsz)
        
        x = self.emb_with_drop(seq, dropout=self.dropoute if self.training else 0)
        x = self.dropouti(x)
        
        #pdb.set_trace()
        output, h = self.rnn(x, self.hidden)        
        self.hidden = repackage_var(h)
     
        sl, bs, _ = output.size()
        x = F.relu(self.conv(output.permute(1,2,0)))
  
        avg_pool = F.adaptive_avg_pool1d(x, (1,)).view(bs,-1)   
        max_pool = F.adaptive_max_pool1d(x, (1,)).view(bs,-1) 
        
        x = torch.cat([avg_pool, max_pool], dim=1)

        outp = F.sigmoid(self.outp(x))
        return outp

    def init_hidden(self, bsz):
        self.hidden = (V(torch.zeros(self.n_dirs * self.n_rnn_layers, bsz, self.n_rnn_hidden)),
                       V(torch.zeros(self.n_dirs * self.n_rnn_layers, bsz, self.n_rnn_hidden)))

In [83]:
vocab_sz = len(TEXT_fld.vocab)
emb_sz = 300

n_rnn_hidden = 128
n_rnn_layers = 1
bi_dir = True

out_sz = 6

model = ConvLstm(vocab_sz, emb_sz, n_rnn_hidden, n_rnn_layers, True, out_sz, bsz=batch_sizes[0])

model.cuda()

ConvLstm(
  (dropouti): LockedDropout(
  )
  (emb): Embedding(26970, 300)
  (emb_with_drop): EmbeddingDropout(
    (embed): Embedding(26970, 300)
  )
  (rnn): WeightDrop(
    (module): LSTM(300, 128, dropout=0.3, bidirectional=True)
  )
  (conv): Conv1d(256, 64, kernel_size=(4,), stride=(1,))
  (outp): Linear(in_features=128, out_features=6, bias=True)
)

## Train Model

In [84]:
lo = LayerOptimizer(optim.Adam, model, 1e-2, 1e-5)

In [85]:
on_end = lambda sched, cycle: save_model(model, f'{PATH}/models/lstm_fit_1_cyc_{cycle}')

In [86]:
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]

In [87]:
fit(model, md, 2**4-1, lo.opt, F.binary_cross_entropy, callbacks=cb)

  0%|          | 1/2369 [00:00<14:12,  2.78it/s, loss=0.752]

  result = self.forward(*input, **kwargs)


                                                                

  


epoch      trn_loss   val_loss   
    0      0.051143   0.045428  
    1      0.052452   0.048086                                  
    2      0.048935   0.042793                                  
    3      0.057762   0.053487                                  
    4      0.050881   0.046729                                  
    5      0.049764   0.043361                                  
    6      0.045766   0.041723                                  
    7      0.060439   0.052335                                  
    8      0.060282   0.052392                                  
    9      0.061376   0.049558                                  
    10     0.052035   0.045024                                  
    11     0.050743   0.045635                                  
    12     0.050397   0.042907                                  
    13     0.045974   0.042163                                  
    14     0.04428    0.042109                                  



[0.042109344]

In [97]:
model.load_state_dict(torch.load(f'{PATH}/models/lstm_fit_1_cyc_3'))

In [98]:
# cb[0].plot_lr()
# cb[0].plot_loss()

In [103]:
on_end = lambda sched, cycle: save_model(model, f'{PATH}/models/lstm_fit_2_cyc_{cycle}')

lo = LayerOptimizer(optim.Adam, model, 1e-2, 1e-5)

cb = [CosAnneal(lo, (len(md.trn_dl) * 20), on_cycle_end=on_end)]

fit(model, md, 20, lo.opt, F.binary_cross_entropy, callbacks=cb)

  0%|          | 1/2369 [00:00<12:26,  3.17it/s, loss=0.0313]

  result = self.forward(*input, **kwargs)


                                                                

  


epoch      trn_loss   val_loss   
    0      0.047446   0.042326  
    1      0.060109   0.052738                                  
    2      0.061474   0.052812                                  
    3      0.052966   0.051033                                  
    4      0.053826   0.052875                                  
    5      0.052001   0.048456                                  
    6      0.056809   0.048939                                  
    7      0.058547   0.049012                                  
    8      0.055328   0.045812                                  
    9      0.053131   0.04595                                   
    10     0.053966   0.045393                                  
    11     0.051473   0.045612                                  
    12     0.047847   0.046206                                  
    13     0.051698   0.043556                                  
    14     0.048082   0.043449                                  
    15     0.043666   0

[0.041532446]

In [104]:
# cb[0].plot_lr()
# cb[0].plot_loss()

In [105]:
# it = iter(md.trn_dl)
# *xs,yts = next(it)
# t = model(*V(xs))

# xs[0].size(), yts.size()

In [106]:
# opt = optim.Adam(model.parameters(), 1e-2)
# fit(model, md, 2, opt, F.binary_cross_entropy)
# set_lrs(opt, 1e-3)
# fit(model, md, 3, opt, F.binary_cross_entropy)

## Predictions

In [107]:
preds = predict(model, test_dl)
preds.shape

  result = self.forward(*input, **kwargs)
  


(153164, 6)

## Prepare submission

In [108]:
subm_df = pd.read_csv("data/test.csv")

for i, col in enumerate(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]):
    subm_df[col] = preds[:, i]

subm_df.head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,"Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,",0.998924,0.333234,0.969354,0.054596,0.945446,0.271051
1,0000247867823ef7,"== From RfC == \n\n The title is fine as it is, IMO.",0.002046,6.5e-05,0.00078,2.9e-05,0.00078,0.000138


In [109]:
# if you want to write the submission file to disk, uncomment and run the below code
subm_df.drop(['comment_text'], axis=1).to_csv(f'{PATH}/submissions/lstm+conv_subm_20180319.csv', index=False)

## K-Fold Cross Validation

In [49]:
train_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                    (txt_col, TEXT_fld),
                    ("toxic", LABEL_fld), ("severe_toxic", LABEL_fld), ("obscene", LABEL_fld),
                    ("threat", LABEL_fld), ("insult", LABEL_fld), ("identity_hate", LABEL_fld), ("none", None)]

# test
test_datafields = [("id", None), (txt_col, TEXT_fld)]

# define test dataset and iterator
test_ds = data.TabularDataset(f'{PATH}/test_ds.csv', format='csv', skip_header=True, fields=test_datafields)
test_iter = data.Iterator(test_ds, batch_size=batch_sizes[2], device=0, train=False, 
                          shuffle=False, sort=False, sort_within_batch=False, repeat=False)
test_dl = BatchWrapper(test_iter, txt_col, None)

# define FULL train dataset for building vocab
full_train_ds = data.TabularDataset(f'{PATH}/full_train_ds.csv', 
                                    format='csv', skip_header=True, fields=train_datafields)

TEXT_fld.build_vocab(full_train_ds, min_freq=min_freq, max_size=max_features, vectors=pretrained_vectors)

# cv
for i in range(n_folds):
    print('-' * 10)
    print(f'Fold {i} ....')
    
    # train/validation datsets
    train_ds, valid_ds = data.TabularDataset.splits(PATH, 
                                                    train=f'train_ds_{i}_of_{n_folds}.csv', 
                                                    validation=f'valid_ds_{i}_of_{n_folds}.csv',
                                                    format='csv', skip_header=True, fields=train_datafields)

    # train/validation iterators/dataloaders
    train_iter, val_iter = data.BucketIterator.splits(
        (train_ds, valid_ds), # we pass in the datasets we want the iterator to draw data from
        batch_sizes=(batch_sizes[0], batch_sizes[1]),
        device=0, # if you want to use the GPU, specify the GPU number here
        sort_key=lambda x: len(x.comment_text_cleaned), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False) # we pass repeat=False because we want to wrap this Iterator layer.
        
    train_dl = BatchWrapper(train_iter, txt_col, label_cols)
    valid_dl = BatchWrapper(val_iter, txt_col, label_cols)
        
    md = ModelData(PATH, trn_dl=train_dl, val_dl=valid_dl, test_dl=test_dl)
    
    model = SimpleGru(vocab_sz, emb_sz, n_rnn_hidden, n_rnn_layers, True, out_sz, bsz=batch_sizes[0],
                             linears=linears, linear_drops=linear_drops, use_bn=use_bn)
    
    model.cuda()
    
    lo = LayerOptimizer(optim.Adam, model, 1e-2, 1e-5)
    on_end = lambda sched, cycle: save_model(model, f'{PATH}/models/fit_1_cv{i}_cyc_{cycle}')
    cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2)] #, on_cycle_end=on_end)]
    fit(model, md, 2**4-1, lo.opt, F.binary_cross_entropy, callbacks=cb)
        
    on_end = lambda sched, cycle: save_model(model, f'{PATH}/models/fit_2_cv{i}_cyc_{cycle}')
    lo = LayerOptimizer(optim.Adam, model, 1e-3, 1e-5)
    cb = [CosAnneal(lo, (len(md.trn_dl) * 6))] #, on_cycle_end=on_end)]
    fit(model, md, 6, lo.opt, F.binary_cross_entropy, callbacks=cb)
        
    preds = predict(model, test_dl)
        
    subm_df = pd.read_csv("data/test.csv")
    for lbl_idx, col in enumerate(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]):
        subm_df[col] = preds[:, lbl_idx]
        
    # if you want to write the submission file to disk, uncomment and run the below code
    subm_df.drop(['comment_text'], axis=1).to_csv(f'{PATH}/submissions/subm_20180317_cv_{i}.csv', index=False)



----------
Fold 0 ....


  0%|          | 1/2244 [00:00<12:44,  2.93it/s, loss=0.593]

  result = self.forward(*input, **kwargs)


                                                                

  


epoch      trn_loss   val_loss   
    0      0.045258   0.046109  
    1      0.058406   0.049222                                  
    2      0.049904   0.042761                                  
    3      0.056364   0.061599                                  
    4      0.052477   0.047066                                  
    5      0.050293   0.042976                                  
    6      0.042117   0.041887                                  
    7      0.053778   0.058154                                  
    8      0.057674   0.052213                                  
    9      0.056211   0.047922                                  
    10     0.049906   0.046597                                  
    11     0.050576   0.045229                                  
    12     0.05148    0.0425                                    
    13     0.045406   0.041833                                  
    14     0.04314    0.041853                                  



epoch      trn_loss   val_loss                                  
    0      0.04807    0.042009  
    1      0.043071   0.042385                                  
    2      0.041938   0.041669                                  
    3      0.039948   0.041536                                  
    4      0.040798   0.041548                                  
    5      0.041751   0.04168                                   

----------
Fold 1 ....


epoch      trn_loss   val_loss                                  
    0      0.0469     0.044532  
    1      0.053637   0.046548                                  
    2      0.04584    0.041699                                  
    3      0.055985   0.052053                                  
    4      0.052709   0.048491                                  
    5      0.047826   0.042662                                  
    6      0.042924   0.041063                                  
    7      0.056542   0.052626                                  
    8      0.057511   0.049231                                  
    9      0.055543   0.048936                                  
    10     0.052002   0.045256                                  
    11     0.052559   0.043447                                  
    12     0.050953   0.042409                                  
    13     0.046146   0.04074                                   
    14     0.042522   0.040535                           

epoch      trn_loss   val_loss                                  
    0      0.048078   0.041007  
    1      0.044074   0.040757                                  
    2      0.041425   0.040185                                  
    3      0.039798   0.040258                                  
    4      0.040758   0.04013                                   
    5      0.04134    0.040319                                  

----------
Fold 2 ....


epoch      trn_loss   val_loss                                  
    0      0.046211   0.046232  
    1      0.054699   0.049172                                  
    2      0.047717   0.043116                                  
    3      0.056392   0.0542                                    
    4      0.050505   0.051808                                  
    5      0.050483   0.046325                                  
    6      0.04397    0.042497                                  
    7      0.057687   0.052988                                  
    8      0.057719   0.052651                                  
    9      0.05681    0.05001                                   
    10     0.051583   0.045737                                  
    11     0.050292   0.046601                                  
    12     0.04866    0.043947                                  
    13     0.047621   0.04177                                   
    14     0.038589   0.041778                           

epoch      trn_loss   val_loss                                  
    0      0.047435   0.042207  
    1      0.044574   0.04222                                   
    2      0.041033   0.04163                                   
    3      0.041096   0.041483                                  
    4      0.038985   0.041551                                  
    5      0.039009   0.041655                                  

----------
Fold 3 ....


epoch      trn_loss   val_loss                                  
    0      0.045842   0.044461  
    1      0.054112   0.046143                                  
    2      0.045601   0.041891                                  
    3      0.057592   0.055015                                  
    4      0.048375   0.047729                                  
    5      0.048961   0.042708                                  
    6      0.043502   0.041123                                  
    7      0.058658   0.050482                                  
    8      0.056168   0.059926                                  
    9      0.05659    0.048221                                  
    10     0.050267   0.045249                                  
    11     0.052742   0.044252                                  
    12     0.047401   0.041722                                  
    13     0.047722   0.040836                                  
    14     0.039114   0.04065                            

epoch      trn_loss   val_loss                                  
    0      0.048837   0.041079  
    1      0.043232   0.04098                                   
    2      0.042975   0.040517                                  
    3      0.043268   0.040325                                  
    4      0.039741   0.040574                                  
    5      0.038668   0.040645                                  

----------
Fold 4 ....


epoch      trn_loss   val_loss                                  
    0      0.047394   0.044547  
    1      0.054482   0.047049                                  
    2      0.043276   0.04153                                   
    3      0.056433   0.051465                                  
    4      0.04939    0.048894                                  
    5      0.047884   0.042169                                  
    6      0.044711   0.040562                                  
    7      0.059609   0.051617                                  
    8      0.052984   0.052207                                  
    9      0.054231   0.047272                                  
    10     0.051272   0.045369                                  
    11     0.05297    0.043546                                  
    12     0.048185   0.041898                                  
    13     0.048386   0.040756                                  
    14     0.040677   0.040414                           

epoch      trn_loss   val_loss                                  
    0      0.045067   0.040743  
    1      0.042674   0.040583                                  
    2      0.038848   0.040784                                  
    3      0.040375   0.040044                                  
    4      0.04045    0.040178                                  
    5      0.036137   0.040224                                  

----------
Fold 5 ....


epoch      trn_loss   val_loss                                  
    0      0.046817   0.047256  
    1      0.054021   0.050298                                  
    2      0.044163   0.043861                                  
    3      0.054303   0.052147                                  
    4      0.051907   0.04782                                   
    5      0.04641    0.044826                                  
    6      0.045651   0.04293                                   
    7      0.055372   0.05412                                   
    8      0.056726   0.059057                                  
    9      0.053099   0.050688                                  
    10     0.053554   0.047085                                  
    11     0.051476   0.045464                                  
    12     0.049338   0.043688                                  
    13     0.045399   0.042582                                  
    14     0.041039   0.042308                           

epoch      trn_loss   val_loss                                  
    0      0.045243   0.043258  
    1      0.040643   0.042408                                  
    2      0.039267   0.042547                                  
    3      0.040848   0.042014                                  
    4      0.040975   0.041901                                  
    5      0.036123   0.042103                                  

----------
Fold 6 ....


epoch      trn_loss   val_loss                                  
    0      0.048828   0.044603  
    1      0.057639   0.047223                                  
    2      0.045096   0.04174                                   
    3      0.056414   0.050528                                  
    4      0.050524   0.046926                                  
    5      0.044799   0.042479                                  
    6      0.043476   0.040853                                  
    7      0.228312   0.215125                                 
    8      0.18473    0.175434                                 
    9      0.191839   0.17328                                  
    10     0.163559   0.171044                                 
    11     0.131054   0.117068                                 
    12     0.129085   0.117278                                 
    13     0.129815   0.113865                                 
    14     0.125669   0.113395                                 


epoch      trn_loss   val_loss                                 
    0      0.132102   0.113707  
    1      0.113833   0.112545                                 
    2      0.120266   0.113575                                 
    3      0.123676   0.112638                                 
    4      0.12268    0.112613                                 
    5      0.122679   0.112709                                 

----------
Fold 7 ....


epoch      trn_loss   val_loss                                  
    0      0.047827   0.045387  
    1      0.058011   0.047186                                  
    2      0.044898   0.042663                                  
    3      0.056477   0.057776                                  
    4      0.049528   0.047493                                  
    5      0.048635   0.043087                                  
    6      0.043139   0.041725                                  
    7      0.064508   0.059641                                  
    8      0.060508   0.052746                                  
    9      0.054367   0.04921                                   
    10     0.050875   0.046835                                  
    11     0.050711   0.045438                                  
    12     0.048188   0.042555                                  
    13     0.049447   0.041567                                  
    14     0.042334   0.041672                           

epoch      trn_loss   val_loss                                  
    0      0.044745   0.042613  
    1      0.041323   0.041684                                  
    2      0.041905   0.041561                                  
    3      0.041196   0.040889                                  
    4      0.040183   0.041134                                  
    5      0.038006   0.041314                                  

----------
Fold 8 ....


epoch      trn_loss   val_loss                                  
    0      0.047918   0.047084  
    1      0.059236   0.053703                                  
    2      0.044655   0.044367                                  
    3      0.057386   0.054183                                  
    4      0.048193   0.049964                                  
    5      0.044946   0.046543                                  
    6      0.045924   0.04411                                   
    7      0.058995   0.055836                                  
    8      0.058722   0.052898                                  
    9      0.054531   0.052112                                  
    10     0.052147   0.048714                                  
    11     0.049246   0.045724                                  
    12     0.049708   0.044345                                  
    13     0.043683   0.043433                                  
    14     0.041986   0.04353                            

epoch      trn_loss   val_loss                                  
    0      0.044418   0.044448  
    1      0.03985    0.043705                                  
    2      0.039774   0.044234                                  
    3      0.041095   0.043214                                  
    4      0.039615   0.043575                                  
    5      0.036174   0.043688                                  

----------
Fold 9 ....


epoch      trn_loss   val_loss                                  
    0      0.048021   0.047052  
    1      0.056743   0.048223                                  
    2      0.044408   0.04363                                   
    3      0.057738   0.051385                                  
    4      0.051514   0.048031                                  
    5      0.048279   0.044213                                  
    6      0.045182   0.042637                                  
    7      0.064643   0.067792                                  
    8      0.056401   0.051193                                  
    9      0.057105   0.049551                                  
    10     0.052294   0.050228                                  
    11     0.05034    0.045985                                  
    12     0.047373   0.044812                                  
    13     0.044682   0.042055                                  
    14     0.040304   0.042011                           

epoch      trn_loss   val_loss                                  
    0      0.042868   0.042617  
    1      0.040834   0.042033                                  
    2      0.041524   0.042581                                  
    3      0.038731   0.041987                                  
    4      0.0412     0.041493                                  
    5      0.037848   0.041736                                  



In [50]:
cv_dfs = []
for i in range(n_folds):
    df = pd.read_csv(f'{PATH}/submissions/subm_20180315_cv_{i}.csv')
    cv_dfs.append(df)
    
final_cv_df = pd.concat([ df for df in cv_dfs ])

display(len(final_cv_df))
display(final_cv_df.head(2))

1531640

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.996244,0.393182,0.972876,0.087797,0.93519,0.310491
1,0000247867823ef7,0.00104,5.1e-05,0.001035,2e-05,0.000553,0.000119


In [51]:
final_cv_df = final_cv_df.groupby(['id']).mean().reset_index()
final_cv_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.90028,0.375404,0.875346,0.087246,0.826584,0.242148
1,0000247867823ef7,0.004881,0.000273,0.002911,2.6e-05,0.001958,0.00036
2,00013b17ad220c46,0.030315,0.003812,0.022104,0.000283,0.014694,0.001177
3,00017563c3f7919a,0.017715,0.001505,0.010097,0.001165,0.010401,0.001894
4,00017695ad8997eb,0.012319,0.000488,0.00405,0.0003,0.002884,0.000992


In [52]:
final_cv_df.to_csv(f'{PATH}/submissions/subm_20180315_cv_final.csv', index=False)

Can repeat process above with other models/hyperparameters

In [124]:
train_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                    (txt_col, TEXT_fld),
                    ("toxic", LABEL_fld), ("severe_toxic", LABEL_fld), ("obscene", LABEL_fld),
                    ("threat", LABEL_fld), ("insult", LABEL_fld), ("identity_hate", LABEL_fld), ("none", None)]

# test
test_datafields = [("id", None), (txt_col, TEXT_fld)]

# define test dataset and iterator
test_ds = data.TabularDataset(f'{PATH}/test_ds.csv', format='csv', skip_header=True, fields=test_datafields)
test_iter = data.Iterator(test_ds, batch_size=batch_sizes[2], device=0, train=False, 
                          shuffle=False, sort=False, sort_within_batch=False, repeat=False)
test_dl = BatchWrapper(test_iter, txt_col, None)

# define FULL train dataset for building vocab
full_train_ds = data.TabularDataset(f'{PATH}/full_train_ds.csv', 
                                    format='csv', skip_header=True, fields=train_datafields)

TEXT_fld.build_vocab(full_train_ds, min_freq=min_freq, max_size=max_features, vectors=pretrained_vectors)

# cv
for i in range(n_folds):
    print('-' * 10)
    print(f'Fold {i} ....')
    
    # train/validation datsets
    train_ds, valid_ds = data.TabularDataset.splits(PATH, 
                                                    train=f'train_ds_{i}_of_{n_folds}.csv', 
                                                    validation=f'valid_ds_{i}_of_{n_folds}.csv',
                                                    format='csv', skip_header=True, fields=train_datafields)

    # train/validation iterators/dataloaders
    train_iter, val_iter = data.BucketIterator.splits(
        (train_ds, valid_ds), # we pass in the datasets we want the iterator to draw data from
        batch_sizes=(batch_sizes[0], batch_sizes[1]),
        device=0, # if you want to use the GPU, specify the GPU number here
        sort_key=lambda x: len(x.comment_text_cleaned), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False) # we pass repeat=False because we want to wrap this Iterator layer.
        
    train_dl = BatchWrapper(train_iter, txt_col, label_cols)
    valid_dl = BatchWrapper(val_iter, txt_col, label_cols)
        
    md = ModelData(PATH, trn_dl=train_dl, val_dl=valid_dl, test_dl=test_dl)
    
    model = SimpleLstm(vocab_sz, emb_sz, n_rnn_hidden, n_rnn_layers, True, out_sz, bsz=batch_sizes[0],
                             linears=linears, linear_drops=linear_drops, use_bn=use_bn)
    
    model.cuda()
    
    lo = LayerOptimizer(optim.Adam, model, 1e-2, 1e-5)
    on_end = lambda sched, cycle: save_model(model, f'{PATH}/models/lstm_fit_1_cv{i}_cyc_{cycle}')
    cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2)] #, on_cycle_end=on_end)]
    fit(model, md, 2**4-1, lo.opt, F.binary_cross_entropy, callbacks=cb)
        
    on_end = lambda sched, cycle: save_model(model, f'{PATH}/models/lstm_fit_2_cv{i}_cyc_{cycle}')
    lo = LayerOptimizer(optim.Adam, model, 1e-3, 1e-5)
    cb = [CosAnneal(lo, (len(md.trn_dl) * 6))] #, on_cycle_end=on_end)]
    fit(model, md, 6, lo.opt, F.binary_cross_entropy, callbacks=cb)
        
    preds = predict(model, test_dl)
        
    subm_df = pd.read_csv("data/test.csv")
    for lbl_idx, col in enumerate(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]):
        subm_df[col] = preds[:, lbl_idx]
        
    # if you want to write the submission file to disk, uncomment and run the below code
    subm_df.drop(['comment_text'], axis=1).to_csv(f'{PATH}/submissions/lstm_subm_20180317_cv_{i}.csv', index=False)



----------
Fold 0 ....


  0%|          | 1/2244 [00:00<13:46,  2.71it/s, loss=0.69]     

  result = self.forward(*input, **kwargs)


  3%|▎         | 58/2244 [00:04<03:02, 11.99it/s, loss=0.32] 
  3%|▎         | 60/2244 [00:04<03:01, 12.02it/s, loss=0.316]

Exception in thread Thread-152:
Traceback (most recent call last):
  File "/home/ubuntu/anaconda3/envs/fastai/lib/python3.6/threading.py", line 916, in _bootstrap_inner
    self.run()
  File "/home/ubuntu/anaconda3/envs/fastai/lib/python3.6/site-packages/tqdm/_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "/home/ubuntu/anaconda3/envs/fastai/lib/python3.6/_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



                                                                

  


epoch      trn_loss   val_loss   
    0      0.048469   0.047591  
    1      0.054427   0.050112                                  
    2      0.049167   0.043807                                  
    3      0.054559   0.051215                                  
    4      0.050404   0.047052                                  
    5      0.049142   0.043621                                  
    6      0.045538   0.042728                                  
    7      0.053466   0.05362                                   
    8      0.054661   0.050953                                  
    9      0.053422   0.048362                                  
    10     0.05041    0.046913                                  
    11     0.050894   0.045503                                  
    12     0.050088   0.044133                                  
    13     0.047106   0.042753                                  
    14     0.043142   0.042676                                  



epoch      trn_loss   val_loss                                  
    0      0.048003   0.043147  
    1      0.046446   0.042669                                  
    2      0.042571   0.04227                                   
    3      0.041115   0.042636                                  
    4      0.041296   0.042247                                  
    5      0.042504   0.042328                                  

----------
Fold 1 ....


epoch      trn_loss   val_loss                                  
    0      0.048619   0.046738  
    1      0.052916   0.047216                                  
    2      0.048028   0.042725                                  
    3      0.053829   0.050584                                  
    4      0.051012   0.047008                                  
    5      0.050445   0.043103                                  
    6      0.04269    0.042042                                  
    7      0.053801   0.051415                                  
    8      0.056824   0.051967                                  
    9      0.054729   0.047815                                  
    10     0.051573   0.046073                                  
    11     0.051749   0.044473                                  
    12     0.049362   0.042717                                  
    13     0.047756   0.041252                                  
    14     0.041802   0.040948                           

epoch      trn_loss   val_loss                                  
    0      0.048031   0.041553  
    1      0.043701   0.040658                                  
    2      0.041153   0.040662                                  
    3      0.041581   0.04105                                   
    4      0.038941   0.040525                                  
    5      0.040792   0.040363                                  

----------
Fold 2 ....


epoch      trn_loss   val_loss                                  
    0      0.046289   0.048431  
    1      0.055556   0.050417                                  
    2      0.04787    0.04464                                   
    3      0.054491   0.054477                                  
    4      0.049381   0.048836                                  
    5      0.05161    0.044986                                  
    6      0.043357   0.043908                                  
    7      0.052217   0.052241                                  
    8      0.054606   0.052241                                  
    9      0.054588   0.050563                                  
    10     0.053075   0.048252                                  
    11     0.052826   0.046483                                  
    12     0.04999    0.044304                                  
    13     0.047604   0.043817                                  
    14     0.041184   0.043506                           

epoch      trn_loss   val_loss                                  
    0      0.049678   0.043891  
    1      0.045215   0.043311                                  
    2      0.042061   0.043348                                  
    3      0.041302   0.042923                                  
    4      0.041568   0.042893                                  
    5      0.039284   0.042996                                  

----------
Fold 3 ....


epoch      trn_loss   val_loss                                  
    0      0.049618   0.047353  
    1      0.052585   0.047008                                  
    2      0.047208   0.042854                                  
    3      0.052913   0.050838                                  
    4      0.048858   0.048677                                  
    5      0.049728   0.043516                                  
    6      0.044183   0.042253                                  
    7      0.052952   0.051179                                  
    8      0.054275   0.052392                                  
    9      0.054917   0.049228                                  
    10     0.049912   0.046602                                  
    11     0.050874   0.04403                                   
    12     0.049603   0.042491                                  
    13     0.049571   0.041655                                  
    14     0.041866   0.041459                           

epoch      trn_loss   val_loss                                  
    0      0.04805    0.041678  
    1      0.043631   0.041326                                  
    2      0.044845   0.041396                                  
    3      0.041532   0.040807                                  
    4      0.04035    0.040967                                  
    5      0.039495   0.040914                                  

----------
Fold 4 ....


epoch      trn_loss   val_loss                                  
    0      0.049441   0.04764   
    1      0.05384    0.049506                                  
    2      0.046548   0.042918                                  
    3      0.052523   0.052382                                  
    4      0.049889   0.048282                                  
    5      0.049554   0.043258                                  
    6      0.047914   0.04193                                   
    7      0.056912   0.049793                                  
    8      0.055153   0.051211                                  
    9      0.056936   0.048892                                  
    10     0.050674   0.046205                                  
    11     0.050791   0.045044                                  
    12     0.050156   0.042741                                  
    13     0.046342   0.041757                                  
    14     0.044168   0.041481                           

epoch      trn_loss   val_loss                                  
    0      0.046981   0.042073  
    1      0.042722   0.041843                                  
    2      0.038307   0.041989                                  
    3      0.042466   0.040804                                  
    4      0.043105   0.041049                                  
    5      0.039883   0.041297                                  

----------
Fold 5 ....


epoch      trn_loss   val_loss                                  
    0      0.050729   0.049864  
    1      0.054787   0.049899                                  
    2      0.047279   0.044821                                  
    3      0.054432   0.056087                                  
    4      0.04877    0.048953                                  
    5      0.046671   0.04586                                   
    6      0.043642   0.043844                                  
    7      0.055741   0.051079                                  
    8      0.053453   0.053713                                  
    9      0.052143   0.052004                                  
    10     0.050021   0.048621                                  
    11     0.050713   0.046845                                  
    12     0.053555   0.044741                                  
    13     0.046228   0.043288                                  
    14     0.043366   0.043017                           

epoch      trn_loss   val_loss                                  
    0      0.04624    0.043702  
    1      0.044615   0.042844                                  
    2      0.042304   0.043506                                  
    3      0.042479   0.042235                                  
    4      0.040376   0.042538                                  
    5      0.040482   0.042533                                  

----------
Fold 6 ....


epoch      trn_loss   val_loss                                  
    0      0.050033   0.047404  
    1      0.057283   0.047643                                  
    2      0.04643    0.042522                                  
    3      0.054909   0.05029                                   
    4      0.050203   0.047146                                  
    5      0.048014   0.043348                                  
    6      0.043167   0.041732                                  
    7      0.050781   0.051169                                  
    8      0.055856   0.050226                                  
    9      0.055869   0.048312                                  
    10     0.052059   0.046685                                  
    11     0.049855   0.045242                                  
    12     0.048794   0.042593                                  
    13     0.049134   0.041507                                  
    14     0.042655   0.041549                           

epoch      trn_loss   val_loss                                  
    0      0.04653    0.042098  
    1      0.044122   0.041872                                  
    2      0.041489   0.042229                                  
    3      0.042402   0.041216                                  
    4      0.041536   0.041386                                  
    5      0.040191   0.041547                                  

----------
Fold 7 ....


epoch      trn_loss   val_loss                                  
    0      0.049353   0.047714  
    1      0.059125   0.047348                                  
    2      0.044911   0.043605                                  
    3      0.053491   0.052291                                  
    4      0.051705   0.048001                                  
    5      0.047005   0.044597                                  
    6      0.045326   0.04334                                   
    7      0.05801    0.050531                                  
    8      0.05764    0.053919                                  
    9      0.05823    0.049168                                  
    10     0.050083   0.047223                                  
    11     0.050047   0.045092                                  
    12     0.050075   0.044003                                  
    13     0.048338   0.043474                                  
    14     0.043119   0.043152                           

epoch      trn_loss   val_loss                                  
    0      0.047963   0.043416  
    1      0.042645   0.043207                                  
    2      0.043045   0.042689                                  
    3      0.043217   0.042471                                  
    4      0.041595   0.042513                                  
    5      0.037113   0.042658                                  

----------
Fold 8 ....


epoch      trn_loss   val_loss                                  
    0      0.049908   0.04927   
    1      0.058203   0.053482                                  
    2      0.045737   0.045254                                  
    3      0.054174   0.052643                                  
    4      0.048635   0.049117                                  
    5      0.045412   0.046131                                  
    6      0.042726   0.044782                                  
    7      0.058585   0.054006                                  
    8      0.058236   0.052397                                  
    9      0.05258    0.051033                                  
    10     0.051816   0.048607                                  
    11     0.048049   0.046738                                  
    12     0.051307   0.045303                                  
    13     0.047728   0.044081                                  
    14     0.041358   0.044005                           

epoch      trn_loss   val_loss                                  
    0      0.045647   0.044474  
    1      0.042817   0.044228                                  
    2      0.040915   0.045599                                  
    3      0.041809   0.043419                                  
    4      0.040474   0.043428                                  
    5      0.037941   0.043692                                  

----------
Fold 9 ....


epoch      trn_loss   val_loss                                  
    0      0.049396   0.049165  
    1      0.05688    0.050416                                  
    2      0.047768   0.045109                                  
    3      0.058394   0.054652                                  
    4      0.051842   0.048766                                  
    5      0.04766    0.044876                                  
    6      0.046309   0.043491                                  
    7      0.059311   0.051079                                  
    8      0.056766   0.053325                                  
    9      0.060314   0.050524                                  
    10     0.052698   0.0494                                    
    11     0.050007   0.046307                                  
    12     0.048753   0.044859                                  
    13     0.046021   0.043258                                  
    14     0.041682   0.043055                           

epoch      trn_loss   val_loss                                  
    0      0.046627   0.044108  
    1      0.044489   0.042747                                  
    2      0.044756   0.042827                                  
    3      0.043293   0.042333                                  
    4      0.042247   0.042378                                  
    5      0.037156   0.042477                                  



In [125]:
cv_dfs = []
for i in range(n_folds):
    df = pd.read_csv(f'{PATH}/submissions/lstm_subm_20180317_cv_{i}.csv')
    cv_dfs.append(df)
    
cv_dfs.append(pd.read_csv(f'{PATH}/submissions/lstm_subm_20180317.csv'))
    
final_cv_df = pd.concat([ df for df in cv_dfs ])

display(len(final_cv_df))
display(final_cv_df.head(2))

1684804

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.998808,0.53841,0.986663,0.125191,0.959094,0.386088
1,0000247867823ef7,0.0016,7.7e-05,0.000564,0.000135,0.000491,0.000188


In [126]:
final_cv_df = final_cv_df.groupby(['id']).mean().reset_index()
final_cv_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.996908,0.410974,0.975145,0.092718,0.940464,0.253616
1,0000247867823ef7,0.002657,0.000136,0.001047,0.00012,0.00059,0.000182
2,00013b17ad220c46,0.035074,0.001136,0.012181,0.000598,0.005512,0.001199
3,00017563c3f7919a,0.001018,5.7e-05,0.000456,6.7e-05,0.000361,6.8e-05
4,00017695ad8997eb,0.00717,0.000335,0.002264,0.00055,0.001165,0.000365


In [127]:
final_cv_df.to_csv(f'{PATH}/submissions/lstm_subm_20180317_cv_final.csv', index=False)

Ensemble a bunch of different model predictions into the grand daddy of all submissions

In [73]:
cv_dfs = []
for i in range(n_folds):
    df = pd.read_csv(f'{PATH}/submissions/lstm_subm_20180315_cv_{i}.csv')
    df2 = pd.read_csv(f'{PATH}/submissions/subm_20180315_cv_{i}.csv')
    cv_dfs.append(df)
    cv_dfs.append(df2)
    
final_cv_df = pd.concat([ df for df in cv_dfs ])

display(len(final_cv_df))
display(final_cv_df.head(2))

3063280

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.996635,0.486019,0.980172,0.184098,0.950054,0.295634
1,0000247867823ef7,0.003302,0.000202,0.001431,0.000136,0.000833,0.000203


In [74]:
final_cv_df = final_cv_df.groupby(['id']).mean().reset_index()
final_cv_df.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.948321,0.389054,0.923985,0.09108,0.882614,0.260359
1,0000247867823ef7,0.004504,0.000229,0.002165,8.3e-05,0.001435,0.000301
2,00013b17ad220c46,0.030084,0.002457,0.015929,0.000577,0.009565,0.001025
3,00017563c3f7919a,0.009609,0.000786,0.005316,0.000637,0.005454,0.001003
4,00017695ad8997eb,0.011514,0.000454,0.003311,0.000428,0.002181,0.000706


In [75]:
final_cv_df.to_csv(f'{PATH}/submissions/subm_gru+lstm_final.csv', index=False)