In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

## Imports and configuration

In [2]:
import pdb
import os

import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

from pandas_summary import DataFrameSummary

from fastai.model import *
from fastai.dataset import *
from fastai.lm_rnn import *
from fastai.sgdr import *
from fastai.rnn_reg import EmbeddingDropout, WeightDrop, LockedDropout
from fastai.torch_imports import *

import torchtext
from torchtext import vocab, data

from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import spacy
from spacy.lang.en.stop_words import STOP_WORDS as spacy_STOPWORDS
spacy_en = spacy.load('en')

from wordcloud import WordCloud, STOPWORDS

# pandas and plotting config
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_colwidth', -1)

In [3]:
PATH = 'data'

os.makedirs(f'{PATH}/models', exist_ok=True)
os.makedirs(f'{PATH}/tmp', exist_ok=True)
os.makedirs(f'{PATH}/submissions', exist_ok=True)

Load data and define labels (ordering is important for competition submission!)

*Note: We are also adding a "None" column

## Preprocess data

Clean comments using techniques from other kernels

In [4]:
raw_train_df = pd.read_csv(f'{PATH}/train.csv')
test_df = pd.read_csv(f'{PATH}/test.csv')
sample_subm_df = pd.read_csv(f'{PATH}/sample_submission.csv')

In [5]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
raw_train_df['none'] = 1 - raw_train_df[label_cols].max(axis=1)

In [6]:
raw_train_df.comment_text.fillna("<na>", inplace=True)
test_df.comment_text.fillna("<na>", inplace=True)

In [7]:
repl = {
    "&lt;3": " good ",
    ":d": " good ",
    ":dd": " good ",
    ":p": " good ",
    "8)": " good ",
    ":-)": " good ",
    ":)": " good ",
    ";)": " good ",
    "(-:": " good ",
    "(:": " good ",
    "yay!": " good ",
    "yay": " good ",
    "yaay": " good ",
    "yaaay": " good ",
    "yaaaay": " good ",
    "yaaaaay": " good ",
    ":/": " bad ",
    ":&gt;": " sad ",
    ":')": " sad ",
    ":-(": " bad ",
    ":(": " bad ",
    ":s": " bad ",
    ":-s": " bad ",
    "&lt;3": " heart ",
    ":d": " smile ",
    ":p": " smile ",
    ":dd": " smile ",
    "8)": " smile ",
    ":-)": " smile ",
    ":)": " smile ",
    ";)": " smile ",
    "(-:": " smile ",
    "(:": " smile ",
    ":/": " worry ",
    ":&gt;": " angry ",
    ":')": " sad ",
    ":-(": " sad ",
    ":(": " sad ",
    ":s": " sad ",
    ":-s": " sad ",
    r"\br\b": "are",
    r"\bu\b": "you",
    r"\bhaha\b": "ha",
    r"\bhahaha\b": "ha",
    r"\bdon't\b": "do not",
    r"\bdoesn't\b": "does not",
    r"\bdidn't\b": "did not",
    r"\bhasn't\b": "has not",
    r"\bhaven't\b": "have not",
    r"\bhadn't\b": "had not",
    r"\bwon't\b": "will not",
    r"\bwouldn't\b": "would not",
    r"\bcan't\b": "can not",
    r"\bcannot\b": "can not",
    r"\bi'm\b": "i am",
    "m": "am",
    "r": "are",
    "u": "you",
    "haha": "ha",
    "hahaha": "ha",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    "haven't": "have not",
    "hadn't": "had not",
    "won't": "will not",
    "wouldn't": "would not",
    "can't": "can not",
    "cannot": "can not",
    "i'm": "i am",
    "m": "am",
    "i'll" : "i will",
    "its" : "it is",
    "it's" : "it is",
    "'s" : " is",
    "that's" : "that is",
    "weren't" : "were not",
}

#https://drive.google.com/file/d/0B1yuv8YaUVlZZ1RzMFJmc1ZsQmM/view
# Aphost lookup dict
appos = {
    "aren't" : "are not",
    "can't" : "cannot",
    "couldn't" : "could not",
    "didn't" : "did not",
    "doesn't" : "does not",
    "don't" : "do not",
    "hadn't" : "had not",
    "hasn't" : "has not",
    "haven't" : "have not",
    "he'd" : "he would",
    "he'll" : "he will",
    "he's" : "he is",
    "i'd" : "I would",
    "i'd" : "I had",
    "i'll" : "I will",
    "i'm" : "I am",
    "isn't" : "is not",
    "it's" : "it is",
    "it'll":"it will",
    "i've" : "I have",
    "let's" : "let us",
    "mightn't" : "might not",
    "mustn't" : "must not",
    "shan't" : "shall not",
    "she'd" : "she would",
    "she'll" : "she will",
    "she's" : "she is",
    "shouldn't" : "should not",
    "that's" : "that is",
    "there's" : "there is",
    "they'd" : "they would",
    "they'll" : "they will",
    "they're" : "they are",
    "they've" : "they have",
    "we'd" : "we would",
    "we're" : "we are",
    "weren't" : "were not",
    "we've" : "we have",
    "what'll" : "what will",
    "what're" : "what are",
    "what's" : "what is",
    "what've" : "what have",
    "where's" : "where is",
    "who'd" : "who would",
    "who'll" : "who will",
    "who're" : "who are",
    "who's" : "who is",
    "who've" : "who have",
    "won't" : "will not",
    "wouldn't" : "would not",
    "you'd" : "you would",
    "you'll" : "you will",
    "you're" : "you are",
    "you've" : "you have",
    "'re": " are",
    "wasn't": "was not",
    "we'll":" will",
    "didn't": "did not",
    "tryin'": "trying"
}

repl = { **appos, **repl }  # repl becomes a merged dictionary with values from repl replacing those from appos

# display(repl)

In [8]:
repl_keys = [i for i in repl.keys()]

def clean(comment):
    # convert to lower case , so that Hi and hi are the same
    comment = comment.lower()
    
    # remove \n 
    # torchtext cannot read the .csv files correctly if there are newline characters, so replace with " "
    comment = re.sub("\\n"," ",comment)
    
    # remove leaky elements like ip,user
    comment = re.sub("\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}"," ",comment)
    
    # removing usernames
    comment = re.sub("\[\[.*\]","",comment)
    
    # do any substitutions
    comment = " ".join([ repl[w] if (w in repl_keys) else w for w in comment.split() ])
    
    return(comment)

In [9]:
%time raw_train_df['comment_text_cleaned'] = raw_train_df.comment_text.apply(lambda x: clean(x))
print('train cleaned ...')

%time test_df['comment_text_cleaned'] = test_df.comment_text.apply(lambda x: clean(x))
print('test cleaned ...')

CPU times: user 19.3 s, sys: 64 ms, total: 19.4 s
Wall time: 19.4 s
train cleaned ...
CPU times: user 17.1 s, sys: 64 ms, total: 17.2 s
Wall time: 17.2 s
test cleaned ...


In [10]:
raw_train_df.to_csv(f'{PATH}/train_preproc.csv', index=None)
test_df.to_csv(f'{PATH}/test_preproc.csv', index=None)

Used the preprocessed datasets for training and evaluation

In [11]:
raw_train_df = pd.read_csv(f'{PATH}/train_preproc.csv')
test_df = pd.read_csv(f'{PATH}/test_preproc.csv')
sample_subm_df = pd.read_csv(f'{PATH}/sample_submission.csv')

txt_col = 'comment_text_cleaned'

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
raw_train_df['none'] = 1 - raw_train_df[label_cols].max(axis=1)

model_cols = ['id', txt_col] + label_cols + ['none']

In [12]:
n_folds = 10
n_trn = len(raw_train_df)
n_examples_per_fold = n_trn // n_folds

# n_examples_per_fold

## Prepare Data

*Note: Only need to run 1x (or as desired to regenerate these .csv files)

Build cross-validation datasets

In [13]:
raw_train_df_rand = raw_train_df.sample(frac=1, random_state=9) # frac=1 = return all rows in random order

In [14]:
val_dfs = []

for i in range(0, n_folds):
    start = i * n_examples_per_fold
    end = n_examples_per_fold + start if (i + 1 < n_folds) else None
    val_dfs.append(raw_train_df_rand[start:end])
    
# [ print(idx,len(d)) for idx, d in enumerate(val_dfs) ]

In [15]:
trn_dfs = []

for idx, df in enumerate(val_dfs):
    trn_dfs.append(pd.concat([ val_df for val_idx, val_df in enumerate(val_dfs) if val_idx != idx]))
    
# [ print(idx,len(d)) for idx, d in enumerate(trn_dfs) ]

In [16]:
for idx, [trn_df, val_df] in enumerate(zip(trn_dfs, val_dfs)):
    print(idx, len(trn_df), len(val_df))
    
    trn_df[model_cols].to_csv(f'{PATH}/train_ds_{idx}_of_{n_folds}.csv', index=None)
    val_df[model_cols].to_csv(f'{PATH}/valid_ds_{idx}_of_{n_folds}.csv', index=None)

0 143614 15957
1 143614 15957
2 143614 15957
3 143614 15957
4 143614 15957
5 143614 15957
6 143614 15957
7 143614 15957
8 143614 15957
9 143613 15958


Use the below if you want to create a single training and cv dataset

In [17]:
# split the training data into a train and validatin dataset
trn, val = train_test_split(raw_train_df, test_size=0.05, random_state=9)
print(len(trn), len(val), len(trn[trn.none != 1]), len(val[val.none != 1]))

# save train, val, and test datasets for torchtext
trn[model_cols].to_csv(f'{PATH}/train_ds.csv', index=None)
val[model_cols].to_csv(f'{PATH}/valid_ds.csv', index=None)

# save full cleaned datasets (train+valid and test) as well
raw_train_df[model_cols].to_csv(f'{PATH}/full_train_ds.csv', index=None)
test_df[['id', txt_col]].to_csv(f'{PATH}/test_ds.csv', index=None)

151592 7979 15417 808


In [18]:
display(pd.read_csv("data/full_train_ds.csv").head(2))
display(pd.read_csv("data/test_ds.csv").head(2))

Unnamed: 0,id,comment_text_cleaned,toxic,severe_toxic,obscene,threat,insult,identity_hate,none
0,0000997932d777bf,"explanation why the edits made under my username hardcore metallica fan were reverted? they were not vandalisms, just closure on some gas after i voted at new york dolls fac. and please do not remove the template from the talk page since i am retired now.",0,0,0,0,0,0,1
1,000103f0d9cfb60f,"d'aww! he matches this background colour i am seemingly stuck with. thanks. (talk) 21:51, january 11, 2016 (utc)",0,0,0,0,0,0,1


Unnamed: 0,id,comment_text_cleaned
0,00001cee341fdb12,"yo bitch ja rule is more succesful then you will ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,"
1,0000247867823ef7,"== from rfc == the title is fine as it is, imo."


## Build Datasets and DataLoaders

Define hyperparameters and column that holds text data

In [19]:
max_features = 100000 #30000
min_freq = 10 #0
max_len = 175 #100

pretrained_vectors = None #'fasttext.en.300d'

batch_sizes = (64,64,64)

Configure how we are going to process text and label fields

In [20]:
import re, string

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [21]:
TEXT_fld = data.Field(sequential=True, tokenize=tokenize, lower=True, fix_length=max_len)
LABEL_fld = data.Field(sequential=False, use_vocab=False, tensor_type=torch.cuda.ByteTensor)

There are various built-in Datasets in torchtext that handle common use cases. **For csv/tsv files, the TabularDataset class** is convenient. Here’s how we would read data from a csv file using the TabularDataset:

In [22]:
%%time

csv.field_size_limit(sys.maxsize)

# train/validation
train_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                    (txt_col, TEXT_fld),
                    ("toxic", LABEL_fld), ("severe_toxic", LABEL_fld), ("obscene", LABEL_fld),
                    ("threat", LABEL_fld), ("insult", LABEL_fld), ("identity_hate", LABEL_fld), ("none", None)]

train_ds, valid_ds = data.TabularDataset.splits(PATH, train='train_ds.csv', validation='valid_ds.csv',
                                          format='csv', skip_header=True, fields=train_datafields)

# test
test_datafields = [("id", None), (txt_col, TEXT_fld)]
test_ds = data.TabularDataset(f'{PATH}/test_ds.csv', format='csv', skip_header=True, fields=test_datafields)

# train+val
full_train_ds = data.TabularDataset(f'{PATH}/full_train_ds.csv', 
                                    format='csv', skip_header=True, fields=train_datafields)

CPU times: user 42.3 s, sys: 1.44 s, total: 43.7 s
Wall time: 43.7 s


In [23]:
display(train_ds[0])
display(train_ds[0].__dict__.keys())
display(train_ds[1].comment_text_cleaned[:5])

<torchtext.data.example.Example at 0x7faaa62d9d30>

dict_keys(['comment_text_cleaned', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

['you', 'cunt']

Build vocab on *full* training data

In [24]:
TEXT_fld.build_vocab(full_train_ds, min_freq=min_freq, max_size=max_features, vectors=pretrained_vectors)

In [25]:
# The vocab.freqs is a collections.Counter object, so we can take a look at the most frequent words.
TEXT_fld.vocab.freqs.most_common(10)

[('.', 645337),
 ('the', 496748),
 (',', 473218),
 ('"', 392156),
 ('to', 297318),
 ('i', 240305),
 ('of', 224837),
 ('and', 224115),
 ('is', 222448),
 ('you', 221861)]

In [26]:
train_iter, val_iter = data.BucketIterator.splits(
    (train_ds, valid_ds), # we pass in the datasets we want the iterator to draw data from
    batch_sizes=(batch_sizes[0], batch_sizes[1]),
    device=0, #-1 if CPU else GPU number if you want to use the GPU
    sort_key=lambda x: len(x.comment_text_cleaned), # the BucketIterator needs to be told what function it should use to group the data.
    sort_within_batch=False,
    repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [27]:
batch = next(train_iter.__iter__()); 

display(batch)
display(batch.__dict__.keys())

<torchtext.data.batch.Batch at 0x7faaa796f898>

dict_keys(['batch_size', 'dataset', 'train', 'comment_text_cleaned', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'])

For the test set, we don't want the data to be shuffled. This is why we'll be using a standard Iterator.

In [28]:
test_iter = data.Iterator(test_ds, batch_size=batch_sizes[2], device=0, train=False, 
                          shuffle=False, sort=False, sort_within_batch=False, repeat=False)

In [29]:
class BatchWrapper:
    def __init__(self, dl, x_var, y_vars):
        self.dl, self.x_var, self.y_vars = dl, x_var, y_vars # we pass in the list of attributes for x and y
    
    def __iter__(self):
        for batch in self.dl:
            x = getattr(batch, self.x_var) # we assume only one input in this wrapper
            
            if self.y_vars is not None: # we will concatenate y into a single tensor
                y = torch.cat([ getattr(batch, feat).unsqueeze(1) for feat in self.y_vars ], dim=1).float()
            else:
                y = torch.zeros((1))

            yield (x, y)
    
    def __len__(self):
        return len(self.dl)

In [30]:
train_dl = BatchWrapper(train_iter, txt_col, label_cols)
valid_dl = BatchWrapper(val_iter, txt_col, label_cols)
test_dl = BatchWrapper(test_iter, txt_col, None)

Construct a fastai ModelData

In [31]:
md = ModelData(PATH, trn_dl=train_dl, val_dl=valid_dl, test_dl=test_dl)

In [32]:
x,y = next(iter(md.trn_dl))

In [33]:
x.size(), y.size()

(torch.Size([175, 64]), torch.Size([64, 6]))

## Define Models

Define a simple GRU and a simple LSTM

In [34]:
class SimpleGru(nn.Module):
    def __init__(self, vocab_sz, emb_sz=300, n_rnn_hidden=256, n_rnn_layers=1, bi_dir=True, out_sz=1, bsz=64,
                 dropout_rnn=0.3, dropout_after_emb=0.4, dropout_emb=0.1, wdrop=0.05):
        
        super().__init__() 
        
        self.bsz = bsz
               
        # configure embeddings layer
        self.dropout_emb = dropout_emb
        self.dropout_after_emb = LockedDropout(dropout_after_emb)
        
        self.emb = nn.Embedding(vocab_sz, emb_sz)
        self.emb.data = train_ds.fields[txt_col].vocab.vectors # to use the pretrained vectors
        self.emb_with_drop = EmbeddingDropout(self.emb)
        
        # configure rnns
        self.n_rnn_hidden, self.n_rnn_layers, self.n_dirs = n_rnn_hidden, n_rnn_layers, 2 if bi_dir else 1
        self.rnn = nn.GRU(emb_sz, self.n_rnn_hidden, self.n_rnn_layers, bidirectional=bi_dir, dropout=dropout_rnn)
        if wdrop: self.rnn = WeightDrop(self.rnn, wdrop)
      
        self.outp = nn.Linear(n_rnn_hidden * 2 * self.n_dirs, out_sz)
        
        # initialize weights
        kaiming_normal(self.outp.weight.data)
        
        # init hidden
        self.init_hidden(self.bsz)
    
    def forward(self, seq):
        bsz = seq.size(1)
        if (self.hidden[0].size(1) != bsz): self.init_hidden(bsz)
        
        x = self.emb_with_drop(seq, dropout=self.dropout_emb if self.training else 0)
        x = self.dropout_after_emb(x)
        
        output, h = self.rnn(x, self.hidden)        
        self.hidden = repackage_var(h)
        
        sl, bs, _ = output.size()
  
        avg_pool = F.adaptive_avg_pool1d(output.permute(1,2,0), (1,)).view(bs,-1)   
        max_pool = F.adaptive_max_pool1d(output.permute(1,2,0), (1,)).view(bs,-1) 
        
        x = torch.cat([avg_pool, max_pool], dim=1)
        outp = F.sigmoid(self.outp(x))
        
        return outp

    def init_hidden(self, bsz):
        self.hidden = V(torch.zeros(self.n_dirs * self.n_rnn_layers, bsz, self.n_rnn_hidden))

In [35]:
class SimpleLstm(nn.Module):
    def __init__(self, vocab_sz, emb_sz=300, n_rnn_hidden=256, n_rnn_layers=1, bi_dir=True, out_sz=1, bsz=64,
                 dropout_rnn=0.3, dropout_after_emb=0.4, dropout_emb=0.1, wdrop=0.05):
        
        super().__init__() 
        
        self.bsz = bsz
               
        # configure embeddings layer
        self.dropout_emb = dropout_emb
        self.dropout_after_emb = LockedDropout(dropout_after_emb)
        
        self.emb = nn.Embedding(vocab_sz, emb_sz)
        self.emb.data = train_ds.fields[txt_col].vocab.vectors # to use the pretrained vectors
        self.emb_with_drop = EmbeddingDropout(self.emb)
        
        # configure rnns
        self.n_rnn_hidden, self.n_rnn_layers, self.n_dirs = n_rnn_hidden, n_rnn_layers, 2 if bi_dir else 1
        self.rnn = nn.LSTM(emb_sz, self.n_rnn_hidden, self.n_rnn_layers, bidirectional=bi_dir, dropout=dropout_rnn)
        if wdrop: self.rnn = WeightDrop(self.rnn, wdrop)
      
        self.outp = nn.Linear(n_rnn_hidden * 2 * self.n_dirs, out_sz)
        
        # initialize weights
        kaiming_normal(self.outp.weight.data)
        
        # init hidden
        self.init_hidden(self.bsz)
    
    def forward(self, seq):
        bsz = seq.size(1)
        if (self.hidden[0].size(1) != bsz): self.init_hidden(bsz)
        
        x = self.emb_with_drop(seq, dropout=self.dropout_emb if self.training else 0)
        x = self.dropout_after_emb(x)
        
        output, h = self.rnn(x, self.hidden)        
        self.hidden = repackage_var(h)
        
        sl, bs, _ = output.size()
  
        avg_pool = F.adaptive_avg_pool1d(output.permute(1,2,0), (1,)).view(bs,-1)   
        max_pool = F.adaptive_max_pool1d(output.permute(1,2,0), (1,)).view(bs,-1) 
        
        x = torch.cat([avg_pool, max_pool], dim=1)
        outp = F.sigmoid(self.outp(x))
        
        return outp

    def init_hidden(self, bsz):
        self.hidden = (V(torch.zeros(self.n_dirs * self.n_rnn_layers, bsz, self.n_rnn_hidden)),
                       V(torch.zeros(self.n_dirs * self.n_rnn_layers, bsz, self.n_rnn_hidden)))

Instantiate a `SimpleLstm` model, experimenting with various hyperparameters

In [36]:
vocab_sz = len(TEXT_fld.vocab)
emb_sz = 300
out_sz = 6

n_rnn_hidden = 128
n_rnn_layers = 1
bi_dir = True

model = SimpleLstm(vocab_sz, emb_sz, n_rnn_hidden, n_rnn_layers, True, out_sz, bsz=batch_sizes[0])
model.cuda()

SimpleLstm(
  (dropout_after_emb): LockedDropout(
  )
  (emb): Embedding(26970, 300)
  (emb_with_drop): EmbeddingDropout(
    (embed): Embedding(26970, 300)
  )
  (rnn): WeightDrop(
    (module): LSTM(300, 128, dropout=0.3, bidirectional=True)
  )
  (outp): Linear(in_features=512, out_features=6, bias=True)
)

## Train Model

Utilize fastai callbacks to add weight-decay and SGDR with restarts goodness.  You can experiment with other callbacks here as well

In [37]:
lo = LayerOptimizer(optim.Adam, model, 1e-2, 1e-5)

In [38]:
on_end = lambda sched, cycle: save_model(model, f'{PATH}/models/lstm_fit_1_cyc_{cycle}')

In [39]:
cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2, on_cycle_end=on_end)]

Utilize the fastai `fit()` method to train the model.  This is our training/validation loop

In [None]:
fit(model, md, 2**4-1, lo.opt, F.binary_cross_entropy, callbacks=cb)

 16%|█▌        | 381/2369 [00:40<03:28,  9.51it/s, loss=0.0631]

In [None]:
model.load_state_dict(torch.load(f'{PATH}/models/lstm_fit_1_cyc_3'))

In [None]:
on_end = lambda sched, cycle: save_model(model, f'{PATH}/models/lstm_fit_2_cyc_{cycle}')

lo = LayerOptimizer(optim.Adam, model, 1e-2, 1e-5)
cb = [CosAnneal(lo, (len(md.trn_dl) * 20), on_cycle_end=on_end)]

fit(model, md, 20, lo.opt, F.binary_cross_entropy, callbacks=cb)

## Predictions

In [42]:
preds = predict(model, test_dl)
preds.shape

  result = self.forward(*input, **kwargs)
  


(153164, 6)

## Prepare submission

In [43]:
subm_df = pd.read_csv("data/test.csv")

for i, col in enumerate(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]):
    subm_df[col] = preds[:, i]

subm_df.head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,"Yo bitch Ja Rule is more succesful then you'll ever be whats up with you and hating you sad mofuckas...i should bitch slap ur pethedic white faces and get you to kiss my ass you guys sicken me. Ja rule is about pride in da music man. dont diss that shit on him. and nothin is wrong bein like tupac he was a brother too...fuckin white boys get things right next time.,",0.991044,0.360929,0.968881,0.031345,0.943214,0.163108
1,0000247867823ef7,"== From RfC == \n\n The title is fine as it is, IMO.",0.007682,0.00056,0.002847,0.000853,0.002909,0.001279


In [44]:
# if you want to write the submission file to disk, uncomment and run the below code
subm_df.drop(['comment_text'], axis=1).to_csv(f'{PATH}/submissions/lstm_subm_001.csv', index=False)

## K-Fold Cross Validation

We can perform a k-fold cross validation by following the same steps on the base training/validation datasets, except this time we use the CV training/validation datasets we created up top

In [None]:
train_datafields = [("id", None), # we won't be needing the id, so we pass in None as the field
                    (txt_col, TEXT_fld),
                    ("toxic", LABEL_fld), ("severe_toxic", LABEL_fld), ("obscene", LABEL_fld),
                    ("threat", LABEL_fld), ("insult", LABEL_fld), ("identity_hate", LABEL_fld), ("none", None)]

# test
test_datafields = [("id", None), (txt_col, TEXT_fld)]

# define test dataset and iterator
test_ds = data.TabularDataset(f'{PATH}/test_ds.csv', format='csv', skip_header=True, fields=test_datafields)
test_iter = data.Iterator(test_ds, batch_size=batch_sizes[2], device=0, train=False, 
                          shuffle=False, sort=False, sort_within_batch=False, repeat=False)
test_dl = BatchWrapper(test_iter, txt_col, None)

# define FULL train dataset for building vocab
full_train_ds = data.TabularDataset(f'{PATH}/full_train_ds.csv', 
                                    format='csv', skip_header=True, fields=train_datafields)

TEXT_fld.build_vocab(full_train_ds, min_freq=min_freq, max_size=max_features, vectors=pretrained_vectors)

# cv
for i in range(n_folds):
    print('-' * 10)
    print(f'Fold {i} ....')
    
    # train/validation datsets
    train_ds, valid_ds = data.TabularDataset.splits(PATH, 
                                                    train=f'train_ds_{i}_of_{n_folds}.csv', 
                                                    validation=f'valid_ds_{i}_of_{n_folds}.csv',
                                                    format='csv', skip_header=True, fields=train_datafields)

    # train/validation iterators/dataloaders
    train_iter, val_iter = data.BucketIterator.splits(
        (train_ds, valid_ds), # we pass in the datasets we want the iterator to draw data from
        batch_sizes=(batch_sizes[0], batch_sizes[1]),
        device=0, # if you want to use the GPU, specify the GPU number here
        sort_key=lambda x: len(x.comment_text_cleaned), # the BucketIterator needs to be told what function it should use to group the data.
        sort_within_batch=False,
        repeat=False) # we pass repeat=False because we want to wrap this Iterator layer.
        
    train_dl = BatchWrapper(train_iter, txt_col, label_cols)
    valid_dl = BatchWrapper(val_iter, txt_col, label_cols)
        
    md = ModelData(PATH, trn_dl=train_dl, val_dl=valid_dl, test_dl=test_dl)
    
    model = SimpleLstm(vocab_sz, emb_sz, n_rnn_hidden, n_rnn_layers, True, out_sz, bsz=batch_sizes[0])    
    model.cuda()
    
    lo = LayerOptimizer(optim.Adam, model, 1e-2, 1e-5)
    on_end = lambda sched, cycle: save_model(model, f'{PATH}/models/fit_1_cv{i}_cyc_{cycle}')
    cb = [CosAnneal(lo, len(md.trn_dl), cycle_mult=2)] #, on_cycle_end=on_end)]
    fit(model, md, 2**4-1, lo.opt, F.binary_cross_entropy, callbacks=cb)
        
    on_end = lambda sched, cycle: save_model(model, f'{PATH}/models/fit_2_cv{i}_cyc_{cycle}')
    lo = LayerOptimizer(optim.Adam, model, 1e-3, 1e-5)
    cb = [CosAnneal(lo, (len(md.trn_dl) * 6))] #, on_cycle_end=on_end)]
    fit(model, md, 6, lo.opt, F.binary_cross_entropy, callbacks=cb)
        
    preds = predict(model, test_dl)
        
    subm_df = pd.read_csv("data/test.csv")
    for lbl_idx, col in enumerate(["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]):
        subm_df[col] = preds[:, lbl_idx]
        
    # if you want to write the submission file to disk, uncomment and run the below code
    subm_df.drop(['comment_text'], axis=1).to_csv(f'{PATH}/submissions/subm_lstm_cv_{i}_001.csv', index=False)



In [None]:
cv_dfs = []
for i in range(n_folds):
    df = pd.read_csv(f'{PATH}/submissions/subm_lstm_cv_{i}_001.csv')
    cv_dfs.append(df)
    
final_cv_df = pd.concat([ df for df in cv_dfs ])

display(len(final_cv_df))
display(final_cv_df.head(2))

In [None]:
final_cv_df = final_cv_df.groupby(['id']).mean().reset_index()
final_cv_df.head()

In [None]:
final_cv_df.to_csv(f'{PATH}/submissions/subm_lstm_cv_final_001.csv', index=False)