In [34]:
import numpy as np
import pandas as pd

In [35]:
from datasets import *
from models import *

In [36]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
df = pd.read_csv('.\\data\\train.csv', sep=',')

In [38]:
df.head()

Unnamed: 0,id,target,comment_text,severe_toxicity,obscene,identity_attack,insult,threat,asian,atheist,...,article_id,rating,funny,wow,sad,likes,disagree,sexual_explicit,identity_annotator_count,toxicity_annotator_count
0,59848,0.0,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
1,59849,0.0,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
2,59852,0.0,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
3,59855,0.0,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,,,...,2006,rejected,0,0,0,0,0,0.0,0,4
4,59856,0.893617,haha you guys are a bunch of losers.,0.021277,0.0,0.021277,0.87234,0.0,0.0,0.0,...,2006,rejected,0,0,0,1,0,0.0,4,47


In [39]:
groups = df.columns[8:-13]
target_columns = ['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']

In [40]:
nan_count = df[groups].loc[:, 'asian'].isna().sum()
length = df.shape[0]
print('NANs: ', nan_count)
print('data rows: ', length)
print('non-NANs: ', length - nan_count)

NANs:  1399744
data rows:  1804874
non-NANs:  405130


In [41]:
s = None
for group in groups:
    if s is None:
        s = df[groups].loc[:, group].isna().sum()
    else:
        assert s == df[groups].loc[:, group].isna().sum(), 'Amount of NANs doesn\'t match'

In [42]:
nan_mask = df[groups].iloc[:, 0].isna()

df_train_net = df[nan_mask]
df_train_boost = df[~nan_mask]

print('Net training size: ', df_train_net.shape[0])
print('Boosting training size: ', df_train_boost.shape[0])

Net training size:  1399744
Boosting training size:  405130


In [43]:
df_train_net = df_train_net.drop(groups, axis='columns')
target_net = df_train_net[target_columns]
df_train_net = df_train_net.drop(target_net, axis='columns')
df_train_net = df_train_net[['id', 'comment_text']]

In [60]:
train_df = pd.concat((df_train_net, target_net), axis='columns')
train_df.to_csv('.\\data\\train_set.csv', index=False, sep=',')

In [67]:
train_df.head(10)

Unnamed: 0,id,comment_text,target,severe_toxicity,obscene,identity_attack,insult,threat
0,59848,"This is so cool. It's like, 'would you want yo...",0.0,0.0,0.0,0.0,0.0,0.0
1,59849,Thank you!! This would make my life a lot less...,0.0,0.0,0.0,0.0,0.0,0.0
2,59852,This is such an urgent design problem; kudos t...,0.0,0.0,0.0,0.0,0.0,0.0
3,59855,Is this something I'll be able to install on m...,0.0,0.0,0.0,0.0,0.0,0.0
5,59859,ur a sh*tty comment.,0.666667,0.047619,0.638095,0.0,0.333333,0.0
6,59861,hahahahahahahahhha suck it.,0.457627,0.050847,0.305085,0.0,0.254237,0.0
7,59863,FFFFUUUUUUUUUUUUUUU,0.0,0.0,0.0,0.0,0.0,0.0
8,239575,The ranchers seem motivated by mostly by greed...,0.0,0.0,0.0,0.0,0.0,0.0
9,239576,It was a great show. Not a combo I'd of expect...,0.0,0.0,0.0,0.0,0.0,0.0
10,239578,"Wow, that sounds great.",0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
emb_path = "..\\Dasha\\embeddings\\crawl-300d-2M.vec"
emb_size = 300

In [64]:
dataset = DataLoader(path='.\\data\\train_set.csv', embeddings_path=emb_path, embeddings_size=emb_size, maxlen=1000000)

Loading embeddings from ..\Dasha\embeddings\crawl-300d-2M.vec...
Done.
Loading text...
Index(['id', 'comment_text', 'target', 'severe_toxicity', 'obscene',
       'identity_attack', 'insult', 'threat'],
      dtype='object')
Done.
Perfoming text editing...
Done.
Transforming text into a sequence of indices...
Done.


In [69]:
hidden_size = 128
num_layers = 2
learning_rate = 5e-4
num_classes = 6
batch_size = 128 
num_epochs = 3

In [71]:
model = nnPredictor(emb_size, hidden_size, num_layers, num_classes, dataset.emb)

In [None]:
model.train(dataset, num_epochs, verbose_step=batch_size * 100)

Epoch [1 / 3], Step [  6400 / 1399744], Average loss: 0.1135
Epoch [1 / 3], Step [ 12800 / 1399744], Average loss: 0.1247
Epoch [1 / 3], Step [ 19200 / 1399744], Average loss: 0.1285
Epoch [1 / 3], Step [ 25600 / 1399744], Average loss: 0.1305
Epoch [1 / 3], Step [ 32000 / 1399744], Average loss: 0.1317
Epoch [1 / 3], Step [ 38400 / 1399744], Average loss: 0.1215
Epoch [1 / 3], Step [ 44800 / 1399744], Average loss: 0.1297
Epoch [1 / 3], Step [ 51200 / 1399744], Average loss: 0.1213
Epoch [1 / 3], Step [ 57600 / 1399744], Average loss: 0.1287
Epoch [1 / 3], Step [ 64000 / 1399744], Average loss: 0.1287
Epoch [1 / 3], Step [ 70400 / 1399744], Average loss: 0.1335
Epoch [1 / 3], Step [ 76800 / 1399744], Average loss: 0.1323
Epoch [1 / 3], Step [ 83200 / 1399744], Average loss: 0.1370
Epoch [1 / 3], Step [ 89600 / 1399744], Average loss: 0.1278
Epoch [1 / 3], Step [ 96000 / 1399744], Average loss: 0.1312
Epoch [1 / 3], Step [102400 / 1399744], Average loss: 0.1451
Epoch [1 / 3], Step [108