## Load Data

In [1]:
import pandas as pd

In [40]:
import numpy as np
from torch import optim

In [3]:
PATH ='data/'

In [4]:
train = pd.read_csv('data/kaggle/train.csv')
test = pd.read_csv('data/kaggle/test.csv')
sample = pd.read_csv('data/kaggle/sample_submission.csv')

In [5]:
train._get_numeric_data().mean()

toxic            0.095844
severe_toxic     0.009996
obscene          0.052948
threat           0.002996
insult           0.049364
identity_hate    0.008805
dtype: float64

The training class is highly unbalanced, need to pay attention to this.

## Data processing

### CSV File Preprocessing

In [None]:
# Need to remvoe the space to make sure we are able to make the torchtext working properly

In [None]:
train['comment_text']=train.comment_text.str.replace('\n',' ')

In [None]:
idx = np.arange(train.shape[0])

In [None]:
np.random.seed(999)

In [None]:
np.random.shuffle(idx)

In [None]:
val_size = int(len(idx)*0.2)

In [None]:
% mkdir cache

In [None]:
train.loc[idx[val_size:],:].to_csv('cache/train.csv',index=False)

In [None]:
train.loc[idx[:val_size],:].to_csv('cache/validation.csv',index=False)

In [None]:
test['comment_text']=test['comment_text'].str.replace('\n',' ')

In [None]:
test.to_csv('cache/test.csv',index=False)

### Tokenization

In [6]:
import re
import spacy
NLP = spacy.load('en')
MAX_CHARS = 10000
def tokenizer(comment):
    comment = re.sub(
    r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!\<\>;]", " ", 
    str(comment))
    comment = re.sub(r"[ ]+", " ", comment)
    comment = re.sub(r"\!+", "!", comment)
    comment = re.sub(r"\,+", ",", comment)
    comment = re.sub(r"\?+", "?", comment)
    if (len(comment)>MAX_CHARS):
        comment = comment[:MAX_CHARS]
    return[
        x.text for x in NLP.tokenizer(comment) if x.text!='']

### Load the dataset
Here we will load data from cache files and process all the data.

In [7]:
import torch
from torchtext import data

In [8]:
from torchtext import *

In [9]:
fix_length=100; lower=False; vectors=True

In [10]:
comment = data.Field(
    sequential=True,
    fix_length=fix_length,
    tokenize=tokenizer,
    pad_first=True,
    tensor_type=torch.cuda.LongTensor,
    lower=lower
)

In [240]:
labels=data.Field(
use_vocab=False, sequential=False,
                tensor_type=torch.cuda.FloatTensor)

In [241]:
train,val = data.TabularDataset.splits(
    path = 'cache/',format='csv',skip_header=True,
    train = 'train.csv',validation='validation.csv',
    fields = [
        ('id',None),
        ('comment_text',comment),
        ('severe_toxic', labels),
        ('obscene', labels),
        ('threat', labels),
        ('insult', labels),
        ('identity_hate', labels)
    ]
)

In [242]:
test = data.TabularDataset(
    path='cache/test.csv', format='csv', 
    skip_header=True,
    fields=[
        ('id', None),
        ('comment_text', comment)
    ])

### Build vocabulary

In [243]:
comment.build_vocab(train,val,test,max_size=10000,min_freq=50)

In [244]:
ss= "just a test and try"

In [245]:
s = [tokenizer(ss)]

In [246]:
comment.numericalize(s)

Variable containing:
  59
   9
 865
   8
 328
[torch.cuda.LongTensor of size 5x1 (GPU 0)]

## Create Batches and Iterate Through dataset

In [247]:
#comment.build_vocab(train,val,test,max_size=10000,min_freq=50,vectors="glove.6B.100d")

In [248]:
# We can also randomly generate embedding and build the vocabulary

In [249]:
dataset_iter = data.Iterator(train,batch_size=64,device=0,train=True,shuffle=True,repeat=False, sort=False)

In [250]:
for examples in dataset_iter:
    x=examples.comment_text
    y = torch.stack([
        examples.severe_toxic, 
        examples.obscene,
        examples.threat, examples.insult, 
        examples.identity_hate
    ], dim=1)

  """Entry point for launching an IPython kernel.


In [None]:
test_iter = data.Iterator(test,batch_size=len(test),device=0,train=False,shuffle=False,repeat=False, sort=False)

In [None]:
for examples in test_iter:
    x=examples.comment_text

## Test the data before the LSTM

## Train Model : Try LSTM First

In [None]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

class LSTM_Sentiment(nn.Module):
    
    def __init__(self,embedding_dim,hidden_dim,vocab_size,label_size,use_gpu,batch_size,dropout=0.5):
        super(LSTM_Sentiment,self).__init__()
        self.embeddings = nn.Embedding(vocab_size,embedding_dim)
        self.hidden_dim = hidden_dim
        self.use_gpu = use_gpu
        self.batch_size = batch_size
        self.dropout= dropout
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=hidden_dim)
        self.hidden2label = nn.Linear(hidden_dim,label_size)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        ## Two layer of hidden, first is for the input x, second is for the hidden state
        if self.use_gpu:
            return (Variable(torch.zeros(1,self.batch_size,self.hidden_dim).cuda()),
                    Variable(torch.zeros(1,self.batch_size,self.hidden_dim).cuda()))
        else:
            return (Variable(torch.zeros(1,self.batch_size,self.hidden_dim)),
                    Variable(torch.zeros(1,self.batch_size,self.hidden_dim)))
                    
    def forward(self,sentence):
        x = self.embeddings(sentence).view(len(sentence),self.batch_size,-1)
        lstm_out,self.hidden = self.lstm(x,self.hidden)
        y = self.hidden2label(lstm_out[-1])
        probs = F.sigmoid(y)
        return probs

In [None]:
def get_accuracy(truth, pred):
    assert len(truth) == len(pred)
    right = 0
    for i in range(len(truth)):
        if truth[i] == pred[i]:
            right +=1
    return right/len(truth)

In [None]:
embedding_dim=100
hidden_dim=50
vocab_size = len(comment.vocab)
label_size = 5

In [None]:
model=LSTM_Sentiment(embedding_dim=embedding_dim,hidden_dim=hidden_dim,vocab_size=vocab_size,label_size=label_size,use_gpu=True,batch_size=64)

In [None]:
model.cuda()

In [None]:
truth=[]

In [None]:
truth += list(y.data)

In [None]:
# # word2vector
# word_to_idx = text_field.vocab.stoi
# pretrained_embeddings = np.random.uniform(-0.25, 0.25, (len(text_field.vocab), 300))
# pretrained_embeddings[0] = 0
# word2vec = load_bin_vec('./data/GoogleNews-vectors-negative300.bin', word_to_idx)
# for word, vector in word2vec.items():
#     pretrained_embeddings[word_to_idx[word]-1] = vector
# model.embeddings.weight.data.copy_(torch.from_numpy(pretrained_embeddings))


In [None]:
pretrained_embeddings = np.random.uniform(-0.25, 0.25, (len(comment.vocab), 100))
model.embeddings.weight.data.copy_(torch.from_numpy(pretrained_embeddings))

In [None]:
truth += list(y.data)
model.batch_size = len(y.data)
model.hidden = model.init_hidden()
# forward model

In [None]:
pred = model(x)
# compute loss, gradient and update parameters by calling

In [None]:
loss_function=nn.BCELoss()

In [None]:
loss = loss_function(pred,y)
avg_loss+=loss.data[0]
model.zero_grad()    
loss.backward()
optimizer.step()

In [50]:
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [None]:
for epoch in range(1):
    for example in dataset_iter:
        # Get all the batch files
        x=examples.comment_text
        y = torch.stack([
        examples.severe_toxic, 
        examples.obscene,
        examples.threat, examples.insult, 
        examples.identity_hate
        ], dim=1)
        # label.data.sub_(1) not sure about this function 
        truth += list(y.data)
        model.batch_size = len(label.data)
        model.hidden = model.init_hidden()
        # forward model
        pred = model(x)
        # compute loss, gradient and update parameters by calling
        loss = loss_function(pred,label)
        avg_loss+=loss.data[0]
        model.zero_grad()    
        loss.backward()
        optimizer.step()
        
    avg_loss /=len(train_iter)
    acc = get_accuracy(truth,pred)
    print (avg_loss)