## Load Data

In [2]:
import pandas as pd

In [3]:
import numpy as np

In [4]:
PATH ='data/'

In [8]:
train = pd.read_csv('data/kaggle/train.csv')
test = pd.read_csv('data/kaggle/test.csv')
sample = pd.read_csv('data/kaggle/sample_submission.csv')

In [9]:
train._get_numeric_data().mean()

toxic            0.095844
severe_toxic     0.009996
obscene          0.052948
threat           0.002996
insult           0.049364
identity_hate    0.008805
dtype: float64

The training class is highly unbalanced, need to pay attention to this.

## Data processing

### CSV File Preprocessing

In [10]:
# Need to remvoe the space to make sure we are able to make the torchtext working properly

In [11]:
train['comment_text']=train.comment_text.str.replace('\n',' ')

In [14]:
idx = np.arange(train.shape[0])

In [16]:
np.random.seed(999)

In [17]:
np.random.shuffle(idx)

In [19]:
val_size = int(len(idx)*0.2)

In [22]:
% mkdir cache

In [23]:
train.loc[idx[val_size:],:].to_csv('cache/train.csv',index=False)

In [24]:
train.loc[idx[:val_size],:].to_csv('cache/validation.csv',index=False)

In [25]:
test['comment_text']=test['comment_text'].str.replace('\n',' ')

In [26]:
test.to_csv('cache/test.csv',index=False)

### Tokenization

In [1]:
import re
import spacy
NLP = spacy.load('en')
MAX_CHARS = 10000
def tokenizer(comment):
    comment = re.sub(
    r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!\<\>;]", " ", 
    str(comment))
    comment = re.sub(r"[ ]+", " ", comment)
    comment = re.sub(r"\!+", "!", comment)
    comment = re.sub(r"\,+", ",", comment)
    comment = re.sub(r"\?+", "?", comment)
    if (len(comment)>MAX_CHARS):
        comment = comment[:MAX_CHARS]
    return[
        x.text for x in NLP.tokenizer(comment) if x.text!='']

### Load the dataset
Here we will load data from cache files and process all the data.

In [5]:
import torch
from torchtext import data

In [6]:
from torchtext import *

In [7]:
??data.TabularDataset

In [8]:
fix_length=100; lower=False; vectors=True

In [9]:
comment = data.Field(
    sequential=True,
    fix_length=fix_length,
    tokenize=tokenizer,
    pad_first=True,
    tensor_type=torch.cuda.LongTensor,
    lower=lower
)

In [10]:
labels=data.Field(
use_vocab=False, sequential=False,
                tensor_type=torch.cuda.ByteTensor)

In [11]:
train,val = data.TabularDataset.splits(
    path = 'cache/',format='csv',skip_header=True,
    train = 'train.csv',validation='validation.csv',
    fields = [
        ('id',None),
        ('comment_text',comment),
        ('severe_toxic', labels),
        ('obscene', labels),
        ('threat', labels),
        ('insult', labels),
        ('identity_hate', labels)
    ]
)

In [12]:
test = data.TabularDataset(
    path='cache/test.csv', format='csv', 
    skip_header=True,
    fields=[
        ('id', None),
        ('comment_text', comment)
    ])

### Build vocabulary

In [13]:
comment.build_vocab(train,val,test,max_size=10000,min_freq=50)

In [18]:
ss= "just a test and try"

In [22]:
s = [tokenizer(ss)]

In [23]:
comment.numericalize(s)

Variable containing:
  59
   9
 865
   8
 328
[torch.cuda.LongTensor of size 5x1 (GPU 0)]

## Create Batches and Iterate Through dataset

In [115]:
comment.build_vocab(train,val,test,max_size=10000,min_freq=50,vectors="glove.6B.100d")

.vector_cache/glove.6B.zip: 862MB [02:27, 5.85MB/s]                             
100%|██████████| 400000/400000 [00:14<00:00, 27018.53it/s]


In [None]:
comment.vocab.

In [None]:
# We can also randomly generate embedding and build the vocabulary

In [None]:
dataset_iter = data.Iterator(train,batch_size=64,device=0,train=True,shuffle=True,repeat=False, sort=False)

In [None]:
for examples in dataset_iter:
    x=examples.comment_text
    y = torch.stack([
        examples.severe_toxic, 
        examples.obscene,
        examples.threat, examples.insult, 
        examples.identity_hate
    ], dim=1)

In [175]:
test_iter = data.Iterator(test,batch_size=len(test),device=0,train=False,shuffle=False,repeat=False, sort=False)

In [None]:
for examples in test_iter:
    x=examples.comment_text

## Test the data before the LSTM

## Train Model : Try LSTM First

In [7]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

class LSTM_Sentiment(nn.Module):
    
    def __init__(self,embedding_dim,hidden_dim,vocab_size,label_size,use_gpu,batch_size,dropout=0.5):
        super(LSTM_Sentiment,self).__init__()
        self.embedding_dim = nn.Embedding(vocab_size,embedding_dim)
        self.hidden_dim = hidden_dim
        self.use_gpu = use_gpu
        self.batch_size = batch_size
        self.dropout= dropout
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=hidden_dim)
        self.hidden2label = nn.Linear(hidden_dim,label_size)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        ## Two layer of hidden, first is for the input x, second is for the hidden state
        if self.use_gpu:
            return (Variable(torch.zeros(1,self.batch_size,self.hidden_dim).cuda()),
                    Variable(torch.zeros(1,self.batch_size,self.hidden_dim).cuda()))
        else:
            return (Variable(torch.zeros(1,self.batch_size,self.hidden_dim)),
                    Variable(torch.zeros(1,self.batch_size,self.hidden_dim)))
                    
    def forward(self,sentence):
        x = self.embeddings(sentence).view(len(setence),self.batch_size,-1)
        lstm_out,self.hidden = self.lstm(x,self.hidden)
        y = self.hidden2label(lstm_out[-1])
        log_probs = F.log_softmax(y)
        return log_probs

In [12]:
def get_accuracy(truth, pred):
    assert len(truth) == len(pred)
    right = 0
    for i in range(len(truth)):
        if truth[i] == pred[i]:
            right +=1
    return right/len(truth)

In [9]:
for epoch in range(1):
    for batch in train_iter:
        
        # Get all the batch files
        sent, label = batch.text,batch.label
        # label.data.sub_(1) not sure about this function 
        truth += list(label.data)
        model.batch_size = len(label.data)
        model.hidden = model.init_hidden()
        # forward model
        pred = model(sent)
        # compute loss, gradient and update parameters by calling
        loss = loss_function(pred,label)
        avg_loss+=loss.data[0]
        model.zero_grad()    
        loss.backward()
        optimizer.step()
        
    avg_loss /=len(train_iter)
    acc = get_accuracy(truth,pred)
    print (avg_loss)

NameError: name 'train_iter' is not defined