## Load Data

In [2]:
import pandas as pd

In [3]:
import numpy as np
from torch import optim

In [4]:
PATH ='data/'

In [5]:
train = pd.read_csv('data/kaggle/train.csv')
test = pd.read_csv('data/kaggle/test.csv')
sample = pd.read_csv('data/kaggle/sample_submission.csv')

In [6]:
train._get_numeric_data().mean()

toxic            0.095844
severe_toxic     0.009996
obscene          0.052948
threat           0.002996
insult           0.049364
identity_hate    0.008805
dtype: float64

The training class is highly unbalanced, need to pay attention to this.

## Data processing

### CSV File Preprocessing

In [None]:
# Need to remvoe the space to make sure we are able to make the torchtext working properly

In [None]:
train['comment_text']=train.comment_text.str.replace('\n',' ')

In [None]:
idx = np.arange(train.shape[0])

In [None]:
np.random.seed(999)

In [None]:
np.random.shuffle(idx)

In [None]:
val_size = int(len(idx)*0.2)

In [None]:
% mkdir cache

In [None]:
train.loc[idx[val_size:],:].to_csv('cache/train.csv',index=False)

In [None]:
train.loc[idx[:val_size],:].to_csv('cache/validation.csv',index=False)

In [None]:
test['comment_text']=test['comment_text'].str.replace('\n',' ')

In [None]:
test.to_csv('cache/test.csv',index=False)

### Tokenization

In [7]:
import re
import spacy
NLP = spacy.load('en')
MAX_CHARS = 10000
def tokenizer(comment):
    comment = re.sub(
    r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!\<\>;]", " ", 
    str(comment))
    comment = re.sub(r"[ ]+", " ", comment)
    comment = re.sub(r"\!+", "!", comment)
    comment = re.sub(r"\,+", ",", comment)
    comment = re.sub(r"\?+", "?", comment)
    if (len(comment)>MAX_CHARS):
        comment = comment[:MAX_CHARS]
    return[
        x.text for x in NLP.tokenizer(comment) if x.text!='']

### Load the dataset
Here we will load data from cache files and process all the data.

In [8]:
import torch
from torchtext import data

In [9]:
from torchtext import *

In [10]:
fix_length=100; lower=False; vectors=True

In [11]:
comment = data.Field(
    sequential=True,
    fix_length=fix_length,
    tokenize=tokenizer,
    pad_first=True,
    tensor_type=torch.cuda.LongTensor,
    lower=lower
)

In [12]:
labels=data.Field(
use_vocab=False, sequential=False,
                tensor_type=torch.cuda.FloatTensor)

In [13]:
train,val = data.TabularDataset.splits(
    path = 'cache/',format='csv',skip_header=True,
    train = 'train.csv',validation='validation.csv',
    fields = [
        ('id',None),
        ('comment_text',comment),
        ('severe_toxic', labels),
        ('obscene', labels),
        ('threat', labels),
        ('insult', labels),
        ('identity_hate', labels)
    ]
)

In [14]:
test = data.TabularDataset(
    path='cache/test.csv', format='csv', 
    skip_header=True,
    fields=[
        ('id', None),
        ('comment_text', comment)
    ])

### Build vocabulary

In [15]:
comment.build_vocab(train,val,test,max_size=10000,min_freq=50)

In [16]:
fields = [
('id',None),
('comment_text',comment),
('severe_toxic', labels),
('obscene', labels),
('threat', labels),
('insult', labels),
('identity_hate', labels)
]

In [17]:
test_new=data.Dataset(test.examples, fields[:2])

In [18]:
ss= "just a test and try"

In [19]:
s = [tokenizer(ss)]

In [20]:
comment.numericalize(s)

Variable containing:
  59
   9
 865
   8
 328
[torch.cuda.LongTensor of size 5x1 (GPU 0)]

## Create Batches and Iterate Through dataset

In [21]:
#comment.build_vocab(train,val,test,max_size=10000,min_freq=50,vectors="glove.6B.100d")

In [22]:
# We can also randomly generate embedding and build the vocabulary

In [35]:
dataset_iter = data.Iterator(train,batch_size=64,device=0,train=True,shuffle=True,repeat=False, sort=False)

In [17]:
for examples in dataset_iter:
    x=examples.comment_text
    y = torch.stack([
        examples.severe_toxic, 
        examples.obscene,
        examples.threat, examples.insult, 
        examples.identity_hate
    ], dim=1)

  """Entry point for launching an IPython kernel.


## Test the data before the LSTM

## Train Model : Try LSTM First

In [23]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

class LSTM_Sentiment(nn.Module):
    
    def __init__(self,embedding_dim,hidden_dim,vocab_size,label_size,use_gpu,batch_size,dropout=0.5):
        super(LSTM_Sentiment,self).__init__()
        self.embeddings = nn.Embedding(vocab_size,embedding_dim)
        self.hidden_dim = hidden_dim
        self.use_gpu = use_gpu
        self.batch_size = batch_size
        self.dropout= dropout
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=hidden_dim)
        self.hidden2label = nn.Linear(hidden_dim,label_size)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        ## Two layer of hidden, first is for the input x, second is for the hidden state
        if self.use_gpu:
            return (Variable(torch.zeros(1,self.batch_size,self.hidden_dim).cuda()),
                    Variable(torch.zeros(1,self.batch_size,self.hidden_dim).cuda()))
        else:
            return (Variable(torch.zeros(1,self.batch_size,self.hidden_dim)),
                    Variable(torch.zeros(1,self.batch_size,self.hidden_dim)))
                    
    def forward(self,sentence):
        x = self.embeddings(sentence).view(len(sentence),self.batch_size,-1)
        lstm_out,self.hidden = self.lstm(x,self.hidden)
        y = self.hidden2label(lstm_out[-1])
        probs = F.sigmoid(y)
        return probs

In [24]:
def get_accuracy(truth, pred):
    assert len(truth) == len(pred)
    right = 0
    for i in range(len(truth)):
        if truth[i] == pred[i]:
            right +=1
    return right/len(truth)

In [25]:
embedding_dim=50
hidden_dim=50
vocab_size = len(comment.vocab)
label_size = 5

In [26]:
model=LSTM_Sentiment(embedding_dim=embedding_dim,hidden_dim=hidden_dim,vocab_size=vocab_size,label_size=label_size,use_gpu=True,batch_size=64)

In [27]:
model.cuda()

LSTM_Sentiment(
  (embeddings): Embedding(10002, 50)
  (lstm): LSTM(50, 50)
  (hidden2label): Linear(in_features=50, out_features=5)
)

In [28]:
truth=[]

In [29]:
# # word2vector
# word_to_idx = text_field.vocab.stoi
# pretrained_embeddings = np.random.uniform(-0.25, 0.25, (len(text_field.vocab), 300))
# pretrained_embeddings[0] = 0
# word2vec = load_bin_vec('./data/GoogleNews-vectors-negative300.bin', word_to_idx)
# for word, vector in word2vec.items():
#     pretrained_embeddings[word_to_idx[word]-1] = vector
# model.embeddings.weight.data.copy_(torch.from_numpy(pretrained_embeddings))


In [30]:
pretrained_embeddings = np.random.uniform(-0.25, 0.25, (len(comment.vocab), 50))
model.embeddings.weight.data.copy_(torch.from_numpy(pretrained_embeddings))


 4.4860e-02  5.1725e-02 -1.7067e-01  ...  -1.1518e-01  1.7021e-01 -2.1969e-01
-2.4033e-02 -1.7740e-01 -1.6071e-01  ...   3.5111e-02  3.5394e-02  2.1418e-01
-2.3380e-02  5.4535e-02  1.8442e-01  ...   1.4856e-01  2.4051e-01 -3.0324e-02
                ...                   ⋱                   ...                
 2.1329e-01  2.4778e-02 -2.3154e-01  ...  -8.3672e-02  1.8862e-01 -7.5230e-02
-2.3654e-01  1.4688e-02  1.3987e-03  ...   1.1681e-01  1.7809e-01  1.6504e-01
-1.9808e-01 -1.7477e-02  1.6744e-01  ...  -7.5114e-02  8.2674e-02  7.1842e-03
[torch.cuda.FloatTensor of size 10002x50 (GPU 0)]

In [31]:
loss_function=nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [36]:
for epoch in range(5):
    model.train()
    avg_loss = 0.0
    truth_res = []
    pred_res = []
    count = 0
    for examples in dataset_iter:
        # Get all the batch files
        x=examples.comment_text
        y = torch.stack([
        examples.severe_toxic, 
        examples.obscene,
        examples.threat, examples.insult, 
        examples.identity_hate
        ], dim=1)
        # label.data.sub_(1) not sure about this function 
        #truth += list(y.data)
        model.batch_size = len(y.data)
        model.hidden = model.init_hidden()
        # forward model
        pred = model(x)
        #predict += list(pred)
        # compute loss, gradient and update parameters by calling
        loss = loss_function(pred,y)
        avg_loss+=loss.data[0]
        model.zero_grad()    
        loss.backward()
        optimizer.step()
        
    avg_loss /=len(dataset_iter)
    #acc = get_accuracy(truth,pred)
    print (avg_loss)

  import sys


0.10487604629070986
0.060547062164559996
0.0526934437325369
0.048503817299749695
0.04509613451624948


In [37]:
model.eval()

LSTM_Sentiment(
  (embeddings): Embedding(10002, 50)
  (lstm): LSTM(50, 50)
  (hidden2label): Linear(in_features=50, out_features=5)
)

## Model Prediction

In [38]:
predict= []

In [50]:
x

Variable containing:
    1     1     1  ...      1   169     1
   17     1     1  ...      1    11     1
    7     1     1  ...      1     2     1
       ...          ⋱          ...       
 1869    66  6782  ...    358     3    32
    2   520   495  ...    250    22    16
    0     2     2  ...      2  2138     2
[torch.cuda.LongTensor of size 100x41 (GPU 0)]

In [49]:
x_test

Variable containing:
 5765
  577
    0
 8671
   10
   67
    0
   98
   11
  165
  373
   19
   54
  293
   82
   26
   11
    8
 6868
   11
 1738
    0
   75
   89
   60
  577
 6082
 1835
    0
  793
 6972
    8
  108
   11
    5
 5783
   40
  494
   11
  804
    0
   44
    2
    0
  757
   10
   42
 5379
   13
 1449
  803
  470
    2
   27
  297
    0
   12
  354
   18
  164
    2
    8
    0
   10
  267
    0
   51
    0
   64
   30
    9
 2397
  170
   75
 2898
  793
 3118
  108
  239
  166
  551
   96
    2
    3
[torch.cuda.LongTensor of size 84x1 (GPU 0)]

In [51]:
x_test

Variable containing:
 5765
  577
    0
 8671
   10
   67
    0
   98
   11
  165
  373
   19
   54
  293
   82
   26
   11
    8
 6868
   11
 1738
    0
   75
   89
   60
  577
 6082
 1835
    0
  793
 6972
    8
  108
   11
    5
 5783
   40
  494
   11
  804
    0
   44
    2
    0
  757
   10
   42
 5379
   13
 1449
  803
  470
    2
   27
  297
    0
   12
  354
   18
  164
    2
    8
    0
   10
  267
    0
   51
    0
   64
   30
    9
 2397
  170
   75
 2898
  793
 3118
  108
  239
  166
  551
   96
    2
    3
[torch.cuda.LongTensor of size 84x1 (GPU 0)]

In [53]:
model.batch_size=41

In [None]:
# need to make some changes on the input data shape to take care of the batch size issue
# or I can also create a batch for test data, but this is to be done.

In [54]:
# Here are just pseudo code and might not run out of box
for test in test_new:
    x_test=comment.numericalize([test.comment_text])
    predict+=list(model(x))

RuntimeError: cuda runtime error (2) : out of memory at /opt/conda/conda-bld/pytorch_1512387374934/work/torch/lib/THC/generic/THCStorage.cu:58