## Load Data

In [1]:
import pandas as pd

In [2]:
import numpy as np
from torch import optim

In [3]:
PATH ='data/'

In [4]:
train = pd.read_csv('data/kaggle/train.csv')
test = pd.read_csv('data/kaggle/test.csv')
sample = pd.read_csv('data/kaggle/sample_submission.csv')

In [5]:
train._get_numeric_data().mean()

toxic            0.095844
severe_toxic     0.009996
obscene          0.052948
threat           0.002996
insult           0.049364
identity_hate    0.008805
dtype: float64

The training class is highly unbalanced, need to pay attention to this.

## Data processing

### CSV File Preprocessing

In [None]:
# Need to remvoe the space to make sure we are able to make the torchtext working properly

In [None]:
train['comment_text']=train.comment_text.str.replace('\n',' ')

In [None]:
idx = np.arange(train.shape[0])

In [None]:
np.random.seed(999)

In [None]:
np.random.shuffle(idx)

In [None]:
val_size = int(len(idx)*0.2)

In [None]:
% mkdir cache

In [None]:
train.loc[idx[val_size:],:].to_csv('cache/train.csv',index=False)

In [None]:
train.loc[idx[:val_size],:].to_csv('cache/validation.csv',index=False)

In [None]:
test['comment_text']=test['comment_text'].str.replace('\n',' ')

In [None]:
test.to_csv('cache/test.csv',index=False)

### Tokenization

In [6]:
import re
import spacy
NLP = spacy.load('en')
MAX_CHARS = 10000
def tokenizer(comment):
    comment = re.sub(
    r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!\<\>;]", " ", 
    str(comment))
    comment = re.sub(r"[ ]+", " ", comment)
    comment = re.sub(r"\!+", "!", comment)
    comment = re.sub(r"\,+", ",", comment)
    comment = re.sub(r"\?+", "?", comment)
    if (len(comment)>MAX_CHARS):
        comment = comment[:MAX_CHARS]
    return[
        x.text for x in NLP.tokenizer(comment) if x.text!='']

### Load the dataset
Here we will load data from cache files and process all the data.

In [7]:
import torch
from torchtext import data

In [8]:
from torchtext import *

In [9]:
fix_length=100; lower=False; vectors=True

In [10]:
comment = data.Field(
    sequential=True,
    fix_length=fix_length,
    tokenize=tokenizer,
    pad_first=True,
    tensor_type=torch.cuda.LongTensor,
    lower=lower
)

In [11]:
labels=data.Field(
use_vocab=False, sequential=False,
                tensor_type=torch.cuda.FloatTensor)

In [12]:
train,val = data.TabularDataset.splits(
    path = 'cache/',format='csv',skip_header=True,
    train = 'train.csv',validation='validation.csv',
    fields = [
        ('id',None),
        ('comment_text',comment),
        ('severe_toxic', labels),
        ('obscene', labels),
        ('threat', labels),
        ('insult', labels),
        ('identity_hate', labels)
    ]
)

In [13]:
test = data.TabularDataset(
    path='cache/test.csv', format='csv', 
    skip_header=True,
    fields=[
        ('id', None),
        ('comment_text', comment)
    ])

### Build vocabulary

In [14]:
comment.build_vocab(train,val,test,max_size=10000,min_freq=50)

In [15]:
fields = [
('id',None),
('comment_text',comment),
('severe_toxic', labels),
('obscene', labels),
('threat', labels),
('insult', labels),
('identity_hate', labels)
]

In [None]:
test_new=data.Dataset(test.examples, fields[:2])

In [None]:
ss= "just a test and try"

In [None]:
s = [tokenizer(ss)]

In [None]:
comment.numericalize(s)

## Create Batches and Iterate Through dataset

In [None]:
#comment.build_vocab(train,val,test,max_size=10000,min_freq=50,vectors="glove.6B.100d")

In [None]:
# We can also randomly generate embedding and build the vocabulary

In [16]:
dataset_iter = data.Iterator(train,batch_size=64,device=0,train=True,shuffle=True,repeat=False, sort=False)

In [17]:
for examples in dataset_iter:
    x=examples.comment_text
    y = torch.stack([
        examples.severe_toxic, 
        examples.obscene,
        examples.threat, examples.insult, 
        examples.identity_hate
    ], dim=1)

  """Entry point for launching an IPython kernel.


## Test the data before the LSTM

## Train Model : Try LSTM First

In [18]:
import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F

class LSTM_Sentiment(nn.Module):
    
    def __init__(self,embedding_dim,hidden_dim,vocab_size,label_size,use_gpu,batch_size,dropout=0.5):
        super(LSTM_Sentiment,self).__init__()
        self.embeddings = nn.Embedding(vocab_size,embedding_dim)
        self.hidden_dim = hidden_dim
        self.use_gpu = use_gpu
        self.batch_size = batch_size
        self.dropout= dropout
        self.lstm = nn.LSTM(input_size=embedding_dim,hidden_size=hidden_dim)
        self.hidden2label = nn.Linear(hidden_dim,label_size)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        ## Two layer of hidden, first is for the input x, second is for the hidden state
        if self.use_gpu:
            return (Variable(torch.zeros(1,self.batch_size,self.hidden_dim).cuda()),
                    Variable(torch.zeros(1,self.batch_size,self.hidden_dim).cuda()))
        else:
            return (Variable(torch.zeros(1,self.batch_size,self.hidden_dim)),
                    Variable(torch.zeros(1,self.batch_size,self.hidden_dim)))
                    
    def forward(self,sentence):
        x = self.embeddings(sentence).view(len(sentence),self.batch_size,-1)
        lstm_out,self.hidden = self.lstm(x,self.hidden)
        y = self.hidden2label(lstm_out[-1])
        probs = F.sigmoid(y)
        return probs

In [19]:
def get_accuracy(truth, pred):
    assert len(truth) == len(pred)
    right = 0
    for i in range(len(truth)):
        if truth[i] == pred[i]:
            right +=1
    return right/len(truth)

In [20]:
embedding_dim=50
hidden_dim=50
vocab_size = len(comment.vocab)
label_size = 5

In [21]:
model=LSTM_Sentiment(embedding_dim=embedding_dim,hidden_dim=hidden_dim,vocab_size=vocab_size,label_size=label_size,use_gpu=True,batch_size=64)

In [22]:
model.cuda()

LSTM_Sentiment(
  (embeddings): Embedding(10002, 50)
  (lstm): LSTM(50, 50)
  (hidden2label): Linear(in_features=50, out_features=5)
)

In [23]:
truth=[]

In [24]:
# # word2vector
# word_to_idx = text_field.vocab.stoi
# pretrained_embeddings = np.random.uniform(-0.25, 0.25, (len(text_field.vocab), 300))
# pretrained_embeddings[0] = 0
# word2vec = load_bin_vec('./data/GoogleNews-vectors-negative300.bin', word_to_idx)
# for word, vector in word2vec.items():
#     pretrained_embeddings[word_to_idx[word]-1] = vector
# model.embeddings.weight.data.copy_(torch.from_numpy(pretrained_embeddings))


In [25]:
pretrained_embeddings = np.random.uniform(-0.25, 0.25, (len(comment.vocab), 50))
model.embeddings.weight.data.copy_(torch.from_numpy(pretrained_embeddings))


-9.6057e-02  1.2684e-01 -7.0991e-02  ...   2.8762e-02 -2.0994e-01 -1.8405e-01
 9.6733e-02  1.7260e-01  1.4135e-01  ...  -1.7217e-02 -1.3952e-01  2.3984e-01
-1.4805e-01 -1.0835e-01  1.3594e-01  ...   1.5753e-01  8.4067e-02  4.6908e-02
                ...                   ⋱                   ...                
 9.2111e-02 -4.0407e-02  1.4952e-01  ...  -4.7937e-02  8.5156e-02  8.0800e-02
-2.4582e-02 -1.1616e-01  1.3713e-01  ...  -1.8092e-01 -1.6539e-02 -8.0761e-02
-9.3745e-03  7.6960e-02 -1.7984e-01  ...  -4.4111e-02 -1.0815e-01 -2.6773e-02
[torch.cuda.FloatTensor of size 10002x50 (GPU 0)]

In [26]:
loss_function=nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

In [27]:
truth=[]
predict=[]
avg_loss=0
acc=0

In [30]:
for epoch in range(5):
    model.train()
    avg_loss = 0.0
    truth_res = []
    pred_res = []
    count = 0
    for examples in dataset_iter:
        # Get all the batch files
        x=examples.comment_text
        y = torch.stack([
        examples.severe_toxic, 
        examples.obscene,
        examples.threat, examples.insult, 
        examples.identity_hate
        ], dim=1)
        # label.data.sub_(1) not sure about this function 
        #truth += list(y.data)
        model.batch_size = len(y.data)
        model.hidden = model.init_hidden()
        # forward model
        pred = model(x)
        #predict += list(pred)
        # compute loss, gradient and update parameters by calling
        loss = loss_function(pred,y)
        avg_loss+=loss.data[0]
        model.zero_grad()    
        loss.backward()
        optimizer.step()
        
    avg_loss /=len(dataset_iter)
    #acc = get_accuracy(truth,pred)
    print (avg_loss)

  import sys


0.049137196319649246
0.045885174009216045
0.04336156057115682
0.0404809182330051
0.037662031253178914


In [33]:
model.eval()

LSTM_Sentiment(
  (embeddings): Embedding(10002, 50)
  (lstm): LSTM(50, 50)
  (hidden2label): Linear(in_features=50, out_features=5)
)

## Model Prediction

In [None]:
predict= []

In [None]:
for tmp in test.examples:
    print(tmp.comment_text)

['Yo', 'bitch', 'Ja', 'Rule', 'is', 'more', 'succesful', 'then', 'you', "'ll", 'ever', 'be', 'what', 's', 'up', 'with', 'you', 'and', 'hating', 'you', 'sad', 'mofuckas', '...', 'i', 'should', 'bitch', 'slap', 'ur', 'pethedic', 'white', 'faces', 'and', 'get', 'you', 'to', 'kiss', 'my', 'ass', 'you', 'guys', 'sicken', 'me', '.', 'Ja', 'rule', 'is', 'about', 'pride', 'in', 'da', 'music', 'man', '.', 'do', 'nt', 'diss', 'that', 'shit', 'on', 'him', '.', 'and', 'nothin', 'is', 'wrong', 'bein', 'like', 'tupac', 'he', 'was', 'a', 'brother', 'too', '...', 'fuckin', 'white', 'boys', 'get', 'things', 'right', 'next', 'time', '.', ',']
[' ', 'From', 'RfC', 'The', 'title', 'is', 'fine', 'as', 'it', 'is', ',', 'IMO', '.']
[' ', 'Sources', 'Zawe', 'Ashton', 'on', 'Lapland', '—']
[' ', 'If', 'you', 'have', 'a', 'look', 'back', 'at', 'the', 'source', ',', 'the', 'information', 'I', 'updated', 'was', 'the', 'correct', 'form', '.', 'I', 'can', 'only', 'guess', 'the', 'source', 'had', "n't", 'updated', '

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[' ', 'There', 'was', 'no', 'genocide', 'of', 'the', 'old', 'Yishuv', 'the', 'very', 'few', 'isolated', 'cases', 'you', 'brought', 'up', 'were', 'in', 'reference', 'to', 'only', 'two', 'cities', 'in', 'Palestine', 'Safed', 'and', 'Tiberias', 'and', 'in', 'every', 'case', 'only', 'a', 'small', 'amount', 'of', 'people', 'were', 'ever', 'killed', 'and', 'in', 'many', 'cases', 'the', 'articles', 'on', 'this', 'dealt', 'either', 'how', 'the', 'event', 'had', 'been', 'greatly', 'exaggerated', 'by', 'later', 'Zionist', 'hasbara', 'propaganda', 'or', 'how', 'many', 'of', 'the', 'people', 'immigrant', 'Jews', 'from', 'Europe', 'who', 'had', 'left', 'soon', 'returned', 'to', 'communities', 'in', 'Safed', 'and', 'Tiberias', 'such', 'as', 'when', 'Daher', 'el', 'Omar', 'rebuild', 'different', 'damages', 'cities', 'after', 'periods', 'of', 'fighting', '.', 'But', 'again', 'what', 'you', 'would', 'bring', 'up', 'were', 'a', 'few', 'very', 'isolated', 'incidents', 'in', 'the', 'two', 'cities', 'of', 

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




['Okay', 'people', ',', 'time', 'for', 'another', 'rant', '.', 'We', 'as', 'an', 'American', 'society', 'have', 'grown', 'to', 'such', 'a', 'huge', 'population', ',', 'and', 'wide', 'range', 'of', 'diversity', ',', 'that', 'any', 'group', 'or', 'even', 'individual', 'study', ',', 'becomes', 'meaningless', '.', 'I', 'am', 'not', 'racist', 'by', 'any', 'means', ',', 'but', 'in', 'my', 'many', 'travels', ',', 'I', 'have', 'noticed', 'that', 'no', 'matter', 'where', 'you', 'go', ',', 'every', 'race', 'has', 'their', 'own', 'distinct', 'characteristics', 'on', 'the', 'whole', '.', 'Now', 'there', 'are', 'always', 'exceptions', 'to', 'this', ',', 'but', 'as', 'a', 'general', 'rule', 'that', 'would', 'cover', 'a', 'small', 'group', 'of', 'one', 'race', ',', 'they', 'all', 'have', 'their', 'distinct', 'differances', '.', 'Psychology', 'in', 'my', 'mind', 'is', 'a', 'waste', 'of', 'time', '.', 'If', 'you', 'want', 'to', 'understand', 'what', 'makes', 'us', 'tick', ',', 'put', 'down', 'the', 'B

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)




[' ', 'rfuck', 'yuo', 'you', 're', 'a', 'fucking', 'gay', 'homo', 'fag', 'obsessed', 'with', 'homoeroticism', 'you', 'fucking', 'pile', 'of', 'chodes', 'i', 'hate', 'yopu', 'star', 'wars', '2', 'came', 'out', 'in', '2001', 'and', 'that', 's', 'final', 'you', 'stupid', 'fucker⊊⊈∋∃∘∓⊕⊗']
[' ', '{', 'class', 'Talk', 'Notice', 'Did', 'you', 'know', '?', 'has', 'been', 'updated', '.', 'A', 'fact', 'from', 'the', 'article', 'The', 'American', "'s", 'Creed', ',', 'which', 'you', 'recently', 'created', ',', 'has', 'been', 'featured', 'in', 'that', 'section', 'on', 'the', 'Main', 'Page', '.', 'If', 'you', 'know', 'of', 'another', 'interesting', 'fact', 'from', 'a', 'recently', 'created', 'article', ',', 'then', 'please', 'suggest', 'it', 'on', 'the', 'Did', 'you', 'know', '?', 'talk', 'page', '.', '}']
['welcome', '.', 'I', 'want', 'your', 'friend', '.']
[' ', 'Fit', 'Totally', ',', 'well', 'I', 'would', ',', 'any', 'day', 'of', 'the', 'f', 'king', 'week', '.', 'I', 'know', 'your', 'interested

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)

