# Sentiment Analysis

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import os
import re
import sys
import pickle
from glob import glob
from tqdm import tqdm_notebook

import nltk
import spacy
import logging

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

import torchtext
import torchtext.datasets as datasets
from torchtext import data, vocab

from IPython.display import display

LOGGER = logging.getLogger("toxic_dataset")

ModuleNotFoundError: No module named 'torchtext'

Helper functions

In [2]:
def prepare_csv(train_csv, test_csv, split=0.2, seed=999):
    if not os.path.exists('data'):
        os.mkdir('data')
        
    # read train csv file
    df_train = pd.read_csv(train_csv)
    df_train["comment_text"] = df_train.comment_text.str.replace("\n", " ")
    
    # create validation data
    idx = np.arange(df_train.shape[0])
    np.random.seed(seed)
    np.random.shuffle(idx)
    val_size = int(len(idx) * split)
    df_train.iloc[idx[val_size:], :].to_csv("data/dataset_train.csv", index=False)
    df_train.iloc[idx[:val_size], :].to_csv("data/dataset_val.csv", index=False)
    
    # read test csv file
    df_test = pd.read_csv(test_csv)
    df_test["comment_text"] = df_test.comment_text.str.replace("\n", " ")
    df_test.to_csv("data/dataset_test.csv", index=False)

## Toxic Comments Dataset

In [3]:
data_dir = 'D:/datasets/kaggle/toxic_comments'

train_csv = f'{data_dir}/train.csv'
test_csv = f'{data_dir}/test.csv'

# batch_size = 2

In [4]:
print(os.listdir(data_dir))

['glove', 'jigsaw', 'sample_submission.csv', 'sample_submission.csv.zip', 'test.csv', 'test.csv.zip', 'train.csv', 'train.csv.zip']


In [5]:
train_df = pd.read_csv(train_csv)

In [6]:
display(train_df.sample(n=5))

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
24837,256828841699,"to be unblocked, its just",0,0,0,0,0,0
91552,956238397504,"34, 21 July 2008 (UTC)",0,0,0,0,0,0
15561,160882777477,"Request on 14:26:36, 27 November 2014 for assi...",0,0,0,0,0,0
25797,267263413007,"""\nI've changed it to """"9/11 Truth movement is...",0,0,0,0,0,0
90555,945944934292,""" —Preceding unsigned comment added by 109.11...",0,0,0,0,0,0


In [7]:
sos_token = 0
eos_token = 1

class Vocabulary(object):
    def __init__(self):
        self.word2index = {"<sos>": 0, "<eos>": 1}
        self.word2count = {}
        self.index2word = {}
        self.count = 2
    
    def add_word(self, word):
        if not word in self.word2index:
            self.word2index[word] = self.count
            self.word2count[word] = 1
            self.index2word[self.count] = word
            self.count += 1
        else:
            self.word2count[word] += 1
    
    def add_sentence(self, sentence):
        for word in sentence.split(" "):
            self.add_word(word)
            
    def __len__(self):
        return self.count

In [8]:
vocab = Vocabulary()
all_comments_text = train_df["comment_text"]
for text in tqdm_notebook(all_comments_text, desc='Building vocabulary'):
    sentences = nltk.sent_tokenize(text)
    for sent in sentences:
        vocab.add_sentence(sent)
        
with open('vocab.pkl', 'bw') as f:
    pickle.dump(vocab, f)




In [None]:
vocab = pickle.load(open('vocab.pkl', 'rb'))

In [9]:
print(len(vocab))

400126


In [10]:
print("All:", len(train_df))
print("toxic:", len(train_df[train_df['toxic'] == 1]))
print("severe_toxic:", len(train_df[train_df['severe_toxic'] == 1]))
print("obscene:", len(train_df[train_df['obscene'] == 1]))
print("threat:", len(train_df[train_df['threat'] == 1]))
print("insult:", len(train_df[train_df['insult'] == 1]))
print("identity_hate:", len(train_df[train_df['identity_hate'] == 1]))

All: 95851
toxic: 9237
severe_toxic: 965
obscene: 5109
threat: 305
insult: 4765
identity_hate: 814


## spaCy

<pre>conda install -c conda-forge spacy</pre>

In [16]:
NLP = spacy.load('en')
MAX_CHARS = 20000

def tokenizer(comment):
    comment = re.sub(
        r"[\*\"“”\n\\…\+\-\/\=\(\)‘•:\[\]\|’\!;]", " ", 
        str(comment))
    comment = re.sub(r"[ ]+", " ", comment)
    comment = re.sub(r"\!+", "!", comment)
    comment = re.sub(r"\,+", ",", comment)
    comment = re.sub(r"\?+", "?", comment)
    if (len(comment) > MAX_CHARS):
        comment = comment[:MAX_CHARS]
    return [x.text for x in NLP.tokenizer(comment) if x.text != " "]

In [17]:
def get_dataset(train_scv, test_csv, split=0.2, fix_length=100, lower=False, vectors=None):
    if vectors is not None:
        # pretrain vectors only supports all lower cases
        lower = True
    
    LOGGER.debug("Preparing CSV files...")
#     prepare_csv(train_csv, test_csv, split)
    
    comment = data.Field(
        sequential=True,
        fix_length=fix_length,
        tokenize=tokenizer,
        pad_first=True,
        tensor_type=torch.cuda.LongTensor,
        lower=lower
    )
    
    print("Reading train csv file...")
    train, val = data.TabularDataset.splits(
        path='data/', format='csv', skip_header=True,
        train='dataset_train.csv', validation='dataset_val.csv',
        fields=[
            ('id', None),
            ('comment_text', comment),
            ('toxic', data.Field(
                use_vocab=False, sequential=False,
                tensor_type=torch.cuda.ByteTensor)),
            ('severe_toxic', data.Field(
                use_vocab=False, sequential=False, 
                tensor_type=torch.cuda.ByteTensor)),
            ('obscene', data.Field(
                use_vocab=False, sequential=False, 
                tensor_type=torch.cuda.ByteTensor)),
            ('threat', data.Field(
                use_vocab=False, sequential=False, 
                tensor_type=torch.cuda.ByteTensor)),
            ('insult', data.Field(
                use_vocab=False, sequential=False, 
                tensor_type=torch.cuda.ByteTensor)),
            ('identity_hate', data.Field(
                use_vocab=False, sequential=False, 
                tensor_type=torch.cuda.ByteTensor)),
        ])
    
    print("Reading test csv file...")
    test = data.TabularDataset(
        path='data/dataset_test.csv', format='csv', 
        skip_header=True,
        fields=[
            ('id', None),
            ('comment_text', comment)
        ])
    
    print("Building vocabulary...")
    comment.build_vocab(
        train, val, test,
        max_size=20000,
        min_freq=50,
        vectors=vectors
    )
    
    print("Done preparing the datasets")
    return train, val, test

In [20]:
%%time
train_ds, valid_ds, test_ds = get_dataset(train_csv, test_csv, split=0.2)

Reading train csv file...
Reading test csv file...
Building vocabulary...
Done preparing the datasets
Wall time: 8min 2s


In [21]:
print(len(train_ds.examples))
print(len(valid_ds.examples))
print(len(test_ds.examples))

76681
19170
153164


In [22]:
train_ds.fields

{'comment_text': <torchtext.data.field.Field at 0x2152f039eb8>,
 'id': None,
 'identity_hate': <torchtext.data.field.Field at 0x2152f039a90>,
 'insult': <torchtext.data.field.Field at 0x2152f039b70>,
 'obscene': <torchtext.data.field.Field at 0x2152f039e48>,
 'severe_toxic': <torchtext.data.field.Field at 0x2152f039c88>,
 'threat': <torchtext.data.field.Field at 0x2152f039b00>,
 'toxic': <torchtext.data.field.Field at 0x2152f039ac8>}

In [23]:
print(len(train_ds.examples))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.





In [38]:
def get_iterator(dataset, batch_size, train=True, 
    shuffle=True, repeat=False):
    dataset_iter = data.Iterator(
        dataset, batch_size=batch_size, device=0,
        train=train, shuffle=shuffle, repeat=repeat,
        sort=False
    )
    return dataset_iter

In [39]:
batch_size = 1
train_iter = get_iterator(train_ds, batch_size, train=True, shuffle=True, repeat=True)

In [40]:
for i, examples in enumerate(train_iter):
    x = examples.comment_text # (fix_length, batch_size) Tensor
    y = torch.stack([
        examples.toxic, 
        examples.severe_toxic, 
        examples.obscene,
        examples.threat, 
        examples.insult, 
        examples.identity_hate
    ], dim=1)
    
    print(x)
    print(y)
    if i >= 1: break

Variable containing:
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
    1
   83
  187
   10
 4613
    0
  932
    2
[torch.cuda.LongTensor of size 100x1 (GPU 0)]

Variable containing:
 0  0  0  0  0  0
[torch.cuda.ByteTensor of size 1x6 (GPU 0)]

Variable containing:
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
     1
   122
    10
    34
  1512
  4203
     2
   154
    20
   151
  1650
  1099
  4

## Loading pre-trained word vectors

In [None]:
# model.word_em.weight.data = train_dataset.fields["comment_text"].vocab.vectors

## Encoder RNN 

In [45]:
use_gpu = torch.cuda.is_available()

def to_var(x, volatile=False):
    x = Variable(x, volatile=volatile)
    return x.cuda() if use_gpu else x

In [109]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes=6, num_layers=1):
        super(EncoderRNN, self).__init__()
        self.num_layers = num_layers
        self.hidden_size = hidden_size
        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)
        self.out = nn.Linear(hidden_size, num_classes)
    
    def forward(self, x, hidden):
        output = self.embedding(x)
        for i in range(self.num_layers):
            output, hidden = self.gru(output, hidden)
        output = self.out(hidden)
        output = F.relu(output)
        output = F.dropout(output, p=0.1)
        output = F.sigmoid(output)
        return output
    
    def init_hidden(self):
        return to_var(torch.zeros((1, 1, self.hidden_size)))

In [115]:
hidden_size = 128
num_layers = 1
vocab = train_ds.fields['comment_text'].vocab
print(len(vocab))
model = EncoderRNN(len(vocab), hidden_size, num_classes=6, num_layers=1)

14636


In [117]:
if use_gpu:
    model = model.cuda()

In [118]:
model

EncoderRNN(
  (embedding): Embedding(14636, 128)
  (gru): GRU(128, 128)
  (out): Linear(in_features=128, out_features=6, bias=True)
)

In [119]:
criterion = nn.BCELoss()
if use_gpu:
    criterion = criterion.cuda()
    
optimizer = optim.Adam(rnn.parameters(), lr=0.002)

## Training LSTM

In [114]:
num_epochs = 1


for epoch in range(num_epochs):
    epoch_loss = 0.0
#     h = to_var(torch.zeros((num_layers, batch_size, hidden_size)))
    h = rnn.init_hidden()
    
    for i, examples in tqdm_notebook(enumerate(train_iter)):
        x = examples.comment_text
        y = torch.stack([examples.toxic, examples.severe_toxic, examples.obscene,
                         examples.threat, examples.insult, examples.identity_hate], dim=1)
        
        # forward step
        output = rnn(x, h)
        
        # loss
        loss = criterion(output, y.float().view(1, 1, -1))
        
        # backward step
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # stats
        sys.stdout.flush()
        sys.stdout.write('\r loss = {:.5f}'.format(loss.data[0]))
        
        if i > len(train_ds.examples): break


 loss = 0.69315


Exception in thread Thread-17:
Traceback (most recent call last):
  File "C:\Users\Razavi\Anaconda3\lib\threading.py", line 916, in _bootstrap_inner
    self.run()
  File "C:\Users\Razavi\Anaconda3\lib\site-packages\tqdm\_tqdm.py", line 144, in run
    for instance in self.tqdm_cls._instances:
  File "C:\Users\Razavi\Anaconda3\lib\_weakrefset.py", line 60, in __iter__
    for itemref in self.data:
RuntimeError: Set changed size during iteration



 loss = 0.69315


KeyboardInterrupt: 

In [120]:
num_epochs = 1

In [None]:
class Corpus(object):
    def __init__(self, DATA_DIR, filenames):
        self.vocab = Vocabulary()
        self.data = self.tokenize(DATA_DIR, filenames)

    def tokenize(self, DATA_DIR, filenames):
        for filename in filenames:
            path = os.path.join(DATA_DIR, filename)
            with open(path, 'r') as f:
                tokens = 0
                for line in f:
                    words = line.split() + ['<eos>']
                    tokens += len(words)
                    for word in words:
                        self.vocab.add_word(word)

            # Tokenize file content
            with open(path, 'r') as f:
                ids = torch.LongTensor(tokens)
                token = 0
                for line in f:
                    words = line.split() + ['<eos>']
                    for word in words:
                        ids[token] = self.dictionary.word2idx[word]
                        token += 1

        return ids

class TxtDatasetProcessing(Dataset):
    def __init__(self, data_path, txt_path, txt_filename, label_filename, sen_len, corpus):
        self.txt_path = os.path.join(data_path, txt_path)
        # reading txt file from file
        txt_filepath = os.path.join(data_path, txt_filename)
        fp = open(txt_filepath, 'r')
        self.txt_filename = [x.strip() for x in fp]
        fp.close()
        # reading labels from file
        label_filepath = os.path.join(data_path, label_filename)
        fp_label = open(label_filepath, 'r')
        labels = [int(x.strip()) for x in fp_label]
        fp_label.close()
        self.label = labels
        self.corpus = corpus
        self.sen_len = sen_len


    def __getitem__(self, index):
        filename = os.path.join(self.txt_path, self.txt_filename[index])
        fp = open(filename, 'r')
        txt = torch.LongTensor(np.zeros(self.sen_len, dtype=np.int64))
        count = 0
        clip = False
        for words in fp:
            for word in words.split():
                if word.strip() in self.corpus.dictionary.word2idx:
                    if count > self.sen_len - 1:
                        clip = True
                        break
                    txt[count] = self.corpus.dictionary.word2idx[word.strip()]
                    count += 1
            if clip: break
        label = torch.LongTensor([self.label[index]])
        return txt, label
    
    def __len__(self):
        return len(self.txt_filename)

## LSTM Classifier

In [122]:
class LSTMClassifier(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size, batch_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.batch_size = batch_size

        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2label = nn.Linear(hidden_dim, label_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        h0 = to_var(torch.zeros(1, self.batch_size, self.hidden_dim))
        c0 = to_var(torch.zeros(1, self.batch_size, self.hidden_dim))
        return h0, c0

    def forward(self, sentence):
        embeds = self.embeddings(sentence)
        x = embeds.view(len(sentence), self.batch_size, -1)
        output, self.hidden = self.lstm(x, self.hidden)
        y = self.hidden2label(output[-1])
        return y

In [123]:
## parameter setting
epochs = 50
batch_size = 5
learning_rate = 0.002

embedding_dim = 100
hidden_dim = 50
seq_len = 100
num_classes = 6

In [None]:
# train dataset

# test dataset

In [124]:
model = LSTMClassifier(embedding_dim=embedding_dim, hidden_dim=hidden_dim, 
                       vocab_size=len(vocab), label_size=num_classes, 
                       batch_size=batch_size)

if use_gpu:
    model = model.cuda()

In [None]:
criterion = nn.BCELoss()
if use_gpu:
    criterion = criterion.cuda()
    
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [None]:
train_loss_ = []
test_loss_ = []
train_acc_ = []
test_acc_ = []

for epoch in range(num_epochs):
#     optimizer = adjust_learning_rate(optimizer, epoch)

    ## training epoch
    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    
    for i, traindata in enumerate(train_loader):
        train_inputs, train_labels = traindata
        train_labels = torch.squeeze(train_labels)

        if use_gpu:
            train_inputs, train_labels = Variable(train_inputs.cuda()), train_labels.cuda()
        else: train_inputs = Variable(train_inputs)

        model.zero_grad()
        model.batch_size = len(train_labels)
        model.hidden = model.init_hidden()
        output = model(train_inputs.t())

        loss = loss_function(output, Variable(train_labels))
        loss.backward()
        optimizer.step()

        # calc training acc
        _, predicted = torch.max(output.data, 1)
        total_acc += (predicted == train_labels).sum()
        total += len(train_labels)
        total_loss += loss.data[0]

    train_loss_.append(total_loss / total)
    train_acc_.append(total_acc / total)
    ## testing epoch
    total_acc = 0.0
    total_loss = 0.0
    total = 0.0
    for iter, testdata in enumerate(test_loader):
        test_inputs, test_labels = testdata
        test_labels = torch.squeeze(test_labels)

        if use_gpu:
            test_inputs, test_labels = Variable(test_inputs.cuda()), test_labels.cuda()
        else: test_inputs = Variable(test_inputs)

        model.batch_size = len(test_labels)
        model.hidden = model.init_hidden()
        output = model(test_inputs.t())

        loss = loss_function(output, Variable(test_labels))

        # calc testing acc
        _, predicted = torch.max(output.data, 1)
        total_acc += (predicted == test_labels).sum()
        total += len(test_labels)
        total_loss += loss.data[0]
    test_loss_.append(total_loss / total)
    test_acc_.append(total_acc / total)

    print('[Epoch: %3d/%3d] Training Loss: %.3f, Testing Loss: %.3f, Training Acc: %.3f, Testing Acc: %.3f'
          % (epoch, epochs, train_loss_[epoch], test_loss_[epoch], train_acc_[epoch], test_acc_[epoch]))
