In this notebook I have implemented sentiment analysis with PyTorch.

In [2]:
#Import dependencies
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from collections import Counter
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

In [6]:
def accuracy(pred, real):
    '''
    Call this function to measure accuracy of the model for binary classification task.
    
    :param: pred - predictions got from a model
    :param: real - real labels for each sample in the dataset
    '''
    assert len(pred) == len(real)
    correct = 0 
    for i in range(len(pred)):
        if pred[i] >= 0.5:
            if real[i] == 1:
                correct += 1
        else:
            if real[i] == 0:
                correct += 1
    
    return correct / len(pred)

### Step 1. Preprocessing dataset

To work properly with the dataset that I've choosen for this notebook there are several steps that we have to perform.
       
       1. Clean punctuation and lower all characters in the dataset
       2. Delete stopwords
       3. Split each sample to words
       4. Encode each word to vocabulary index
       5. Split training data into training and validation (small_test) set

In [2]:
#if you have just PIP-ed nltk, you will need to download stopwords
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lukaa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
#Here are all stopwords for english language
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

#### Step 1.1 Load dataset from csv file and separate it to features and labels

In [4]:
dataset = pd.read_csv('training.csv', sep='\t')

In [5]:
dataset.head()

Unnamed: 0,Class,Sentence
0,1,The Da Vinci Code book is just awesome.
1,1,this was the first clive cussler i've ever rea...
2,1,i liked the Da Vinci Code a lot.
3,1,i liked the Da Vinci Code a lot.
4,1,I liked the Da Vinci Code but it ultimatly did...


In [8]:
labels = dataset.Class.values

In [9]:
labels

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [10]:
features = dataset.Sentence.values

In [11]:
features

array(['The Da Vinci Code book is just awesome.',
       "this was the first clive cussler i've ever read, but even books like Relic, and Da Vinci code were more plausible than this.",
       'i liked the Da Vinci Code a lot.', ...,
       'As I sit here, watching the MTV Movie Awards, I am reminded of how much I despised the movie Brokeback Mountain.',
       'Ok brokeback mountain is such a horrible movie.',
       'Oh, and Brokeback Mountain was a terrible movie.'], dtype=object)

#### Step 1.2 Clean data from all punctuation and change all character to the lower case

In [7]:
def clean_data(dataset):
    '''
        Use this function to lower all samples and delete all punctuations from a dataset.
        
        :param: dataset - unprocessed dataset
        :return: cleaned_dataset - where each sample is lower case and without punctuations
    '''
    cleaned_dataset = []
    for s in dataset:
        cleaned_sample = s.lower() #to lowercase
        cleaned_sample = re.sub(r'[^\w\s]','',cleaned_sample) #delete all punctuation
        cleaned_dataset.append(cleaned_sample)
    return np.array(cleaned_dataset)

In [12]:
cleand_features = clean_data(features)

In [13]:
cleand_features

array(['the da vinci code book is just awesome',
       'this was the first clive cussler ive ever read but even books like relic and da vinci code were more plausible than this',
       'i liked the da vinci code a lot', ...,
       'as i sit here watching the mtv movie awards i am reminded of how much i despised the movie brokeback mountain',
       'ok brokeback mountain is such a horrible movie',
       'oh and brokeback mountain was a terrible movie'], dtype='<U4637')

#### Step 1.3 Delete all stopwords and split each sample in the dataset into words | create vocab, word_to_id and id_to_word

In [16]:
def tokenizer(data):
    '''
    Call this function on cleaned dataset to tokenize it and create vocab for the dataset.
    
    :param: data -  cleaned_dataset
    :return: tokenized_data - dataset where each sample has been splited into words
    :return: word_to_id - dict where each key is word from vocab and value is its index
    :return: id_to_word - dict where each key is index of word from vocab and value is the word on that index
    :return: vocab - list of all unique words in this dataset
    '''
    all_words = []
    tokenized_data = []
    for s in data:
        words = s.split()
        tokenized_sample = []
        for word in words:
            if word not in stopwords.words('english'):
                tokenized_sample.append(word)
                all_words.append(word)
            
        tokenized_data.append(np.array(tokenized_sample))
        
    counter = Counter(all_words)
    vocab = sorted(counter, key=counter.get, reverse=True)
    
    word_to_id = {word:i for i, word in enumerate(vocab)}
    id_to_word = {i:word for i, word in enumerate(vocab)}
    
    return np.array(tokenized_data), word_to_id, id_to_word, vocab

In [15]:
tokenized_data, word_to_id, id_to_word, vocab = tokenizer(cleand_features)

In [17]:
tokenized_data

array([array(['da', 'vinci', 'code', 'book', 'awesome'], dtype='<U7'),
       array(['first', 'clive', 'cussler', 'ive', 'ever', 'read', 'even',
       'books', 'like', 'relic', 'da', 'vinci', 'code', 'plausible'],
      dtype='<U9'),
       array(['liked', 'da', 'vinci', 'code', 'lot'], dtype='<U5'), ...,
       array(['sit', 'watching', 'mtv', 'movie', 'awards', 'reminded', 'much',
       'despised', 'movie', 'brokeback', 'mountain'], dtype='<U9'),
       array(['ok', 'brokeback', 'mountain', 'horrible', 'movie'], dtype='<U9'),
       array(['oh', 'brokeback', 'mountain', 'terrible', 'movie'], dtype='<U9')],
      dtype=object)

#### Step 1.4 Encoded each word into its index in the vocab

In [18]:
def encoder_helper(sample, seq_len):
    '''
    This method is used to for padding samples that are shorter then given sequence_length
    :param: sample - dataset sample that needs to be chacked
    :param: seq_len - wanted number of tokens (words) in each dataset sample
    :return: edited sample
    '''
    if len(sample) > seq_len:
        return sample[:seq_len]
    else:
        return [0]*(seq_len-len(sample)) + sample

In [19]:
def encoder(data, word_to_id, seq_len):
    '''
    Call this method to encode dataset.
    
    :param: data - tokenized data
    :param: word_to_id - dict where each key is word from vocab and value is its index
    :param: seq_len - wanted number of tokens (words) in each dataset sample
    :return: encoded dataset
    '''
    encoded_data = []
    for s in data:
        encoded_sample = []
        for word in s:
            encoded_sample.append(word_to_id[word])
        
        encoded_data.append(np.array(encoder_helper(encoded_sample, seq_len)))
    
    return np.array(encoded_data)

In [20]:
encoded_data = encoder(tokenized_data, word_to_id, 200)

#### Step 1.5 Split dataset into training and testing parts

In [21]:
def train_test_creator(data, in_labels, test_size=0.2):
    '''
    Hlper function used as a wrapper for sklear functions to split a dataset into training and testing parts
    
    :param: data - dataset
    :param: in_labels - labels for the dataset
    :param: test_size - pecentage of a dataset that will be used as a testing dataset
    '''
    shuffled_features, shuffled_labels = shuffle(data, in_labels)
    
    return train_test_split(shuffled_features, shuffled_labels, test_size=test_size)

In [22]:
X_train, X_test, y_train, y_test = train_test_creator(encoded_data, labels)

### Step 2 Create model

In [26]:
#Hyperparams
epochs = 3
batch_size = 128
learning_rate = 0.001
embed_vector = 300
vocab_size = len(word_to_id)
rnn_size = 256
number_of_layers = 2

#### Step 2.1 RNN-Model

In [24]:
class SentimentRNN(nn.Module):
    
    def __init__(self, vocab_size, embed_size, rnn_size, number_of_layers):
    
        super(SentimentRNN, self).__init__()
        
        self.hidden_units = rnn_size
        self.number_of_layers = number_of_layers
        
        self.embed = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, rnn_size, number_of_layers, batch_first=True)
        
        self.out_layer = nn.Linear(self.hidden_units, 1)
        
    def forward(self, X):
        
        h0 = Variable(torch.zeros(self.number_of_layers, X.size(0), self.hidden_units).cuda())
        c0 = Variable(torch.zeros(self.number_of_layers, X.size(0), self.hidden_units).cuda())
        
        out, _ = self.lstm(self.embed(X), (h0, c0))
        
        out = F.sigmoid(self.out_layer(out[:, -1, :]))
        return out

In [27]:
#Create object of the model for wanted hyperparams
model = SentimentRNN(vocab_size, embed_vector, rnn_size, number_of_layers)

In [28]:
#If you have GPU that is cuda supported execute this cell
model.cuda()

SentimentRNN(
  (embed): Embedding(2116, 300)
  (lstm): LSTM(300, 256, num_layers=2, batch_first=True)
  (out_layer): Linear(in_features=256, out_features=1)
)

#### Step 2.2 Setup of loss function

For this tast we will use Binary Cross entropy

In [29]:
criterion = nn.BCELoss()

#### Step 2.3 Setup optimizer

In [30]:
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

### Step 3 Training and testing


#### Step 3.1 Training

In [31]:
for epoch in range(epochs):
    epoch_accuracy = []
    epoch_loss = []
    for i in range(len(X_train) // batch_size):
        starting_id = i * batch_size
        ending_id = starting_id + batch_size
        
        X_batch = Variable(torch.from_numpy(X_train[starting_id:ending_id])).cuda()
        y_batch = Variable(torch.from_numpy(np.float32(y_train[starting_id:ending_id]))).cuda()
        X_batch = X_batch.type(torch.cuda.LongTensor)
        optimizer.zero_grad()
        pred = model(X_batch)
        
        epoch_accuracy.append(accuracy(pred.cpu().data.numpy(), y_batch.cpu().data.numpy()))
        
        loss = criterion(pred, y_batch)
        epoch_loss.append(loss.cpu().data.numpy())
        loss.backward()
        optimizer.step()
        
    print("Epoch: {}/{}".format(epoch, epochs),
          " | Accuracy: {}".format(np.mean(epoch_accuracy)), 
          " | Loss: {}".format(np.mean(epoch_loss)))

  "Please ensure they have the same size.".format(target.size(), input.size()))


Epoch: 0/3  | Accuracy: 0.90625  | Loss: 0.20403729379177094
Epoch: 1/3  | Accuracy: 0.9943677325581395  | Loss: 0.02016657590866089
Epoch: 2/3  | Accuracy: 0.998546511627907  | Loss: 0.0049927267245948315


#### Step 3.2 Testing

In [34]:
test_accuray = []
for i in range(len(X_test) // batch_size):
    starting_id = i * batch_size
    ending_id = starting_id + batch_size

    X_batch = Variable(torch.from_numpy(X_test[starting_id:ending_id])).cuda()
    y_batch = Variable(torch.from_numpy(np.float32(y_test[starting_id:ending_id]))).cuda()
    X_batch = X_batch.type(torch.cuda.LongTensor)

    pred = model(X_batch)

    test_accuray.append(accuracy(pred.cpu().data.numpy(), y_batch.cpu().data.numpy()))

In [35]:
np.mean(test_accuray) # This is nice accuracy for the testing set :-)

0.98828125