# LSTM model for sentiment analysis
### Dataset: IMDb Large Movie Review Dataset https://ai.stanford.edu/~amaas/data/sentiment/

## Download dataset from kaggle

In [0]:

# Colab library to upload files to notebook
from google.colab import files

# Upload kaggle API key file
uploaded = files.upload()

Saving kaggle.json to kaggle.json


In [0]:
!ls

 features.csv	     imdb-dataset-of-50k-movie-reviews.zip   sample_data
'IMDB Dataset.csv'   kaggle.json			     weights.pth


In [0]:
!mkdir .kaggle

In [0]:
!mv kaggle.json .kaggle/kaggle.json

In [0]:
 !mv .kaggle /root/

In [0]:
! chmod 600 /root/.kaggle/kaggle.json

In [0]:
! kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 35% 9.00M/25.7M [00:00<00:00, 36.9MB/s]
100% 25.7M/25.7M [00:00<00:00, 72.8MB/s]


In [0]:
! unzip imdb-dataset-of-50k-movie-reviews.zip

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


## Load dataset

In [0]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [0]:
data = pd.read_csv("IMDB Dataset.csv")

In [0]:
reviews = data['review'].values
labels = data['sentiment'].values

## Text preprocessing

In [0]:
from tqdm import tqdm
from time import sleep

import re
from string import punctuation
from sklearn.preprocessing import OneHotEncoder
import itertools

import nltk 
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer 

### Deleting html tags:

In [51]:
!pip install contractions

Collecting contractions
  Downloading https://files.pythonhosted.org/packages/85/41/c3dfd5feb91a8d587ed1a59f553f07c05f95ad4e5d00ab78702fbf8fe48a/contractions-0.0.24-py2.py3-none-any.whl
Collecting textsearch
  Downloading https://files.pythonhosted.org/packages/42/a8/03407021f9555043de5492a2bd7a35c56cc03c2510092b5ec018cae1bbf1/textsearch-0.0.17-py2.py3-none-any.whl
Collecting pyahocorasick
[?25l  Downloading https://files.pythonhosted.org/packages/f4/9f/f0d8e8850e12829eea2e778f1c90e3c53a9a799b7f412082a5d21cd19ae1/pyahocorasick-1.4.0.tar.gz (312kB)
[K     |████████████████████████████████| 317kB 3.7MB/s 
[?25hCollecting Unidecode
[?25l  Downloading https://files.pythonhosted.org/packages/d0/42/d9edfed04228bacea2d824904cae367ee9efd05e6cce7ceaaedd0b0ad964/Unidecode-1.1.1-py2.py3-none-any.whl (238kB)
[K     |████████████████████████████████| 245kB 10.4MB/s 
[?25hBuilding wheels for collected packages: pyahocorasick
  Building wheel for pyahocorasick (setup.py) ... [?25l[?25hdone
  

In [0]:
import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

In [0]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, ' ', raw_html)
    withoutdoublespaces = re.sub(' +', ' ', cleantext)
    return withoutdoublespaces

In [0]:
def tokenize(data):
    res = []
    words = []
    lem = WordNetLemmatizer()
    for sent in tqdm(data):
        sleep(0.25)
        pre_sent = sent.lower()
        
        # delete punctuation and html tags and numbers
        pre_sent = cleanhtml(pre_sent)
        pre_sent = re.sub('[0-9]+', '', pre_sent)
        pre_sent = re.sub("\'", ' ', pre_sent)
        pre_sent = pre_sent.translate(str.maketrans('','',punctuation))
        
        pre_sent = word_tokenize(pre_sent)
        sent = []
        for word in pre_sent:
            word = lem.lemmatize(word)
            
            # delete stop words and add to all-word-list 
            if word not in stopwords.words('english'):
                sent.append(word)
                words.append(word)
       
        res.append(sent)
    return res, words

In [56]:
  >>> import nltk
  >>> nltk.download('punkt')
  >>> nltk.download('wordnet')
  >>> nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
data, words = tokenize(reviews)

100%|██████████| 50000/50000 [3:58:17<00:00,  3.50it/s]


download to pc tokenized rewiews

In [0]:
data_csv = data.to_csv('data.csv')
from google.colab import files
files.download('labels.csv') 

### Word to numbers

In [0]:
# feel free to use this import 
from collections import Counter

## Build a dictionary that maps words to integers
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab,1)} 

## use the dict to tokenize each review in reviews_split
## store the tokenized reviews in reviews_ints
reviews_ints = []
for review in data:
    reviews_ints.append([vocab_to_int[word] for word in review])

In [0]:
# outlier review stats
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 0
Maximum review length: 1429


download important files

In [0]:
import json
a_file = open("word_to_int.json", "w")
json.dump(vocab_to_int, a_file)
a_file.close()

In [0]:
!ls

'IMDB Dataset.csv'			 sample_data
 imdb-dataset-of-50k-movie-reviews.zip	 word_to_int.json


In [0]:
from google.colab import files
files.download('word_to_int.json') 

### Encoding labels

In [0]:
encoded_labels = [1 if lab == 'positive' else 0 for lab in labels]  

In [0]:
csv_labels = pd.DataFrame(labels)

In [0]:
csv_labels.to_csv('labels.csv')

In [0]:
from google.colab import files
files.download('labels.csv') 

### Padding features

In [0]:
def pad_features(reviews_ints, seq_length=280):
    ## getting the correct rows x cols shape
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)
    
    ## for each review, I grab that review
    for i, row in enumerate(reviews_ints):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

In [0]:
seq_length = 280
features = pad_features(reviews_ints, seq_length=seq_length)

## test statements - do not change - ##
assert len(features)==len(reviews_ints), "Your features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."

In [0]:
csv_features = pd.DataFrame(features)

In [0]:
csv_features.to_csv('features.csv')

In [0]:
features = pd.read_csv('features.csv')
labels = pd.read_csv('labels.csv')


In [0]:
labels = np.array(labels)
features = np.array(features)


In [0]:
labels = [each[1] for each in labels]

In [0]:
labels = [1 if x == 'positive' else 0 for x in labels]

In [0]:
labels = np.array(labels)

## Import predtrained weights and vocab

In [202]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
features = pd.read_csv('/content/drive/My Drive/sentiment analysis/features.csv')

In [0]:
labels = pd.read_csv('/content/drive/My Drive/sentiment analysis/labels.csv')

In [0]:
import json
json_file_path = "/content/drive/My Drive/sentiment analysis/word_to_int.json"

with open(json_file_path, 'r') as j:
     word_to_int = json.loads(j.read())

In [206]:
features = np.array(features)
labels = np.array(labels)
print(features.shape)
print(labels.shape)


(50000, 281)
(50000, 2)


In [0]:
labels = np.array(labels)
labels= np.array([l[1] for l in labels])
labels = np.array([1 if x=='positive' else 0 for x in labels])

In [9]:
labels[:10]

array([1, 1, 1, 0, 1, 1, 1, 0, 0, 1])

In [0]:
batch_size = 16

## Dataloaders

In [0]:
features = np.array(features)
labels = np.array(labels)

In [0]:
train_x, test_x, train_y, test_y = train_test_split(features, labels, shuffle=True, test_size=0.2)

In [0]:
batch_size = 64

In [14]:
print(f'shape of train data is {train_x.shape}')
print(f'shape of test data is {test_x.shape}')

shape of train data is (40000, 281)
shape of test data is (10000, 281)


In [0]:
import torch
from torch.utils.data import TensorDataset, DataLoader

In [0]:
# datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 16

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size, num_workers=4)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size, num_workers=4)

## LSTM network on PyTorch

In [0]:
import torch.nn as nn
import torch.functional as F
import torch.optim as optim
import torch

In [210]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

Training on GPU.


In [0]:
class SentimentLSTM(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_lstm_layers, drop_prob=0.5):
        super(SentimentLSTM, self).__init__()
        
        self.vocab_size = vocab_size
        self.output_size = output_size
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.n_lstm_layers = n_lstm_layers
        self.drop_prob = drop_prob
        
        self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, 
                            num_layers=n_lstm_layers, dropout=drop_prob, batch_first=True)
        
        self.dropout = nn.Dropout(p=0.3)
        
        self.fc = nn.Linear(in_features=hidden_dim, out_features=output_size)
        self.sig = nn.Sigmoid()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        
        embed = self.embedding(x)
        
        lstm_out, hidden = self.lstm(embed, hidden)
        # stack up lstm outputs
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        
        # sigmoid function
        sig_out = self.sig(out)
        
        # reshape to be batch_size first
        sig_out = sig_out.view(batch_size, -1)
        sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    def init_hidden(self, batch_size):
        
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if(train_on_gpu):
          hidden = (weight.new(self.n_lstm_layers, batch_size, self.hidden_dim).zero_().cuda(),
                   weight.new(self.n_lstm_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
          hidden = (weight.new(self.n_lstm_layers, batch_size, self.hidden_dim).zero_(),
                   weight.new(self.n_lstm_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden


* `vocab_size`: Size of our vocabulary or the range of values for our input, word tokens.
* `output_size`: Size of our desired output; the number of class scores we want to output (pos/neg).
* `embedding_dim`: Number of columns in the embedding lookup table; size of our embeddings.
* `hidden_dim`: Number of units in the hidden layers of our LSTM cells. Usually larger is better performance wise. Common values are 128, 256, 512, etc.
* `n_layers`: Number of LSTM layers in the network. Typically between 1-3


In [212]:
vocab_size = len(word_to_int) + 1 # +1 for zero padding + our word tokens
output_size = 1 # prob of positive sentiment
embedding_dim = 400 
hidden_dim = 256
n_lstm_layers = 2

net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_lstm_layers)

print(net)

SentimentLSTM(
  (embedding): Embedding(146156, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [40]:
dataiter = iter(test_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([16, 281])
Sample input: 
 tensor([[11112,     0,     0,  ...,   413,  4552,  3542],
        [45813,     0,     0,  ...,  2501,   551,   146],
        [ 8201,     0,     0,  ...,    32,    32,    32],
        ...,
        [10538,     0,     0,  ...,   157,    70,    18],
        [24023,     0,     0,  ..., 16000,   732, 44228],
        [42844, 14230,   209,  ...,    37,   196,   129]])

Sample label size:  torch.Size([16])
Sample label: 
 tensor([0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1])


## Train

In [0]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


In [0]:
# training params

epochs = 1 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in test_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])

                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

In [0]:
torch.save(net.state_dict(), 'weights_1_epoch.pth')

In [36]:
from google.colab import files
files.download('weights_1_epoch.pth') 

----------------------------------------
Exception happened during processing of request from ('::ffff:127.0.0.1', 49846, 0, 0)
Traceback (most recent call last):
  File "/usr/lib/python3.6/socketserver.py", line 320, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/usr/lib/python3.6/socketserver.py", line 351, in process_request
    self.finish_request(request, client_address)
  File "/usr/lib/python3.6/socketserver.py", line 364, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/usr/lib/python3.6/socketserver.py", line 724, in __init__
    self.handle()
  File "/usr/lib/python3.6/http/server.py", line 418, in handle
    self.handle_one_request()
  File "/usr/lib/python3.6/http/server.py", line 406, in handle_one_request
    method()
  File "/usr/lib/python3.6/http/server.py", line 639, in do_GET
    self.copyfile(f, self.wfile)
  File "/usr/lib/python3.6/http/server.py", line 800, in copyfile
    shutil.copyfil

## Test

In [0]:
from tqdm import tqdm

In [42]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in tqdm(test_loader):

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    output, h = net(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.338
Test accuracy: 0.852


## Test loaded weights

In [0]:
loaded_net = SentimentLSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_lstm_layers)

In [214]:
!ls

drive  sample_data  weights_1_epoch.pth


In [215]:
loaded_net.load_state_dict(torch.load('weights_1_epoch.pth'))

<All keys matched successfully>

In [0]:
from tqdm import tqdm

In [49]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = loaded_net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    if(train_on_gpu):
        inputs, labels = inputs.cuda(), labels.cuda()
    
    # get predicted outputs
    output, h = net(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.337
Test accuracy: 0.852


## Util functions

In [0]:
loaded_net = loaded_net.cuda()

In [0]:
def tokenize_one_sample(sent):
    
    lem = WordNetLemmatizer()

    pre_sent = sent.lower()
    
    # delete punctuation and html tags and numbers
    pre_sent = cleanhtml(pre_sent)
    pre_sent = re.sub('[0-9]+', '', pre_sent)
    pre_sent = re.sub("\'", ' ', pre_sent)
    pre_sent = pre_sent.translate(str.maketrans('','',punctuation))
    
    pre_sent = word_tokenize(pre_sent)
    sent = []
    for word in pre_sent:
        word = lem.lemmatize(word)
        if word not in stopwords.words('english'):
            sent.append(word)
    
    return sent

In [0]:
def pad_features_one_sample(tweet_int, seq_length=280):
    tweet_int = tweet_int[0]
    ## getting the correct rows x cols shape
    features = np.zeros(seq_length, dtype=int)

    ## for each review, I grab that review
    features[-len(tweet_int):] = np.array(tweet_int)[:seq_length]
    
    return features

In [0]:
from time import sleep

In [0]:
def tweet_to_sentiment(tweet, vocab=word_to_int, model=net): 
    h = loaded_net.init_hidden(batch_size)
    
    tweet = tokenize_one_sample(tweet)

    tweet_int = []

    tweet_int.append([word_to_int[word] if word in word_to_int else 0 for word in tweet])

    inputs = pad_features_one_sample(tweet_int)

    batch = []
    for _ in range(batch_size):
        batch.append(inputs)
    inputs = torch.from_numpy(np.array(batch))

    if(train_on_gpu):
        inputs = inputs.cuda()


    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])
   
    # get predicted outputs
    output, h = loaded_net(inputs, h)
    
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.sum()/batch_size)  # rounds to the nearest integer
      
    return 'positive' if pred.item() else 'negative'

In [0]:
tweet = "First, how does the movie just start with no background information explaining to the audience how this woman even ended up stuck and living in this basement?? Like what happened to her that landed her in this place?? I honestly couldn't get past the first 20 minutes of this movie because it was so boring and offers no explanation for anything. I have no clue how it got such high ratings."

Example: First, how does the movie just start with no background information explaining to the audience how this woman even ended up stuck and living in this basement?? Like what happened to her that landed her in this place?? I honestly couldn't get past the first 20 minutes of this movie because it was so boring and offers no explanation for anything. I have no clue how it got such high ratings.

In [308]:
tweet_to_sentiment(tweet)

'negative'