In [1]:
# This notebook contains implementation of LSTM(using nn.LSTM) and VanillaRNN and Multilayer VanillaRNN
# Reference :https://colab.research.google.com/github/agungsantoso/deep-learning-v2-pytorch/blob/master/sentiment-rnn/Sentiment_RNN_Exercise.ipynb

### Download the dataset

In [2]:
# Download the dataset
# It's a movie review sentiment dataset
#!mkdir data
#!wget -c https://github.com/agungsantoso/deep-learning-v2-pytorch/raw/master/sentiment-rnn/data/labels.txt
#!wget -c https://github.com/agungsantoso/deep-learning-v2-pytorch/raw/master/sentiment-rnn/data/reviews.txt
#!mv *.txt data/

In [3]:
import numpy as np

# read data from text files
# Each review and label is seperated by '\n'
with open('data/reviews.txt', 'r') as f:
    reviews = f.read()
with open('data/labels.txt', 'r') as f:
    labels = f.read()

In [4]:
print(reviews[:2000])
print()
print(labels[:20])

bromwell high is a cartoon comedy . it ran at the same time as some other programs about school life  such as  teachers  . my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers  . the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students . when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled . . . . . . . . . at . . . . . . . . . . high . a classic line inspector i  m here to sack one of your teachers . student welcome to bromwell high . i expect that many adults of my age think that bromwell high is far fetched . what a pity that it isn  t   
story of a man who has unnatural feelings for a pig . starts out with a opening scene that is a terrific example of absurd comedy . a formal orchestra audience is turn

### Data Preprocessing
    -- Removing punctuations
    -- Prepre data for modelling
    -- Removing outliers

In [5]:
from string import punctuation

print(punctuation)

# get rid of punctuation
reviews = reviews.lower() # lowercase, standardize
all_text = ''.join([c for c in reviews if c not in punctuation])

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [6]:
# split by new lines and spaces
reviews_split = all_text.split('\n') # each element is a review
all_text = ' '.join(reviews_split)

# create a list of words in corpus
words = all_text.split()

In [7]:
# First review
reviews_split[0]

'bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   '

In [8]:
# Total unique words
len(set(words))

74072

In [9]:
words[:10]

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', 'it', 'ran', 'at', 'the']

In [10]:
# Converting each word into a corresponding integer
from collections import Counter

## Build a dictionary that maps words to integers
counts = Counter(words)

# Sorting based on decreasing order of frequency of word
vocab = sorted(counts, key=counts.get, reverse=True)

# Most frequent word is given integer 1
vocab_to_int = {word: ii for ii, word in enumerate(vocab,1)} 

## use the dict to tokenize each review in reviews_split
## store the tokenized reviews in reviews_ints
reviews_ints = []
for review in reviews_split:
  reviews_ints.append([vocab_to_int[word] for word in review.split()])

In [11]:
print("Most frequent word :",list(vocab_to_int.keys())[0])
print("Integer :",list(vocab_to_int.values())[0])

Most frequent word : the
Integer : 1


In [12]:
# stats about vocabulary
print('Unique words: ', len((vocab_to_int)))  # should ~ 74000+
print()

# print tokens in first review
print('Tokenized review: \n', reviews_ints[0])

Unique words:  74072

Tokenized review: 
 [21025, 308, 6, 3, 1050, 207, 8, 2138, 32, 1, 171, 57, 15, 49, 81, 5785, 44, 382, 110, 140, 15, 5194, 60, 154, 9, 1, 4975, 5852, 475, 71, 5, 260, 12, 21025, 308, 13, 1978, 6, 74, 2395, 5, 613, 73, 6, 5194, 1, 24103, 5, 1983, 10166, 1, 5786, 1499, 36, 51, 66, 204, 145, 67, 1199, 5194, 19869, 1, 37442, 4, 1, 221, 883, 31, 2988, 71, 4, 1, 5787, 10, 686, 2, 67, 1499, 54, 10, 216, 1, 383, 9, 62, 3, 1406, 3686, 783, 5, 3483, 180, 1, 382, 10, 1212, 13583, 32, 308, 3, 349, 341, 2913, 10, 143, 127, 5, 7690, 30, 4, 129, 5194, 1406, 2326, 5, 21025, 308, 10, 528, 12, 109, 1448, 4, 60, 543, 102, 12, 21025, 308, 6, 227, 4146, 48, 3, 2211, 12, 8, 215, 23]


In [13]:
# 1=positive, 0=negative label conversion
labels_split = labels.split('\n')
encoded_labels = np.array([1 if label == 'positive' else 0 for label in labels_split])
print("Review : ")
print(reviews_split[0])

print("\nSentiment :")
print(labels_split[0])

Review : 
bromwell high is a cartoon comedy  it ran at the same time as some other programs about school life  such as  teachers   my   years in the teaching profession lead me to believe that bromwell high  s satire is much closer to reality than is  teachers   the scramble to survive financially  the insightful students who can see right through their pathetic teachers  pomp  the pettiness of the whole situation  all remind me of the schools i knew and their students  when i saw the episode in which a student repeatedly tried to burn down the school  i immediately recalled          at           high  a classic line inspector i  m here to sack one of your teachers  student welcome to bromwell high  i expect that many adults of my age think that bromwell high is far fetched  what a pity that it isn  t   

Sentiment :
positive


In [14]:
# outlier review stats
# review_lens stores the frequency of integers in increasing order
review_lens = Counter([len(x) for x in reviews_ints])
print("Zero-length reviews: {}".format(review_lens[0]))
print("15_word_ reviews: {}".format(review_lens[15]))
print("Maximum review length: {}".format(max(review_lens)))

Zero-length reviews: 1
15_word_ reviews: 2
Maximum review length: 2514


In [15]:
print('Number of reviews before removing outliers: ', len(reviews_ints))

## remove any reviews/labels with zero length from the reviews_ints list.

## get any indices of any reviews with length 0
non_zero_idx = [ii for ii, review in enumerate(reviews_ints) if len(review) != 0]

# remove 0-length review with their labels
reviews_ints = [reviews_ints[ii] for ii in non_zero_idx]
encoded_labels = np.array([encoded_labels[ii] for ii in non_zero_idx])

print('Number of reviews after removing outliers: ', len(reviews_ints))

Number of reviews before removing outliers:  25001
Number of reviews after removing outliers:  25000


---
## Padding sequences

### We are padding to ensure the each review of same size,(this is necessary if we are sending input to model in batches, each review batch should be of same size).

### We can also chose to not to pad anything, then batch_size hsould be set to 1

To deal with both short and very long reviews, we'll pad or truncate all our reviews to a specific length. For reviews shorter than some `seq_length`, we'll pad with 0s. For reviews longer than `seq_length`, we can truncate them to the first `seq_length` words. A good `seq_length`, in this case, is 200.


* The data should come from `review_ints`, since we want to feed integers to the network. 
* Each row should be `seq_length` elements long. 
* For reviews shorter than `seq_length` words, **left pad** with 0s. That is, if the review is `['best', 'movie', 'ever']`, `[117, 18, 128]` as integers, the row will look like `[0, 0, 0, ..., 0, 117, 18, 128]`. 
* For reviews longer than `seq_length`, use only the first `seq_length` words as the feature vector.

As a small example, if the `seq_length=10` and an input review is: 
```
[117, 18, 128]
```
The resultant, padded sequence should be: 

```
[0, 0, 0, 0, 0, 0, 0, 117, 18, 128]
```

**Your final `features` array should be a 2D array, with as many rows as there are reviews, and as many columns as the specified `seq_length`.**

This isn't trivial and there are a bunch of ways to do this. But, if you're going to be building your own deep learning networks, you're going to have to get used to preparing your data.

In [16]:
def pad_features(reviews_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    ## getting the correct rows x cols shape
    features = np.zeros((len(reviews_ints), seq_length), dtype=int)
    
    ## for each review, I grab that review
    for i, row in enumerate(reviews_ints):
      features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

In [17]:
# Test your implementation!

seq_length = 200

features = pad_features(reviews_ints, seq_length=seq_length)

## test statements - do not change - ##
assert len(features)==len(reviews_ints), "Your features should have as many rows as reviews."
assert len(features[0])==seq_length, "Each feature row should contain seq_length values."

# print first 10 values of the first 30 batches 
print(features[:30,:10])

[[    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [22382    42 46418    15   706 17139  3389    47    77    35]
 [ 4505   505    15     3  3342   162  8312  1652     6  4819]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [   54    10    14   116    60   798   552    71   364     5]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    0     0     0     0     0     0     0     0     0     0]
 [    1   330   578    34     3   162   748  2731     9   325]
 [    9    11 10171  5305  1946   689   444    22   280   673]
 [    0     0     0     0     0     0     0     0     0

In [18]:
features.shape

(25000, 200)

In [19]:
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)
split_idx = int(len(features)*0.8)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeatures Shapes:")
print("Train set: \t\t{}".format(train_x.shape),
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))



			Features Shapes:
Train set: 		(20000, 200) 
Validation set: 	(2500, 200) 
Test set: 		(2500, 200)


### Datasets and dataloaders

In [20]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 50

# make sure to SHUFFLE your data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [21]:
# obtain one batch of training data
dataiter = iter(train_loader)
sample_x, sample_y = next(dataiter)

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 200])
Sample input: 
 tensor([[    0,     0,     0,  ...,    32,     1,  1383],
        [    0,     0,     0,  ..., 23842,   126,   370],
        [   11,    14,   159,  ...,     7,     7,     1],
        ...,
        [    0,     0,     0,  ...,     6,    58,  2058],
        [ 1701,   717,  1339,  ...,     5,  8691,    87],
        [   46,   124,    40,  ...,     1,    95,    12]])

Sample label size:  torch.Size([50])
Sample label: 
 tensor([1, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1,
        0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0,
        0, 0])


---
## Sentiment Network with PyTorch

Below is where you'll define the network.

<img src="https://github.com/agungsantoso/deep-learning-v2-pytorch/blob/master/sentiment-rnn/assets/network_diagram.png?raw=1" width=40%>

The layers are as follows:
1. An [embedding layer](https://pytorch.org/docs/stable/nn.html#embedding) that converts our word tokens (integers) into embeddings of a specific size.
2. An [LSTM layer](https://pytorch.org/docs/stable/nn.html#lstm) defined by a hidden_state size and number of layers
3. A fully-connected output layer that maps the LSTM layer outputs to a desired output_size
4. A sigmoid activation layer which turns all outputs into a value 0-1; return **only the last sigmoid output** as the output of this network.

### The Embedding Layer

We need to add an [embedding layer](https://pytorch.org/docs/stable/nn.html#embedding) because there are 74000+ words in our vocabulary. It is massively inefficient to one-hot encode that many classes. So, instead of one-hot encoding, we can have an embedding layer and use that layer as a lookup table. You could train an embedding layer using Word2Vec, then load it here. But, it's fine to just make a new layer, using it for only dimensionality reduction, and let the network learn the weights.

**nn.embedding** neither CBOW nor Skip-Gram, which are models trained end-to-end to predict the context given the word, or the word given the context. Here, nn.Embedding is optimized as part of your training task. Consequently, after training, you can expect to have embeddings that are specific to your task rather than generic embeddings that generate a more general representation which can be more or less useful depending on your task. To illustrate, you can imagine having a screwdriver that works on some screws but if your screw is very specific, then you will need to change your tool for your specific task.


### The LSTM Layer(s)

We'll create an [LSTM](https://pytorch.org/docs/stable/nn.html#lstm) to use in our recurrent network, which takes in an input_size, a hidden_dim, a number of layers, a dropout probability (for dropout between multiple layers), and a batch_first parameter.

Most of the time, you're network will have better performance with more layers; between 2-3. Adding more layers allows the network to learn really complex relationships. 


Note: `init_hidden` should initialize the hidden and cell state of an lstm layer to all zeros, and move those state to GPU, if available.

In [22]:
device = "cuda:1"


![Image](embedding.png)

### Embedding_layer

In [23]:
import torch.nn as nn

# https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html

# This module is often used to store word embeddings and retrieve 
# them using indices. The input to the module is a list of indices(each index is less than num_embeddings),
#  and the output is the corresponding word embeddings(each word embedding is of "embedding_dim").

# The learnable weights of the module of shape (num_embeddings, embedding_dim)

# An embedding layer is esentially just a Linear layer. So we could define our layer as 
# nn.Linear(num_embeddings, embedding_dim) and represent each word as on-hot encoded vector of size num_embeddings

# An Embedding module containing 10 tensors of size 3
embedding = nn.Embedding(num_embeddings=10, embedding_dim=3)

# a batch of 2 samples of 4 indices each
input = torch.LongTensor([[1, 2, 4, 5], [4, 3, 2, 9]])
embedding(input)

tensor([[[ 1.1305, -1.0397, -0.4556],
         [ 0.3179, -1.7751,  0.2407],
         [-0.0069, -0.5764, -0.4162],
         [-0.8451, -1.6613,  1.0504]],

        [[-0.0069, -0.5764, -0.4162],
         [ 0.3176, -0.9379, -1.5160],
         [ 0.3179, -1.7751,  0.2407],
         [ 1.0496, -1.0179, -0.5858]]], grad_fn=<EmbeddingBackward0>)

In [24]:
# Single integer
embedding(torch.tensor([1]))

tensor([[ 1.1305, -1.0397, -0.4556]], grad_fn=<EmbeddingBackward0>)

### Multilayer RNN


![Image](multi_LSTM2.png)

![Image](multi_LSTM1.png)

In [25]:
rnn = nn.LSTM(input_size = 10,hidden_size =  20,num_layers = 2, batch_first = True)

input = torch.randn(3, 5, 10) # (batch_size, sequence_length, input_shape)
h0 = torch.randn(2, 3, 20) # (num_layers, batch_size, hidden_shape) ==> hidden input at each layer
c0 = torch.randn(2, 3, 20) # (num_layers, batch_size, hidden_shape) ==> cell state input at each layer

output, (hn, cn) = rnn(input, (h0, c0))



print("output_shape :",output.shape) # (batch_size, sequence_lengthm hidden_shape), it's just hidden outputs at last layer for each word
print("hn.shape : ",hn.shape) # same as h0
print("cn.shape : ",cn.shape) # same as c0

output_shape : torch.Size([3, 5, 20])
hn.shape :  torch.Size([2, 3, 20])
cn.shape :  torch.Size([2, 3, 20])


In [26]:
output[0,-1]

tensor([-0.0255,  0.1254, -0.0215, -0.0586, -0.0061,  0.0414, -0.0657,  0.0104,
         0.0603,  0.0929, -0.0661,  0.0013,  0.0058,  0.0781,  0.1237, -0.0237,
        -0.1320,  0.0157,  0.0348, -0.0925], grad_fn=<SelectBackward0>)

In [27]:
hn[-1,0]

tensor([-0.0255,  0.1254, -0.0215, -0.0586, -0.0061,  0.0414, -0.0657,  0.0104,
         0.0603,  0.0929, -0.0661,  0.0013,  0.0058,  0.0781,  0.1237, -0.0237,
        -0.1320,  0.0157,  0.0348, -0.0925], grad_fn=<SelectBackward0>)

In [28]:
import torch

X = torch.tensor([[1, 2, 3], [4, 5, 6]])  # Example input tensor
vocab_size = 10  # Size of the vocabulary

# Convert X to one-hot encoded tensor
X_one_hot = torch.nn.functional.one_hot(X, num_classes=vocab_size)

print("Input Shape:", X.shape)
print("One-Hot Encoded Shape:", X_one_hot.shape)


Input Shape: torch.Size([2, 3])
One-Hot Encoded Shape: torch.Size([2, 3, 10])


In [41]:
import torch.nn as nn

class Sentiment_Vanilla_RNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim,drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(Sentiment_Vanilla_RNN, self).__init__()

        self.output_size = output_size
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        # embedding layer
        
        self.embedding = nn.Embedding(num_embeddings= vocab_size,
                                      embedding_dim= embedding_dim)

        #self.input_hidden = nn.Linear(self.embedding_dim,self.hidden_dim)
        #self.hidden_hidden = nn.Linear(self.hidden_dim,self.hidden_dim)

        self.input_hidden_weight = nn.Parameter(torch.randn(self.hidden_dim,self.embedding_dim))
        self.hidden_hidden_weight = nn.Parameter(torch.randn(self.hidden_dim,self.hidden_dim))
        self.bias = nn.Parameter(torch.randn(self.hidden_dim,1))

        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layer
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        # x.shape = (batch_size, sequence_length)
        # hidden = (batch_size, hidden_dim)
        batch_size = x.size(0)
        sequence_len = x.size(1)
        
        # embeddings and lstm_out
        embeds = self.embedding(x) # shape = (batch_size, sequence_lenth, embedding_dim)


        lstm_out = torch.zeros(batch_size,sequence_len,self.hidden_dim)
        for time_step in range(embeds.size(1)): # each time step
            input = embeds[:,time_step,:] #(batch_size, embd_dim)
            #hidden = nn.functional.tanh(self.input_hidden(input)+self.hidden_hidden(hidden)) # (batch_size, hidden_dim)
            hidden = nn.functional.tanh(
                                        (self.input_hidden_weight @ input.reshape(-1,batch_size))+
                                        (self.hidden_hidden_weight @ hidden.reshape(-1,batch_size))+
                                        self.bias
                                        ).reshape(batch_size,-1)
            lstm_out[:,time_step,:] = hidden

        # We need to only consider lstm_out[i,-1,:](last hidden) for ith input sentence
        # for prediction
        out = lstm_out[:,-1,:] # (batch_size, hidden_dim)
        out = self.dropout(out)
        out = self.fc(out.to(device))
        sig_out = self.sig(out)
        del out,embeds
        return sig_out,hidden
       
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors(cell_state, hidden_state) with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        #weight = next(self.parameters()).data
        
        #hidden = weight.new(batch_size, self.hidden_dim).zero_().to(device)
        
        hidden = torch.zeros(batch_size,self.hidden_dim).to(device)
        return hidden
        

In [42]:
import torch.nn as nn

class Sentiment_Multilayer_Vanilla_RNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim,n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(Sentiment_Multilayer_Vanilla_RNN, self).__init__()

        self.output_size = output_size
        self.num_layers = n_layers
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        # embedding layer
        self.embedding = nn.Embedding(num_embeddings= vocab_size,
                                      embedding_dim= embedding_dim)

        # Weights at each layer
        self.input_W = []
        self.hidden_U = []
        for i in range(self.num_layers):
            if i == 0:
                self.input_W.append(nn.Linear(self.embedding_dim,self.hidden_dim)) # Weights W
            else:
                self.input_W.append(nn.Linear(self.hidden_dim,self.hidden_dim)) # Weights W
            self.hidden_U.append(nn.Linear(self.hidden_dim,self.hidden_dim)) # Weights U
            self.input_W[-1] = self.input_W[-1].to(device)
            self.hidden_U[-1] = self.hidden_U[-1].to(device) 
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layer
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        # x.shape = (batch_size, sequence_length)
        # hidden = (num_layers,batch_size, hidden_dim)
        batch_size = x.size(0)
        
        # embeddings and lstm_out
        embeds = self.embedding(x) # shape = (batch_size, sequence_lenth, embedding_dim)


        # Don't know why after each iteration GPU memory is increasing
        batch_size = embeds.size(0)
        sequence_len = embeds.size(1)
        next_embeds = torch.zeros(batch_size,sequence_len,self.hidden_dim).to(device)
        
        for layer in range(self.num_layers):
            for time_step in range(embeds.size(1)): # each time step
                input = embeds[:,time_step,:] #(batch_size, embd_dim) for layer = 0, (batch_size, hidden_dim) for layer>0
                hidden[layer] = nn.functional.tanh(self.input_W[layer](input)+self.hidden_U[layer](hidden[layer])) # (batch_size, hidden_dim)

                next_embeds[:,time_step,:].copy_(hidden[layer])
            embeds = next_embeds.clone().detach()
        del next_embeds
        #print(torch.cuda.memory_allocated(),torch.cuda.memory_reserved())

        # We need to only consider lstm_out[i,-1,:](last hidden) for ith input sentence
        # for prediction
        out = embeds[:,-1,:] # (batch_size, hidden_dim)
        out = self.dropout(out)
        out = self.fc(out)
        sig_out = self.sig(out)

        del out,embeds
        return sig_out,hidden
       
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors(cell_state, hidden_state) with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        hidden = weight.new(self.num_layers,batch_size, self.hidden_dim).zero_().to(device)
        
        return hidden
        

In [43]:
import torch.nn as nn

class SentimentRNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        
        # embedding layer
        
        self.embedding = nn.Embedding(num_embeddings= vocab_size,
                                      embedding_dim= embedding_dim)
        
        self.linear_embedding = nn.Linear(vocab_size,embedding_dim)
        
        # num_layers indicate number of layers in multilayer RNN
        # For 2-layer RNN, just imagine output in the layer-1 is passed to the layer-2 as input
        self.lstm = nn.LSTM(input_size = embedding_dim,hidden_size=  hidden_dim,num_layers = n_layers,
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layer
        self.fc = nn.Linear(hidden_dim, output_size)
        self.sig = nn.Sigmoid()

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        # x.shape = (batch_size, sequence_length)
        batch_size = x.size(0)
        
        # embeddings and lstm_out
        embeds = self.embedding(x) # shape = (batch_size, sequence_lenth, embedding_dim)

        # Implementing embeddings with nn.linear
        # x_one_hot = nn.functional.one_hot(x, num_classes=self.vocab_size) #shape = (batch_size, seq_len,vocab_size)
        # x_one_hot = x_one_hot.float()
        # embeds = self.linear_embedding(x_one_hot.view(-1,self.vocab_size)) # shape = (batch_size*seq_len, embedding_dim)
        # embeds = embeds.view(batch_size,-1,self.embedding_dim)

        lstm_out, hidden = self.lstm(embeds, hidden)
        # lstm_out.shape = (batch_size, sequence_length, self.hidden_dim)
        # We need to only consider lstm_out[i,-1,:](last hidden) for ith input sentence
        # for prediction
        
        out = lstm_out[:,-1,:] # (batch_size, hidden_dim)
        out = self.dropout(out)
        out = self.fc(out)
        sig_out = self.sig(out)

        return sig_out,hidden
    
        # stack up lstm outputs
        #lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        # dropout and fully connected layer
        # out = self.dropout(lstm_out)

        
        # out = self.fc(out)
        
        # # sigmoid function
        # sig_out = self.sig(out)
        
    
        # # reshape to be batch_size first
        # sig_out = sig_out.view(batch_size, -1)
        # sig_out = sig_out[:, -1] # get last batch of labels
        
        # return last sigmoid output and hidden state
        # return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors(cell_state, hidden_state) with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                   weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        
        return hidden
        

In [44]:
# Instantiate the model w/ hyperparams
vocab_size = len(vocab_to_int) + 1 # +1 for zero padding + our word tokens
output_size = 1
embedding_dim = 400 
hidden_dim = 256
n_layers = 2

net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

#net = Sentiment_Multilayer_Vanilla_RNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

net = Sentiment_Vanilla_RNN(vocab_size, output_size, embedding_dim, hidden_dim)
print(net.hidden_hidden_weight.requires_grad)
print(net.input_hidden_weight.requires_grad)
print(net.bias.requires_grad)


print(net)

True
True
True
Sentiment_Vanilla_RNN(
  (embedding): Embedding(74073, 400)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=1, bias=True)
  (sig): Sigmoid()
)


In [45]:
hidden = net.init_hidden(batch_size)
input, labels = next(iter(dataiter))

print(input.shape)
print(hidden.shape)

torch.Size([50, 200])
torch.Size([50, 256])


In [46]:
net = net.to(device)
pred, hidden = net(input.to(device),hidden)
print(pred.shape)
print(hidden.shape)

torch.Size([50, 1])
torch.Size([50, 256])


In [47]:
# Why doing hidden.data
pred, hidden = net(input.to(device),hidden)
print(1,torch.cuda.memory_allocated(),torch.cuda.memory_reserved())

#hidden = tuple([each.data for each in hidden])
hideen = hidden.data
pred, hidden = net(input.to(device),hidden)
print(2,torch.cuda.memory_allocated(),torch.cuda.memory_reserved())

#hidden = tuple([each.data for each in hidden])
hideen = hidden.data
pred, hidden = net(input.to(device),hidden)
print(3,torch.cuda.memory_allocated(),torch.cuda.memory_reserved())


1 0 2097152
2 0 2097152
3 0 2097152


In [48]:
# loss and optimization functions
lr=0.001

criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)


In [50]:
# training params

epochs = 3 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
net = net.to(device)
net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        inputs, labels = inputs.to(device), labels.to(device)

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        #h = tuple([each.data for each in h])
        #h = h.clone().detach()
        h = h.data

        # Move the hidden state back to GPU if needed
        #h = h.to(device)

        
        # zero accumulated gradients
        net.zero_grad()

        # get the output from the model
        output, h = net(inputs, h)

        # calculate the loss and perform backprop
        loss = criterion(output.squeeze(), labels.float())
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                #val_h = tuple([each.data for each in val_h])
                #val_h = val_h.clone().detach()  
                val_h = val_h.data

                # Move the hidden state back to GPU if needed
                #val_h = val_h.to(device)

                inputs, labels = inputs.to(device), labels.to(device)

                output, val_h = net(inputs, val_h)
                val_loss = criterion(output.squeeze(), labels.float())

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [53]:
# Get test data loss and accuracy

test_losses = [] # track loss
num_correct = 0

# init hidden state
h = net.init_hidden(batch_size)

net.eval()
# iterate over test data
for inputs, labels in test_loader:

    # Creating new variables for the hidden state, otherwise
    # we'd backprop through the entire training history
    h = tuple([each.data for each in h])

    inputs, labels = inputs.to(device), labels.to(device)
    
    # get predicted outputs
    output, h = net(inputs, h)
    
    # calculate loss
    test_loss = criterion(output.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())  # rounds to the nearest integer
    
    # compare predictions to true label
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    num_correct += np.sum(correct)


# -- stats! -- ##
# avg test loss
print("Test loss: {:.3f}".format(np.mean(test_losses)))

# accuracy over all test data
test_acc = num_correct/len(test_loader.dataset)
print("Test accuracy: {:.3f}".format(test_acc))

Test loss: 0.459
Test accuracy: 0.794


In [54]:
# positive test review
test_review_pos = 'This movie had the best acting and the dialogue was so good. I loved it.'

# negative test review
test_review_neg = 'The worst movie I have seen; acting was terrible and I want my money back. This movie had bad acting and the dialogue was slow.'


In [55]:
from string import punctuation

def tokenize_review(test_review):
    test_review = test_review.lower() # lowercase
    # get rid of punctuatuon
    test_text = ''.join([c for c in test_review if c not in punctuation])
    
    # splitting by spaces
    test_words = test_text.split()
    
    # tokens
    test_ints = []
    test_ints.append([vocab_to_int[word] for word in test_words])
    
    return test_ints
  
# test code and generate tokenized review
test_ints = tokenize_review(test_review_neg)
print(test_ints)

[[1, 247, 18, 10, 28, 108, 113, 14, 388, 2, 10, 181, 60, 273, 144, 11, 18, 68, 76, 113, 2, 1, 410, 14, 539]]


In [56]:
# test sequence padding
seq_length = 200
features = pad_features(test_ints, seq_length)

print(features)

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   1 247  18  10  28
  108 113  14 388   2  10 181  60 273 144  11  18  68  76 113   2   1 410
   14 539]]


In [57]:
# test conversion to tensor and pass it to model
feature_tensor = torch.from_numpy(features)
print(feature_tensor.size())

torch.Size([1, 200])


In [58]:
def predict(net, test_review, sequence_length=200):
    ''' Prints out whether a give review is predicted to be 
        positive or negative in sentiment, using a trained model.
        
        params:
        net - A trained net 
        test_review - a review made of normal text and punctuation
        sequence_length - the padded length of a review
        '''
    
    net.eval()
    
    # tokenize review
    test_ints = tokenize_review(test_review)
    
    # It's not necessary to pad sequence with zeros
    # pad tokenize sequence
    #seq_length = sequence_length
    #features = pad_features(test_ints, seq_length)

    features = np.array(test_ints)
    
    # convert to tensor to pass to model
    feature_tensor = torch.from_numpy(features)
    
    batch_size = feature_tensor.size(0)
    
    # initialize hidden state
    h = net.init_hidden(batch_size)
    
    feature_tensor = feature_tensor.to(device)
      
    # get the output from the model
    output, h = net(feature_tensor, h)
    
    # convert output probabilities to predicted class (0 or 1)
    pred = torch.round(output.squeeze())
    # printing output value, before rounding
    print('Prediction value, pre-rounding: {:.6f}'.format(output.item()))
    
    # print custom response based on whether test_review is pos/neg
    if(pred.item()==1):
      print('Positive review detected!')
    else:
      print('Negative review detected!')
    
        

In [59]:
# positive test review
test_review_pos = 'This movie had the best acting and the dialogue was so good. I loved it.'

# negative test review
test_review_neg = 'The worst movie I have seen; acting was terrible and I want my money back. This movie had bad acting and the dialogue was slow.'


In [60]:
# call function
# try negative and positive reviews!
seq_length=200
predict(net, test_review_neg, seq_length)
predict(net, test_review_pos, seq_length)

Prediction value, pre-rounding: 0.009414
Negative review detected!
Prediction value, pre-rounding: 0.778411
Positive review detected!
