# Long Short-Term Memory (LSTM), Amazon Reviews

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt

import bz2
from collections import Counter
import re
import nltk
import numpy as np
# nltk.download('punkt')

In [2]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

In [3]:
import warnings
warnings.filterwarnings('ignore')

## Load Datasets

In [4]:
train_file = bz2.BZ2File('./datasets/amazon-reviews/train.ft.txt.bz2')
test_file = bz2.BZ2File('./datasets/amazon-reviews/test.ft.txt.bz2')

In [5]:
train_file = train_file.readlines()
test_file = test_file.readlines()

In [6]:
print("Number of training reviews: " + str(len(train_file)))
print("Number of test reviews: " + str(len(test_file)))

Number of training reviews: 3600000
Number of test reviews: 400000


In [7]:
# limit datasets will be used for training the network
n_train = 100000 # try 800000 to get much better results
n_test = 20000 # try 200000 to get much better results

train_file = [x.decode('utf-8') for x in train_file[:n_train]]
test_file = [x.decode('utf-8') for x in test_file[:n_test]]

In [8]:
print(train_file[0])

__label__2 Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate vid. game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^



## Pre-process Datasets

In [9]:
# extracting labels from sentences
train_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in train_file]
train_sentences = [x.split(' ', 1)[1][:-1].lower() for x in train_file]

test_labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in test_file]
test_sentences = [x.split(' ', 1)[1][:-1].lower() for x in test_file]

In [10]:
# simply cleaning data
for i in range(len(train_sentences)):
    train_sentences[i] = re.sub('\d', '0', train_sentences[i])
    
for i in range(len(test_sentences)):
    test_sentences[i] = re.sub('\d', '0', test_sentences[i])

In [11]:
# modifying URLs to <url>
for i in range(len(train_sentences)):
    if 'www.' in train_sentences[i] or 'http:' in train_sentences[i] or 'https:' in train_sentences[i] or '.com' in train_sentences[i]:
        train_sentences[i] = re.sub(r'([^ ]+(?<=\.[a-z]{3}))', '<url>', train_sentences[i])
        
for i in range(len(test_sentences)):
    if 'www.' in test_sentences[i] or 'http:' in test_sentences[i] or 'https:' in test_sentences[i] or '.com' in test_sentences[i]:
        test_sentences[i] = re.sub(r'([^ ]+(?<=\.[a-z]{3}))', '<url>', test_sentences[i])

In [12]:
del train_file, test_file

## Tokenize Sentences

In [13]:
# create dictionary that will map a word to the number of times it appeared in all the training sentences
words = Counter()
for i, sentence in enumerate(train_sentences):
    # the sentences will be stored as a list of words/ tokens
    train_sentences[i] = []
    # tokenizing the words
    # task of splitting a sentence into individual tokens, which can be words or punctuation, etc.
    for word in nltk.word_tokenize(sentence):
        words.update([word.lower()])
        train_sentences[i].append(word)
    if i%20000 == 0:
        print(str((i*100)/ n_train) + '%.......................... successfully tokenized.')
print('100%.......................... successfully tokenized.')

0.0%.......................... successfully tokenized.
20.0%.......................... successfully tokenized.
40.0%.......................... successfully tokenized.
60.0%.......................... successfully tokenized.
80.0%.......................... successfully tokenized.
100%.......................... successfully tokenized.


## Remove Unnecessary Words

In [14]:
# removing the words that only appear once (typos, unexisting words)
words = {word:n for word, n in words.items() if n>1}

## Pad Sentences

In [15]:
# sorting the words according to the number of appearances, with the most common word being first
words = sorted(words, key=words.get, reverse=True)

# adding padding and unknown to our vocabulary so that they will be assigned an index
words = ['_PAD', '_UNK'] + words

## Convert Inputs

In [16]:
# dictionaries to store the word to index mappings and vice versa
word2idx = {word:idx for idx,word in enumerate(words)}
idx2word = {idx:word for idx,word in enumerate(words)}

In [17]:
for i, sentence in enumerate(train_sentences):
    # looking up the mapping dictionary and assigning the index to the respective words
    train_sentences[i] = [word2idx[word] if word in word2idx else 0 for word in sentence]
    
for i, sentence in enumerate(test_sentences):
    # for test sentences, we have to tokenize the sentences as well
    test_sentences[i] = [word2idx[word.lower()] if word.lower() in word2idx else 0 for word in nltk.word_tokenize(sentence)]

## Repad Sentences

In [18]:
# defining a function that either shortens sentences or pads sentences with 0 to a fixed length
def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len), dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

In [19]:
seq_len = 200 # the length that the sentences will be padded/ shortened to
train_sentences = pad_input(train_sentences, seq_len)
test_sentences = pad_input(test_sentences, seq_len)

In [20]:
# converting our labels into numpy array
train_labels = np.array(train_labels)
test_labels = np.array(test_labels)

## Split Datasets

In [21]:
split_frac = 0.5
split_id = int(split_frac * len(test_sentences))
valid_sentences, test_sentences = test_sentences[:split_id], test_sentences[split_id:]
valid_labels, test_labels = test_labels[:split_id], test_labels[split_id:]

## Set Dataloader

In [22]:
BATCH_SIZE = 16

In [23]:
is_cuda = torch.cuda.is_available()

if is_cuda: device = torch.device('cuda')
else: device = torch.device('cpu')

In [24]:
train_dataset = TensorDataset(torch.from_numpy(train_sentences), torch.from_numpy(train_labels))
valid_dataset = TensorDataset(torch.from_numpy(valid_sentences), torch.from_numpy(valid_labels))
test_dataset = TensorDataset(torch.from_numpy(test_sentences), torch.from_numpy(test_labels))

In [25]:
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)
valid_loader = DataLoader(valid_dataset, shuffle=True, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, shuffle=True, batch_size=BATCH_SIZE)

In [26]:
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Input shape:' + str(sample_x.shape))
print('Output shape:' + str(sample_y.shape))

Input shape:torch.Size([16, 200])
Output shape:torch.Size([16])


## Set Configs

In [27]:
vocab_size = len(word2idx) + 1
output_size = 1
embedding_dim = 400
hidden_dim = 512
n_layers = 2

LR = 0.005
N_EPOCHS = 2

## Build LSTM Network

<img src='images/lstm-architecture.png' width=50% />

In [28]:
class LSTM(nn.Module):
    
    def __init__(self, input_size, embedding_dim, hidden_dim, output_size, n_layers, drop_prob=0.5):
        super(LSTM, self).__init__()
        self.output_size = output_size
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        
        # optionally, we can use pre-trained word embeddings such as GloVe or fastText
        self.embedding_layer = nn.Embedding(vocab_size, embedding_dim)
        self.lstm_layer = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.sigmoid = nn.Sigmoid()
        self.fc_layer = nn.Linear(hidden_dim, output_size)
        
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(device))
        
        return hidden
    
    def forward(self, x, hidden):
        
        batch_size = x.size(0)
        
        x = x.long()
        embeds = self.embedding_layer(x)
        lstm_out, hidden = self.lstm_layer(embeds, hidden)
        lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        lstm_out = self.dropout(lstm_out)
        output = self.fc_layer(lstm_out)
        output = self.sigmoid(output)
        
        output = output.view(batch_size, -1)
        output = output[:,-1]
        
        return output, hidden

#### Initialize LSTM Network with hyper-parameters

In [29]:
lstm = LSTM(vocab_size, embedding_dim, hidden_dim, output_size, n_layers)
lstm.to(device)

LSTM(
  (embedding_layer): Embedding(62615, 400)
  (lstm_layer): LSTM(400, 512, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
  (sigmoid): Sigmoid()
  (fc_layer): Linear(in_features=512, out_features=1, bias=True)
)

## Set Loss Function

In [30]:
bce_Loss = nn.BCELoss()

## Set Optimizers

In [31]:
optimizer = torch.optim.Adam(lstm.parameters(), lr=LR)

## Train LSTM Network

In [32]:
print_every = 1000
clip = 5
valid_loss_min = np.Inf

lstm.train()
for epoch in range(N_EPOCHS):
    
    counter = 0
    
    hidden = lstm.init_hidden(BATCH_SIZE)
    
    for inputs, labels in train_loader:
        counter += 1
        hidden = tuple([e.data for e in hidden])
        inputs, labels = inputs.to(device), labels.to(device)
        
        optimizer.zero_grad()
        lstm_out, hidden = lstm(inputs.float(), hidden)
        loss = bce_Loss(lstm_out.squeeze(), labels.float())
        
        loss.backward()
        nn.utils.clip_grad_norm_(lstm.parameters(), clip)
        optimizer.step()
        
        if counter%print_every == 0:
            valid_hidden = lstm.init_hidden(BATCH_SIZE)
            valid_losses = []
            
            lstm.eval()
            for inputs, labels in valid_loader:
                hidden = tuple([e.data for e in valid_hidden])
                inputs, labels = inputs.to(device), labels.to(device)
                lstm_out, hidden = lstm(inputs, hidden)
                valid_loss = bce_Loss(lstm_out.squeeze(), labels.float())
                valid_losses.append(valid_loss.item())
                
            lstm.train()
            print('Epoch: {}...'.format(epoch+1),
                  'Step: {}/{}...'.format(counter, len(train_loader)),
                  'Loss: {:.6f}...'.format(loss.item()),
                  'Valid Loss: {:.6f}'.format(np.mean(valid_losses)))
            
            if np.mean(valid_losses) <= valid_loss_min:
                torch.save(lstm.state_dict(), './weights/lstm_epoch{}_loss{}'.format(epoch+1, np.mean(valid_losses)))
                print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model...'.format(valid_loss_min, np.mean(valid_losses)))
                valid_loss_min = np.mean(valid_losses)

Epoch: 1... Step: 1000/6250... Loss: 0.157439... Valid Loss: 0.315429
Validation loss decreased (inf --> 0.315429). Saving model...
Epoch: 1... Step: 2000/6250... Loss: 0.056990... Valid Loss: 0.299483
Validation loss decreased (0.315429 --> 0.299483). Saving model...
Epoch: 1... Step: 3000/6250... Loss: 0.354859... Valid Loss: 0.274866
Validation loss decreased (0.299483 --> 0.274866). Saving model...
Epoch: 1... Step: 4000/6250... Loss: 0.356864... Valid Loss: 0.275680
Epoch: 1... Step: 5000/6250... Loss: 0.243332... Valid Loss: 0.316531
Epoch: 1... Step: 6000/6250... Loss: 0.657070... Valid Loss: 0.275620
Epoch: 2... Step: 1000/6250... Loss: 0.149035... Valid Loss: 0.279540
Epoch: 2... Step: 2000/6250... Loss: 0.526219... Valid Loss: 0.264183
Validation loss decreased (0.274866 --> 0.264183). Saving model...
Epoch: 2... Step: 3000/6250... Loss: 0.034330... Valid Loss: 0.267391
Epoch: 2... Step: 4000/6250... Loss: 1.013978... Valid Loss: 0.361436
Epoch: 2... Step: 5000/6250... Loss: 

## Test LSTM Network

In [36]:
lstm.load_state_dict(torch.load('./weights/lstm_epoch2_loss0.2585601684451103'))

<All keys matched successfully>

In [37]:
n_correct = 0

test_hidden = lstm.init_hidden(BATCH_SIZE)
test_losses = []

lstm.eval()
for inputs, labels in test_loader:
    hidden = tuple([e.data for e in test_hidden])
    inputs, labels = inputs.to(device), labels.to(device)
    lstm_out, hidden = lstm(inputs.float(), hidden)
    test_loss = bce_Loss(lstm_out.squeeze(), labels.float())
    test_losses.append(test_loss.item())
    
    pred = torch.round(lstm_out.squeeze())
    correct_tensor = pred.eq(labels.float().view_as(pred))
    correct = np.squeeze(correct_tensor.cpu().numpy())
    n_correct += np.sum(correct)

In [38]:
print('Test Loss: {:.3f}'.format(np.mean(test_losses)))
test_accuracy = n_correct/ len(test_loader.dataset)
print('Test Accuracy: {:.3f}%'.format(test_accuracy*100))

Test Loss: 0.261
Test Accuracy: 89.570%


---