In [None]:
# keras.datasets.imdb is broken in TensorFlow 1.13 and 1.14 due to numpy 1.16.3
!pip install numpy==1.16.2
!pip install gputil
!pip install psutil
!pip install humanize

import os
# os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
import tensorflow as tf 
import numpy as np
from tensorflow.keras.preprocessing import sequence
from numpy import array

# Load the Drive helper and mount
from google.colab import drive
drive.mount('/content/drive')
SRC_DIR = "/content/drive/My Drive/FIT/ZPJa"

if not os.path.exists(SRC_DIR):
  os.makedirs(SRC_DIR)

# Supress deprecation warnings
import logging
logging.getLogger('tensorflow').disabled = True

# Fetch "IMDB Movie Review" data, constraining our reviews to 
# the 10000 most commonly used words
vocab_size = 10000
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.imdb.load_data(num_words=vocab_size)

# Map for readable classnames
class_names = ["Negative", "Positive"]

Download GloVe embeddings (if not already available on Google Drive)

In [None]:
glove_download = True
if (glove_download):
  !wget http://nlp.stanford.edu/data/glove.6B.zip
  !unzip glove*.zip

Get IMDB dataset word index

In [None]:
def print_first_few_elements(dictionary: dict):
  i = 0
  for k, v in dictionary.items():
    print(k,v)
    i = i + 1
    if (i==10):
      print('-' * 10)
      return

# Get the word index from the dataset
word_index = tf.keras.datasets.imdb.get_word_index()

# Ensure that "special" words are mapped into human readable terms 
word_index = {k:(v+3) for k,v in word_index.items()}
word_index["<PAD>"] = 0
word_index["<START>"] = 1
word_index["<UNKNOWN>"] = 2
word_index["<UNUSED>"] = 3

Parse the pretrained GloVE 50d-embeddings into a dictionary (unless created in any of the previous runs)


In [None]:
import json

os.listdir(SRC_DIR)
EMBED_DICT_TEMP_FILE = os.path.join(SRC_DIR, 'embedding_dict_temp_400k.tmp')
GLOVE_EMBEDDINGS_PATH = os.path.join(SRC_DIR, 'glove.6B.50d.txt')

if not (os.path.isfile(EMBED_DICT_TEMP_FILE)):
  print("Creating temporary embedding dictionary...")
  embed_dict = {}
  with open(GLOVE_EMBEDDINGS_PATH, "r") as f:
      data = f.readlines()
      for line in data:
          line_tokens = line.split()
          embed_dict[line_tokens[0]] = [float(embed_val) for embed_val in line_tokens[1:]]

      with open(EMBED_DICT_TEMP_FILE, 'w') as out_f:
          out_f.write(json.dumps(embed_dict))

with open(EMBED_DICT_TEMP_FILE) as json_file:
  embed_dict = json.load(json_file)

How many words do our reviews contain? And what do our review look like in machine and human readable form?

In [None]:
# Map for readable classnames
class_names = ["Negative", "Positive"]

# Concatenate test and training datasets
allreviews = np.concatenate((x_train, x_test), axis=0)

# Review lengths across test and training whole datasets
print("Maximum review length: {}".format(len(max((allreviews), key=len))))
print("Minimum review length: {}".format(len(min((allreviews), key=len))))
result = [len(x) for x in allreviews]
print("Mean review length: {}".format(np.mean(result)))

# Print a review and it's class as stored in the dataset. Replace the number
# to select a different review.
print("")
print("Machine readable Review")
print("  Review Text: " + str(x_train[159]))
print("  Review Sentiment: " + str(y_train[159]))

# Perform reverse word lookup and make it callable
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
def decode_review(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])

# Print a review and it's class in human readable format. Replace the number
# to select a different review.
print("")
print("Human Readable Review")
print("  Review Text: " + decode_review(x_train[23555]))
print("  Review Sentiment: " + class_names[y_train[23555]])

We need to make sure that our reviews are of a uniform length. Some reviews will need to be truncated, while others need to be padded.

In [None]:
# Arbitrarily set length of reviews
review_length = 300

# Padding / truncated our reviews
x_train = sequence.pad_sequences(x_train, maxlen = review_length)
x_test = sequence.pad_sequences(x_test, maxlen = review_length)

# Check the size of our datasets. 
print("Shape Training Review Data: " + str(x_train.shape))
print("Shape Training Class Data: " + str(y_train.shape))
print("Shape Test Review Data: " + str(x_test.shape))
print("Shape Test Class Data: " + str(y_test.shape))

# Note padding is added to start of review, not the end
print("")
print("Human Readable Review Text (post padding): " + decode_review(x_train[60]))

Let's initialize the embeddings for 'unknown' words 

In [None]:
UNK_EMBEDDING = '-0.12920076 -0.28866628 -0.01224866 -0.05676644 -0.20210965 -0.08389011 \
0.33359843  0.16045167  0.03867431  0.17833012  0.04696583 -0.00285802 \
0.29099807  0.04613704 -0.20923874 -0.06613114 -0.06822549  0.07665912 \
0.3134014   0.17848536 -0.1225775  -0.09916984 -0.07495987  0.06413227 \
0.14441176  0.60894334  0.17463093  0.05335403 -0.01273871  0.03474107 \
-0.8123879  -0.04688699  0.20193407  0.2031118  -0.03935686  0.06967544 \
-0.01553638 -0.03405238 -0.06528071  0.12250231  0.13991883 -0.17446303 \
-0.08011883  0.0849521  -0.01041659 -0.13705009  0.20127155  0.10069408 \
0.00653003  0.01685157'

unk_embedding = UNK_EMBEDDING.split(" ")
unk_embedding = list(filter(None, unk_embedding))
unk_embedding = [float(em) for em in unk_embedding]

pad_embedding = [float(0.0)] * 50

Now let's build a matrix of weights (pretrained embeddings) that will be loaded into the PyTorch embedding layer. Its shape will be equal to (vocab_size, 50):

In [None]:
vocab_embedded = np.zeros((len(word_index), 50))
words_found = 0

for word, index in word_index.items():
  if (word == '<UNKNOWN>'):
    vocab_embedded[index] = unk_embedding
  elif (word == '<START>') or (word == '<PAD>'):
    vocab_embedded[index] = pad_embedding
  elif (word == '<UNUSED>'):
    print('unused')
    vocab_embedded[index] = unk_embedding
  else:
    try:
      vocab_embedded[index] = embed_dict[word]
    except KeyError:
      # print('Could not find word in pretrained embeddings: {}'.format(word))
      vocab_embedded[index] = unk_embedding

# Limit the vocabulary to only <vocab_size> most frequently occuring words
vocab_embedded = vocab_embedded[:vocab_size,:]

Let's define CustomLSTM class and implement its *forward* method

In [None]:
from torch._C import dtype
import torch, math
import torch.nn as nn
import os,sys,humanize,psutil,GPUtil

# def mem_report():
#   print("CPU RAM Free: " + humanize.naturalsize( psutil.virtual_memory().available ))
  
#   GPUs = GPUtil.getGPUs()
#   for i, gpu in enumerate(GPUs):
#     print('GPU {:d} ... Mem Free: {:.0f}MB / {:.0f}MB | Utilization {:3.0f}%'.format(i, gpu.memoryFree, gpu.memoryTotal, gpu.memoryUtil*100))

class CustomLSTM(nn.Module):
    def __init__(self, size_in: int, size_hidden: int):
        super().__init__()
        self.size_in = size_in
        self.size_hidden = size_hidden

        # Forget gate: f_t = sigmoid(U_f * x_t + Vf * H_t-1 + b_f_)
        # Weight for input x_t (i.e. one token from sequence)
        self.U_f = nn.Parameter(torch.Tensor(size_in, size_hidden))
        # Weight for previous cell hidden output
        self.V_f = nn.Parameter(torch.Tensor(size_hidden, size_hidden))
        # Bias
        self.bias_f = nn.Parameter(torch.Tensor(size_hidden))

        # Input gate: i_t = sigmoid(U_i * x_t + V_i * h_t-1 + b_i)
        self.U_i = nn.Parameter(torch.Tensor(size_in, size_hidden))
        self.V_i = nn.Parameter(torch.Tensor(size_hidden, size_hidden))
        self.bias_i = nn.Parameter(torch.Tensor(size_hidden))

        # Output gate: o_t = sigmoid(U_o * x_t + V_o * h_t-1 + b_o)
        self.U_o = nn.Parameter(torch.Tensor(size_in, size_hidden))
        self.V_o = nn.Parameter(torch.Tensor(size_hidden, size_hidden))
        self.bias_o = nn.Parameter(torch.Tensor(size_hidden))

        # Update of the long-term memory: c_t+ = tanh(U_c * x_t + V_c * h_t-1 + b_c)
        self.U_c = nn.Parameter(torch.Tensor(size_in, size_hidden))
        self.V_c = nn.Parameter(torch.Tensor(size_hidden, size_hidden))
        self.bias_c = nn.Parameter(torch.Tensor(size_hidden))

        for name, param in self.named_parameters():
            if 'bias' in name:
                nn.init.constant_(param, 0.0)
            else:
                nn.init.xavier_normal_(param)
  
    def forward(self, x, init_states=None):
        """
        Implements forward pass of the LSTM layer
        :param x: input sequence
        :param init_states: (h_t, c_t) tuple
            h_t = o_t * tanh(c_t)
            c_t = f_t * c_t + i_t * g_t
        :return: hidden sequence and (h_t, c_t) tuple
        """
        batch_size, seq_size, dims = x.size()
        hidden_seq = []

        # If h_t and c_t are not provided, we need to initialize them with zeros
        if not init_states:
            (h_t, c_t) = (
                torch.zeros(batch_size, self.size_hidden).to(x.device),
                torch.zeros(batch_size, self.size_hidden).to(x.device)
            )
        else:
            h_t, c_t = init_states

        # Iterate over tokens in the sequence 'x' (for all the reviews and embedding
        # dimensions simultaneously)
        for t in range(seq_size):
          x_t = x[:, t, :]

          f_t = torch.sigmoid(x_t @ self.U_f + h_t @ self.V_f + self.bias_f)
          i_t = torch.sigmoid(x_t @ self.U_i + h_t @ self.V_i + self.bias_i)
          o_t = torch.sigmoid(x_t @ self.U_o + h_t @ self.V_o + self.bias_o)
          g_t = torch.tanh(x_t @ self.U_c + h_t @ self.V_c + self.bias_c)
          c_t = f_t * c_t + i_t * g_t
          h_t = o_t * torch.tanh(c_t)
          
          hidden_seq.append(h_t.unsqueeze(0))

        hidden_seq = torch.cat(hidden_seq, dim=0)
        hidden_seq = hidden_seq.transpose(0, 1).contiguous()
        return hidden_seq[:, -1, :], (h_t, c_t)

class MyFullyConnected(nn.Module):
  def __init__(self, dim_in, dim_out):
    super().__init__()

    # Initialization of linear layer weights and biases (inspired by weight initialization
    # of fully connected layer in Pytorch: https://pytorch.org/docs/stable/generated/torch.nn.Linear.html)
    k = math.sqrt(1.0 / dim_in)
    weight_init = np.random.uniform(low=-k, high=k, size=(dim_out,dim_in))
    self.weight = nn.Parameter(torch.tensor(weight_init).float())

    bias_init = np.random.uniform(low=-k, high=k, size=dim_out)
    self.bias = nn.Parameter(torch.tensor(bias_init).float())

  def forward(self, x):
    """
    @param x represents the last element in the given sequence of shape (batch_size, embed_dim)
    @returns y = xw^T + b
    """
    return x @ torch.transpose(self.weight, 0, 1) + self.bias


Create our NN model including the **CustomLSTM layer** and the **pretrained 50d GloVe embeddings**

In [None]:
# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(0)
print(device)


class MyLSTMNet(nn.Module):
  def __init__(self):
    super().__init__()
    
    num_embeddings, embedding_dim = vocab_embedded.shape
    print('MyLSTMNet - count of total word embeddings in the embedding layer: ', num_embeddings)
    self.embedding = nn.Embedding(num_embeddings, embedding_dim)

    # Init the embedding layer with the pretrained weights
    # (Comment the next line if pretrained embeddings are not preferred)
    self.embedding.load_state_dict({'weight': torch.tensor(vocab_embedded, requires_grad=True, dtype=torch.float)})
    
    print('MyLSTMNet - dimension of the word embeddings: ', embedding_dim)
    self.lstm = CustomLSTM(embedding_dim, embedding_dim)

    # Readout layer
    self.fully_connected = MyFullyConnected(embedding_dim, 2)
        
  def forward(self, x):
      x_embedded = self.embedding(x)
      lstm_out, _ = self.lstm(x_embedded)
      return self.fully_connected(lstm_out)

Instantiate the MyLSTMNet model and run training:

In [None]:
from tqdm.notebook import tqdm

net = MyLSTMNet()
net.to(device)

optimizer = torch.optim.Adam(net.parameters(), lr=0.005)
criterion = nn.CrossEntropyLoss()

# Set up training params
MAX_EPOCHS = 10
TRAIN_BATCH_SIZE = 64
TEST_BATCH_SIZE = 128

x_train_tensor = torch.tensor(x_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)

print('Count of reviews in the training data: ', x_train_tensor.size()[0])

train_params = {'batch_size': TRAIN_BATCH_SIZE,
            'shuffle': True}
          #  'num_workers': 2}
          # 'pin_memory': True}

ds_train = torch.utils.data.TensorDataset(x_train_tensor, y_train_tensor)
train_loader = torch.utils.data.DataLoader(ds_train, **train_params)


test_params = {'batch_size': TEST_BATCH_SIZE,
            'shuffle': True}
          #  'num_workers': 2}

x_test_tensor = torch.tensor(x_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

print('Count of reviews in the testing data: ', x_test_tensor.size()[0])

ds_test = torch.utils.data.TensorDataset(x_test_tensor, y_test_tensor)
test_loader = torch.utils.data.DataLoader(ds_test, **test_params)

print('Number of iterations within one training epoch: {} (batch size {})'.format(len(train_loader), TRAIN_BATCH_SIZE))
print('Number of iterations within one training epoch: {} (batch size {})'.format(len(test_loader), TEST_BATCH_SIZE))
print()

epoch_bar = tqdm(range(MAX_EPOCHS), desc="LSTM training", position=0, total=MAX_EPOCHS)
accuracy = 0

for epoch in epoch_bar:
  batch_bar = tqdm(enumerate(train_loader), desc="Epoch: {}".format(str(epoch)), position=1, total=len(train_loader))
  
  for i, (reviews, sentiments) in enumerate(train_loader):
      reviews = reviews.long().to(device)
      y_pred = net(reviews)
      sentiments_gpu = sentiments.to(device)
      loss = criterion(y_pred, sentiments_gpu)

      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      if (i + 1) % 10 == 0:
        accuracy = 0
        
        with torch.no_grad():
            for  i, (data, labels) in enumerate(test_loader):
                predictions = net(data.to(device))
                accuracy += (predictions.argmax(dim=1) == labels.to(device)).float().sum().cpu().item()
        accuracy /= len(x_train)

      batch_bar.set_postfix(loss=loss.cpu().item(), accuracy="{:.2f}".format(accuracy), epoch=epoch)
      batch_bar.update()

  epoch_bar.set_postfix(loss=loss.cpu().item(), accuracy="{:.2f}".format(accuracy), epoch=epoch)
  epoch_bar.update()

torch.save(net.state_dict(), os.path.join(SRC_DIR, 'exported_model'))