# LSTM for text prediction

## Importing the libraries

In [279]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader
import os
from torch.utils.tensorboard import SummaryWriter
# from torchtext.data.utils import get_tokenizer
# from torchtext.vocab import build_vocab_from_iterator
# import torchtext.transforms as T




# Tensorboard
tensorboard is a visualization tool that can be used to visualize the training process of a deep learning model. The torch.utils.tensorboard.SummaryWriter class is used to write the logs to the tensorboard. The SummaryWriter class takes the log directory as input. The logs are written to the log directory in the form of event files. The event files can be visualized using the tensorboard web interface.

In [182]:
#%tensorboard --logdir runs
log_dir = os.path.join(os.getcwd(), "runs")
os.makedirs(log_dir, exist_ok=True)



# Data Preprocessing
we will train the LSTM model on the Penn  Treebank dataset. The Penn Treebank dataset is a dataset of cleaned and annotated English text. The data is split into training, validation, and testing sets.

## load train, test, and validation data

In [307]:
train_data_raw = open('data/ptb.train.txt', 'r').read()
test_data_raw = open('data/ptb.test.txt', 'r').read()
valid_data_raw = open('data/ptb.valid.txt', 'r').read()
data =  train_data_raw + ' ' + test_data_raw + ' ' + valid_data_raw



TypeError: bad operand type for unary +: 'str'

### sample from the data
 let's see what are the most common words in the data

In [280]:
from collections import Counter

leaderboard = Counter(data.split()).most_common(10)
i = 0
for word, freq in leaderboard:
    i+=1
    print(f'{i}.{word}: appears {freq} times')

1.the: appears 59421 times
2.<unk>: appears 53299 times
3.N: appears 37607 times
4.of: appears 28427 times
5.to: appears 27430 times
6.a: appears 24755 times
7.in: appears 21032 times
8.and: appears 20404 times
9.'s: appears 11555 times
10.for: appears 10436 times


## Tokenizing the data

In [281]:
# Tokenize the data
def tokenize(text):
    return text.replace('\n', '<eos>').split()


### create a vocabulary of words


In [204]:
def build_vocab(text):
    tokens = tokenize(text)
    counter = Counter(tokens)
    vocab = sorted(counter, key=counter.get, reverse=True)

    return vocab


vocab = build_vocab(data)
vocab_size = len(vocab)



print(f'vocab size: {vocab_size}')

vocab size: 9999


### decode and encode the words
 let's create a function that converts a word to token index and vice versa: the function stoi converts a word to a token index and the function itos converts a token index to a word

In [297]:

# decode the token i to a word S
def itos(i):
    return list(vocab.keys())[i-1]

# encode the word S to a token index i
def stoi(s):
    return vocab[s] if s in vocab else vocab['<unk>']

In [299]:
print(stoi('would'))
print(itos(42))

42
would


In [308]:
train_data = [stoi(word) for word in train_data_raw.split()]
valid_data = [stoi(word) for word in valid_data_raw.split()]
test_data = [stoi(word) for word in test_data_raw.split()]

## build a dataset and dataloader

In [340]:
batch_size = 20
seq_length = 32

In [341]:
class PTBDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) // self.seq_length

    def __getitem__(self, idx):
        x = self.data[idx * self.seq_length: (idx + 1) * self.seq_length]
        y = self.data[idx * self.seq_length + 1: (idx + 1) * self.seq_length + 1]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)


# Create datasets
train_dataset = PTBDataset(train_data, seq_length)
valid_dataset = PTBDataset(valid_data, seq_length)
test_dataset = PTBDataset(test_data, seq_length)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

test_data_iter = iter(test_loader)
train_data_iter = iter(train_loader)
valid_data_iter = iter(valid_loader)

### display a batch of data
note that y is exactly x shifted by one position. meaning that $y_i = x_{i+1} = LSTM(x[0:i-1])$

In [317]:
x, y = next(train_data_iter)
print(f'x: {x.size()}, y: {y.size()}')
print(f'x:{" ".join([itos(i) for i in x[0]])}')
print(f'y:{" ".join([itos(i) for i in y[0]])}')

x: torch.Size([20, 32]), y: torch.Size([20, 32])
x:been among its leading parsow sen. pete parsow r. n.m. the ranking republican on the senate budget committee used his influence to preserve more than $ N in subsidies for air service
y:among its leading parsow sen. pete parsow r. n.m. the ranking republican on the senate budget committee used his influence to preserve more than $ N in subsidies for air service to


# Lstm model Architecture

In [373]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTM_Cell(nn.Module):
    def __init__(self,
                 input_size,
                 hidden_size=200,
                 num_layers=1,
                 ):
        """
        :param input_size:
        :param hidden_size:
        :param num_layers:
        :param num_classes:
        """
        super(LSTM_Cell, self).__init__()

        # size of the hidden state
        self.hidden_size = hidden_size


        # LSTM gates
        # Forget gate
        self.f_gate = nn.Sequential(
            nn.Linear(input_size + hidden_size, hidden_size),
            nn.Sigmoid() )

        # Candidate gate(input modulation in the original paper)
        self.g_gate = nn.Sequential(
            nn.Linear(input_size + hidden_size, hidden_size),
            nn.Tanh())

        # Input gate
        self.i_gate = nn.Sequential(
            nn.Linear(input_size + hidden_size, hidden_size),
            nn.Sigmoid())

        # Output gate
        self.o_gate = nn.Sequential(
            nn.Linear(input_size + hidden_size,hidden_size),
            nn.Sigmoid())
        self.tanh = nn.Tanh()

    def forward(self,x,h,c):
        """
        :param x: input tensor
        :param h: previous hidden state
        :param c: previous cell state
        :return: (h,c) tuple of new cell state and new hidden state
        """
        print(f"x: {x.size()}, h: {h.size()}")
        # Concatenate input and hidden state
        x_h = torch.cat((x,h),dim=1)
        print(f"x_h: {x_h.size()}")

        # Forget
        f = self.f_gate(x_h)
        g = self.g_gate(x_h)
        i = self.i_gate(x_h)
        o = self.o_gate(x_h)

        # update c
        c = c * f + (g*i)
        # THEN, update h
        h = self.tanh(c) * o

        return h,c

class LSTM(nn.Module):
    def __init__(self,
                 input_size=200,
                 hidden_size=32,
                 num_layers=1,
                 vocab_size=vocab_size):
        """
        :param input_size:
        :param hidden_size:
        :param num_layers:
        """
        super(LSTM, self).__init__()

        self.embedding = nn.Embedding(vocab_size, input_size)

        self.LSTM_Cell = LSTM_Cell(input_size, hidden_size, num_layers)
        self.hidden_size = hidden_size
        self.fc = nn.Linear(hidden_size, vocab_size)


    def forward(self,x):
            """
            :param x: input tensor
            :return: (c,h) tuple of new cell state and new hidden state
            """
            batch_size = x.size(0)
            h = [torch.zeros(batch_size,self.hidden_size)]
            outputs = []
            # store the initial hidden and cell states
            ht =torch.zeros(batch_size, self.hidden_size) # store the hidden states (output)
            c = torch.zeros(batch_size,self.hidden_size)

            # store the hidden states (output)

            for i in range(x.size(1)):
                xi = self.embedding(x[:,i])
                print(f"xi: {xi.size()}")

                ht,c = self.LSTM_Cell(xi,ht,c)
                h.append(ht)
                print("done")

                outputs.append(self.fc(h[-1]))

            return torch.stack(outputs, dim=0)


In [374]:

tb_writer = SummaryWriter('runs/LSTM')
lstm_cell = LSTM_Cell(32, 32, 1)
# visualize the model in our tensorboard summary
tb_writer.add_graph(lstm_cell, [torch.rand(32),torch.rand(32),torch.rand(32)])
tb_writer.flush()

x: torch.Size([32]), h: torch.Size([32])


IndexError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

In [377]:
def evaluate(model,
             val_loader,
             criterion = nn.CrossEntropyLoss(),
             seq_length = 32):
    """
    evaluate the model on the validation set
    :param model: model to evaluate
    :param val_loader: validation dataset loader
    :return: (accuracy, loss)
    """
    model.eval()
    running_loss = 0.0
    running_acc = 0
    total = 0
    with torch.no_grad():
        for i, data in enumerate(val_loader, 0):
            inputs, targets = data

            outputs = model(inputs)

            outputs = outputs.view(-1, vocab_size)
            targets = targets.view(-1)

            print(inputs.size())
            print(outputs.size())

            loss = criterion(outputs, targets)

            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            running_acc += (predicted == targets).sum().item()
            total += targets.size(0)
        avg_loss = running_loss/ len(val_loader)
        perplexity = np.exp(avg_loss)
    return perplexity, loss

# Example

In [378]:
x,y = next(train_data_iter)
print(x.size())
lstm = LSTM(200, 200, 1)

evaluate(lstm, valid_loader)



torch.Size([20, 32])
xi: torch.Size([20, 200])
x: torch.Size([20, 200]), h: torch.Size([20, 200])
x_h: torch.Size([20, 400])
done
xi: torch.Size([20, 200])
x: torch.Size([20, 200]), h: torch.Size([20, 200])
x_h: torch.Size([20, 400])
done
xi: torch.Size([20, 200])
x: torch.Size([20, 200]), h: torch.Size([20, 200])
x_h: torch.Size([20, 400])
done
xi: torch.Size([20, 200])
x: torch.Size([20, 200]), h: torch.Size([20, 200])
x_h: torch.Size([20, 400])
done
xi: torch.Size([20, 200])
x: torch.Size([20, 200]), h: torch.Size([20, 200])
x_h: torch.Size([20, 400])
done
xi: torch.Size([20, 200])
x: torch.Size([20, 200]), h: torch.Size([20, 200])
x_h: torch.Size([20, 400])
done
xi: torch.Size([20, 200])
x: torch.Size([20, 200]), h: torch.Size([20, 200])
x_h: torch.Size([20, 400])
done
xi: torch.Size([20, 200])
x: torch.Size([20, 200]), h: torch.Size([20, 200])
x_h: torch.Size([20, 400])
done
xi: torch.Size([20, 200])
x: torch.Size([20, 200]), h: torch.Size([20, 200])
x_h: torch.Size([20, 400])
don

(9999.38765169298, tensor(9.2071))

In [254]:
from datetime import datetime
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=0.001)
epochs = 10

def train(model,
          train_loader=train_loader,
          val_loader=valid_loader,
          criterion = criterion,
          optimizer = optimizer,
          epochs=epochs,
          session = None,
          ):
    #create /models directory IF it does not exist
    if not os.path.exists('/models'):
        # Create the directory
        os.makedirs('/models')

    # track with tensorboard
    session = session or 'LSTM'+datetime.now().strftime('%m-%d-%H-%M')
    #tb_writer = SummaryWriter(f'runs/{session}')
    run_dir = f'{log_dir}/{session}'
    print(run_dir)
    tb_writer = SummaryWriter(run_dir)
    tb_writer.flush()

    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        running_corrects = 0
        train_total = 0
        val_loss_min = np.Inf
        # set the model to train mode
        model.train(True)

        for i, data in enumerate(train_loader, 0):

            # get the input image and labels
            inputs, labels = data

            # start with zero gradients
            optimizer.zero_grad()
            # forward pass
            outputs = model(inputs)
            # calculate the loss
            loss = criterion(outputs, labels)
            loss.backward()
            # update the weights
            optimizer.step()
            running_loss += loss.item()
            _, preds = torch.max(outputs, 1)

            running_corrects += (preds == labels).sum().item()
            train_total += labels.size(0)

            if i % 100 == 99:
                # print gradient statistics
                for name, param in model.named_parameters():
                    tb_writer.add_histogram(name, param.grad, epoch * len(train_loader) +  i)
                # loss of current batch
                avg_train_loss = running_loss / 100
                tb_writer.add_scalar('training loss', avg_train_loss, epoch * len(train_loader) +  i)

                #print(f'[{epoch + 1}, {i + 1}] loss: {avg_train_loss}')
                running_loss = 0.0
        """-----------------
        per epoch evaluation
        -----------------"""
        # set the model to evaluation mode
        model.eval()

        #accuracy
        train_accuracy = running_corrects / train_total
        # validation
        val_acc, val_loss = evaluate(model, val_loader)

        #val_accuracy = (val_preds == val_labels).sum().item() / len(val_loader)
        tb_writer.add_scalars('train vs val loss', {'train': avg_train_loss, 'val': val_loss}, epoch)
        tb_writer.add_scalars('train vs val accuracy', {'train': train_accuracy, 'val': val_acc}, epoch)
        print(f'at epoch {epoch}: \nvalidation loss: {val_loss} \ntraining loss:   {avg_train_loss} ')
        tb_writer.add_scalar('validation loss', val_loss, epoch)
        if val_loss <= val_loss_min:
            print('validation loss decreased({:.6f} -->{:.6f}). Saving Model ...'.format(val_loss_min, val_loss))
            torch.save(model, f'./models/Lenet {session}.pt')
            val_loss_min = val_loss
    print('Finished Training')

NameError: name 'Lenet5' is not defined