# LSTM for text prediction

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import os
from datetime import datetime
from torch.utils.tensorboard import SummaryWriter
# from torchtext.data.utils import get_tokenizer
# from torchtext.vocab import build_vocab_from_iterator
# import torchtext.transforms as T


In [None]:
#hyper parameters
batch_size = 20
seq_length = 32
lr = 1e-4
criterion = nn.CrossEntropyLoss()
epochs = 35
hidden_size =200
input_size = 200
num_layers =  2

# Tensorboard
tensorboard is a visualization tool that can be used to visualize the training process of a deep learning model. The torch.utils.tensorboard.SummaryWriter class is used to write the logs to the tensorboard. The SummaryWriter class takes the log directory as input. The logs are written to the log directory in the form of event files. The event files can be visualized using the tensorboard web interface.

In [None]:
#%tensorboard --logdir runs
log_dir = os.path.join(os.getcwd(), "runs")
os.makedirs(log_dir, exist_ok=True)



# Data Preprocessing
we will train the LSTM model on the Penn  Treebank dataset. The Penn Treebank dataset is a dataset of cleaned and annotated English text. The data is split into training, validation, and testing sets.

## load train, test, and validation data

In [None]:
train_data_raw = open('data/ptb.train.txt', 'r').read()
test_data_raw = open('data/ptb.test.txt', 'r').read()
valid_data_raw = open('data/ptb.valid.txt', 'r').read()
data =  train_data_raw + ' ' + test_data_raw + ' ' + valid_data_raw



### sample from the data
 let's see what are the most common words in the data

In [None]:
from collections import Counter

leaderboard = Counter(data.split()).most_common(10)
i = 0
for word, freq in leaderboard:
    i+=1
    print(f'{i}.{word}: appears {freq} times')

## Tokenizing the data

In [None]:
# Tokenize the data
def tokenize(text):
    return text.replace('\n', '<eos>').split()


### create a vocabulary of words


In [None]:
def build_vocab(text):
    tokens = tokenize(text)
    counter = Counter(tokens)
    vocab = sorted(counter, key=counter.get, reverse=True)
    vocab = {word: i for i, word in enumerate(vocab, 1)}

    return vocab


vocab = build_vocab(data)
vocab_size = len(vocab) + 1



print(f'vocab size: {vocab_size}')

In [None]:
vocab

### decode and encode the words
 let's create a function that converts a word to token index and vice versa: the function stoi converts a word to a token index and the function itos converts a token index to a word

In [None]:

# decode the token i to a word S
def itos(i):
    return list(vocab.keys())[i-1]

# encode the word S to a token index i
def stoi(s):
    return vocab[s] if s in vocab else vocab['<unk>']

In [None]:
print(vocab)

print(stoi('year'))
print(itos(42))



In [None]:
train_data = [stoi(word) for word in train_data_raw.split()]
valid_data = [stoi(word) for word in valid_data_raw.split()]
test_data = [stoi(word) for word in test_data_raw.split()]

In [None]:
train_data

## build a dataset and dataloader

In [None]:
class PTBDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) // self.seq_length

    def __getitem__(self, idx):
        x = self.data[idx * self.seq_length: (idx + 1) * self.seq_length]
        y = self.data[idx * self.seq_length + 1: (idx + 1) * self.seq_length + 1]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)


# Create datasets
train_dataset = PTBDataset(train_data, seq_length)
valid_dataset = PTBDataset(valid_data, seq_length)
test_dataset = PTBDataset(test_data, seq_length)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

test_data_iter = iter(test_loader)
train_data_iter = iter(train_loader)
valid_data_iter = iter(valid_loader)

### display a batch of data
note that y is exactly x shifted by one position. meaning that $y_i = x_{i+1} = LSTM(x[0:i-1])$

In [None]:
x, y = next(train_data_iter)
print(f'x: {x.size()}, y: {y.size()}')
print(f'x:{" ".join([itos(i) for i in x[0]])}')
print(f'y:{" ".join([itos(i) for i in y[0]])}')

# LSTM/GRU Cell definition


In [None]:
class GRU_Cell(nn.Module):
    def __init__(self, input_size, hidden_size, dropout ):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.reset_gate = nn.Sequential(nn.Linear(input_size + hidden_size, hidden_size),nn.Sigmoid() )
        self.update_gate = nn.Sequential(nn.Linear(input_size + hidden_size, hidden_size),nn.Sigmoid() )
        self.new_gate = nn.Sequential(nn.Linear(input_size + hidden_size, hidden_size), nn.Tanh())
        self.dropout = nn.Dropout(dropout)



    def gru_step(self, x, h_prev):
        x_h = torch.cat((x,h_prev),dim=1)
        r = self.reset_gate(x_h)
        z = self.g_gate(x_h)
        x_rh = torch.cat((x, r * h_prev), dim=1)
        n = torch.tanh(x_rh) #Right??
        h = (1-z) * n + z*h_prev
        return h

    #What exactly happen here? Do In still need this for loop?
    # Takes input tensor x with dimensions: [T, B, X].
    def forward(self, x, states):
        h = states
        outputs = []
        inputs = x.unbind(1)
        for x_t in inputs:
            print(f"GRU Cell h size:{h.size()}, x size: {x_t.size()} ")
            h = self.gru_step(x_t, h)
            h = self.dropout(h) 
            outputs.append(h)
        return torch.stack(outputs)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTM_Cell(nn.Module):
    def __init__(self,
                 input_size,
                 hidden_size=200,
                #  num_layers=1,
                 dropout=0,
                 ):
        """
        :param input_size:
        :param hidden_size:
        :param num_layers:
        :param num_classes:
        """
        super(LSTM_Cell, self).__init__()

        # size of the hidden state
        self.hidden_size = hidden_size
        self.dropout = nn.Dropout(dropout)


        # LSTM gates
        # Forget gate
        self.f_gate = nn.Sequential(
            nn.Linear(input_size + hidden_size, hidden_size),
            nn.Sigmoid() )

        # Candidate gate(input modulation in the original paper)
        self.g_gate = nn.Sequential(
            nn.Linear(input_size + hidden_size, hidden_size),
            nn.Tanh())

        # Input gate
        self.i_gate = nn.Sequential(
            nn.Linear(input_size + hidden_size, hidden_size),
            nn.Sigmoid())

        # Output gate
        self.o_gate = nn.Sequential(
            nn.Linear(input_size + hidden_size,hidden_size),
            nn.Sigmoid())
        self.tanh = nn.Tanh()

    def lstm_step(self,x,h,c):
        """
        :param x: input tensor
        :param h: previous hidden state
        :param c: previous cell state
        :return: (h,c) tuple of new cell state and new hidden state
        """
        
        # Concatenate input and hidden state
        x_h = torch.cat((x,h),dim=1)

        # Forget
        f = self.f_gate(x_h)
        g = self.g_gate(x_h)
        i = self.i_gate(x_h)
        o = self.o_gate(x_h)

        # update c
        c = c * f + (g*i)
        # THEN, update h
        h = self.tanh(c) * o

        # apply dropout to the hidden
        h = self.dropout(h)

        return h,c
    
    def forward(self, x, states):
        h, c = states
        outputs = []
        inputs = x.unbind(1)
        
        #Run on words in sequence length
        for x_t in inputs:
            h, c = self.lstm_step(x_t, h, c)
            outputs.append(h)
        output_seq_first = torch.stack(outputs) # shape is [seq, batch, embedd]
        output_batch_first= torch.transpose(output_seq_first, 0, 1) #shape is [batch, seq, embedd]
        return output_batch_first




In [None]:
class Network(nn.Module):
    def __init__(self,
                 input_size=input_size,
                 hidden_size=hidden_size,
                 num_layers=num_layers,
                 batch_size=batch_size,
                 vocab_size=vocab_size,
                 dropout=0, cell_type = "lstm"):
        """
        :param input_size:
        :param hidden_size:
        :param num_layers:
        """

        super(Network, self).__init__()
        self.embedding = nn.Embedding(vocab_size, input_size)
        self.cell_type = cell_type
        self.cell = LSTM_Cell if self.cell_type == "lstm" else GRU_Cell
        self.num_layers = num_layers
        self.batch_size = batch_size
        # implementing multilayer network
        self.rnns = [self.cell(input_size,hidden_size,dropout=dropout) for i in range(self.num_layers)]
        self.hidden_size = hidden_size
        self.fc = nn.Linear(hidden_size, vocab_size)


    def forward_step(self,x):
         if self.cell_type == "lstm":
            return self.forward_lstm(x)
         else:
            return self.forward_gru(x)


    def forward(self,x):
            """
            :param x: input tensor
            :return: (c,h) tuple of new cell state and new hidden state
            """
            
          #  h = [torch.zeros(batch_size,self.hidden_size)]
            outputs = []
            # store the initial hidden and cell states
            # ht =torch.zeros(batch_size, self.hidden_size) # store the hidden states (output)
            # c = torch.zeros(batch_size,self.hidden_size)

            if (self.cell_type == 'lstm'):
            #initial c, h for all LSTM Layers
             states=[(torch.zeros(self.batch_size, self.hidden_size), torch.zeros(self.batch_size, self.hidden_size)) for layer in self.rnns]
            elif (self.cell_type == 'gru'):
            #intialize only h for GRU laters
             states = [(torch.zeros(self.batch_size, self.hidden_size)) for layer in self.rnns]
            else:
             print("Cell Type : {:3f} is not Valid".format(self.cell_type))

            # store the hidden states (output)
            x = self.embedding(x) #shape is [batch, seq, embedd]
            for i, rnn in enumerate(self.rnns): #run over all layers
              x = rnn(x, states[i]) #shape of X is [batch, embedd]  
            output = self.fc(x) # size is batch, seq_length, vocab_size

            return output
    






In [None]:

tb_writer = SummaryWriter('runs/LSTM')
lstm_cell = LSTM_Cell(32, 32, 1)
# visualize the model in our tensorboard summary
tb_writer.add_graph(lstm_cell, [torch.rand(32),torch.rand(32),torch.rand(32)])
tb_writer.flush()

In [None]:
def evaluate(model,
             val_loader,
             criterion = nn.CrossEntropyLoss(),
             seq_length = 32):
    """
    evaluate the model on the validation set
    :param model: model to evaluate
    :param val_loader: validation dataset loader
    :return: (accuracy, loss)
    """
    model.eval()
    running_loss = 0.0
    running_acc = 0
    total = 0
    with torch.no_grad():
        for i, data in enumerate(val_loader, 0):
            inputs, targets = data

            outputs = model(inputs)

            outputs = outputs.view(-1, vocab_size)
            targets = targets.view(-1)

            loss = criterion(outputs, targets)

            running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            running_acc += (predicted == targets).sum().item()
            total += targets.size(0)
        avg_loss = running_loss/ len(val_loader)
        perplexity = np.exp(avg_loss)
    return perplexity, loss

# Example

In [None]:
x,y = next(train_data_iter)
print(x.size())
lstm = Network(200, 200, 1)

evaluate(lstm, valid_loader)



In [None]:


def train(model,
          criterion,
          optimizer,
          train_loader=train_loader,
          val_loader=valid_loader,
          epochs=35,
          session = None,
          start_epoch = 0,
          ):
    #create /models directory IF it does not exist
    if not os.path.exists('./models'):
        # Create the directory
        print("Creating models directory")
        os.makedirs('./models')

    # track with tensorboard
    session = session or 'LSTM'+datetime.now().strftime('%m-%d-%H-%M')
    #tb_writer = SummaryWriter(f'runs/{session}')
    run_dir = f'{log_dir}/{session}'
    print(run_dir)
    tb_writer = SummaryWriter(run_dir)
    tb_writer.flush()

    val_loss_min = np.Inf
    model.train()
    for epoch in range(start_epoch, start_epoch + epochs):
        print("Epoch : {:d} out of {:d}".format(epoch, epochs))
        running_loss = 0.0
        running_corrects = 0
        train_total = 0

        # set the model to train mode
        model.train(True)

        for i, data in enumerate(train_loader, 0):
            # get the input image and labels
            inputs, labels = data
            # start with zero gradients
            optimizer.zero_grad()
            # forward pass
            outputs = model(inputs)

            # reshape the outputs and labels (batch_size , seq_length, vocab_size)=>(batch_size * seq_length, vocab_size)
            outputs = outputs.view(-1, vocab_size)
            labels = labels.view(-1)

            # calculate the loss
            loss = criterion(outputs, labels)
            loss.backward()
            # update the weights
            optimizer.step()
            running_loss += loss.item()
            _, preds = torch.max(outputs, 1)

            running_corrects += (preds == labels).sum().item()
            train_total += labels.size(0)

            if i % 100 == 99:
                # print gradient statistics
                for name, param in model.named_parameters():
                    tb_writer.add_histogram(name, param.grad, epoch * len(train_loader) +  i)
                # loss of current batch
                avg_train_loss = running_loss / 100
                print("batch no = {:d} / {:d}, ".format(i, len(train_loader)) +
                      "Avrage train loss = {:.3f}, ".format(avg_train_loss) +
                      "lr = {:.3f}, ".format(lr) +
                      "cuda memory = {:.3f} GBs".format(torch.cuda.max_memory_allocated()/1024/1024/1024))
                tb_writer.add_scalar('training loss', avg_train_loss, epoch * len(train_loader) +  i)

            
                running_loss = 0.0
        """-----------------
        per epoch evaluation
        -----------------"""
        # set the model to evaluation mode
        model.eval()

        # TODO convert accuracy to perplexity
        # TODO: maybe move perplexity tensorboard log to happen more often
        train_perplexity = np.exp(avg_train_loss)
        # validation
        val_perplexity, val_loss = evaluate(model, val_loader)

        #val_accuracy = (val_preds == val_labels).sum().item() / len(val_loader)
        tb_writer.add_scalars('train vs val loss', {'train': avg_train_loss, 'val': val_loss}, epoch)
        tb_writer.add_scalars('train vs val perplexity', {'train': train_perplexity, 'val': val_perplexity}, epoch)
        print(f'at epoch {epoch}: \nvalidation loss: {val_loss} \nValidation Preplexity: {val_perplexity}\ntraining loss:   {avg_train_loss} ')
        tb_writer.add_scalar('validation loss', val_loss, epoch)
        if val_loss <= val_loss_min:
            print('validation loss decreased({:.6f} -->{:.6f}). Saving Model ...'.format(val_loss_min, val_loss))
            torch.save(model, f'./models/ {session}.pt')
            val_loss_min = val_loss
    print('Finished Training')

# Train the model
LSTM no drop out

In [None]:




lstm = Network(input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            vocab_size=vocab_size,
            dropout=0, cell_type = "lstm" )




In [None]:
optimizer = torch.optim.Adam(lstm.parameters(), lr=lr, weight_decay=1e-5)
epochs =1

train(lstm,
      criterion = criterion,
      optimizer=optimizer,
      train_loader=train_loader,
      val_loader=valid_loader,
      session='vanilla LSTM',
      start_epoch=1
      )


In [None]:
batch_size = 10
lr = 1e-4
# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)



lstm = LSTM(input_size=200,
            hidden_size=200,
            num_layers=1,
            vocab_size=vocab_size,
            dropout=0)

optimizer = torch.optim.Adam(lstm.parameters(), lr=lr)
train(lstm,
      session='vanilla LSTM no l2 reg',
      optimizer=optimizer,
      val_loader=valid_loader,
      train_loader=train_loader,
      epochs=20
      )

# Train with Dropout

In [None]:

optimizer = torch.optim.Adam(lstm.parameters(), lr=lr, weight_decay=1e-5)


lstm = LSTM(input_size=200,
            hidden_size=200,
            num_layers=1,
            vocab_size=vocab_size,
            dropout=0.3)

train(lstm,
      criterion = criterion,
      optimizer=optimizer,
      train_loader=train_loader,
      val_loader=valid_loader,
      session='lstm_dropout',
      start_epoch=10
      )



In [None]:
# load a random sample from the test data
x, y = next(test_data_iter)

out = lstm(x)[-1]
print("====================================")
print(x.size())
print(itos(42))
print(itos(x[0][0].item()))
print("input: ", " ".join([itos(i) for i in x[0]]))
print("output: ", " ".join([itos(i) for i in out.argmax(dim=1)]))

print("===========")
for i in range(5,15):

    print( " ".join([itos(i) for i in x[0]][:i]),itos(out.argmax(dim=1)[i].item()))

In [None]:
torch.save(lstm, 'models/lstm_dropout.pt')