# LSTM for time series prediction

## Importing the libraries

In [44]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import Dataset, DataLoader


# Data Preprocessing
we will train the LSTM model on the Penn  Treebank dataset. The Penn Treebank dataset is a dataset of cleaned and annotated English text. The data is split into training, validation, and testing sets.

In [35]:
train_data = open('data/ptb.train.txt', 'r').read()
test_data = open('data/ptb.test.txt', 'r').read()
valid_data = open('data/ptb.valid.txt', 'r').read()
data = train_data + ' ' + test_data + ' ' + valid_data



Training data:  no it was n't black monday 
 but while the new york stock exchange did n't fall apart friday as the
['no', 'it', 'was', "n't", 'black', 'monday', 'but', 'while', 'the', 'new']


### let's see what are the most common words in the data

In [23]:
from collections import Counter

leaderboard = Counter(data.split()).most_common(10)
i = 0
for word, freq in leaderboard:
    i+=1
    print(f'{i}.{word}: appears {freq} times')

1.the: appears 59421 times
2.<unk>: appears 53299 times
3.N: appears 37607 times
4.of: appears 28427 times
5.to: appears 27430 times
6.a: appears 24755 times
7.in: appears 21032 times
8.and: appears 20404 times
9.'s: appears 11555 times
10.for: appears 10436 times


## Tokenizing the data
### create a vocabulary of words

In [37]:


def build_vocab(txt, max_vocab_size):
    counter = Counter()
    for t in txt:
        counter.update(t.split())
    most_common = counter.most_common(max_vocab_size - 1)
    vocab = {word: idx + 1 for idx, (word, _) in enumerate(most_common)}
    vocab['<unk>'] = 0
    return vocab


In [38]:
# concatenate the data

vocab = build_vocab(data, max_vocab_size=10000)

### create a function that converts a word to token index and vice versa

In [39]:

# decode the token i to a word S
def itos(i):
    return list(vocab.keys())[i]

# encode the word S to a token index i
def stoi(s):
    return vocab[s] if s in vocab else vocab['<unk>']

In [40]:
train_data = [stoi(word) for word in train_data.split()]
valid_data = [stoi(word) for word in valid_data.split()]
test_data = [stoi(word) for word in test_data.split()]

## build a dataset and dataloader

In [42]:
batch_size = 32
seq_length = 32

In [45]:
class PTBDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) // self.seq_length

    def __getitem__(self, idx):
        x = self.data[idx * self.seq_length: (idx + 1) * self.seq_length]
        y = self.data[idx * self.seq_length + 1: (idx + 1) * self.seq_length + 1]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)


def batchify(data, batch_size, seq_length):
    num_batches = len(data) // (batch_size * seq_length)
    data = data[:num_batches * batch_size * seq_length]
    data = np.reshape(data, [batch_size, -1])
    return data

train_data_batched = batchify(train_data, batch_size, seq_length)
valid_data_batched = batchify(valid_data, batch_size, seq_length)
test_data_batched = batchify(test_data, batch_size, seq_length)

train_dataset = PTBDataset(train_data_batched.flatten(), seq_length)
valid_dataset = PTBDataset(valid_data_batched.flatten(), seq_length)
test_dataset = PTBDataset(test_data_batched.flatten(), seq_length)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Lstm model Architecture

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        """
        :param input_size:
        :param hidden_size:
        :param num_layers:
        :param num_classes:
        """
        super(LSTM, self).__init__()

        # size of the hidden state
        self.hidden_size = hidden_size

        # LSTM gates
        # Forget gate
        self.f_gate = nn.Sequential(
            nn.Linear(input_size + hidden_size, hidden_size),
            nn.Sigmoid() )

        # Candidate gate(input modulation in the original paper)
        self.g_gate = nn.Sequential(
            nn.Linear(input_size + hidden_size, hidden_size),
            nn.Tanh())

        # Input gate
        self.i_gate = nn.Sequential(
            nn.Linear(input_size + hidden_size, hidden_size),
            nn.Sigmoid())

        # Output gate
        self.o_gate = nn.Sequential(
            nn.Linear(input_size + hidden_size,hidden_size),
            nn.Sigmoid())

    def forward(self,x,h,c):
        """
        :param x: input tensor
        :param h: previous hidden state
        :param c: previous cell state
        :return: (c,h) tuple of new cell state and new hidden state
        """

        # Concatenate input and hidden state
        x_h = torch.cat((x,h),1)

        # Forget
        f = self.f_gate(x_h)
        g = self.g_gate(x_h)
        i = self.i_gate(x_h)
        o = self.o_gate(x_h)

        # update c
        c = c * f + (g*i)
        # THEN, update h
        h = nn.Tanh(c) * o

        return h,c


