# LSTM to Seq2Seq
- I will try to use medical examples and apply lstm or seq2seq

# LSTM
- lstm cell

![lstm](https://miro.medium.com/max/900/1*s7_EO0rjXAw99RnH1x4s_g.png)
- lstm cell takes 3 input
    - cell state from $t-1$
    - hidden state from $t-1$
    - current input $x_t$

- lstm cell output 
    - cell state from $t$
    - hidden state from $t$
    - output from $t$



In [58]:
import torch
import torch.nn as nn
import torch.nn.functional as F

use sample data

input : length 1 with 10 dimension\
output : length 1 20 dimension

In [59]:
rnn = nn.LSTM(10, 20, 1) # (input_size, hidden_size, num_layers)
input = torch.randn(1, 1, 10) # (Series Length, Batch size, input_dim)
h0 = torch.randn(1, 1, 20) # (Series Length, Batch size, input_dim)

In [60]:
rnn.input_size, rnn.hidden_size, rnn.num_layers
# input size , hidden size,  number of layers

(10, 20, 1)

In [61]:
output = rnn(input)

len(output) # length of 2

2

In [62]:
output[0], output[1]

# output 0 : 1,1,20 vector. output of layer
# output 1 : (output and the hidden output & current state output)

(tensor([[[-0.0511,  0.0860, -0.1217,  0.0877,  0.0426,  0.0022,  0.0229,
           -0.0829, -0.0583, -0.0324, -0.1867,  0.1178,  0.0094,  0.0187,
            0.0352,  0.0582, -0.0778,  0.0369,  0.0825, -0.0036]]],
        grad_fn=<StackBackward0>),
 (tensor([[[-0.0511,  0.0860, -0.1217,  0.0877,  0.0426,  0.0022,  0.0229,
            -0.0829, -0.0583, -0.0324, -0.1867,  0.1178,  0.0094,  0.0187,
             0.0352,  0.0582, -0.0778,  0.0369,  0.0825, -0.0036]]],
         grad_fn=<StackBackward0>),
  tensor([[[-0.1416,  0.1849, -0.2309,  0.1433,  0.0767,  0.0040,  0.0547,
            -0.1416, -0.1010, -0.0884, -0.4111,  0.2050,  0.0194,  0.0476,
             0.0682,  0.0931, -0.1866,  0.0725,  0.1666, -0.0062]]],
         grad_fn=<StackBackward0>)))

What if we changed the sequence of the vector to 2?

In [63]:
input = torch.rand(2, 1, 10) # sequence length 2, batch_size, 1, embedding size 10
rnn = nn.LSTM(10, 20, 1) # same model as before

In [64]:
output = rnn(input) 

output[0], output[1] 

(tensor([[[-0.0569,  0.1192, -0.0259,  0.0212, -0.0842, -0.0415, -0.1121,
            0.0661, -0.0783,  0.0848, -0.0500, -0.0435,  0.0087,  0.0822,
           -0.0447,  0.1175, -0.0532, -0.1104,  0.0928, -0.0492]],
 
         [[-0.0831,  0.1595, -0.0117,  0.0178, -0.1352, -0.0932, -0.1217,
            0.1179, -0.1499,  0.1582, -0.0862, -0.0483, -0.0161,  0.1349,
            0.0220,  0.1528, -0.0120, -0.1601,  0.1565, -0.0897]]],
        grad_fn=<StackBackward0>),
 (tensor([[[-0.0831,  0.1595, -0.0117,  0.0178, -0.1352, -0.0932, -0.1217,
             0.1179, -0.1499,  0.1582, -0.0862, -0.0483, -0.0161,  0.1349,
             0.0220,  0.1528, -0.0120, -0.1601,  0.1565, -0.0897]]],
         grad_fn=<StackBackward0>),
  tensor([[[-0.1749,  0.3374, -0.0240,  0.0464, -0.3265, -0.1827, -0.2405,
             0.2857, -0.3497,  0.3893, -0.1783, -0.0808, -0.0323,  0.2939,
             0.0380,  0.2307, -0.0278, -0.4370,  0.2883, -0.2168]]],
         grad_fn=<StackBackward0>)))

In [65]:
output[0].shape 
# The hidden state output. 

# (Length 2, batch, embed_size)

torch.Size([2, 1, 20])

In [66]:
output[1]

# (hidden state : (sequence_size, batch_size, embedding), current_state : (sequence_size, batch_size, embedding))

(tensor([[[-0.0831,  0.1595, -0.0117,  0.0178, -0.1352, -0.0932, -0.1217,
            0.1179, -0.1499,  0.1582, -0.0862, -0.0483, -0.0161,  0.1349,
            0.0220,  0.1528, -0.0120, -0.1601,  0.1565, -0.0897]]],
        grad_fn=<StackBackward0>),
 tensor([[[-0.1749,  0.3374, -0.0240,  0.0464, -0.3265, -0.1827, -0.2405,
            0.2857, -0.3497,  0.3893, -0.1783, -0.0808, -0.0323,  0.2939,
            0.0380,  0.2307, -0.0278, -0.4370,  0.2883, -0.2168]]],
        grad_fn=<StackBackward0>))

Try out bi-lstm model

In [86]:
bi_lstm = nn.LSTM(10, 20, 1, bidirectional=True)

In [87]:
output = bi_lstm(input) # input : length 2, 1 batch, size 10

In [88]:
output[0].shape 

# length 2, batch 1, 2 of 20

torch.Size([2, 1, 40])

The only difference is that the embedding becomes twice the size of the hidden output

In [93]:
layer1 = nn.LSTM(10,20,2,bidirectional=True)
layer2 = nn.LSTM(40,5,2,bidirectional=True)

class extract_tensor(nn.Module):
    def forward(self, x):
        tensor, _ = x
        return tensor

model1 = nn.Sequential(
    layer1,
    extract_tensor(),
    layer2
)

In [106]:
model1(input)[1][0]

tensor([[[ 0.0613, -0.1756, -0.0847, -0.0718, -0.1153]],

        [[ 0.1223, -0.0876,  0.0668, -0.0772,  0.0308]],

        [[-0.1749, -0.0881, -0.2084, -0.1275, -0.1554]],

        [[ 0.0212, -0.0551, -0.0744,  0.0346, -0.0522]]],
       grad_fn=<StackBackward0>)

# Bi-LSTM models with some dataset
- dataset : Tabular data - titanic

In [1]:
from pathlib import Path
import pandas as pd

project_dir = Path.cwd().parent
data_dir = project_dir.joinpath('data')

data = data_dir.joinpath('train.csv')
data = pd.read_csv(data)


In [2]:
data.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
# first tokenize all the values in the dataset
data.values

from torchtext.vocab import build_vocab_from_iterator
from collections import Counter, OrderedDict

row_wise_data = data.values.astype('str').tolist()
vocab = build_vocab_from_iterator(row_wise_data, specials = ["<unk>"])

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from torchtext.transforms import VocabTransform, ToTensor, Sequential
import torch.nn as nn

# make vocab transform layer
vocab_transform = VocabTransform(vocab)

# get the length of the vocab and do word embedding
word_embedding = nn.Embedding(vocab.__len__(), 128)

# To tensor
transform2tensor = ToTensor()

In [53]:
# change tokens to indices

my_embedding_layer = Sequential(
    vocab_transform,
    transform2tensor
)

# change titanic data
titanic_data = my_embedding_layer(row_wise_data)

In [87]:
import torch

torch.randn(4,6, 8).mean(2).shape

torch.Size([4, 6])

In [88]:
from copy import deepcopy

class extract_tensor(nn.Module):
        def forward(self, x):
            tensor, _ = x
            return tensor

class resizer(nn.Module):
    def forward(self, x):
        batch_size = x.shape[0]
        seq_len = x.shape[1]
        return x.reshape(seq_len, batch_size, -1)
 
 
class TableModel(nn.Module):
    
    def __init__(self, 
                 input_size,
                 hidden_size, 
                 n_layer,
                 seq_len,
                 vocab):
        super().__init__()
        
        # embed vocab
        self.vocab_size = vocab.__len__()
        self.word_embedding = nn.Embedding(self.vocab_size, input_size)
        self.resizer = resizer()
        self.seq_len = seq_len
        
        self.embedding_layer = Sequential(
            self.word_embedding,
            self.resizer
        )
        
        # bilstm layers
        self.bilstm_layer1 = nn.LSTM(input_size=input_size,
                                     hidden_size= hidden_size, 
                                     num_layers=n_layer,
                                     bidirectional =True) 
        
        self.bilstm_layer2 = nn.LSTM(input_size= hidden_size*2,
                                     hidden_size= input_size//2,
                                     num_layers = n_layer,
                                     bidirectional=True)
        
        self.bilstm_layer3 = nn.LSTM(input_size =  input_size,
                                     hidden_size = 1,
                                     num_layers = 3,
                                     bidirectional=True)
        
        self.final_layer = nn.Linear(self.seq_len, 1)

        self.model = nn.Sequential(
            self.bilstm_layer1,
            extract_tensor(),
            nn.Dropout(p=0.5),
            nn.ReLU(),
            self.bilstm_layer2,
            extract_tensor(),
            nn.Dropout(p=0.5),
            nn.ReLU(),
            self.bilstm_layer3,
            extract_tensor(),
            nn.ReLU(),
            nn.Dropout(p=0.5),
        )
    
    def forward(self, x):
        x = self.embedding_layer(x)
        x = self.model(x)
        x = torch.mean(x,2)
        x = x.transpose(1,0)
        x = self.final_layer(x)
        return x

In [90]:
my_model = TableModel(128, 256, 2, 12, vocab)

- What metric should I use to calculate loss??

In [103]:
from torch.utils.data import DataLoader, Dataset, TensorDataset

class MyDataset(Dataset):
    
    def __init__(self, x): 
        self.x = x      
    
    def __len__(self):
        return len(self.x)
        
    def __getitem__(self, idx):
        return self.x[idx]

my_dataset = MyDataset(titanic_data)

dataloader = DataLoader(my_dataset, batch_size=32, shuffle=True)

In [104]:
for data in dataloader:
    print(data)
    break

tensor([[1264,    1,    6, 1968,    5,   26,    2,    1,  959, 1529,    3,    4],
        [1773,    1,    8, 1922,    5,   22,    1,    1, 2002,   23,    3,    4],
        [1218,    2,    8, 2662,    7,   16,    8,    6,  168,  158,    3,    4],
        [1220,    2,    8, 2405,    7,   83,    2,    8,  201,  175,    3,    9],
        [1558,    2,    2, 2827,    5,   61,    2,    1,  243,   46,  361,    4],
        [ 897,    2,    6, 2244,    7,   29,    2,    2,  173,  159,    3,    4],
        [ 465,    1,    6, 2619,    5,   18,    1,    1, 2731,   36,    3,    4],
        [1204,    1,    6, 2675,    5,   19,    2,    2,  300,  244,    3,    4],
        [1566,    1,    6, 2359,    5,   34,    1,    1,  963,   37,    3,    4],
        [ 565,    2,    6, 2357,    7,   68,    2,    2,  171,  149,    3,    4],
        [1464,    2,    6, 2201,    5,    3,    1,    1,   71,   76,    3,    4],
        [ 592,    2,    6, 1860,    7,   21,    2,    1, 1047,   48,    3,    4],
        [1134,  

In [107]:
import torch.nn.functional as f
import torch.optim as optim

def train_model(model, epoch):
    
    loss_fn = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.01)
    
    for e in range(0, epoch):
        
        for batch in dataloader:
            optimizer.zero_grad()
            output = model(batch)
            loss = loss_fn(output[0], batch.to(torch.float32))
            loss.backward()
            optimizer.step()
            with torch.no_grad():
                print(f'the loss is {loss}')
        print(f'{e} epoch finished!')

In [108]:
train_model(my_model, 30)

  return F.mse_loss(input, target, reduction=self.reduction)


tensor(748929.9375, grad_fn=<MseLossBackward0>)
the loss is 748929.9375
tensor(798379.8125, grad_fn=<MseLossBackward0>)
the loss is 798379.8125
tensor(811125.3125, grad_fn=<MseLossBackward0>)
the loss is 811125.3125
tensor(757158.3125, grad_fn=<MseLossBackward0>)
the loss is 757158.3125
tensor(754493.6875, grad_fn=<MseLossBackward0>)
the loss is 754493.6875
tensor(813563.5625, grad_fn=<MseLossBackward0>)
the loss is 813563.5625
tensor(804478.8125, grad_fn=<MseLossBackward0>)
the loss is 804478.8125
tensor(799151.3125, grad_fn=<MseLossBackward0>)
the loss is 799151.3125
tensor(849818., grad_fn=<MseLossBackward0>)
the loss is 849818.0
tensor(830642.7500, grad_fn=<MseLossBackward0>)
the loss is 830642.75
tensor(817020.7500, grad_fn=<MseLossBackward0>)
the loss is 817020.75
tensor(890934.8125, grad_fn=<MseLossBackward0>)
the loss is 890934.8125
tensor(808585.2500, grad_fn=<MseLossBackward0>)
the loss is 808585.25
tensor(755185.3125, grad_fn=<MseLossBackward0>)
the loss is 755185.3125
tenso

  return F.mse_loss(input, target, reduction=self.reduction)


the loss is 775237.0625
tensor(733205.3125, grad_fn=<MseLossBackward0>)
the loss is 733205.3125
tensor(761710.1875, grad_fn=<MseLossBackward0>)
the loss is 761710.1875
tensor(772098., grad_fn=<MseLossBackward0>)
the loss is 772098.0
tensor(772243.7500, grad_fn=<MseLossBackward0>)
the loss is 772243.75
tensor(783972.2500, grad_fn=<MseLossBackward0>)
the loss is 783972.25
tensor(749448.2500, grad_fn=<MseLossBackward0>)
the loss is 749448.25
tensor(884209.4375, grad_fn=<MseLossBackward0>)
the loss is 884209.4375
tensor(807358.5000, grad_fn=<MseLossBackward0>)
the loss is 807358.5
tensor(764711.8125, grad_fn=<MseLossBackward0>)
the loss is 764711.8125
tensor(818314.2500, grad_fn=<MseLossBackward0>)
the loss is 818314.25
tensor(767019., grad_fn=<MseLossBackward0>)
the loss is 767019.0
tensor(778660., grad_fn=<MseLossBackward0>)
the loss is 778660.0
tensor(801897.3125, grad_fn=<MseLossBackward0>)
the loss is 801897.3125
tensor(772680.8125, grad_fn=<MseLossBackward0>)
the loss is 772680.8125


# Sequence to Sequence
- 