In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F #for activation functions
from torch.optim import Adam  
import lightning as L

import matplotlib.pyplot as plt
import seaborn as sns

from torch.utils.data import DataLoader, TensorDataset

In [2]:
class LSTMByHand(L.LightningModule):
    #create and initalize weights and bias
    def __init__(self):
        super().__init__()
        #will use normal distribution to select weight
        mean = torch.tensor(0.0)
        std = torch.tensor(1.0)
        self.w1r1 = nn.Parameter(torch.normal(mean = mean , std = std), requires_grad = True)
        self.w1r2 = nn.Parameter(torch.normal(mean = mean , std = std), requires_grad = True)
        self.b1r1 = nn.Parameter(torch.tensor(0.), requires_grad = True)

        self.wpr1 = nn.Parameter(torch.normal(mean = mean , std = std), requires_grad = True)
        self.wpr2 = nn.Parameter(torch.normal(mean = mean , std = std), requires_grad = True)
        self.bpr1 = nn.Parameter(torch.tensor(0.), requires_grad = True)

        self.wp1 = nn.Parameter(torch.normal(mean = mean , std = std), requires_grad = True)
        self.wp2 = nn.Parameter(torch.normal(mean = mean , std = std), requires_grad = True)
        self.bp1 = nn.Parameter(torch.tensor(0.), requires_grad = True)

        self.wo1 = nn.Parameter(torch.normal(mean = mean , std = std), requires_grad = True)
        self.wo2 = nn.Parameter(torch.normal(mean = mean , std = std), requires_grad = True)
        self.bo1 = nn.Parameter(torch.tensor(0.), requires_grad = True)

    #do the lstm math
    def lstm_unit(self, input_value, long_memory, short_memory):
        #stage 1
        long_remember_percent = torch.sigmoid((short_memory * self.w1r1) + (input_value*self.w1r2) + self.b1r1)

        #compute potential long term memory
        potential_remember_percent = torch.sigmoid((short_memory * self.wpr1) + (input_value * self.wpr2) + self.bpr1)
        potential_memory = torch.tanh((short_memory * self.wp1) + (input_value * self.wp2) + self.bp1)

        new_long_memory = long_memory * long_remember_percent + (potential_remember_percent * potential_memory)

        output_percent = torch.sigmoid((short_memory * self.wo1) + (input_value * self.wo2) + self.bo1)

        new_short_memory = torch.tanh(new_long_memory * output_percent)

        return([new_long_memory, new_short_memory])
    
    #makes forward pass through lstm
    def forward(self, input):
        long_memory = 0
        short_memory = 0
        day1 = input[0]
        day2 = input[1]
        day3 = input[2]
        day4 = input[3]

        long_memory, short_memory = self.lstm_unit(day1, long_memory, short_memory)
        long_memory, short_memory = self.lstm_unit(day2, long_memory, short_memory)
        long_memory, short_memory = self.lstm_unit(day3, long_memory, short_memory)
        long_memory, short_memory = self.lstm_unit(day4, long_memory, short_memory)

        return short_memory
    
    #configure optimizer we wanna use
    def configure_optimizers(self):
        return Adam(self.parameters())

    #also tells us how well we are doing
    def training_step(self, batch, batch_idx):
        input_i , label_i = batch
        #make prediction with training data
        output_i = self.forward(input_i[0])
        loss = (output_i - label_i)**2
        #log is part of lightning that is able to make files to store whatever it wants
        self.log("train loss", loss)

        if (label_i == 0):
            self.log("out_0", output_i)
        else:
            self.log("out_1", output_i)

        return loss

In [3]:
model = LSTMByHand()
print("\nNow lets compare the observed and predicted values")
#remember model prints the predicted output
#model returns tensor (the value we want( prediction) and gradient) <- detach strips gradient off
print("Company A: Observed = 0, Predicted = ", model(torch.tensor([0., 0.5, 0.25, 1.]).detach()))



Now lets compare the observed and predicted values
Company A: Observed = 0, Predicted =  tensor(-0.0518, grad_fn=<TanhBackward0>)


- Close to 0 

In [4]:
print("\nNow lets compare the observed and predicted values")
#remember model prints the predicted output
#model returns tensor (the value we want( prediction) and gradient) <- detach strips gradient off
print("Company B: Observed = 1, Predicted = ", model(torch.tensor([1., 0.5, 0.25, 1.]).detach()))



Now lets compare the observed and predicted values
Company B: Observed = 1, Predicted =  tensor(-0.0527, grad_fn=<TanhBackward0>)


Not close to 1 -- we need to train

In [5]:
inputs = torch.tensor([[0., 0.5, 0.25, 1.], [1., 0.5, 0.25, 1.]])
labels = torch.tensor([0., 1.])

dataset = TensorDataset(inputs, labels)
dataloader = DataLoader(dataset) #useful when we have a lot of data
#datalaoder make it easy to access the data in batches
#make it easy to shuffle the data each epoch
#make it easy to use small fraction of data

We will do backpropogation for every weight  and bias using the data from both companies for 2000 times at most


In [6]:
trainer = L.Trainer(max_epochs = 2000)
trainer.fit(model, train_dataloaders= dataloader)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs

  | Name         | Type | Params | Mode
---------------------------------------------
  | other params | n/a  | 12     | n/a 
---------------------------------------------
12        Trainable params
0         Non-trainable params
12        Total params
0.000     Total estimated model params size (MB)
0         Modules in train mode
0         Modules in eval mode
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/data_connector.py:424: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=9` in the `DataLoader` to improve performance.
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/loops/fit_loop.py:298: The number of training batches (2) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=2000` reached.


In [7]:
print("\n Now let's compare the observed and predicted values...")
print("Company A: Observed = 0, Predicted =", model(torch.tensor([0., 0.5, 0.25, 1.])))


 Now let's compare the observed and predicted values...
Company A: Observed = 0, Predicted = tensor(0.4994, grad_fn=<TanhBackward0>)


The Prediction is worse than what we started with

In [8]:
print("\n Now let's compare the observed and predicted values...")
print("Company A: Observed = 1, Predicted =", model(torch.tensor([1., 0.5, 0.25, 1.])))


 Now let's compare the observed and predicted values...
Company A: Observed = 1, Predicted = tensor(0.4887, grad_fn=<TanhBackward0>)


The prediction is a bit better

We can now use tensorboard to see what has happened in the log files. So we can know if we want to continue triaing

In [9]:
#add additonal epochs
path_to_best_checkpoint = trainer.checkpoint_callbacks[0].best_model_path
trainer = L.Trainer(max_epochs = 3000)
trainer.fit(model, train_dataloaders= dataloader, ckpt_path= path_to_best_checkpoint)

GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
Restoring states from the checkpoint path at /Users/naomigong/Coding/ML Projects/Pytorch_Quest/lightning_logs/version_15/checkpoints/epoch=1999-step=4000.ckpt
/opt/anaconda3/lib/python3.12/site-packages/lightning/pytorch/callbacks/model_checkpoint.py:362: The dirpath has changed from '/Users/naomigong/Coding/ML Projects/Pytorch_Quest/lightning_logs/version_15/checkpoints' to '/Users/naomigong/Coding/ML Projects/Pytorch_Quest/lightning_logs/version_18/checkpoints', therefore `best_model_score`, `kth_best_model_path`, `kth_value`, `last_model_path` and `best_k_models` won't be reloaded. Only `best_model_path` will be reloaded.

  | Name         | Type | Params | Mode
---------------------------------------------
  | other params | n/a  | 12     | n/a 
---------------------------------------------
12        Trainable params
0         Non-trainable params
12        Total param

Training: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=3000` reached.


In [10]:
print("\n Now let's compare the observed and predicted values...")
print("Company A: Observed = 0, Predicted =", model(torch.tensor([0., 0.5, 0.25, 1.])))


 Now let's compare the observed and predicted values...
Company A: Observed = 0, Predicted = tensor(0.4980, grad_fn=<TanhBackward0>)
