<a href="https://colab.research.google.com/github/look4pritam/PyTorchLightning/blob/master/Notebooks/mnist.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install lightning

## Import Library

In [1]:
import os

In [2]:
import torch
from torch import nn

In [3]:
from torch.nn import functional as F

In [4]:
from torch.utils.data import TensorDataset, DataLoader, random_split

In [5]:
from torchvision.datasets import MNIST
from torchvision import datasets, transforms

In [6]:
import lightning as pl
pl.seed_everything(7, workers=True)
from lightning.pytorch.callbacks import EarlyStopping, LearningRateMonitor, ModelCheckpoint

INFO: Seed set to 7
INFO:lightning.fabric.utilities.seed:Seed set to 7


In [7]:
model_path = 'pytorch-lightning/'

In [8]:
!rm -rf pytorch-lightning
!ls -al

total 20
drwxr-xr-x 1 root root 4096 Mar 29 06:23 .
drwxr-xr-x 1 root root 4096 Mar 29 05:40 ..
drwxr-xr-x 4 root root 4096 Mar 26 13:27 .config
drwxr-xr-x 3 root root 4096 Mar 29 05:47 MNIST
drwxr-xr-x 1 root root 4096 Mar 26 13:28 sample_data


## Create Model

In [9]:
import torchmetrics

In [63]:
class LightningMNISTClassifier(pl.LightningModule):

  def __init__(self, lr_rate):
    super(LightningMNISTClassifier, self).__init__()

    # mnist images are (1, 28, 28) (channels, width, height)
    self.layer_1 = torch.nn.Linear(28 * 28, 128)
    self.layer_2 = torch.nn.Linear(128, 256)
    self.layer_3 = torch.nn.Linear(256, 10)

    self.lr_rate = lr_rate

    self.val_acc = torchmetrics.classification.Accuracy(task="multiclass", num_classes=10)
    self.val_acc.reset()

    self.test_acc = torchmetrics.classification.Accuracy(task="multiclass", num_classes=10)
    self.test_acc.reset()

    self.val_loss = []
    self.test_loss = []

  def forward(self, x):
      batch_size, channels, width, height = x.size()

      # (b, 1, 28, 28) -> (b, 1*28*28)
      x = x.view(batch_size, -1)

      # layer 1 (b, 1*28*28) -> (b, 128)
      x = self.layer_1(x)
      x = torch.relu(x)

      # layer 2 (b, 128) -> (b, 256)
      x = self.layer_2(x)
      x = torch.relu(x)

      # layer 3 (b, 256) -> (b, 10)
      x = self.layer_3(x)

      # probability distribution over labels
      x = torch.softmax(x, dim=1)

      return(x)

  def cross_entropy_loss(self, logits, labels):
    return(F.nll_loss(logits, labels))

  def training_step(self, train_batch, batch_idx):
      x, y = train_batch
      logits = self.forward(x)

      train_loss = self.cross_entropy_loss(logits, y)
      self.log("train_loss", train_loss, on_step=True)

      return(train_loss)

  def validation_step(self, val_batch, batch_idx):
      x, y = val_batch
      logits = self.forward(x)

      val_loss = self.cross_entropy_loss(logits, y)

      self.val_acc(logits, y)
      self.val_loss.append(val_loss)

      return(val_loss)

  def on_validation_epoch_end(self):
      val_loss = torch.mean(torch.tensor(self.val_loss))

      val_acc = self.val_acc.compute()
      print('val_loss', val_loss.item(), val_acc.item())

      self.log("val_loss", val_loss, on_epoch=True, prog_bar=True, sync_dist=True)
      self.log("val_acc", val_acc, on_epoch=True, prog_bar=True, sync_dist=True)

      self.val_loss = []
      self.val_acc.reset()

  def test_step(self, val_batch, batch_idx):
      x, y = val_batch
      logits = self.forward(x)

      test_loss = self.cross_entropy_loss(logits, y)

      self.test_acc(logits, y)
      self.test_loss.append(test_loss)

      return(test_loss)

  def on_test_epoch_end(self):
      test_loss = torch.mean(torch.tensor(self.test_loss))

      test_acc = self.test_acc.compute()

      print('test_loss', test_loss.item(), test_acc.item())
      self.log("test_loss", test_loss, on_epoch=True, prog_bar=True, sync_dist=True)
      self.log("test_acc", test_acc, on_epoch=True, prog_bar=True, sync_dist=True)

      self.test_loss = []
      self.test_acc.reset()

  def lr_scheduler_step(self, scheduler, metric):
    print('lr_scheduler_step', metric.item())
    scheduler.step(metric)

  def configure_optimizers(self):
    optimizer = torch.optim.Adam(self.parameters(), lr=self.lr_rate)
    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, threshold=0.01, threshold_mode='rel', verbose=True)
    lr_scheduler = {'scheduler': scheduler,
                    'monitor': 'val_loss'}
    return [optimizer], [lr_scheduler]

## Callbacks

In [38]:
# Custom Callbacks
class MyPrintingCallback(pl.pytorch.callbacks.Callback):

    def on_init_start(self, trainer):
        print('Starting to init trainer!')

    def on_init_end(self, trainer):
        print('trainer is init now')

    def on_train_end(self, trainer, pl_module):
        print('do something when training ends')

## Prepare Data

In [12]:
def prepare_data():
  # transforms for images
  transform=transforms.Compose([transforms.ToTensor(),
                                transforms.Normalize((0.1307,), (0.3081,))])

  # prepare transforms standard to MNIST
  mnist_train = MNIST(os.getcwd(), train=True, download=True, transform=transform)
  mnist_train = [mnist_train[i] for i in range(2200)]

  mnist_train, mnist_val = random_split(mnist_train, [2000, 200])

  mnist_test = MNIST(os.getcwd(), train=False, download=True, transform=transform)
  mnist_test = [mnist_test[i] for i in range(3000,4000)]

  return mnist_train, mnist_val, mnist_test

### Get train, validation, test data

In [55]:
train, val, test = prepare_data()

### Prepare Data Loader

In [56]:
train_loader, val_loader, test_loader = DataLoader(train, batch_size=64), DataLoader(val, batch_size=64), DataLoader(test, batch_size=64)

## Train Model

In [64]:
model = LightningMNISTClassifier(lr_rate=1e-3)

In [65]:
# Learning Rate Logger
lr_logger = LearningRateMonitor()

In [66]:
# Set Early Stopping
early_stopping = EarlyStopping('val_loss', mode='min', patience=10)

In [67]:
# saves checkpoints to 'model_path' whenever 'val_loss' has a new min
checkpoint_callback = ModelCheckpoint(filename=model_path+'mnist_{epoch}-{val_loss:.5f}',
                                      monitor='val_loss', mode='min', save_top_k=10)

In [68]:
trainer = pl.Trainer(max_epochs=150, callbacks=[lr_logger, early_stopping, checkpoint_callback],
                     default_root_dir=model_path) #gpus=1

INFO: GPU available: True (cuda), used: True
INFO:lightning.pytorch.utilities.rank_zero:GPU available: True (cuda), used: True
INFO: TPU available: False, using: 0 TPU cores
INFO:lightning.pytorch.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO: IPU available: False, using: 0 IPUs
INFO:lightning.pytorch.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO: HPU available: False, using: 0 HPUs
INFO:lightning.pytorch.utilities.rank_zero:HPU available: False, using: 0 HPUs


In [69]:
trainer.fit(model, train_loader, val_loader)

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO: 
  | Name     | Type               | Params
------------------------------------------------
0 | layer_1  | Linear             | 100 K 
1 | layer_2  | Linear             | 33.0 K
2 | layer_3  | Linear             | 2.6 K 
3 | val_acc  | MulticlassAccuracy | 0     
4 | test_acc | MulticlassAccuracy | 0     
------------------------------------------------
136 K     Trainable params
0         Non-trainable params
136 K     Total params
0.544     Total estimated model params size (MB)
INFO:lightning.pytorch.callbacks.model_summary:
  | Name     | Type               | Params
------------------------------------------------
0 | layer_1  | Linear             | 100 K 
1 | layer_2  | Linear             | 33.0 K
2 | layer_3  | Linear             | 2.6 K 
3 | val_acc  | MulticlassAccuracy | 0     
4 | test_acc | MulticlassAccuracy | 0     
---------------------

Sanity Checking: |          | 0/? [00:00<?, ?it/s]

val_loss -0.09820368885993958 0.1015625


/usr/local/lib/python3.10/dist-packages/lightning/pytorch/loops/fit_loop.py:298: The number of training batches (32) is smaller than the logging interval Trainer(log_every_n_steps=50). Set a lower value for log_every_n_steps if you want to see logs for the training epoch.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

val_loss -0.6179718375205994 0.5950000286102295
lr_scheduler_step -0.6179718375205994


Validation: |          | 0/? [00:00<?, ?it/s]

val_loss -0.8586819171905518 0.8700000047683716
lr_scheduler_step -0.8586819171905518


Validation: |          | 0/? [00:00<?, ?it/s]

val_loss -0.893921434879303 0.8899999856948853
lr_scheduler_step -0.893921434879303


Validation: |          | 0/? [00:00<?, ?it/s]

val_loss -0.9151581525802612 0.8999999761581421
lr_scheduler_step -0.9151581525802612


Validation: |          | 0/? [00:00<?, ?it/s]

val_loss -0.9176052808761597 0.9150000214576721
lr_scheduler_step -0.9176052808761597


Validation: |          | 0/? [00:00<?, ?it/s]

val_loss -0.9262775778770447 0.9200000166893005
lr_scheduler_step -0.9262775778770447


Validation: |          | 0/? [00:00<?, ?it/s]

val_loss -0.9294140338897705 0.9150000214576721
lr_scheduler_step -0.9294140338897705


Validation: |          | 0/? [00:00<?, ?it/s]

val_loss -0.9268438816070557 0.9150000214576721
lr_scheduler_step -0.9268438816070557


Validation: |          | 0/? [00:00<?, ?it/s]

val_loss -0.9225237965583801 0.9049999713897705
lr_scheduler_step -0.9225237965583801


Validation: |          | 0/? [00:00<?, ?it/s]

val_loss -0.9105885624885559 0.8799999952316284
lr_scheduler_step -0.9105885624885559


Validation: |          | 0/? [00:00<?, ?it/s]

val_loss -0.9140033721923828 0.8999999761581421
lr_scheduler_step -0.9140033721923828


Validation: |          | 0/? [00:00<?, ?it/s]

val_loss -0.9154452085494995 0.8849999904632568
lr_scheduler_step -0.9154452085494995


Validation: |          | 0/? [00:00<?, ?it/s]

val_loss -0.9180771112442017 0.8999999761581421
lr_scheduler_step -0.9180771112442017


Validation: |          | 0/? [00:00<?, ?it/s]

val_loss -0.9227342009544373 0.9150000214576721
lr_scheduler_step -0.9227342009544373


Validation: |          | 0/? [00:00<?, ?it/s]

val_loss -0.9180418848991394 0.8949999809265137
lr_scheduler_step -0.9180418848991394


Validation: |          | 0/? [00:00<?, ?it/s]

val_loss -0.9184967875480652 0.9049999713897705
lr_scheduler_step -0.9184967875480652


Validation: |          | 0/? [00:00<?, ?it/s]

val_loss -0.9293467998504639 0.9150000214576721
lr_scheduler_step -0.9293467998504639


In [21]:
# Print model's state_dict
print("Model's state_dict:")
for param_tensor in model.state_dict():
    print(param_tensor, "\t", model.state_dict()[param_tensor].size())

Model's state_dict:
layer_1.weight 	 torch.Size([128, 784])
layer_1.bias 	 torch.Size([128])
layer_2.weight 	 torch.Size([256, 128])
layer_2.bias 	 torch.Size([256])
layer_3.weight 	 torch.Size([10, 256])
layer_3.bias 	 torch.Size([10])


## TensorBoard

In [None]:
# copy lightning logs from google drive to local machine
os.environ['lightning_logs'] = model_path+'lightning_logs'
!cp -r "$lightning_logs" .

In [None]:
# Start tensorboard.

%load_ext tensorboard
%tensorboard --logdir lightning_logs/

## Test Model

In [45]:
trainer.validate(model, val_loader)

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Validation: |          | 0/? [00:00<?, ?it/s]

val_loss -0.9350934028625488 0.9350000023841858


[{'val_loss': -0.9350934028625488, 'val_acc': 0.9350000023841858}]

In [23]:
trainer.test(model, test_loader)

INFO: LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
INFO:lightning.pytorch.accelerators.cuda:LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: |          | 0/? [00:00<?, ?it/s]

test_loss -0.8732351064682007 0.8809999823570251


[{'test_loss': -0.8732351064682007, 'test_acc': 0.8809999823570251}]

## Inference

In [46]:
PATH = checkpoint_callback.best_model_path
print(PATH)

pytorch-lightning/lightning_logs/version_2/checkpoints/pytorch-lightning/mnist_epoch=12-val_loss=-0.93858.ckpt


In [25]:
inference = LightningMNISTClassifier.load_from_checkpoint(PATH, lr_rate=1e-3)

In [26]:
inference

LightningMNISTClassifier(
  (layer_1): Linear(in_features=784, out_features=128, bias=True)
  (layer_2): Linear(in_features=128, out_features=256, bias=True)
  (layer_3): Linear(in_features=256, out_features=10, bias=True)
  (val_acc): MulticlassAccuracy()
  (test_acc): MulticlassAccuracy()
)

In [27]:
# Prepare data untuk predict
x = torch.cat((test[0][0],test[1][0],test[2][0]), 0) # 3 image
x = x.unsqueeze(1)

y = [test[0][1],test[1][1],test[2][1]]

In [28]:
import numpy as np

# Do Prediction
logits = inference(torch.tensor(x).to('cuda'))
print('Prediction :',np.argmax(logits.to('cpu').detach().numpy(), axis=1))
print('Real :', y)

Prediction : [6 9 8]
Real : [6, 9, 8]


  logits = inference(torch.tensor(x).to('cuda'))
