In [1]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.plugins.environments import SLURMEnvironment

In [2]:
import torch, datetime, os

# Essential packages for training an image classifier in PyTorch
import torch
from torch.utils.data import DataLoader
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
from torch.cuda import amp

import torchvision
from torchvision.datasets import ImageFolder
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np

In [3]:
# Setting the seed
pl.seed_everything(42)

[rank: 0] Global seed set to 42


42

# Lightning Data module

In [4]:
class MYDataModule(pl.LightningDataModule):
    def __init__(self, batch_size:int = 64, num_workers:int = 10, data_dir: str = './'):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        
        
        self.num_workers = 4
        
        normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], 
                                     std=[0.229, 0.224, 0.225])
        self.train_transform = transforms.Compose([
                transforms.RandomResizedCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ToTensor(),
                normalize,
            ])


        self.val_transform = transforms.Compose([
                transforms.Resize(256),
                transforms.CenterCrop(224),
                transforms.ToTensor(),
                normalize,
            ])
        
        print('Initing class')
    def prepare_data(self):
        print('Preparing data')
    
    def setup(self, stage=None):
        # Assign train/val datasets for use in dataloaders
        if stage == 'train' or stage is None:
            print('setup stage fit')
            self.trainset=ImageFolder(root=os.path.join(self.data_dir,'train'),
                                                transform=self.train_transform)

        # Assign test dataset for use in dataloader(s)
        if stage == 'val' or stage is None:
            print('setup stage test')
            self.valset = ImageFolder(root=os.path.join(self.data_dir,'val'),
                                              transform=self.val_transform)
    
    def train_dataloader(self):
        print('Train loader')
        return DataLoader(self.trainset, 
                                          batch_size=self.batch_size,
                                          shuffle=True, 
                                          num_workers=self.num_workers,
                                          pin_memory=True,
                                          drop_last=False)
    def val_dataloader(self):
        print('Validation loader')
        return DataLoader(self.valset, 
                                             batch_size=self.batch_size,
                                             shuffle=False, 
                                             num_workers=self.num_workers,
                                             pin_memory=True,
                                             drop_last=False)

# Lightning module for training

In PyTorch Lightning, we define pl.LightningModule's (inheriting from Module) that organize our code into 5 main sections:

- Initialization (__init__), where we create all necessary parameters/models
- Optimizers (configure_optimizers) where we create the optimizers, learning rate scheduler, etc.
- Training loop (training_step) where we only have to define the loss calculation for a single batch (the loop of optimizer.zero_grad(), loss.backward() and optimizer.step(), as well as any logging/saving operation, is done in the background)
- Validation loop (validation_step) where similarly to the training, we only have to define what should happen per step
- Test loop (test_step) which is the same as validation, only on a test set.

In [5]:
class CLASSIFY_lit_module(pl.LightningModule):
    def __init__(self):
        """
        Inputs:
            model_name - Name of the model/CNN to run. Used for creating the model (see function below)
            model_hparams - Hyperparameters for the model, as dictionary.
            optimizer_name - Name of the optimizer to use -- SGD
            optimizer_hparams - Hyperparameters for the optimizer, as dictionary. This includes learning rate, weight decay, etc.
        """
        super().__init__()
        
        # Exports the hyperparameters to a YAML file, and create "self.hparams" namespace
        self.save_hyperparameters()
        # Create model
        self.model = torchvision.models.resnet50(weights=None,num_classes=200)
        # Create loss module
        self.loss_module = nn.CrossEntropyLoss()        

    def forward(self, inputs):
        # Forward function that is run when visualizing the graph
        return self.model(inputs)

    def configure_optimizers(self):
        # We choose SGD as our optimizers.
        optimizer = optim.SGD(self.parameters(), lr=1e-3)
        
        # We will reduce the learning rate by 0.1 after 100 and 150 epochs
        scheduler = optim.lr_scheduler.StepLR(optimizer, 
                                              step_size=30, gamma=0.1)
        return [optimizer], [scheduler]

    def training_step(self, batch, batch_idx):
        # "batch" is the output of the training data loader.
        inputs, labels = batch
        outputs = self.model(inputs)
        loss = self.loss_module(outputs, labels)
        acc = (outputs.argmax(dim=-1) == labels).float().mean()

        # Logs the accuracy per epoch to tensorboard (weighted average over batches)
        self.log("train_acc", acc, on_step=False, on_epoch=True)
        self.log("train_loss", loss, on_step=False, on_epoch=True)
        return loss  # Return tensor to call ".backward" on

    def validation_step(self, batch, batch_idx):
        inputs, labels = batch
        outputs = self.model(inputs).argmax(dim=-1)
        acc = (labels == outputs).float().mean()
        # By default logs it per epoch (weighted average over batches)
        self.log("val_acc", acc,on_step=False, on_epoch=True)

# Trainer definition

Now that the data pipeline and training scheme is defined, we pass them to the Lightning's execution framework to automate the execution of the training workflow:
- Epoch and batch iteration
- Calling forward, loss eval, and backward passes
- Evaluating cross validation
- Saving and loading weights
- MultiGPU support
- Mixed precision training

And more

### Initialize data module

In [6]:
data = MYDataModule(batch_size=256,
                    num_workers=10,
                    data_dir="/ibex/ai/reference/CV/tinyimagenet")

Initing class


In [7]:
data.prepare_data()

Preparing data


In [8]:
data.setup()

setup stage fit
setup stage test


### Initialize model

In [9]:
net = CLASSIFY_lit_module()

In [10]:
CHPKT_PATH=os.path.join(os.environ['PWD'],'lightning/chkpt')
checkpoint_callback = pl.callbacks.ModelCheckpoint(dirpath=CHPKT_PATH,
                                                   filename='{epoch}-{val_loss:.2f}-{other_metric:.2f}',
                                                  save_weights_only=True,
                                                  mode="max",
                                                  monitor='train_acc')
logger = TensorBoardLogger(save_dir="logs",
                           sub_dir=None,
                           name=None,
                           version=None,
                           default_hp_metric=False,
                          )

In [16]:
# Initialize a trainer
trainer = pl.Trainer(max_epochs=2,
                     logger=logger,
                     callbacks=[checkpoint_callback],
                     accelerator="auto", 
                     devices=1, 
                     num_nodes=1, 
                     strategy=None,
                     plugins=[SLURMEnvironment(auto_requeue=False)],
                     benchmark=False,            
                     deterministic=True,
                     precision=32,
                     )

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [None]:
%%time 
# Train the model 
trainer.fit(net, data)

You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name        | Type             | Params
-------------------------------------------------
0 | model       | ResNet           | 23.9 M
1 | loss_module | CrossEntropyLoss | 0     
-------------------------------------------------
23.9 M    Trainable params
0         Non-trainable params
23.9 M    Total params
191.343   Total estimated model params size (MB)


Preparing data


Sanity Checking: 0it [00:00, ?it/s]

Validation loader
Train loader


Training: 0it [00:00, ?it/s]