## **GNR 638:** Machine Learning for Remote Sensing-II
### **Mini Project-1:** Fine grained classification on CUB-200-2011 dataset
> The task is to train a CNN model with an upper limit of 10M parameters to do fine grained classification on CUB-200-2011 dataset. 

### Collaborators: 
[![Munish](https://img.shields.io/badge/22M2153-Munish_Monga-blue)](https://github.com/munish30monga)
[![Sachin](https://img.shields.io/badge/22M2162-Sachin_Giroh-darkgreen)](https://github.com/22M2159)

### Table of Contents:
1. [Introduction](#introduction)
2. [Imporing Libraries](#imporing-libraries)
3. [Hyperparameters](#hyperparameters)
4. [Downloading and Processing CUB Dataset](#downloading-and-processing-cub-dataset)
5. [Preparing the Model](#preparing-the-model)
6. [Training Loop](#training-loop)
7. [Plotting Loss and Accuracy](#plotting-loss-and-accuracy)
8.  [References:](#references)

### Introduction

### Imporing Libraries

In [1]:
import wandb
import argparse
import yaml
import munch
import lightning as L
import torch.nn as nn
import torch.nn.functional as F
import timm
import torch
from lightning.pytorch.loggers import WandbLogger
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import lightning as L
from pathlib import Path
import numpy as np
from prettytable import PrettyTable
import albumentations as A
from albumentations.pytorch import ToTensorV2
from focal_loss.focal_loss import FocalLoss
from lightning.pytorch.trainer import Trainer
from lightning.pytorch.callbacks import ModelCheckpoint, LearningRateMonitor, RichProgressBar
from collections import OrderedDict
from torchvision.ops import FeaturePyramidNetwork

### Hyperparameters

In [2]:
hyperparameters = {
    "backbone": 'efficientnet_b0',  # 'efficientnet_b0', 'resnet18', 'dpn48b', 'mobilenetv2_140', 'efficientnet_b2', 'fastvit_s12', 'densenet121', 'mixnet_l'
    "pretrained": True,
    "unfreeze_last_n": -1,
    "dataset_dir": './datasets/cub',
    "batch_size": 32,
    "num_workers": 8,
    "img_size": 512,
    "optimizer": 'AdamW',  # 'Adam', 'SGD', 'AdamW'
    "scheduler": 'CosineAnnealing',  # 'CosineAnnealing', 'ReduceLROnPlateau'
    "epochs": 100,
    "learning_rate": 1e-3,
    "temperature": 0.5,
    "weight_decay": 1e-4,
    "patience": 5,
    "decay_factor": 0.5,
    "loss_function": 'CrossEntropy',  # 'CrossEntropy', 'FocalLoss'
    "label_smoothing": 0.2,
    "gamma": 1,
    "use_augm": False,
    "use_fpn": False,
}

### Configurations

In [3]:
class Config:
    def __init__(self, **entries):
        self.__dict__.update(entries)
        
cfg = Config(**hyperparameters)

### Downloading and Processing CUB-200-2011 Dataset <a id="downloading-and-processing-cub-dataset"></a>

In [4]:
# Uncomment & run only once for downloading data
# !bash down_process_CUB.sh

### Data Augmentations & Preprocessing

In [5]:
def get_transforms(cfg):
    transforms = {
        'train': A.Compose([
            A.Resize(cfg.img_size, cfg.img_size),
            A.CenterCrop(cfg.img_size, cfg.img_size),
            A.HorizontalFlip(),
            A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1, rotate_limit=15, p=0.5),
            A.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1, p=0.5),
            A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.2, p=0.5),
            A.Sharpen(alpha=(0.2, 0.5), lightness=(0.5, 1.0), p=0.5),
            A.CoarseDropout(max_holes=4, max_height=15, max_width=15, fill_value=0, p=0.3),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2(),
        ]),
        'val': A.Compose([
            A.Resize(cfg.img_size, cfg.img_size),
            A.CenterCrop(cfg.img_size, cfg.img_size),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2(),
        ]),
        'test': A.Compose([
            A.Resize(cfg.img_size, cfg.img_size),
            A.CenterCrop(cfg.img_size, cfg.img_size),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2(),
        ])
    }
    return transforms

### CUB-200-2011 Dataloader

In [6]:
class CUB_Dataset(Dataset):
    def __init__(self, cfg, split='train', transform=None, split_ratio=0.2):
        self.cfg = cfg
        self.dataset_dir = Path(self.cfg.dataset_dir)
        self.transform = transform
        self.split = split
        self.split_ratio = split_ratio
        self.target2class_dict = {}
        self._load_metadata()

    def _load_metadata(self):
        images = pd.read_csv(self.dataset_dir / 'CUB_200_2011' / 'images.txt', sep=' ', names=['img_id', 'filepath'])
        image_class_labels = pd.read_csv(self.dataset_dir / 'CUB_200_2011' / 'image_class_labels.txt', sep=' ', names=['img_id', 'target'])
        train_test_split = pd.read_csv(self.dataset_dir / 'CUB_200_2011' / 'train_test_split.txt', sep=' ', names=['img_id', 'is_training_img'])
        classes = pd.read_csv(self.dataset_dir / 'CUB_200_2011' / 'classes.txt', sep=' ', names=['class_id', 'class_name'], index_col=False)
        self.target2class_dict = pd.Series(classes.class_name.values, index=classes.class_id).to_dict()

        data = images.merge(image_class_labels, on='img_id')
        data = data.merge(train_test_split, on='img_id')

        if self.split == 'train':
            self.data = data[data.is_training_img == 1]
        else:  # 'test'
            self.data = data[data.is_training_img == 0]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data.iloc[idx]
        path = self.dataset_dir / 'CUB_200_2011' / 'images' / sample.filepath
        target = sample.target - 1  # Targets start at 1 by default, so shift to 0
        img = Image.open(path).convert('RGB')
        img = np.array(img)

        if self.transform:
            augmented = self.transform(image=img)
            img = augmented['image']

        return img, target

### CUB Dataset Pytorch Lightning Module

In [7]:
class CUB_DataModule(L.LightningDataModule):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.dataset_dir = Path(self.cfg.dataset_dir)
        self.batch_size = self.cfg.batch_size
        self.num_workers = self.cfg.num_workers
        self.transforms = get_transforms(self.cfg)

    def setup(self, stage=None):
        if stage in ('fit', None):
            self.train_dataset = CUB_Dataset(self.cfg, split='train', transform=self.transforms['train'] if self.cfg.use_augm else self.transforms['val'])
        if stage in ('validate', None):
            self.val_dataset = CUB_Dataset(self.cfg, split='test', transform=self.transforms['val'])
        if stage in ('test', None):
            self.test_dataset = CUB_Dataset(self.cfg, split='test', transform=self.transforms['test'])

    def train_dataloader(self):
        return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=self.num_workers)

    def val_dataloader(self):
        return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers)
    
    def test_dataloader(self):
        return DataLoader(self.test_dataset, batch_size=self.batch_size, shuffle=False, num_workers=self.num_workers)

### Dataset Summary

In [8]:
def dataset_summary(cfg):
    print('=> Dataset Summary:')
    # Initialize datasets to load their metadata  
    train_dataset = CUB_Dataset(cfg, split='train')
    test_dataset = CUB_Dataset(cfg, split='test')

    # Calculate number of samples for each split
    num_samples_train = len(train_dataset)
    num_samples_test = len(test_dataset)
    total_samples = num_samples_train + num_samples_test
    
    # Create and fill the table
    table = PrettyTable()
    table.field_names = ["Split", "Number of Samples", "Percentage"]
    
    # Calculate and add the percentage for each split
    percentage_train = (num_samples_train / total_samples) * 100
    percentage_test = (num_samples_test / total_samples) * 100
    
    table.add_row(["Train", num_samples_train, f"{percentage_train:.2f}%"])
    table.add_row(["Test", num_samples_test, f"{percentage_test:.2f}%"])
    
    print(table)
    
    num_classes = len(set(train_dataset.data['target']))
    print(f"Number of classes: {num_classes}")
    
    dataset_summary_dict = {
        'train_dataset': train_dataset,
        'test_dataset':test_dataset,
        'num_classes':num_classes
    }
    return dataset_summary_dict

In [9]:
dataset_summary_dict = dataset_summary(cfg)
data_module = CUB_DataModule(cfg)
data_module.setup()
num_classes = dataset_summary_dict['num_classes']

=> Dataset Summary:
+-------+-------------------+------------+
| Split | Number of Samples | Percentage |
+-------+-------------------+------------+
| Train |        5994       |   50.85%   |
|  Test |        5794       |   49.15%   |
+-------+-------------------+------------+
Number of classes: 200


### Loss Functions

In [10]:
def choose_loss_function(cfg):
    if cfg.label_smoothing and cfg.loss_function == 'CrossEntropy':
        return nn.CrossEntropyLoss(label_smoothing=cfg.label_smoothing)
    
    if cfg.loss_function == 'CrossEntropy':
        return nn.CrossEntropyLoss()
    
    if cfg.loss_function == 'FocalLoss':
        return FocalLoss(gamma=cfg.gamma)

### Optimizers & Learning Rate Schedulers

In [11]:
def choose_optimizer_scheduler(cfg, parameters, learning_rate):
    optimizer = {
        'Adam': torch.optim.Adam(parameters, lr=float(learning_rate)),
        'SGD': torch.optim.SGD(parameters, lr=float(learning_rate)),
        'AdamW': torch.optim.AdamW(parameters, lr=float(learning_rate), weight_decay=float(cfg.weight_decay)),
    }[cfg.optimizer]
    print(f"=> Using '{cfg.optimizer}' optimizer.")
    
    scheduler = {
        'CosineAnnealing': {
            'scheduler': torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=40, eta_min=0),
            'interval': 'epoch',
            'frequency': 1
        },
        'ReduceLROnPlateau': {
            'scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=cfg.patience, min_lr=0, factor=cfg.decay_factor),
            'monitor': 'val_loss',  
            'interval': 'epoch',
            'frequency': 1
        }
    }[cfg.scheduler]
    print(f"=> Using '{cfg.scheduler}' scheduler.")
    return optimizer, scheduler

### Fine-grained Classification Model

In [12]:
class FGCM_Model(L.LightningModule):
    def __init__(self, cfg, num_classes):
        super().__init__()
        self.cfg = cfg
        self.learning_rate = cfg.learning_rate
        self.save_hyperparameters()  
        if cfg.use_fpn:
            self.base_model = timm.create_model(self.cfg.backbone, pretrained=cfg.pretrained, features_only=True, num_classes=num_classes)
            feature_channels = self.base_model.feature_info.channels()
            self.fpn = FeaturePyramidNetwork(
                in_channels_list=feature_channels,
                out_channels=256,
            )
            self.projection = nn.Linear(256 * len(feature_channels), num_classes)
            # self.projection.apply(self.init_weights)
        else:
            self.base_model = timm.create_model(self.cfg.backbone, pretrained=cfg.pretrained, num_classes=num_classes)
            
        self.criterion = choose_loss_function(self.cfg)
        
        # If unfreeze_last_n is -1, make all layers trainable
        if self.cfg.unfreeze_last_n == -1:
            print("=> All layers are trainable.")
            for param in self.base_model.parameters():
                param.requires_grad = True
        else:
            # Freeze all layers initially
            if self.cfg.unfreeze_last_n == 0:
                print("=> All layers are frozen.")
            else:
                print(f"=> Unfreezing the last {self.cfg.unfreeze_last_n} layers.")
            for param in self.base_model.parameters():
                param.requires_grad = False

            # Unfreeze the last n layers
            num_layers = len(list(self.base_model.children()))
            for i, child in enumerate(self.base_model.children()):
                if i >= num_layers - self.cfg.unfreeze_last_n:
                    for param in child.parameters():
                        param.requires_grad = True
    
    def init_weights(self, layer):
        if isinstance(layer, nn.Linear):
            torch.nn.init.kaiming_normal_(layer.weight)   
                        
    def forward(self, x):
        if self.cfg.use_fpn:
            features = self.base_model(x)
            
            # Create an OrderedDict for FPN input
            fpn_input = OrderedDict([
                (f'feat{i}', feature) for i, feature in enumerate(features)
            ])
            
            # FPN Output
            fpn_output = self.fpn(fpn_input)
            
            combined_features = torch.cat([torch.nn.functional.adaptive_avg_pool2d(output, (1, 1)) for output in fpn_output.values()], dim=1)
            combined_features = combined_features.view(combined_features.size(0), -1)
            
            # Final classification
            logits = self.projection(combined_features)
        else:
            x = self.base_model(x)
            logits = x / self.cfg.temperature
        return logits
        
    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        train_loss = self.criterion(F.softmax(logits, dim=1), y) if self.cfg.loss_function == 'FocalLoss' else self.criterion(logits, y)
        self.log('train_loss', train_loss, on_step=False, on_epoch=True, prog_bar=True, logger=True)
        return train_loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(F.softmax(logits, dim=1), y) if self.cfg.loss_function == 'FocalLoss' else self.criterion(logits, y)
        preds = torch.argmax(logits, dim=1)
        acc = torch.tensor(torch.sum(preds == y).item() / len(preds), device=self.device)*100
        self.log('val_loss', loss, on_epoch=True, prog_bar=True, logger=True)
        self.log('val_acc', acc, on_epoch=True, prog_bar=True, logger=True)
        return {'val_loss': loss, 'test_acc': acc}

    def test_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.criterion(F.softmax(logits, dim=1), y) if self.cfg.loss_function == 'FocalLoss' else self.criterion(logits, y) 
        preds = torch.argmax(logits, dim=1)
        acc = torch.tensor(torch.sum(preds == y).item() / len(preds), device=self.device)*100
        self.log('test_loss', loss, on_epoch=True, prog_bar=True, logger=True)
        self.log('test_acc', acc, on_epoch=True, prog_bar=True, logger=True)
        return {'test_loss': loss, 'test_acc': acc}

    def configure_optimizers(self):        
        optimizer = {
            'Adam': torch.optim.Adam(self.parameters(), lr=float(self.learning_rate)),
            'SGD': torch.optim.SGD(self.parameters(), lr=float(self.learning_rate)),
            'AdamW': torch.optim.AdamW(self.parameters(), lr=float(self.learning_rate), weight_decay=float(self.cfg.weight_decay)),
        }[self.cfg.optimizer]
        print(f"=> Using '{self.cfg.optimizer}' optimizer.")
        
        scheduler = {
            'CosineAnnealing': {
                'scheduler': torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=40, eta_min=0),
                'interval': 'epoch',
                'frequency': 1
            },
            'ReduceLROnPlateau': {
                'scheduler': torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=self.cfg.patience, min_lr=0, factor=self.cfg.decay_factor),
                'monitor': 'val_loss',  
                'interval': 'epoch',
                'frequency': 1
            }
        }[self.cfg.scheduler]
        print(f"=> Using '{self.cfg.scheduler}' scheduler.")
        
        return [optimizer], [scheduler]

In [13]:
print(f"=> Fine-Grained Classification Model is build using '{cfg.backbone}' as base model.")
model = FGCM_Model(cfg, num_classes)

=> Fine-Grained Classification Model is build using 'efficientnet_b0' as base model.
=> All layers are trainable.


### Training Loop

In [14]:
def train_model(cfg, model, data_module, logger):  
    # Callbacks      
    checkpoint_callback = ModelCheckpoint(
        dirpath='./checkpoints',
        monitor='val_acc',
        filename='{cfg.backbone}_{epoch:02d}_{acc:.2f}',
        save_top_k=1,
        mode='max',
        verbose=True,
    )
    LR_monitor_callback = LearningRateMonitor(
        logging_interval='epoch', 
    )
    Rich_pbar_callback = RichProgressBar()
     
    # Initialize trainer
    trainer = Trainer(
        max_epochs=cfg.epochs,
        log_every_n_steps=1,
        callbacks = [
            LR_monitor_callback, 
            checkpoint_callback,
            Rich_pbar_callback
        ],
        logger=logger,
        accelerator='gpu',
        devices=1,
    )
    
    # Train the model
    trainer.fit(model, datamodule=data_module)
    
    best_model_path = checkpoint_callback.best_model_path 
       
    return trainer, best_model_path

In [22]:
trainer, best_model_path = train_model(cfg, model, data_module, logger=None)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
/raid/biplab/munish/miniconda3/envs/GNR_638/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torc

=> Using 'AdamW' optimizer.
=> Using 'CosineAnnealing' scheduler.


Output()

Epoch 0, global step 188: 'val_acc' reached 60.18295 (best 60.18295), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=00_acc=0.00-v7.ckpt' as top 1


Epoch 1, global step 376: 'val_acc' reached 70.52123 (best 70.52123), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=01_acc=0.00-v9.ckpt' as top 1


Epoch 2, global step 564: 'val_acc' reached 75.85433 (best 75.85433), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=02_acc=0.00-v6.ckpt' as top 1


Epoch 3, global step 752: 'val_acc' reached 77.87366 (best 77.87366), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=03_acc=0.00-v1.ckpt' as top 1


Epoch 4, global step 940: 'val_acc' reached 78.96099 (best 78.96099), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=04_acc=0.00-v2.ckpt' as top 1


Epoch 5, global step 1128: 'val_acc' reached 81.20470 (best 81.20470), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=05_acc=0.00-v2.ckpt' as top 1


Epoch 6, global step 1316: 'val_acc' was not in top 1


Epoch 7, global step 1504: 'val_acc' was not in top 1


Epoch 8, global step 1692: 'val_acc' was not in top 1


Epoch 9, global step 1880: 'val_acc' reached 81.86054 (best 81.86054), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=09_acc=0.00.ckpt' as top 1


Epoch 10, global step 2068: 'val_acc' reached 82.13670 (best 82.13670), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=10_acc=0.00-v1.ckpt' as top 1


Epoch 11, global step 2256: 'val_acc' reached 83.60373 (best 83.60373), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=11_acc=0.00.ckpt' as top 1


Epoch 12, global step 2444: 'val_acc' was not in top 1


Epoch 13, global step 2632: 'val_acc' was not in top 1


Epoch 14, global step 2820: 'val_acc' was not in top 1


Epoch 15, global step 3008: 'val_acc' reached 83.81084 (best 83.81084), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=15_acc=0.00.ckpt' as top 1


Epoch 16, global step 3196: 'val_acc' reached 83.91439 (best 83.91439), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=16_acc=0.00.ckpt' as top 1


Epoch 17, global step 3384: 'val_acc' reached 84.48395 (best 84.48395), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=17_acc=0.00.ckpt' as top 1


Epoch 18, global step 3572: 'val_acc' reached 84.94995 (best 84.94995), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=18_acc=0.00-v2.ckpt' as top 1


Epoch 19, global step 3760: 'val_acc' reached 85.51950 (best 85.51950), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=19_acc=0.00-v3.ckpt' as top 1


Epoch 20, global step 3948: 'val_acc' was not in top 1


Epoch 21, global step 4136: 'val_acc' was not in top 1


Epoch 22, global step 4324: 'val_acc' reached 85.79565 (best 85.79565), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=22_acc=0.00.ckpt' as top 1


Epoch 23, global step 4512: 'val_acc' was not in top 1


Epoch 24, global step 4700: 'val_acc' was not in top 1


Epoch 25, global step 4888: 'val_acc' was not in top 1


Epoch 26, global step 5076: 'val_acc' reached 85.91647 (best 85.91647), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=26_acc=0.00.ckpt' as top 1


Epoch 27, global step 5264: 'val_acc' reached 86.00276 (best 86.00276), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=27_acc=0.00-v2.ckpt' as top 1


Epoch 28, global step 5452: 'val_acc' was not in top 1


Epoch 29, global step 5640: 'val_acc' was not in top 1


Epoch 30, global step 5828: 'val_acc' reached 86.14084 (best 86.14084), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=30_acc=0.00-v1.ckpt' as top 1


Epoch 31, global step 6016: 'val_acc' reached 86.27891 (best 86.27891), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=31_acc=0.00.ckpt' as top 1


Epoch 32, global step 6204: 'val_acc' was not in top 1


Epoch 33, global step 6392: 'val_acc' was not in top 1


Epoch 34, global step 6580: 'val_acc' was not in top 1


Epoch 35, global step 6768: 'val_acc' was not in top 1


Epoch 36, global step 6956: 'val_acc' was not in top 1


Epoch 37, global step 7144: 'val_acc' reached 86.39973 (best 86.39973), saving model to '/raid/biplab/munish/GitHub/GNR_638/checkpoints/cfg.backbone=0_epoch=37_acc=0.00-v4.ckpt' as top 1


Epoch 38, global step 7332: 'val_acc' was not in top 1


Epoch 39, global step 7520: 'val_acc' was not in top 1


Epoch 40, global step 7708: 'val_acc' was not in top 1


Epoch 41, global step 7896: 'val_acc' was not in top 1


Epoch 42, global step 8084: 'val_acc' was not in top 1


Epoch 43, global step 8272: 'val_acc' was not in top 1


Epoch 44, global step 8460: 'val_acc' was not in top 1


Epoch 45, global step 8648: 'val_acc' was not in top 1


Epoch 46, global step 8836: 'val_acc' was not in top 1


Epoch 47, global step 9024: 'val_acc' was not in top 1


Epoch 48, global step 9212: 'val_acc' was not in top 1


Epoch 49, global step 9400: 'val_acc' was not in top 1


Epoch 50, global step 9588: 'val_acc' was not in top 1


Epoch 51, global step 9776: 'val_acc' was not in top 1


Epoch 52, global step 9964: 'val_acc' was not in top 1


Epoch 53, global step 10152: 'val_acc' was not in top 1


Epoch 54, global step 10340: 'val_acc' was not in top 1


Epoch 55, global step 10528: 'val_acc' was not in top 1


Epoch 56, global step 10716: 'val_acc' was not in top 1


Epoch 57, global step 10904: 'val_acc' was not in top 1


Epoch 58, global step 11092: 'val_acc' was not in top 1


Epoch 59, global step 11280: 'val_acc' was not in top 1


Epoch 60, global step 11468: 'val_acc' was not in top 1


Epoch 61, global step 11656: 'val_acc' was not in top 1


Epoch 62, global step 11844: 'val_acc' was not in top 1


Epoch 63, global step 12032: 'val_acc' was not in top 1


Epoch 64, global step 12220: 'val_acc' was not in top 1


Epoch 65, global step 12408: 'val_acc' was not in top 1


Epoch 66, global step 12596: 'val_acc' was not in top 1


Epoch 67, global step 12784: 'val_acc' was not in top 1


Epoch 68, global step 12972: 'val_acc' was not in top 1


Epoch 69, global step 13160: 'val_acc' was not in top 1


Epoch 70, global step 13348: 'val_acc' was not in top 1


Epoch 71, global step 13536: 'val_acc' was not in top 1


Epoch 72, global step 13724: 'val_acc' was not in top 1


Epoch 73, global step 13912: 'val_acc' was not in top 1


Epoch 74, global step 14100: 'val_acc' was not in top 1


Epoch 75, global step 14288: 'val_acc' was not in top 1


Epoch 76, global step 14476: 'val_acc' was not in top 1


Epoch 77, global step 14664: 'val_acc' was not in top 1


Epoch 78, global step 14852: 'val_acc' was not in top 1


Epoch 79, global step 15040: 'val_acc' was not in top 1


Epoch 80, global step 15228: 'val_acc' was not in top 1


Epoch 81, global step 15416: 'val_acc' was not in top 1


Epoch 82, global step 15604: 'val_acc' was not in top 1


Epoch 83, global step 15792: 'val_acc' was not in top 1


Epoch 84, global step 15980: 'val_acc' was not in top 1


Epoch 85, global step 16168: 'val_acc' was not in top 1


Epoch 86, global step 16356: 'val_acc' was not in top 1


Epoch 87, global step 16544: 'val_acc' was not in top 1


Epoch 88, global step 16732: 'val_acc' was not in top 1


Epoch 89, global step 16920: 'val_acc' was not in top 1


### Testing Model

In [15]:
def test_model(best_model_path, data_module, logger):
    print(f"Loading best model from {best_model_path}")

    # Initialize trainer
    trainer = Trainer(
        accelerator='gpu',
        devices=1,
        logger=logger,
    )
    
    # Load the best model
    best_model = FGCM_Model.load_from_checkpoint(best_model_path)
    
    # Run the test using the best model
    trainer.test(best_model, datamodule=data_module)

In [17]:
test_model(best_model_path, data_module, logger=None)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


Loading best model from ./checkpoints/cfg.backbone=0_epoch=37_acc=0.00-v4.ckpt


/raid/biplab/munish/miniconda3/envs/GNR_638/lib/python3.11/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:75: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


=> All layers are trainable.


You are using a CUDA device ('NVIDIA A100-SXM4-80GB') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]


Testing: |          | 0/? [00:00<?, ?it/s]