In [None]:
#keep alive (right click 'inspect', then 'console', clear console and paste the below code)
function ConnectButton(){
    console.log("Connect pushed"); 
    document.querySelector("#top-toolbar > colab-connect-button").shadowRoot.querySelector("#connect").click() 
}

var colab = setInterval(ConnectButton,600000);   #to connect for 10 mins

#clearInterval(connect)     #to clear the keep alive interval

### Mounting Drive and Installing Libraries

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive') 

#install libraries 
!pip install -q -r '/content/drive/MyDrive/Colab Notebooks/requirements.txt' 


### Import Libraries 

In [3]:
#Basic Libraries / Data Analytics Libraries 
import random 
import sys 
import os 
import numpy as np
import pandas as pd
from PIL import Image
from pathlib import Path 

#Machine Learning Libraries 
import matplotlib.pyplot as plt 
from sklearn.datasets import load_iris 
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold, train_test_split 

#deep learning libraries 
import cv2 
import torch.nn.init as init 
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms.functional as TF 
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR 
from torch.utils.data import DataLoader, Dataset, TensorDataset, random_split, Subset, WeightedRandomSampler 
from PIL import Image 
import torchvision
from torchvision.utils import save_image  
from torchvision import datasets 
from torchvision.datasets import DatasetFolder, ImageFolder
import torchvision.transforms as transforms 
import torchmetrics 
from torchsummary import summary 
import torchinfo
import timm 
import albumentations as A 


: 

### Get the Data Ready

In [176]:

#Device-agnostic code (using PyTorch on CPU, GPU or MPS)
if torch.cuda.is_available():
    device = "cuda:0" # NVIDIA GPU
elif torch.backends.mps.is_available():
    device = "mps:0" # Apple GPU
else:
    device = "cpu" # Defaults to CPU if NVIDIA GPU/Apple GPU aren't available

>> Visualize the images from the directory

In [None]:

# Set seed
random.seed(42)  # <- try changing this and see what happens

# Set directory path
directory_path = "path_to_directory"  # Replace with the path to your image directory.  NB: Use TRAIN dataset

# Get all image paths from the directory
image_path_list = []
for root, dirs, files in os.walk(directory_path):
    for file in files:
        if file.lower().endswith(('.jpg', '.jpeg', '.png', '.gif')):
            image_path_list.append(os.path.join(root, file))

# Repeat the process multiple times to visualize different random images
num_images_to_visualize = 5  # Set the desired number of random images to visualize

for _ in range(num_images_to_visualize):
    # Get random image path
    random_image_path = random.choice(image_path_list)

    # Get image class from path name (the image class is the name of the directory where the image is stored)
    image_class = os.path.basename(os.path.dirname(random_image_path))

    # Open image
    img = Image.open(random_image_path)

    # Print metadata
    print(f"Random image path: {random_image_path}")
    print(f"Image class: {image_class}")
    print(f"Image height: {img.height}")
    print(f"Image width: {img.width}")

    # Display the image
    img.show()


>> Visualize the transforms

In [None]:

def plot_transformed_images(image_paths, transform, n=1, seed=42):
    """Plots a series of random images from image_paths.

    Will open n image paths from image_paths, transform them
    with transform and plot them side by side.

    Args:
        image_paths (list): List of target image paths.
        transform (PyTorch Transforms): Transforms to apply to images.
        n (int, optional): Number of images to plot. Defaults to 3.
        seed (int, optional): Random seed for the random generator. Defaults to 42.
    """
    random.seed(seed)
    random_image_paths = random.sample(image_paths, k=n)
    for image_path in random_image_paths:
        with Image.open(image_path) as f:
            fig, ax = plt.subplots(1, 2)
            ax[0].imshow(f)
            ax[0].set_title(f"Original \nSize: {f.size}")
            ax[0].axis("off")

            # Transform and plot image
            transformed_image = transform(f)
            transformed_image = transformed_image.numpy()  # Convert transformed image to numpy array
            transformed_image = np.transpose(transformed_image, (1, 2, 0))  # Reshape to (H, W, C)
            ax[1].imshow(transformed_image)
            ax[1].set_title(f"Transformed \nSize: {transformed_image.shape}")
            ax[1].axis("off")

            fig.suptitle(f"Class: {Path(image_path).parent.stem}", fontsize=16)

# Write transform for image
image_transforms = transforms.Compose([
    # Resize the images to 155 * 155
    transforms.Resize(size=(155, 155)),
    transforms.RandomCrop(size=145),
    transforms.ColorJitter(brightness=0.4, contrast=0.8, saturation=0.4, hue=0.1),
    # Turn the image into a torch.Tensor
    transforms.ToTensor(),  # this also converts all pixel values from 0 to 255 to be between 0.0 and 1.0
    # transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    # Flip the images randomly on the horizontal
    transforms.RandomHorizontalFlip(p=0.5),  # p = probability of flip, 0.5 = 50% chance
    transforms.RandomRotation((-270, 270)),
    transforms.RandomRotation(degrees=30),
    transforms.Lambda(lambda x: TF.affine(x, angle=0, translate=(10, 10), scale=1.0, shear=0)),  # Apply affine transformation

])

# image_transforms = A.Compose(
#     [
#     A.Resize(Width = 1920, height = 1080),
#     A.RandomCrop(Width = 1280, height = 720),
#     A.Rotate(limit = 40, p = 0.9),
#     A.HorizontalFlip(p = 0.5),
#     A.VerticalFlip(p = 0.1),
#     A.RGBShift(r_shift_limit = 25, g_shift_limit = 25, b_shift_limit = 25, p = 0.5 ),
#     A.OneOf([
#         A.Blur(blur_limit = 3, p = 0.5),
#         A.ColorJitter(p = 0.5)
#             ], p = 1.0)  
#     ]
#     ) 


plot_transformed_images(image_path_list,
                        transform=image_transforms,
                        n=1)



>> Load the Data (generic)

In [None]:

data_mnist = np.loadtxt(open(r"/content/sample_data/mnist_train_small.csv" , 'rb'), delimiter = ',' )
print("shape: ", data_mnist.shape)

#class names
class_labels = data_mnist[:, 0].astype(int)
unique_labels = np.unique(class_labels)
class_names = [str(label) for label in unique_labels]


#NB: Use Batch or Layer normalization in place of MinMaxScaler or StandardScaler for deep learning
scaler = MinMaxScaler() 
inputs = data_mnist[:,1:]
inputs = scaler.fit_transform(inputs)
labels = data_mnist[:,0] 

# train, test = torch.utils.data.random_split(train_dataset, [800, 200]) #train/test split 
X_train, X_test, y_train, y_test = train_test_split(inputs, labels, random_state=23, train_size=0.9)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# inputs 
X_train = torch.tensor(np.array(X_train), dtype=torch.float32)
X_test = torch.tensor(np.array(X_test), dtype=torch.float32)
y_train = torch.tensor(np.array(y_train), dtype=torch.long)
y_test = torch.tensor(np.array(y_test), dtype=torch.long)

#dataset 
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)



>> Load the Data (torchvision)

In [None]:
# Define the transformation to apply to the images

#1. Using a DatasetFolder
# Create an instance of the DatasetFolder
dataset = torchvision.datasets.DatasetFolder(
                            root='Dataset/',
                            loader=torchvision.datasets.folder.default_loader,  # Use the default image loader
                            extensions=".jpg",  # Specify the file extensions of the images
                            transform=image_transforms  # Apply the defined transformation pipeline
                            )
class_labels = dataset.classes # Get the class labels
print(class_labels) # Print the class labels



#2. Using ImageFolder (a subclass of DatasetFolder)
# Load the dataset from the image folders 
dataset = torchvision.datasets.ImageFolder(root='Dataset/', transform=image_transforms)
class_labels = dataset.classes # Get the class labels
print(class_labels) # Print the class labels
#NB: ImageFolder simplifies the process of loading such datasets by automatically assigning class labels based on 
    # folder names and also uses a default data loader (unlike DatasetFolder where you can specify and custom dataloader)


#3. Using Custom Dataset
class CustomImageDataset(Dataset):
    def __init__(self, csv_file, root_dir, transform=None):
        """
        Arguments:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.data = pd.read_csv(csv_file)  # Read the CSV file
        self.root_dir = root_dir
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_name = self.data.iloc[idx, 0]  # Get the image file name from the CSV
        img_path = os.path.join(self.root_dir, img_name)
        image = Image.open(img_path).convert('RGB')  # Open and convert the image to RGB

        if self.transform:
            image = self.transform(image)  # Apply transformations if provided

        label = self.data.iloc[idx, 1]  # Get the corresponding label from the CSV

        return image, label
    
    def get_class_names(self):
        """
        Retrieve the class names.

        Returns:
            list: A list of strings.
        """
        class_names = self.data['label'].unique().tolist()
        return class_names

face_dataset = CustomImageDataset(csv_file='data/faces/face_landmarks.csv',
                                    root_dir='data/faces/', transform = image_transforms)

class_names = face_dataset.get_class_names() # Retrieve the class names
print(class_names)  # Print the class names


In [None]:
#Visualize images from the loaded dataset 

fig,axs = plt.subplots(3,3,figsize=(5,5))

for ax in axs.flatten():
  # select a random picture
  randidx = np.random.choice(len(train_dataset.targets))

  # extract that image
  pic, label = train_dataset[randidx]

  # Transpose the image tensor to (H, W, C) format
  pic = np.transpose(pic, (1, 2, 0))        # sometimes: pic = np.transpose(pic.cpu().detach().numpy(), (1,2,0))

  # Normalize the pixel values to the range [0, 1]
  pic = pic.clip(0, 1)  # Clip values outside [0, 1]

  # and its label
  label = train_dataset.classes[train_dataset.targets[randidx]]

  # and show!
  ax.imshow(pic)
  ax.text(16,0,label,ha='center',fontweight='bold',color='k',backgroundcolor='y')
  ax.axis('off')

plt.tight_layout()
plt.show() 


# #if you want to save the transformed images
# img_num = 0
# for _ in range(10):
#     for img, label in train_dataset:
#         save_image(img, 'img' + str(img_num) + '.png')
#         img_num +=1 

>> Data Split

In [None]:
#When using ImageFolder, DatasetFolder, or a custom dataset to load your data, you can split it into training and 
# testing sets using the random_split function from PyTorch


# Split the dataset into training and testing sets
train_size = int(0.8 * len(dataset))  # 80% for training, adjust as desired
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

# #or
# train_dataset = Subset(test_dataset, randidx[:6000]) #subset of test (or use range(2000) instead of randidx[;6000])
# test_dataset = Subset(test_dataset, randidx[6000:]) #subset of test

>> Data Balancing

In [None]:
# How to deal with unbalanced data
    # Get more data
    # undersample
    # Oversample
        # create multiple copies of the rare data. (be careful because it increases the risk of overfitting)
    # Data Augmentation
        # Add new features as non-linear transformations of existing data
    # Create synthetic samples
        # you can use SMOTE
    # Consider whether non-deep learning would be better

In [None]:

root_dir = directory_path       #root directory of the train dataset
class_weights = []
for root, subdir, files in os.walk(root_dir):
    if len(files) > 0:
        class_weights.append(1/len(files))

sample_weights = [0] *len(train_dataset)

for idx, (data, label) in enumerate(train_dataset):
    class_weight = class_weights[label]
    sample_weights[idx] = class_weight

# Create a sampler with weighted sampling
sampler = WeightedRandomSampler(sample_weights, len(sample_weights), replacement=True)



# # Calculate the weights for each sample based on class frequencies
# weights = [1.0 / class_counts[label] for _, label in train_dataset]

# # Create a sampler with weighted sampling
# sampler = WeightedRandomSampler(weights, len(train_dataset), replacement=True)


>> Data Loader

In [None]:
#dataloader
batch_size = 32
shuffle = True 
drop_last = True


def load_dataloader (batch_size = batch_size, shuffle = True, drop_last = True, sampler = sampler):
    
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size,
                                drop_last=drop_last, sampler=sampler)   #don't shuffle when using sampler
    test_dataloader = DataLoader(test_dataset, shuffle = False, batch_size=len(test_dataset))
    
    return train_dataloader, test_dataloader


train_dataloader, test_dataloader = load_dataloader (batch_size = 32)

### Build the Model

In [None]:
# Some common hyperparameters/metaparameters to consider when building the model include:

# Model architecture: 
    # Determines the overall structure and complexity of the neural network.
    
# Number of hidden layers:
    # Determines the depth of the neural network architecture and influences the model's capacity to learn complex
    # patterns.
    
# Number of neurons per layer:
    # Defines the width of the neural network architecture and affects the model's representational capacity and
    # computational efficiency.
    
# Activation functions:
    # Determines the non-linear transformation applied to the output of each neuron, introducing non-linearity into
    # the model.
    
# Batch Normalization (nn.BatchNorm2d(nUnits))
    # used to normalize the weights and input data
    
# Dropout (nn.Dropout(p = 0.2))
    # Controls the regularization technique of randomly dropping out a fraction of neurons during training, which
    # helps prevent overfitting.
    
# Weight Initialization:
    # Defines the initial values of the weights in the neural network.

>> Build the Model from Scratch

In [None]:
#consider the following when building your model architecture:
    # Model Complexity
    # Data Augmentation
    # Input Image Size

In [None]:
# define the model 
#ANN Model

class iris_model(nn.Module):
    """Some Information about iris_model"""
    def __init__(self, weight_init='default'):
        super(iris_model, self).__init__()
        self.fc1 = nn.Linear(4, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 3)
        
        if weight_init == 'default':
            pass  # Default weight initialization

        elif weight_init == 'xavier_uniform':
            self._init_weights_xavier_uniform()

        elif weight_init == 'kaiming_normal':
            self._init_weights_kaiming_normal()
        
    def forward(self, x):
        out = F.relu(self.fc1(x))
        # out = self.fca(out) 
        out = F.relu(self.fc2(out))
        # out = self.fcb(out)
        out = self.fc3(out)
        return out

    def _init_weights_xavier_uniform(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    init.constant_(m.bias, 0)

    def _init_weights_kaiming_normal(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                if m.bias is not None:
                    init.constant_(m.bias, 0)

model = iris_model(weight_init='default')    #initializing the model  #model = iris_model(weight_init='xavier_uniform') 


#2

# def create_model(nUnits, nLayers, weight_init):
#     class iris_model(nn.Module):
#         def __init__(self):
#             super().__init__()

#             # Create dictionary to store the layers
#             self.layers = nn.ModuleDict()
#             self.nLayers = nLayers 
#             self.weight_init = weight_init

#             ### Input layer
#             self.layers['input'] = nn.Linear(4, nUnits)
#             self.layers['input_bn'] = nn.BatchNorm1d(nUnits)
#             self.layers['input_dropout'] = nn.Dropout(0.2)
            
#             ### Hidden layers
#             for i in range(nLayers):
#                 self.layers[f'hidden{i}'] = nn.Linear(nUnits, nUnits)
#                 self.layers[f'hidden{i}_bn'] = nn.BatchNorm1d(nUnits)
#                 self.layers[f'hidden{i}_dropout'] = nn.Dropout(0.2)

#             ### Output layer
#             self.layers['output'] = nn.Linear(nUnits, 3)
        
#             # Initialize weights
#             self._initialize_weights()
            
        
#         # Forward pass
#         def forward(self, x):
#             # Input layer
#             x = self.layers['input'](x)
#             x = self.layers['input_bn'](x)
#             x = F.relu(x)
#             x = self.layers['input_dropout'](x)

#             # Hidden layers
#             for i in range(self.nLayers):
#                 x = self.layers[f'hidden{i}'](x)
#                 x = self.layers[f'hidden{i}_bn'](x)
#                 x = F.relu(x)
#                 x = self.layers[f'hidden{i}_dropout'](x)
                
#             # Output layer
#             x = self.layers['output'](x)    #or x = F.sigmoid(self.layers['output](x)) for Binary classification 
            
#             return x 
        
#         def _initialize_weights(self):
#             for name, module in self.layers.items():
#                 if isinstance(module, nn.Linear):
#                     weight_init = self.weight_init.get(name, 'default') #works well with sigmoid (uniform distribution)
#                     if weight_init == 'xavier_uniform':     #works well with sigmoid
#                         init.xavier_uniform_(module.weight)
#                     elif weight_init == 'kaiming_normal':   #works well with ReLU activation 
#                         init.kaiming_normal_(module.weight)
#                     if module.bias is not None:
#                         init.constant_(module.bias, 0)
    
#     return iris_model()

# nUnits = 64
# nLayers = 5
# weight_init = {
#     'input': 'default',
#     'hidden0': 'kaiming_normal',
#     'hidden1': 'kaiming_normal',
#     'hidden2': 'kaiming_normal',
#     # 'hidden3': 'kaiming_normal',
#     'output': 'default'
# }

# model = create_model(nUnits, nLayers, weight_init)    #initializing the model


In [169]:
# CNN Model

class CNNModel(nn.Module):
    def __init__(self, in_channels, num_classes, input_shape:tuple):    #input_shape = (3, 224, 224) where 3 is for RGB
        super(CNNModel, self).__init__()

        self.features = nn.Sequential(
            nn.Conv2d(in_channels=in_channels, out_channels=64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
        )

        # Calculate linear layer input size based on the output size of the last convolutional layer
        last_conv_out_shape = self._get_last_conv_output_shape(*input_shape)
        self.classifier = nn.Sequential(
            nn.Linear(last_conv_out_shape, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),

            nn.Linear(4096, 4096),
            nn.ReLU(),
            nn.Dropout(0.5),

            nn.Linear(4096, num_classes)
        )

    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), -1)#reshapes the tensor to x.size(0) rows, while the number of columns is inferred automatically.
        x = self.classifier(x)
        # x = F.log_softmax(x, dim=1)  # LogSoftmax for classification (if using CrossEntropyLoss, then no need for this)
        return x

    def _get_last_conv_output_shape(self, in_channels, height, width):
        with torch.no_grad():
            x = torch.zeros(1, in_channels, height, width)
            x = self.features(x)
        return x.size(1) * x.size(2) * x.size(3)


#x.view(x.size(0), -1) reshapes the tensor x to a 2D tensor of x.size(0) rows, and automatically inferring the colunns. 

# Create an instance of the model
in_channels = 3  # Number of input channels (RGB)
num_classes = 10  # Adjust according to your classification task
input_shape = (3, 155, 155) 

model = CNNModel(in_channels, num_classes, input_shape)

# Move the model to GPU if available
model = model.to(device)

# summary(model, (3, 224, 224)) #torchsummary


In [174]:
#model summary

print(torchinfo.summary(model, (3, 224, 224), batch_dim = 0, 
                col_names = ('input_size', 'output_size', 'num_params', 'kernel_size', 
                            'mult_adds', 'trainable'), verbose = 0))


In [None]:
# Print the model summary
print(model)

>> Use a Pretrained Model

In [None]:
# Here are the situations where transfer learning may not be advisable or viable, presented in bullet points:

# Insufficient similarity between the source and target tasks
    # For example, if you are trying to translate text from English to French, and the pre-trained model was trained on 
    # a natural language processing task such as sentiment analysis, then transfer learning may not be helpful
# Limited availability of labeled data for the source task, or the source model is not deep enough to be transferred.
# Significant differences in data distribution between the source and target domains:
    # For example, if the source data is collected from a different time period or from a different population, 
    # it may not capture the characteristics of the target data accurately
# Target task requires learning task-specific features not relevant to the source task
    # For instance, if the source task is image classification, but the target task involves object detection, 
    # the features learned by the source model may not be directly applicable, and training from scratch or using 
    # domain-specific pretraining may be more appropriate.
# You have a large dataset of labeled data. It may be better to train your model from scratch
# Privacy or security concerns regarding the source data


In [132]:
#show all models in torchvision
from torchvision import models

dir(models) 

In [None]:
torch.backends.cudnn.benchmark = True   # CuDNN (CUDA Deep Neural Network library) is used to enable CuDNN auto-tuning. 
    # When torch.backends.cudnn.benchmark is set to True, CuDNN will automatically find the best algorithm configuration 
    # for the specific input size and hardware during the first forward or backward pass of the network

import torchvision.models as models 
import torch.hub as hub 

#1: from torchvision models
model = models.resnet18(pretrained=True) # Load the pre-trained model from torchhub


#2: from torchhub models
model = hub.load('pytorch/vision', 'resnet18', pretrained=True)


#3. fromm timm (It comes packaged with >700 pretrained models, and is designed to be flexible and easy to use)
# (https://huggingface.co/docs/timm/feature_extraction) #learn to use timm
# https://huggingface.co/docs/timm/models   #Timm SOTA models
model = timm.create_model('resnet34', pretrained=True) 
model = timm.create_model('resnet34', pretrained=True, num_classes=10)
avail_pretrained_models = timm.list_models(pretrained=True) # List Models with Pretrained Weights
all_densenet_models = timm.list_models('*densenet*')    # search for model architectures using Wildcard as below


#4: Loading a custom model
class CustomModel(nn.Module):
    def __init__(self, num_classes):
        super(CustomModel, self).__init__()
        # Load the pre-trained model
        self.model = torch.load('paul_model.pt')

        # Freeze the parameters of the pre-trained model
        for param in self.model.parameters():
            param.requires_grad = False

        # Modify the last layer for your custom dataset
        num_features = self.model.fc.in_features
        self.model.fc = nn.Linear(num_features, num_classes)

    def forward(self, x):
        return self.model(x)
model = CustomModel(num_classes)

model = model.to(device) # Move the model to GPU if available

torchinfo.summary(model, (3, 224, 224), batch_dim = 0, 
                col_names = ('input_size', 'output_size', 'num_params', 'kernel_size', 
                            'mult_adds', 'trainable'), verbose = 0)

In [None]:
# first visualize all the modules and submodules of the model to know which layer(s) will or will not be frozen

for names, params in model.named_modules():
# for names, params in model.named_children():
    print(names)
    

In [None]:
#  performing transfer learning

for name, param in model.named_parameters():  #for param in model.parameters()[:5]: if you wish to freeze only a few parmas
    # if "classifier" not in name:  # Unfreeze parameters of MobileNetV2
    # if not name.startswith('fc'): # Exclude the fully connected layer (output layer)
    param.requires_grad = False
    # #NB: the more similar your data is to the data of the source model, the more layers you can freeze
num_ftrs = model.fc.in_features
num_classes = 10
model.fc = nn.Linear(num_ftrs, num_classes)
# Train only the classifier
# ...

# or

# Freeze the first six layers - this assumes that the layers are labeled 'layer**'
for name, param in model.named_parameters():
    if "layer" in name and int(name.split(".")[1]) < 6:
    # if 'layer4' not in name and 'fc' not in name : 
        param.requires_grad = False
num_ftrs = model.fc.in_features
num_classes = 10
model.fc = nn.Linear(num_ftrs, num_classes)
# Train the classifier and a few other layers

# Print the number of trainable parameters
num_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f"Number of trainable parameters: {num_trainable_params}")

In [None]:
#various methods of Transfer learning:

# 1. Finetuning the entire pre-trained model
    # In this approach, you load a pre-trained model and replace the final fully connected layer (classifier) with a 
    # new one suitable for your specific task. You then train the entire model, including the pre-trained layers and 
    # the newly added classifier, with your custom dataset    
    model_ft = models.resnet50(pretrained=True)
    num_ftrs = model_ft.fc.in_features
    num_classes = 10
    model_ft.fc = nn.Linear(num_ftrs, num_classes)
    # Train the entire model
    # ...
    
#2. Finetuning only the classifier: Similar to the previous approach, you load a pre-trained model, but instead of 
    # training the entire model, you freeze the parameters of the pre-trained layers and only train the newly added 
    # classifier. This is useful when you have limited training data or when the pre-trained model is very different 
    # from your target task.
    model_conv = models.resnet50(pretrained=True)
    for param in model_conv.named_parameters():  #for param in model.parameters()[:5]: if you wish to freeze only a few parmas
        # if "classifier" not in name:  # Unfreeze parameters of MobileNetV2
        param.requires_grad = False
        # #NB: the more similar your data is to the data of the source model, the more layers you can freeze
    num_ftrs = model_conv.fc.in_features
    num_classes = 10
    model_conv.fc = nn.Linear(num_ftrs, num_classes)
    # Train only the classifier
    # ...
    
    # or
    
    # Freeze the first six layers - this assumes that the layers are labeled 'layer**'
    for name, param in resnet.named_parameters():
        if "layer" in name and int(name.split(".")[1]) < 6:
            param.requires_grad = False

    # Print the number of trainable parameters
    num_trainable_params = sum(p.numel() for p in resnet.parameters() if p.requires_grad)
    print(f"Number of trainable parameters: {num_trainable_params}")

#3. Feature extraction is another way to perform transfer learning. In feature extraction, we start with a 
    # pre-trained model and then use the model's features as input to a new model. We can then train the new model on 
    # our own dataset.
    # Load the pre-trained ResNet model
    model = models.resnet18(pretrained=True)

    # Freeze the weights of the model
    for param in model.parameters():
        param.requires_grad = False

    # Extract the features from the model
    features = model(images)
    num_classes = 10
    # Create a new model to classify the features
    new_model = torch.nn.Linear(features.size(1), num_classes)
    new_model.train()



>> Visualize weight initialization

In [None]:
#visualize weight initialization 
layers = [name.split('.')[1] for name, _ in model.named_modules() if '.' in name if 'dropout' not in name]
for i in layers:
    weight_i = model.layers[i].weight.detach()
    # print(weight_i)
    plt.hist(weight_i, bins=30, edgecolor='black')
    plt.title(f'Weight Initialization - {i}')
    plt.xlabel('Weight Values')
    plt.ylabel('Frequency')
    plt.show()

### Training the Model

In [None]:
# Some common hyperparameters/metaparameters to consider when training the model include:

# Learning rate:
    # Determines the step size during gradient descent optimization and affects the convergence speed 
    # and accuracy of the model.
    
# Dropout rate:
    # Controls the regularization technique of randomly dropping out a fraction of neurons during training, which
    # helps prevent overfitting. 
    
# Batch size:
    # Specifies the number of training samples propagated through the network before updating the model's weights.
    
# Number of epochs:
    # Specifies the number of times the entire training dataset is passed through the model during training.
    
# Regularization techniques:
    # Include methods like L1 and L2 regularization, which help prevent overfitting by adding penalties to the 
    # loss function. 
    
# Optimizer:
    # Specifies the optimization algorithm used to update the model's weights during training, such as 
    # Stochastic Gradient Descent (SGD), Adam, or RMSprop.
    
# Loss function:
    # Defines the objective function used to measure the discrepancy between the predicted output and the 
    # true output during training.

In [None]:
num_classes = 9
learning_rate = 0.003
# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
criterion = criterion.to(device) 
optimizer = optim.Adam(model.parameters(), lr=learning_rate) # weight_decay=0.0008018107002058151)
# Define the learning rate scheduler
scheduler = StepLR(optimizer, step_size=5, gamma=0.1)


In [None]:
#overfit a single batch (by adjusting the epochs and batch size)

num_epochs = 20
batch_size = 15
batchLoss = []

train_dataloader, test_dataloader = load_dataloader (batch_size = batch_size, shuffle = True, drop_last = True)
inputs, labels = next(iter(train_dataloader))

for epoch in range(num_epochs):
    model.train()
    inputs, labels = inputs.to(device), labels.to(device)
    # Zero the parameter gradients
    optimizer.zero_grad()
    # Forward pass
    outputs = model(inputs)
    loss = criterion(outputs, labels)
    # Backward pass and optimization
    loss.backward()
    optimizer.step()
    # Compute accuracy on the training set
    predictions = torch.argmax(outputs, axis=1)
    
    batchLoss.append(loss.item())
    accuracy = torchmetrics.functional.classification.accuracy(predictions, labels, task='multiclass',
                                                                num_classes=num_classes) * 100

    if epoch % 2 == 0:
        msg = f"Epoch {epoch}/{num_epochs}: Loss = {np.mean(batchLoss):.4f}, train accuracy = {accuracy:.2f}%"
        sys.stdout.write('\r' + msg)

In [None]:
#model training 
from torch.optim.lr_scheduler import StepLR

## metric = torchmetrics.Accuracy(task='multiclass', num_classes=num_classes)    (.Precision(), .Recall(), .F1Score(), .ConfusionMatrix())
                #see doc. https://torchmetrics.readthedocs.io/en/stable/classification/accuracy.html#functional-interface 

num_epochs = 50
learning_rate = 0.0032734813343726263
losses = torch.zeros(num_epochs)
ongoing_accuracy = []
ongoing_accuracy_test = []
num_classes = 10

# Define the loss function and optimizer
criterion = nn.NLLLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.0008018107002058151)

# Define the learning rate scheduler
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

# device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Move the model and data to the appropriate device (e.g., GPU if available)
model.to(device)

# Variables to track the best model and accuracy
best_accuracy = 0.0
best_model_state = None

# Training loop
for epoch in range(num_epochs):
    model.train()
    batchAcc = []
    batchLoss = []

    # Iterate over the training dataloader
    for inputs, labels in train_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        batchLoss.append(loss.item())

        # Compute accuracy on the training set
        predictions = torch.argmax(outputs, axis=1)
        accuracy = torchmetrics.functional.classification.accuracy(predictions, labels, task='multiclass',
                                                                    num_classes=num_classes) * 100
        # accuracy = torchmetrics.functional.classification.accuracy(predictions, labels, task='multiclass', num_classes=num_classes) 
        #                                                     (or metric(predictions, labels))
        # accuracy = torchmetrics.functional.classification.binary_accuracy (predicted, labels, threshold = 0.5)   #for binary classification
        # r2score = torchmetrics.functional.r2_score(preds, target) 
        batchAcc.append(accuracy.item())

    # Update the learning rate
    scheduler.step()

    ongoing_accuracy.append(np.mean(batchAcc.to(device)))
    losses[epoch] = np.mean(batchLoss)

    # Print loss and accuracy for the epoch
    if epoch % 10 == 0:
        msg = f"Epoch {epoch}/{num_epochs}: Loss = {np.mean(batchLoss):.4f}, Accuracy = {np.mean(batchAcc):.2f}%"
        sys.stdout.write('\r' + msg)

    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        batchAcc_test = []
        for data in test_dataloader:
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)

            # Calculate predictions
            _, predicted = torch.max(outputs.data, 1)

            accuracy = torchmetrics.functional.classification.accuracy(predicted, labels,
                                                                    task='multiclass', 
                                                                       num_classes=num_classes) * 100
            batchAcc_test.append(accuracy.cpu())
            
    test_accuracy = np.mean(batchAcc_test)
    ongoing_accuracy_test.append(test_accuracy)

    if test_accuracy > best_accuracy:
        best_accuracy = test_accuracy
        best_model_state = model.state_dict().copy()

    if epoch % 10 == 0:
        msg = f"Epoch {epoch}/{num_epochs}: Loss = {np.mean(batchLoss):.4f}, train accuracy = {np.mean(batchAcc):.2f}%, test accuracy = {test_accuracy:.2f}%"
        sys.stdout.write('\r' + msg)

print(' ')
print('Finished Training')
print(' ')

# Load the best model state
model.load_state_dict(best_model_state)

# Report accuracy
print('Final accuracy (eval): {:.2f}%'.format(ongoing_accuracy_test[-1]))
print('Best accuracy (eval): {:.2f}%'.format(best_accuracy)) 

fig, ax = plt.subplots(1, 2, figsize=(13, 4))

ax[0].plot(losses.detach())
ax[0].set_ylabel('Loss')
ax[0].set_xlabel('Epoch')
ax[0].set_title('Losses')

ax[1].plot(ongoing_accuracy, label='Training Accuracy')
ax[1].plot(ongoing_accuracy_test, label='Evaluation Accuracy')
ax[1].set_ylabel('Accuracy')
ax[1].set_xlabel('Epoch')
ax[1].set_title('Accuracy')
ax[1].legend()

plt.show()

# run training again to see whether this performance is consistent 

### Model Evaluation

>> Compare predicted values with the actual test values

In [None]:
def plot_predictions(y_pred, y_test): 
    """
    Plots the predicted and actual values on separate scatter plots.
    """
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 5))
    
    # Plot the actual values
    ax1.scatter(range(len(y_test)), y_test, label='Actual Values')
    ax1.set_xlabel('Index')
    ax1.set_ylabel('Actual Values')
    ax1.set_title('Scatter plot of Actual Values')
    ax1.legend()
    
    # Plot the predicted values
    ax2.scatter(range(len(y_pred)), y_pred, label='Predicted Values')
    ax2.set_xlabel('Index')
    ax2.set_ylabel('Predicted Values')
    ax2.set_title('Scatter plot of Predicted Values')
    ax2.legend()
    
    # Show the plots
    plt.show()

plot_predictions(predicted, labels)

>> Confusion Matrix, Precision, Recall, Accuracy, F1-Score

In [None]:
from sklearn.metrics import (roc_auc_score,roc_curve,precision_recall_curve, auc,
                            classification_report, confusion_matrix, average_precision_score,
                            accuracy_score,silhouette_score,mean_squared_error)
from inspect import signature


#confusion matrix
accuracy = accuracy_score(predicted, labels) 
class_names = train_dataset.classes 

def plot_confusion_matrix(y_true, y_pred, classes,
                        normalize=False,
                        title='Confusion matrix',
                        cmap=plt.cm.Blues):
    """
    This function plots a confusion matrix.
    """
    cm = confusion_matrix(y_true, y_pred)
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    
    for i, j in np.ndindex(cm.shape):
        plt.text(j, i, format(cm[i, j], fmt),
                horizontalalignment="center",
                color="white" if cm[i, j] > thresh else "black")
    
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.tight_layout()

plot_confusion_matrix(labels, predicted, classes=class_names,
                    title='Confusion matrix, Accuracy = {:.2f}'.format(accuracy))

In [None]:
confusion_matrix = torch.zeros(num_classes, num_classes)

# Update confusion matrix
for t, p in zip(labels.view(-1), predicted.view(-1)):
    confusion_matrix[t.long(), p.long()] += 1

# Calculate evaluation metrics
# accuracy = correct_predictions / total_samples 
recall = confusion_matrix.diag() / confusion_matrix.sum(1)
precision = confusion_matrix.diag() / confusion_matrix.sum(0)
f1_score = 2 * (precision * recall) / (precision + recall)

# # Calculate TP, TN, FP, and FN for each class
# TP = confusion_matrix.diag()
# TN = torch.diag(confusion_matrix.sum(0)) - TP
# FP = torch.diag(confusion_matrix.sum(1)) - TP
# FN = confusion_matrix.sum(1) - TP

# Print the metrics
print(f"Class Names: {class_names}")
print(f"Recall: {recall}")
print(f"Precision: {precision}")
print(f"F1 Score: {f1_score}")
print('Test Accuracy: {:.2f}%'.format(ongoing_accuracy_test[-1])) 

# Precision: Precision tells us how many of the things the robot says are dogs (or cats) are actually dogs (or cats). 
    # For example, if the robot says, "That's a dog!" to 10 pictures, but only 7 of those pictures are actually dogs, 
    # then its precision is 7 out of 10. A high precision indicates that the model has a low rate of false positives.
    # Precision = True Positives (TP) / (True Positives (TP) + False Positives (FP))
    
# Recall: Recall tells us how many of the actual dogs (or cats) the robot correctly identifies. For example, 
    # if there are 20 dogs in the pictures, but the robot only detects 15 of them as dogs, then its recall is 15 out of 20. 
    # It tells us how well the model is able to detect positive 
    # samples of a specific class. A high recall indicates that the model has a low rate of false negatives.
    # Recall = True Positives (TP) / (True Positives (TP) + False Negatives (FN))
    
# F1-score: F1-score is the harmonic mean of precision and recall and provides a single score that balances both metrics. 
    # It's useful when you want to strike a balance between precision and recall. F1-score tends to be lower when either 
    # precision or recall is low.
    # F1 Score = 2 * (Precision * Recall) / (Precision + Recall) 

# A high recall indicates that the model is good at capturing positive instances, minimizing false negatives.
# A high precision indicates that when the model predicts positive, it is likely to be correct, minimizing false positives.
# The F1 score is a balanced metric that takes both precision and recall into account. It is useful when there is an uneven 
#     class distribution.



>> Analyze Error Distribution

In [None]:
# if the errors are normally distributed around zero, it may indicate that the model is making unbiased predictions. 
# If there is a pattern or trend in the errors, it may suggest that the model has systematic biases or is making 
# consistent errors in certain regions of the input space



def analyze_error_distribution(y_true, y_pred):
    """
    Function to analyze the error distribution by plotting histograms and scatter plots.

    Parameters:
    -----------
    y_true : array-like
        Array of true labels or ground truth.
    y_pred : array-like
        Array of predicted values.

    Returns:
    --------
    None
    """
    # Calculate errors
    errors = y_true - y_pred

    # Plot histogram of errors
    plt.figure(figsize=(8, 6))
    plt.hist(errors, bins=20, alpha=0.75)
    plt.xlabel('Error')
    plt.ylabel('Frequency')
    plt.title('Error Distribution (Histogram)')
    plt.grid(True)
    plt.show()

    # Plot scatter plot of true labels vs. errors
    plt.figure(figsize=(8, 6))
    plt.scatter(y_true, errors, alpha=0.75)
    plt.xlabel('True Labels')
    plt.ylabel('Error')
    plt.title('Error Distribution (Scatter Plot)')
    plt.grid(True)
    plt.show()

    # Plot scatter plot of predicted values vs. errors
    plt.figure(figsize=(8, 6))
    plt.scatter(y_pred, errors, alpha=0.75)
    plt.xlabel('Predicted Values')
    plt.ylabel('Error')
    plt.title('Error Distribution (Scatter Plot)')
    plt.grid(True)
    plt.show()


analyze_error_distribution(labels, predicted)

>> Error Analysis - Diagnostics

In [None]:
#Error analysis is the process of analyzing the errors made by a machine learning model and identifying the patterns 
# or trends that may be causing the errors. The goal of error analysis is to gain insight into the behavior of the 
# model and identify areas for improvement.

# The steps involved in error analysis:
    # Collect error data (Collect and visualize Misclassified Samples)
    # Categorize errors
    # Identify patterns:    
        # Look for recurring patterns among the misclassified samples. Are certain classes consistently misclassified? 
        # Are there specific types of images that the model struggles with?
    # Analyze False Positives and False Negatives: 
        # For each misclassified sample, determine whether it is a false positive or false negative. A false positive 
        # occurs when the model predicts a positive class when the actual class is negative, and a false negative occurs 
        # when the model predicts a negative class when the actual class is positive.
    # Analyze causes:
        # Examine misclassified samples that are particularly challenging, and try to understand the reasons behind the 
        # errors. Are there any ambiguous or low-quality images? Are there classes that are inherently difficult to 
        # distinguish from others?
    # Prioritize fixes:
        # Based on the insights gained from error analysis, you can consider adjusting the training strategy. 
        # This might involve collecting more data for specific classes, using data augmentation techniques, or 
        # fine-tuning hyperparameters.
    
    
    
    
# Based on the insights gained from the error analysis, you can perform the following.
# False negatives:
    # False negatives occur when the model predicts that a customer will not churn when they actually do churn. 
    # To fix this issue, you may consider the following:
        #  Increase the weight of the features that are more indicative of churn for low-usage customers, 
        #     such as frequency of usage or specific product usage. (adjust the model parameters)
        #  Add new features that may be predictive of churn, such as customer sentiment or customer service interactions.
        #  Use a different model architecture that is better suited for handling imbalanced data, such as a decision tree 
        #     or ensemble model.
# False positives:
    # False positives occur when the model predicts that a customer will churn when they actually do not churn. 
    # To fix this issue, you may consider the following:
        # Decrease the weight of features that are causing false positives, such as age or income, if they are not as 
        #    indicative of churn for low-usage customers. (adjust the model parameters)
        # Remove features that are causing false positives altogether, if they are not providing significant value to the 
        #    model.
        # Increase the size of the training dataset to capture a more representative sample of customers who do not churn, 
        #    which may help the model learn more accurately which customers are likely to churn.


In [None]:
def calculate_tp_tn_fp_fn(confusion_matrix):
    num_classes = confusion_matrix.size(0)
    tp = torch.zeros(num_classes)
    tn = torch.zeros(num_classes)
    fp = torch.zeros(num_classes)
    fn = torch.zeros(num_classes)
    
    for i in range(num_classes):
        # True positives (diagonal elements of the confusion matrix)
        tp[i] = confusion_matrix[i, i]
        
        # True negatives (sum of all elements in the matrix except the current row and column)
        tn[i] = confusion_matrix[:i, :i].sum() + confusion_matrix[:i, i+1:].sum() + \
                confusion_matrix[i+1:, :i].sum() + confusion_matrix[i+1:, i+1:].sum()
        
        # False positives (sum of values in the column excluding the diagonal element)
        fp[i] = confusion_matrix[:, i].sum() - tp[i]
        
        # False negatives (sum of values in the row excluding the diagonal element)
        fn[i] = confusion_matrix[i, :].sum() - tp[i]
    
    return tp, tn, fp, fn

true_positives, true_negatives, false_positives, false_negatives = calculate_tp_tn_fp_fn(confusion_matrix)

print("True Positives (TP):", true_positives)
print("True Negatives (TN):", true_negatives)
print("False Positives (FP):", false_positives)
print("False Negatives (FN):", false_negatives)

### Model Optimization

>> Auto Tune using Optuna

In [None]:
import optuna
from sklearn.metrics import accuracy_score


# Define your objective function
def objective(trial):
    # Define your hyperparameters to be tuned
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1) 
    nUnits = trial.suggest_categorical('nUnits', 4, 128, step=8) 
    nLayers = trial.suggest_int('nLayers', 1, 6, step = 1) 
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.0, 0.5)    
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])    
    # weight_inits = trial.suggest_categorical('weight_init', ['default', 'kaiming_normal', 'xavier_uniform_']) 
    # num_epochs = trial.suggest_int('num_epochs', 30, 300)  
    # optimizer = trial.suggest_categorical('optimizer', ['adam', 'sgd'])
    # activation = trial.suggest_categorical('activation', ['relu', 'sigmoid', 'tanh'])
    # patience = trial.suggest_int('patience', 5, 20)

    # Define your model architecture with the hyperparameters
    model = mnist_model(nUnits, nLayers, weight_inits, dropout_rate) 

    num_epochs = 50
    learning_rate = learning_rate
    losses = torch.zeros(num_epochs)
    ongoing_accuracy = []
    num_classes = 10

    # Define the loss function and optimizer
    criterion = nn.NLLLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay = weight_decay) 

    # Define the learning rate scheduler
    scheduler = StepLR(optimizer, step_size=30, gamma=0.1)

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    # Move the model and data to the appropriate device (e.g., GPU if available)
    model.to(device)

    # Loop over the dataset for multiple epochs
    for epoch in range(num_epochs):
        model.train()
        batchAcc  = []
        batchLoss = []

        # Iterate over the training dataloader
        for inputs, labels in train_dataloader:
            inputs, labels = inputs.to(device), labels.to(device)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            # Backward pass and optimization
            loss.backward()
            optimizer.step()

            batchLoss.append(loss.item())

            # Compute accuracy on the training set
            predictions = torch.argmax(outputs, axis=1)
            accuracy = torchmetrics.functional.classification.accuracy(predictions, labels, task='multiclass',
                                                                        num_classes=num_classes) * 100
            batchAcc.append(accuracy.item())

        # Update the learning rate
        scheduler.step()

        ongoing_accuracy.append(np.mean(batchAcc))
        losses[epoch] = np.mean(batchLoss)


        #evaluation
    model.eval()  # Set the model to evaluation mode

    # Disable gradient computation for evaluation
    with torch.inference_mode():        #or torch.no_grad()
        for data in test_dataloader:
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # Forward pass
            outputs = model(inputs)

            # Calculate predictions
            _, predicted = torch.max(outputs.data, 1)

    # Calculate accuracy
    # accuracy = 100 * (total_correct / total_samples)
    accuracy = torchmetrics.functional.classification.accuracy(predicted, labels, task='multiclass', num_classes=num_classes) * 100

    return accuracy 

# Define the study
study = optuna.create_study(direction='maximize')

# Run the optimization
study.optimize(objective, n_trials=100)


In [None]:
# Get the best hyperparameters
print(" Value: ", study.best_trial.value)
print(" Params: ")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

>> Model Performance

In [None]:
#How to prevent overfitting
# Get more data	
    # Having more data gives the model more opportunities to learn patterns, patterns which may be more generalizable 
    # to new examples.
# Simplify your model	
    # If the current model is already overfitting the training data, it may be too complicated of a model. 
    # This means it's learning the patterns of the data too well and isn't able to generalize well to unseen data. 
    # One way to simplify a model is to reduce the number of layers it uses or to reduce the number of hidden units in 
    # each layer.
# Use data augmentation	
    # Data augmentation manipulates the training data in a way so that's harder for the model to learn as it artificially 
    # adds more variety to the data. If a model is able to learn patterns in augmented data, the model may be able to 
    # generalize better to unseen data.
# Use transfer learning	
    # Transfer learning involves leveraging the patterns (also called pretrained weights) one model has learned to use as 
    # the foundation for your own task. In our case, we could use one computer vision model pretrained on a large variety 
    # of images and then tweak it slightly to be more specialized for food images.
# Use dropout layers	
    # Dropout layers randomly remove connections between hidden layers in neural networks, effectively simplifying a model 
    # but also making the remaining connections better. See torch.nn.Dropout() for more.
# Use learning rate decay	
    # The idea here is to slowly decrease the learning rate as a model trains. This is akin to reaching for a coin at the 
    # back of a couch. The closer you get, the smaller your steps. The same with the learning rate, the closer you get to 
    # convergence, the smaller you'll want your weight updates to be.
# Use early stopping	
    # Early stopping stops model training before it begins to overfit. As in, say the model's loss has stopped decreasing 
    # for the past 10 epochs (this number is arbitrary), you may want to stop the model training here and go with the model 
    # weights that had the lowest loss (10 epochs prior).





#How to prevent underfitting
# Add more layers/units to your model	
    # If your model is underfitting, it may not have enough capability to learn the required patterns/weights/representations 
    # of the data to be predictive. One way to add more predictive power to your model is to increase the number of hidden 
    # layers/units within those layers.
# Tweak the learning rate	
    # Perhaps your model's learning rate is too high to begin with. And it's trying to update its weights each epoch too 
    # much, in turn not learning anything. In this case, you might lower the learning rate and see what happens.
# Use transfer learning	
    # Transfer learning is capable of preventing overfitting and underfitting. It involves using the patterns from a 
    # previously working model and adjusting them to your own problem.
# Train for longer	
    # Sometimes a model just needs more time to learn representations of data. If you find in your smaller experiments 
    # your model isn't learning anything, perhaps leaving it train for a more epochs may result in better performance.
# Use less regularization	
    # Perhaps your model is underfitting because you're trying to prevent overfitting too much. Holding back on 
    # regularization techniques can help your model fit the data better.



    

### Test the Model

In [None]:
model.eval()  # Set the model to evaluation mode

# Disable gradient computation for evaluation
with torch.inference_mode():        #or torch.no_grad()
    for data in test_dataloader:
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        # Forward pass
        outputs = model(inputs)

        # Calculate predictions
        _, predicted = torch.max(outputs.data, 1)

# Calculate accuracy
accuracy = torchmetrics.functional.classification.accuracy(predicted, labels, task='multiclass', num_classes=num_classes) 

print(f"Accuracy on test set: {100 * accuracy:.2f}%")

### Save the Model

In [None]:
import torch

# Save the model checkpoint
checkpoint = {
    'epoch': 300,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,
    'learning_rate': optimizer.param_groups[0]['lr'],
    'hyperparameters': {
                'hidden_units': 64,
                'batch_size': 32
                        },
    # 'other_info': 'Additional information about the checkpoint'
}

torch.save(checkpoint, 'model_checkpoint.pth')
