### Tensors

In [None]:
#Introduction
    # Tensors are the primary data structure in PyTorch and are similar to multi-dimensional arrays.
    # They are the fundamental building blocks for deep learning models and computations.
    # Tensors can be scalars (0-dimensional), vectors (1-dimensional), matrices (2-dimensional), or higher-dimensional arrays.

%whos #check all the variables in the workspace (%who; %who_ls)

#Creating Tensors:
    # You can create tensors using various methods, such as:
    torch.Tensor() #Creates an uninitialized tensor.    
    torch.tensor() #Creates a tensor from existing data. i.e torch.tensor(array)
    torch.zeros(), torch.ones() #Creates tensors of zeros or ones.
    torch.rand(3,4), torch.randn(2,2) #Creates tensors with random values.
    # You can specify the data type and device (CPU or GPU) while creating tensors.

#Creating random tensors
tensor = torch.randn(3, 4, 3)
    # The first dimension represents the number of "blocks" or "chunks." In this case, we have 3 blocks.
    # The second dimension represents the number of rows within each block. Here, we have 4 rows.
    # The third dimension represents the number of columns within each row. We have 3 columns.
    
tensor = torch.randn(3, 4, 3, 2)
    # The first dimension represents the number of blocks or chunks. We have 3 blocks.
    # The second dimension represents the number of rows within each block. Here, we have 4 rows.
    # The third dimension represents the number of columns within each row. We have 3 columns.
    # The fourth dimension represents the depth or the number of elements in each cell. Each cell has 2 elements.

tensor = torch.randn([3, 4, 3, 2, 5])
    # The first dimension represents the number of "blocks" or "chunks." In this case, we have 3 blocks.
    # The second dimension represents the number of rows within each block. Here, we have 4 rows.
    # The third dimension represents the number of columns within each row. We have 3 columns.
    # The fourth dimension represents the depth or the number of elements in each cell. Each cell has 2 elements.
    # The fifth dimension represents the number of values within each element. Each element has 5 values.

# Create a 2D tensor with random values from a uniform distribution between 0 and 1
random_tensor = torch.rand(3, 4)
# Create a 3D tensor with random values from a normal distribution
random_tensor = torch.randn(2, 3, 4)
# Create a random tensor with dtype=torch.float32 and allocate it on the GPU
random_tensor = torch.randn(3, 4, dtype=torch.float32, device='cuda')
# Create a random tensor with integer values between 0 and 9
random_tensor = torch.randint(10, size=(3, 4))

# Set the random seed
torch.manual_seed(42)

#convert numpy to tensor
arr = np.array([10,23,23])
arr = torch.tensor(arr)

def set_seeds(seed=1234):
    """Set seeds for reproducibility."""
    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) # multi-GPU

# convert tensor to numpy
tensor = torch.randint(10, size=(3, 3, 4))
numpy_array = tensor.numpy()

torch.zeros(3,4)    #create tensors with zeros
torch.zeros_like(tensor)    #create tensors with zeros in the shape of tensor 'tensor'. 

In [33]:
# Tensor Operations:

# PyTorch provides a wide range of operations to manipulate tensors efficiently.
# Element-wise operations: Addition, subtraction, multiplication, division, etc.
# Reduction operations: Sum (torch.sum(a)), mean (torch.mean(a)), min, max, etc.
# Matrix operations: Matrix multiplication (a @ b), dot product (torch.dot(a,b)), transpose (torch.transpose(a, 0, 1)), etc.
# Indexing and slicing: Accessing specific elements or subsets of a tensor.
# Broadcasting: Performing operations on tensors with different shapes.
# Concatenation and stacking: Combining tensors along specified dimensions (torch.stack((a, b), dim=0 or 1); 
                                                                                # d = torch.cat((a, b.T), dim=1)).

In [None]:
# GPU Acceleration

# PyTorch supports GPU acceleration, which enables faster computations on compatible hardware.
    if torch.cuda.is_available():
        print("GPU is available!")
    else:
        print("GPU is not available.")
# You can move tensors to the GPU using .to(device) or .cuda() methods.
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # Checking GPU availability
    x = torch.tensor([1, 2, 3]) # Creating a tensor
    x = x.to(device)    # Moving the tensor to the GPU
# Performing computations on the GPU can significantly speed up training deep learning models.
    # Creating tensors on the GPU
    a = torch.tensor([1, 2, 3], device=device)
    b = torch.tensor([4, 5, 6], device=device)

In [None]:
# Automatic Differentiation:

# PyTorch provides automatic differentiation, a powerful feature for training neural networks.
# You can track the operations on tensors and compute gradients using the torch.autograd module.
    # Creating a tensor with requires_grad=True
    x = torch.tensor([2.0], requires_grad=True)
# Gradients represent the derivative of a tensor with respect to another tensor.
# Autograd allows you to compute gradients efficiently for backpropagation during training.



#Torch provides a module, `autograd`, for automatically calculating the gradients of tensors. We can use it to 
# calculate the gradients of all our parameters with respect to the loss. Autograd works by keeping track of 
# operations performed on tensors, then going backwards through those operations, calculating gradients along the 
# way. To make sure PyTorch keeps track of operations on a tensor and calculates the gradients, you need to set 
# `requires_grad = True` on a tensor. You can do this at creation with the `requires_grad` keyword, or at any time 
# with `x.requires_grad_(True)`.
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)
y = torch.tensor([4.0, 5.0, 6.0], requires_grad=True)
z = x + y   # Perform operations
output = torch.sum(z)
#or
x.requires_grad_(True)
# Once you have computed the output tensor, you can call the backward() method on the output tensor to compute gradients
output.backward()


# You can turn off gradients for a block of code with the torch.no_grad()
>>> with torch.no_grad():
...     y = x * 2
>>> y.requires_grad
False


In [None]:
# Advanced Tensor Manipulation:

# PyTorch offers advanced tensor manipulation techniques for complex operations.
# Reshaping tensors: Changing the shape or size of a tensor using view(), reshape(), or unsqueeze().
# Tensor concatenation and splitting: Combining or splitting tensors along specified dimensions.
    torch.cat(), torch.stack(), and torch.split()
# Element-wise functions: Applying mathematical functions to each element of a tensor.
    torch.sin(), torch.cos(), torch.exp(), torch.log()
# Advanced indexing: Using advanced indexing techniques to access or modify specific elements of a tensor.
# Broadcasting with advanced shapes: Handling tensors with different shapes during operations.

### Model Definition

In [None]:
# Define your neural network architecture by creating a custom model class that inherits from torch.nn.Module.

# In the __init__ method, define the layers of your model using PyTorch's nn module. 
# This includes defining linear layers, activation functions, pooling layers, etc.

# Implement the forward method to define the forward pass of your model. This method describes how the input flows 
# through the layers to produce an output.

# NB:
# Trainable Parameters in deep learning: weights, and biases
#non-trainable parameters in deep learning: Hyperparameters, and Pretrained parameters. 

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive') 

!pip install -q -r '/content/drive/MyDrive/Colab Notebooks/requirements.txt' 

>> Imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.datasets import load_iris 
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import KFold, train_test_split 

#deep learning libraries 
import torch.nn.init as init 
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR 
from torch.utils.data import DataLoader, Dataset, TensorDataset 
import torchmetrics 
from torchsummary import summary 

>>Sequential

In [1]:
#using Sequential
#Sequential API allows you to create a model by stacking layers on top of each other in a sequential manner

import torch
import torch.nn as nn

#A simple feed-forward neural network with two hidden layers

# Define the model architecture 
model = nn.Sequential(
    nn.Linear(in_features=784, out_features=64),  # First hidden layer
    nn.ReLU(),                                    # Activation function
    nn.Linear(in_features=64, out_features=32),   # Second hidden layer
    nn.ReLU(),                                    # Activation function
    nn.Linear(in_features=32, out_features=10)    # Output layer
)


>> Custom function (Module)

In [None]:
class paul_model(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(paul_model, self).__init__()
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        self.output_dim = output_dim 
        
        self.fct1 = nn.Linear(input_dim, hidden_dim)
        self.relu1 = nn.ReLU()
        self.fct2 = nn.Linear(hidden_dim, hidden_dim)
        self.soft1 = nn.Softmax(dim=1)
        self.fct3 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.fct1(x)
        x = self.relu1(x)
        x = self.fct2(x)
        x = self.soft1(x)
        x = self.fct3(x)
        return x

# Create an instance of the custom model
input_dim = ...  # specify the input dimension
hidden_dim = ...  # specify the hidden dimension
output_dim = ...  # specify the output dimension
model = paul_model(input_dim, hidden_dim, output_dim)

# get output
y_pred = model(X_train)

model.state_dict()  #shows all the properties of the model

In [None]:
#a custom function to define the model

import torch
import torch.nn as nn

# Define a custom model class
# Define your neural network architecture by creating a custom model class that inherits from torch.nn.Module.
class MyModel(nn.Module):
    
    # In the __init__ method, define the layers of your model using PyTorch's nn module. 
    # This includes defining linear layers, activation functions, pooling layers, etc.
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)  #first hidden layer
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)   #second hidden layer
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden_dim, output_dim)   #output layer

# Implement the forward method to define the forward pass of your model. This method describes how the input flows 
# through the layers to produce an output.
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

# Create an instance of the custom model
input_dim = ...  # specify the input dimension
hidden_dim = ...  # specify the hidden dimension
output_dim = ...  # specify the output dimension
model = MyModel(input_dim, hidden_dim, output_dim)

# get output
y_pred = model(X_train)


#Input features represents the number of features or variables in your input data
#hidden features/dimensions represents the number of neurons in the hidden layers of your neural network
#output dimension represents the number of neurons in the output layer of your neural network. 
    # If you have a multi-class classification problem with 10 classes, the output dimension would be 10
    # In a regression dataset, the output dimension would typically be 1. This is because regression tasks 
        # involve predicting a continuous numerical value as the output

In [None]:
#asides the Linear and ReLu, we have other subclasses of nn.Module that can be used to define different layers 
# and operations in your neural network

# Convolutional Layers:
nn.Conv1d: #1D convolutional layer for processing sequential data.
nn.Conv2d: #2D convolutional layer for processing images or spatial data.
nn.Conv3d: #3D convolutional layer for processing volumetric data.
    
# Pooling Layers:
nn.MaxPool1d: #1D max pooling layer 
nn.MaxPool2d: #2D max pooling layer
nn.MaxPool3d: #3D max pooling layer
nn.AvgPool1d: #1D average pooling layer
nn.AvgPool2d: #2D average pooling layer
nn.AvgPool3d: #3D average pooling layer.
    
# Recurrent Layers:
nn.RNN: #Basic RNN layer.
nn.LSTM: #LSTM layer.
nn.GRU: #GRU layer.

# Normalization Layers:
nn.BatchNorm1d: #Batch normalization layer for 1D inputs.
nn.BatchNorm2d: #Batch normalization layer for 2D inputs.
nn.BatchNorm3d: #Batch normalization layer for 3D inputs.

# Dropout and Regularization:
nn.Dropout: #Dropout layer for regularization.
nn.Dropout2d: #2D dropout layer.
nn.Dropout3d: #3D dropout layer.

# Activation Functions:
nn.Sigmoid: #Sigmoid activation function.
nn.Tanh: #Hyperbolic tangent activation function.
nn.Softmax: #Softmax activation function.
nn.LeakyReLU: #Leaky ReLU activation function. 



>> Example

In [2]:
#an example 

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the Iris dataset
iris = load_iris()
X = iris.data
y = iris.target

# Preprocess the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the data to PyTorch tensors
X_train = torch.Tensor(X_train)
y_train = torch.LongTensor(y_train)

# Define the custom model
class CustomModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(CustomModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        out = self.fc1(x)
        out = self.relu(out)
        out = self.fc2(out)
        return out

# Set the dimensions for the input, hidden, and output layers
input_dim = X_train.shape[1]
hidden_dim = 16
output_dim = 3

# Create an instance of the custom model
model = CustomModel(input_dim, hidden_dim, output_dim)
model.train()
outputs = model(X_train) #


>> Activation Functions

In [None]:
# Sigmoid Function (nn.Sigmoid() or F.sigmoid())
    # Range: (0, 1)
    # Smooth, continuously differentiable function
    # Used for 'binary classification' problems or when the output needs to be in the range of probabilities.
    sigmoid_act = nn.Sigmoid()  #output = sigmoid_act(x)
    output = F.sigmoid(x)

# ReLU (Rectified Linear Unit) Function (nn.ReLU() or F.relu()) 
    # Range: [0, +∞)
    # Simple and computationally efficient activation function
    # Commonly used as a default choice for most deep learning models. often used for most hidden layers
    relu = nn.ReLU()    #output = relu(x)
    output = F.relu(x)

# Leaky ReLU Function (nn.LeakyReLU() or F.leaky_relu())
    # Range: (-∞, +∞)
    # Similar to ReLU but allows small negative values for negative inputs
    # Helps prevent "dying ReLU" problem by allowing a small gradient for negative inputs.
    leaky_relu = nn.LeakyReLU(0.2)  # Set the negative slope.    output = leaky_relu(x)
    output = F.leaky_relu(x, negative_slope=0.2)  # Set the negative slope

# Tanh Function (nn.Tanh() or F.tanh())     - Hyperbolic tangent (tanh)
    # Range: (-1, 1)
    # S-shaped activation function that maps values between -1 and 1
    # Used in some models as an alternative to sigmoid function.
    tanh = nn.Tanh()    #output = tanh(x)
    output = F.tanh(x)

#Softmax Function (nn.Softmax(dim=) or F.softmax(dim=))
    # Converts a vector of arbitrary real values into a probability distribution
    # Typically used in the output layer for multi-class classification problems.
    softmax = nn.Softmax(dim=1)  # Set the appropriate dimension
    output = F.softmax(x, dim=1)  # Set the appropriate dimension
    
    #ln-softmax works fine on problems with a 'small number' of categories, 
    # or when categories are easily differentiable. But when categories are large, use log_softmax()
    

>> Hyperparameters/Metaparameters

In [None]:
# Some common hyperparameters/metaparameters in deep learning include:

#Model architecture:
    # x 

# Learning rate: 
    # Determines the step size during gradient descent optimization and affects the convergence speed 
    # and accuracy of the model.

# Number of hidden layers:  
    # Determines the depth of the neural network architecture and influences the model's capacity to learn complex 
    # patterns.

# Number of neurons per layer: 
    # Defines the width of the neural network architecture and affects the model's representational capacity and 
    # computational efficiency.

# Activation functions: 
    # Determines the non-linear transformation applied to the output of each neuron, introducing non-linearity into 
    # the model.

# Dropout rate: 
    # Controls the regularization technique of randomly dropping out a fraction of neurons during training, which
    # helps prevent overfitting. 

# Batch size: 
    # Specifies the number of training samples propagated through the network before updating the model's weights.

# Number of epochs: 
    # Specifies the number of times the entire training dataset is passed through the model during training.

# Regularization techniques: 
    # Include methods like L1 and L2 regularization, which help prevent overfitting by adding penalties to the 
    # loss function. 

# Optimizer: 
    # Specifies the optimization algorithm used to update the model's weights during training, such as 
    # Stochastic Gradient Descent (SGD), Adam, or RMSprop.

# Loss function: 
    # Defines the objective function used to measure the discrepancy between the predicted output and the 
    # true output during training.

# Cross-validation sizes:
    # x

# Weight and Data Normalization:
    # minmax normalization; z-score scaling
    
# Weight Initialization:
    # xxxxx

>> Model Info/summary

In [None]:
from torchsummary import summary 

model = ConvNet() #define your model
summary(model, ( 28, 28))   #input tensor of size (28, 28) 


# Get the trainable parameters
trainable_params = model.parameters()

# Get the non-trainable parameters
non_trainable_params = [p for p in model.parameters() if not p.requires_grad]

>> Regularization

In [None]:
# Regularization is a technique used in deep learning to prevent overfitting and improve the generalization ability 
# of the model.

# Node Regularization: Modify the model (dropout)
# Loss Regularization: Add a cost to the loss function (L1/2)
# Data Regularization: Modify or add data (batch training, data augmentation, normalization) 


# L2 Regularization (Weight Decay):
    # L2 regularization, also known as weight decay, adds a penalty term to the loss function that discourages large 
    # weights in the model.
    # It helps prevent overfitting by encouraging the model to use smaller weights, effectively reducing the 
    # complexity of the model.
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, weight_decay=0.001)#Define the optimizer with weight decay

# Dropout:
    # Dropout is a regularization technique that randomly sets a fraction of the input units (nodes) to 0 during training.
    # It helps prevent overfitting by introducing noise and reducing the interdependence of neurons.
    #dropout is not a good idea for simple models or small data. 
    class MyModel(nn.Module):
        def __init__(self):
            super(MyModel, self).__init__()
            self.fc1 = nn.Linear(64, 128)
            self.dropout = nn.Dropout(0.5)
            self.fc2 = nn.Linear(128, 10)
        
        def forward(self, x):
            x = self.fc1(x)
            x = self.dropout(x)  # Apply Dropout
            x = torch.relu(x)
            x = self.fc2(x)
            return x 


>> Model Functions

In [None]:
# model.parameters(): This function returns an iterator over all the learnable parameters of the model. 
    # It is typically used when defining the optimizer to specify which parameters should be updated during training
for param in model.parameters(): #for name, param in model.named_parameters(): #for both parameter name and tensor
    print(param)

# model.children(): This function returns an iterator over the immediate child modules of the model. It can be used, 
        # for example, to access and modify specific layers or modules within the model.
for child in model.children():  #for name, child in model.named_children(): for both the child name and the module
    print(child)

# model.state_dict(): This function returns a dictionary containing the model's state, including the learnable 
        # parameters and buffers. It is commonly used for saving and loading model checkpoints.
state_dict = model.state_dict()
torch.save(state_dict, 'model_checkpoint.pth')

# model.load_state_dict(): This method loads a state dictionary into the model, restoring the model's parameters 
        # and buffers from a saved checkpoint
state_dict = torch.load('model_checkpoint.pth')
model.load_state_dict(state_dict)

model.eval()    #sets the model to evaluation mode
model.train()   #sets the model to training mode
model.freeze()  #freezes all the parameters in the model, making them not trainable
model.unfreeze()    #unfreezes all the parameters in the model, making them trainable

#counting the number of parameters in the model:
total_params = sum(p.numel() for p in model.parameters()) #.numel() is number of elements.
print(f"Total parameters: {total_params}")

#Visualizing model architectures:
from torchsummary import summary 
summary(model, input_size = ( 28, 28))   # Print a summary of the model architecture







### Data Preprocessing

In [None]:
import torch
from torchvision import transforms

# Define transformations
transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# Load datasets and apply transformations
train_dataset = torchvision.datasets.ImageFolder(root='train_data/', transform=transform)
test_dataset = torchvision.datasets.ImageFolder(root='test_data/', transform=transform)

# Create data loaders
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True, drop_last = False)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=False, drop_last = False)
                                        #drop_last wil drop the last batch if its not the same size as the rest

>> ANN Data (Use this)

In [None]:

inputs = data[:,:-1]
labels = data[:,-1]

X_train, X_test, y_train, y_test = train_test_split(inputs, labels, random_state=23, train_size=0.9)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# inputs 
X_train = torch.tensor(np.array(X_train), dtype=torch.float32)
X_test = torch.tensor(np.array(X_test), dtype=torch.float32)
y_train = torch.tensor(np.array(y_train), dtype=torch.long)
y_test = torch.tensor(np.array(y_test), dtype=torch.long)

#dataset 
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

#dataloader
batch_size = 32
shuffle = True
drop_last = True
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last)
test_dataloader = DataLoader(test_dataset, batch_size=test_dataset.tensors[0].shape[0]) 

>> Loading Data

In [24]:
from torch.utils.data import DataLoader, Dataset, TensorDataset

# From DataFrame  
df = pd.read_csv('your_dataset.csv') #inputs, targets = load_iris(return_X_y = True, as_frame = True)

# Extract the input features and target labels from the DataFrame
inputs = df[['feature1', 'feature2', ...]].values
targets = df['target'].values

# Convert the data to PyTorch tensors
inputs = torch.tensor(inputs, dtype=torch.float32, #use float for input data, and long for labels. 
                    device=None,    #can use 'cpu', or 'cuda'
                    requires_grad=True) #or False
targets = torch.tensor(targets, dtype=torch.long)   #or LongTensor for cuda

# Create a TensorDataset
dataset = TensorDataset(inputs, targets) 

In [None]:
# Data loaders in deep learning are utility classes or functions that help in efficiently loading and 
# processing training, validation, and testing data. They are an essential component of training deep learning models 
# and provide several benefits such as Data Batching, Data Shuffling, Data Augmentation, Data Transformation, 
# Efficient Memory Management, Parallel Data Loading.
train_dataset = datasets.MNIST(root='./data', train=True, transform=transform, download=True) # Load the training data
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True) #data loader 
                        # are used to move the input data and labels to a specified device (e.g., CPU or GPU) for computation





In [None]:
#Image Folder Dataset
import torchvision.datasets as datasets
from torchvision.datasets import DatasetFolder
import torchvision.transforms as transforms

# Define the transformation to apply to the images
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize images to a fixed size
    transforms.ToTensor(),          # Convert images to tensors
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))  # Normalize image tensors
])

# Load the dataset from the image folders
dataset = datasets.ImageFolder(root='Dataset/', transform=transform)


# Create an instance of the DatasetFolder
dataset = DatasetFolder(
    root='Dataset/',
    loader=torchvision.datasets.folder.default_loader,  # Use the default image loader
    extensions=".jpg",  # Specify the file extensions of the images
    transform=transform  # Apply the defined transformation pipeline
)

#CSV Dataset 
# Load the dataset from a CSV file
dataset = datasets.CSVDataset(root='Dataset/', filename='data.csv', target_column='label',  # specify the CSV file and target column
                            has_header=True,  # specify if the CSV file has a header row
                            categorical_columns=[3, 4, 5],  # specify categorical columns (if any)
                            continuous_columns=[0, 1, 2],  # specify continuous columns
                            delimiter=',',  # specify the delimiter used in the CSV file
                            transform=None)  # apply transformations if needed



In [None]:
#Custom Dataset 

#Custom Dataset (from a root directory)
from torch.utils.data import Dataset

class CustomDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        # Additional initialization logic goes here

    def __len__(self):
        # Return the total number of samples in the dataset
        # e.g., return len(self.data)
        pass

    def __getitem__(self, index):
        # Retrieve and preprocess a single sample from the dataset
        # e.g., return self.transform(self.data[index]), self.labels[index]
        pass

dataset = CustomDataset(root_dir='Dataset/', transform=None)    # Create an instance of your custom dataset



#Custom Dataset (from a dataframe with known input and output)
class CustomDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, index):
        input_data = self.inputs[index]
        label = self.labels[index]
        return input_data, label

# Assuming you have 'inputs' and 'labels' as your data
dataset = CustomDataset(inputs, labels)

# Create a data loader
batch_size = 32
shuffle = True
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)


>> Dataloader

In [None]:
# Create a data loader
batch_size = 32
shuffle = True
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last = False)

#visualize the data in dataloader
for inputs, labels in dataloader:
    print("Inputs:", inputs) 
    print("Labels:", labels)
    print()
#or
data_list = list(dataloader)
print(data_list)



>> Feature Scaling (Normalization)

In [None]:
#Normalization techniques are used to preprocess the input data in order to improve the training process and the 
# overall performance of deep learning models.

#It is always a good idea to perform both data normalization (i.e z-score, minmax etc.) and batch normalization. 

# Batch Normalization:
    # Batch Normalization is a technique that normalizes the inputs within each mini-batch during training.
    #i.e It normalizes the activations within a mini-batch during training by adjusting the mean and standard deviation
    # It helps stabilize and accelerate the training process by reducing internal covariate shift.
    # should only be applied during training - use model.eval()
    class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(64, 128)
        self.bn1 = nn.BatchNorm1d(128)  # Batch Normalization
        self.fc2 = nn.Linear(128, 10)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.bn1(x)  # Apply Batch Normalization
        x = torch.relu(x)
        x = self.fc2(x)
        return x    
    #  Batch Normalization requires a specified input size (e.g., nn.BatchNorm1d for 1D inputs, 
    # nn.BatchNorm2d for 2D inputs) depending on the dimensionality of the data


#Layer Normalization:
    # Layer Normalization is a technique that normalizes the inputs within each layer across the feature dimension.
    # It helps improve the generalization ability of models and performs well on tasks with recurrent neural 
    # networks (RNNs).
    class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        self.fc1 = nn.Linear(64, 128)
        self.ln1 = nn.LayerNorm(128)  # Layer Normalization
        self.fc2 = nn.Linear(128, 10)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.ln1(x)  # Apply Layer Normalization
        x = torch.relu(x)
        x = self.fc2(x)
        return x    
    # Layer Normalization normalizes across the feature dimension, so it doesn't require a specific input size
    

### Model Training

In [None]:
# Loss Function: 
    # Choose an appropriate loss function based on the problem you are solving. Common loss functions include 
        # mean squared error (MSE), binary cross-entropy, or categorical cross-entropy, depending on the task.
    # Create an instance of the chosen loss function from torch.nn module.

# Optimizer:
    # Select an optimizer that will update the model's parameters during training. Popular choices include 
        # Stochastic Gradient Descent (SGD), Adam, or RMSprop.
    # Initialize the optimizer by passing the model parameters and setting the learning rate and other hyperparameters.
    
# Training Loop:
    # Iterate over the training dataset in batches.
    # Zero the gradients of the model parameters to avoid accumulation.
    # Pass the input batch through the model to obtain predictions.
    # Calculate the loss between the predictions and the target values.
    # Backpropagate the gradients by calling backward() on the loss tensor.
    # Update the model parameters using the optimizer's step() function.
    # Optionally, track and record metrics like accuracy or loss during training.



>> Loss function

In [None]:
# Define the loss function

#Regression 
loss_function = nn.MSELoss()  # Mean Squared Error loss: It is widely used in regression problems 
loss_function = nn.L1Loss() #Mean Absolute Error (MAE): It is often used in regression problems
loss_function = nn.SmoothL1Loss() #Huber Loss: A robust loss function for regression problems that combines properties 
                                    #of both MSE and MAE. 
                                    
#Classification 
loss_function = nn.CrossEntropyLoss( )  #Cross-Entropy Loss: It is commonly used in multi-class classification problems
loss_function = nn.NLLLoss() #Negative log likelihood (NLL) loss: commonly used in multi-class classification problems
            #The NLL loss is often used in combination with the nn.LogSoftmax() activation function for multi-class 
            # classification tasks.
loss_function = nn.BCELoss() #Binary Cross-Entropy Loss: used in binary classification tasks, where the model's output 
                                #consists of probabilities instead of logits.
loss_function = nn.BCEWithLogitsLoss() #Binary Cross-Entropy (BCE) Loss: It is commonly used in binary classification 
                                            #problems, where the model's output consists of logits 
                                            # (unbounded real numbers) rather than probabilities.
                                            
#Generative models
loss_function = nn.KLDivLoss()  #Kullback-Leibler Divergence (KLD): Measuring the difference between two probability 
                                    #distributions, commonly used in generative models.



# Calculate the loss
loss = loss_function(outputs, target_data)

# Print the loss
print("Loss:", loss.item())


# Logits are the raw, unnormalized values produced by the model before applying any activation function like sigmoid or 
# softmax. They represent the model's predictions or scores for each class without being converted into probabilities 
# yet.


>> Optimizers

In [None]:
# Optimizers play a crucial role in training neural networks by updating the model's parameters to minimize the 
# loss function

# Stochastic Gradient Descent (SGD):
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
# SGD is a classic optimization algorithm that updates the model parameters based on the gradients computed on 
# small subsets of the training data. It is great when all samples are similar to each other. 
# Stochastic gradient descent randomly picks a single sample to compute gradients and update parameters. 

# Adam:
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# Adam (Adaptive Moment Estimation) is an optimization algorithm that adapts the learning rate for each parameter 
# based on the estimates of the first and second moments of the gradients

# Adagrad:
optimizer = torch.optim.Adagrad(model.parameters(), lr=learning_rate)
# Adagrad (Adaptive Gradient) is an optimization algorithm that adapts the learning rate for each parameter based on 
# the historical gradients for that parameter. It is often used in natural language processing tasks. 

#RMSprop:
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
# RMSprop (Root Mean Square Propagation) is an optimization algorithm that adapts the learning rate for each parameter 
# based on the moving average of squared gradients. It helps mitigate the diminishing learning rate problem

# Adadelta:
optimizer = torch.optim.Adadelta(model.parameters(), lr=learning_rate)
# Adadelta is an optimization algorithm that dynamically adapts the learning rate and accumulates only a limited 
# history of past gradients.


#the most popular are: Stochastic Gradient Descent (SGD), Adam, RMSprop, and Adagrad.


In [None]:
# Commonly used optimizers in deep learning are based on gradient descents, with three types: batch, stochastic, 
# and mini-batch. Mini-batch is often used for a balance between the other two. Stochastic gradient descent is a 
# classic optimizer that applies mini-batch gradient descent with fixed step size.

# Gradient descent has three types: batch, stochastic, and mini-batch.
    # •	Batch gradient descent uses the entire dataset to compute gradients and update parameters.
    # •	Stochastic gradient descent randomly picks a single sample to compute gradients and update parameters.
    # •	Mini-batch gradient descent randomly picks a subset of the dataset to compute gradients and update parameters.

# NB:	# Batch gradient descent considers entire database, slowing down training
        # Stochastic gradient descent uses one sample, causing fluctuation and difficulty in reaching global minimum. 
        # Mini-batch gradient descent balances both.

# Adaptive learning rates address issues with sparse data
    # •	Learning rate applied throughout training can be problematic
    # •	Adaptive learning rates scale based on inverse sum of squared gradient
    
#Adaptive learning rates are used in optimizers to dynamically adjust the step size (learning rate) during the training
# process. The main reason for using adaptive learning rates is to improve the efficiency and effectiveness of the 
# optimization algorithm. They are beneficial because of their convergence speed, robustness to different scales, 
# handling sparse data, robustness to initial learning rate. AdaGrad, RMSprop and Adam use adaptive learning rate.

# RMSprop(Root Mean Square Propagation) and Adam are popular optimization methods in deep learning
    # •	RMSprop scales the gradient and uses a moving average of the squared gradient
    # •	Adam combines AdaGrad(Adaptive Gradient Algorithm), RMSprop, and momentum methods into one

# Comparison between ATOMS and RMSProp
    # •	ATOMS corrects moment estimates for bias towards zero, making it easier to achieve good performance without 
        # tuning hyperparameters.
    # •	Adaptive learning methods like ATOMS are preferred for better convergence and results, but finding a good 
        # learning rate can also be effective.

# Final Note: Adam and RMSprop are good starting points for adaptive learning weight methods.
    # - Adam(Adaptive Moment Estimation) and RMSprop are extensions of SGD with momentum.
    # - RMSprop is preferred for sparse data, while Adam is better for faster gradients.
    # - Towards the end of optimization, Adam may be the best overall choice for adaptive learning weight methods. 


>> Training Loop

In [None]:
# Training loop involves forward and backward pass to update weights
    # Forward pass computes prediction
    # Backward pass computes gradients, which are used to update weights

# ----------------------------------------------------------------------------


# The training loop consists of two nested loops. The outer loop iterates over the specified number of epochs. 
# Inside the epoch loop, the model is set to train mode (model.train()) and the running loss is initialized. 
# Then, we iterate over the training data in batches using the train_loader.

# For each batch, we perform the following steps:

    # Zero the gradients using optimizer.zero_grad().
    # Forward pass: Pass the input data through the model to obtain the predicted outputs.
    # Compute the loss between the predicted outputs and the true labels.
    # Backward pass: Compute the gradients of the loss with respect to the model parameters.
    # Update the weights using the optimizer's step() method.
    # Update the running loss by adding the current batch loss.

In [None]:
# Define your model, optimizer, and loss function
model = MyModel()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
train_epoch = 100
losses = torch.zeros(train_epoch)
ongoing_accuracy = [] 

# Training loop
for epoch in range(train_epoch):
    # Set the model to train mode
    model.train()
    
    # Initialize the running loss
    running_loss = 0.0 
    total_correct = 0
    total_samples = 0
    
    # Iterate over the training dataset (training batches)
    for inputs, labels in train_loader:

        inputs, labels = inputs.to(device), labels.to(device) 
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        
        # Compute the loss
        loss = criterion(outputs, labels)
        losses[epoch] = loss
        
        # Backward pass
        loss.backward()
        
        # Update the weights
        optimizer.step()
        
        # Update the running loss
        running_loss += loss.item()
    
    # Compute the average loss for the epoch
    epoch_loss = running_loss / len(train_loader)
    
    accuracy = 100*torch.mean(((outputs>.5) == labels).float())   #Binary classification
    ongoing_accuracy.append(accuracy) 
    print('Accuracy: ' accuracy) 
    # Print the loss for each epoch
    print(f"Epoch [{epoch+1}/{train_epoch}], Loss: {epoch_loss:.4f}")


>> Train classification

In [None]:
#Binary classification
    #see code above



#multi-class classification

# train the model 
loss_function = nn.CrossEntropyLoss()        # loss function 
optimizer =  torch.optim.SGD (model.parameters(), lr=0.05)  # optimizer
train_epoch = 100
losses = torch.zeros(train_epoch)
ongoing_accuracy = [] 

# loop over the dataset multiple times
for epoch in range(train_epoch):
    running_loss = 0.0
    total_correct = 0
    total_samples = 0
    for i, data in enumerate(dataloader, 0):
        inputs, labels = data
        # inputs = inputs.float()  # Convert inputs to float type
        # labels = labels.long()  # Convert labels to long type or LongTensor for cuda
        inputs, labels = inputs.to(device), labels.to(device)

        # zero the parameter gradients 
        optimizer.zero_grad() 

        # forward + backward + optimize 
        y_pred = model(inputs) 
        loss = loss_function(y_pred, labels) 
        losses[epoch] = loss 
        
        loss.backward() 
        optimizer.step() 

        running_loss += loss.item() 
        
        # Calculate accuracy 
        predicted_labels = torch.argmax(y_pred, dim=1) 
        total_correct += torch.sum(predicted_labels == labels).item() 
        total_samples += labels.size(0) 

    accuracy = 100 * total_correct / total_samples
    ongoing_accuracy.append(accuracy)
    print('Epoch: {}, Loss: {:.4f}, Accuracy: {:.2f}%'.format(epoch + 1, running_loss, accuracy))

    print('Loss: {}'.format(running_loss) )

print('Finished Training')

#y_pred_probabilities = F.softmax(y_pred, dim=1) #to view the probabilities by transforming into the softmax transfmtn.



>> Plots to visualize results

In [None]:
# report accuracy
print('Model accuracy: %g%%' %accuracy)

fig,ax = plt.subplots(1,2,figsize=(13,4))

ax[0].plot(losses.detach())
ax[0].set_ylabel('Loss')
ax[0].set_xlabel('epoch')
ax[0].set_title('Losses')

ax[1].plot(ongoing_accuracy)
ax[1].set_ylabel('accuracy')
ax[1].set_xlabel('epoch')
ax[1].set_title('Accuracy')
plt.show()
# run training again to see whether this performance is consistent

>> Use this code for training and evaluation

In [None]:
# #define the model 

class iris_model(nn.Module):
    """Some Information about iris_model"""
    def __init__(self, weight_init='default'):
        super(iris_model, self).__init__()
        self.fc1 = nn.Linear(4, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 3)
        
        if weight_init == 'default':
            pass  # Default weight initialization

        elif weight_init == 'xavier_uniform':
            self._init_weights_xavier_uniform()

        elif weight_init == 'kaiming_normal':
            self._init_weights_kaiming_normal()
        
    def forward(self, x):
        out = F.relu(self.fc1(x))
        # out = self.fca(out) 
        out = F.relu(self.fc2(out))
        # out = self.fcb(out)
        out = self.fc3(out)
        return out

    def _init_weights_xavier_uniform(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    init.constant_(m.bias, 0)

    def _init_weights_kaiming_normal(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                if m.bias is not None:
                    init.constant_(m.bias, 0)

model = iris_model()    #initializing the model  #model = iris_model(weight_init='xavier_uniform') 



# # A. create a class for the model 
# def create_model(nUnits, nLayers):
#     class iris_model(nn.Module):
#         def __init__(self):
#             super().__init__()

#             # create dictionary to store the layers
#             self.layers = nn.ModuleDict()
#             self.nLayers = nLayers 

#             ### input layer
#             self.layers['input'] = nn.Linear(4, nUnits)
            
#             ### hidden layers
#             for i in range(nLayers):
#                 self.layers[f'hidden{i}'] = nn.Linear(nUnits, nUnits)

#             ### output layer
#             self.layers['output'] = nn.Linear(nUnits, 3)
        
#         # forward pass
#         def forward(self, x):
#             # input layer (note: the code in the video omits the relu after this layer)
#             x = F.relu(self.layers['input'](x))

#             # hidden layers
#             for i in range(self.nLayers):
#                 x = F.relu(self.layers[f'hidden{i}'](x))
                
#             # return output layer
#             x = self.layers['output'](x)    #or x = F.sigmoid(self.layers['output](x)) for Binary classification 
#             return x 
        
#     return iris_model() 


def create_model(nUnits, nLayers, weight_init):
    class iris_model(nn.Module):
        def __init__(self):
            super().__init__()

            # Create dictionary to store the layers
            self.layers = nn.ModuleDict()
            self.nLayers = nLayers 
            self.weight_init = weight_init

            ### Input layer
            self.layers['input'] = nn.Linear(4, nUnits)
            self.layers['input_bn'] = nn.BatchNorm1d(nUnits)
            self.layers['input_dropout'] = nn.Dropout(0.2)
            
            ### Hidden layers
            for i in range(nLayers):
                self.layers[f'hidden{i}'] = nn.Linear(nUnits, nUnits)
                self.layers[f'hidden{i}_bn'] = nn.BatchNorm1d(nUnits)
                self.layers[f'hidden{i}_dropout'] = nn.Dropout(0.2)

            ### Output layer
            self.layers['output'] = nn.Linear(nUnits, 3)
        
            # Initialize weights
            self._initialize_weights()
            
        
        # Forward pass
        def forward(self, x):
            # Input layer
            x = self.layers['input'](x)
            x = self.layers['input_bn'](x)
            x = F.relu(x)
            x = self.layers['input_dropout'](x)

            # Hidden layers
            for i in range(self.nLayers):
                x = self.layers[f'hidden{i}'](x)
                x = self.layers[f'hidden{i}_bn'](x)
                x = F.relu(x)
                x = self.layers[f'hidden{i}_dropout'](x)
                
            # Output layer
            x = self.layers['output'](x)    #or x = F.sigmoid(self.layers['output](x)) for Binary classification 
            
            return x 
        
        def _initialize_weights(self):
            for name, module in self.layers.items():
                if isinstance(module, nn.Linear):
                    weight_init = self.weight_init.get(name, 'default') #works well with sigmoid (uniform distribution)
                    if weight_init == 'xavier_uniform':     #works well with sigmoid
                        init.xavier_uniform_(module.weight)
                    elif weight_init == 'kaiming_normal':   #works well with ReLU activation 
                        init.kaiming_normal_(module.weight)
                    if module.bias is not None:
                        init.constant_(module.bias, 0)
    
    return iris_model()

nUnits = 64
nLayers = 5
weight_init = {
    'input': 'default',
    'hidden0': 'kaiming_normal',
    'hidden1': 'kaiming_normal',
    'hidden2': 'kaiming_normal',
    # 'hidden3': 'kaiming_normal',
    'output': 'default'
}

model = create_model(nUnits, nLayers, weight_init)    #initializing the model


In [None]:
#model training 
from torch.optim.lr_scheduler import StepLR

## metric = torchmetrics.Accuracy(task='multiclass', num_classes=num_classes)    (.Precision(), .Recall(), .F1Score(), .ConfusionMatrix())
                #see doc. https://torchmetrics.readthedocs.io/en/stable/classification/accuracy.html#functional-interface 
                
num_epochs = 300
learning_rate = 0.01
losses = torch.zeros(num_epochs)
ongoing_accuracy = [] 
num_classes = 3 

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Define the learning rate scheduler
scheduler = StepLR(optimizer, step_size=10, gamma=0.1)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Move the model and data to the appropriate device (e.g., GPU if available)
model.to(device)

# Loop over the dataset for multiple epochs 
for epoch in range(num_epochs):
    model.train()
    batchAcc  = []
    batchLoss = []
    
    # Iterate over the training dataloader (training batches)
    for inputs, labels in train_dataloader:
        inputs, labels = inputs.to(device), labels.to(device)

        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        batchLoss.append(loss.item())

        # Compute accuracy on the training set 
        predictions = torch.argmax(outputs, axis = 1) 

        accuracy = torchmetrics.functional.classification.accuracy(predictions, labels, task='multiclass', 
                                                            num_classes=num_classes) * 100 
        # accuracy = torchmetrics.functional.classification.accuracy(predictions, labels, task='multiclass', num_classes=num_classes) 
        #                                                     (or metric(predictions, labels))
        # accuracy = torchmetrics.functional.classification.binary_accuracy (predicted, labels, threshold = 0.5)   #for binary classification
        # r2score = torchmetrics.functional.r2_score(preds, target) 
                
        batchAcc.append(accuracy.item())
        
    # Update the learning rate
    scheduler.step()
    
    ongoing_accuracy.append(np.mean(batchAcc)) 
    losses[epoch] = np.mean(batchLoss) 

    # Print loss and accuracy for the epoch
    print(f"Epoch {epoch+1}/{num_epochs}: Loss = {np.mean(batchLoss):.4f}, Accuracy = {np.mean(batchAcc):.2f}%") 

print('Finished Training')


# report accuracy
print('Model accuracy: {:.2f}%'.format(ongoing_accuracy[-1]))  

fig,ax = plt.subplots(1,2,figsize=(13,4))

ax[0].plot(losses.detach())
ax[0].set_ylabel('Loss')
ax[0].set_xlabel('epoch')
ax[0].set_title('Losses')

ax[1].plot(ongoing_accuracy)
ax[1].set_ylabel('accuracy')
ax[1].set_xlabel('epoch')
ax[1].set_title('Accuracy')
plt.show()
# run training again to see whether this performance is consistent

In [None]:
#model evaluation

model.eval()  # Set the model to evaluation mode

# Disable gradient computation for evaluation
with torch.inference_mode():        #or torch.no_grad()
    for data in test_dataloader:
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)

        # Forward pass
        outputs = model(inputs)

        # Calculate predictions
        _, predicted = torch.max(outputs.data, 1)

# Calculate accuracy
accuracy = torchmetrics.functional.classification.accuracy(predicted, labels, task='multiclass', num_classes=num_classes) 

print(f"Accuracy on test set: {100 * accuracy:.2f}%")


In [None]:
# import requests
# from pathlib import Path 

# # Download helper functions from Learn PyTorch repo (if not already downloaded)
# if Path("helper_functions.py").is_file():
#   print("helper_functions.py already exists, skipping download")
# else:
#   print("Downloading helper_functions.py")
#   request = requests.get("https://raw.githubusercontent.com/mrdbourke/pytorch-deep-learning/main/helper_functions.py")
#   with open("helper_functions.py", "wb") as f:
#     f.write(request.content)

# from helper_functions import plot_predictions, plot_decision_boundary 

>> Evaluating and Validation

In [None]:
# Validation Loop:
    # Evaluate the model on the validation dataset to monitor its performance and make any necessary adjustments.
    # Pass the input batch through the model to obtain predictions.
    # Calculate the validation loss and any desired evaluation metrics.
    
# Hyperparameter Tuning:
    # Experiment with different learning rates, batch sizes, architectures, activation functions, 
        # regularization techniques, and optimizer settings.
    # Use the validation set to evaluate different combinations of hyperparameters and choose the best-performing ones.

In [None]:
#Model Evaluation

# Set the model to evaluation mode
model.eval()

# Evaluate the model
with torch.no_grad():
  for data, labels in test_loader:
    # Forward pass
    outputs = model(data)

    # Calculate loss
    loss = criterion(outputs, labels)

    # Calculate accuracy
    # accuracy = (outputs.argmax(1) == labels).sum().item() / len(labels)
    accuracy = 100*torch.mean(((outputs>.5) == labels).float())   #Binary classification

print('Accuracy:', accuracy)

In [None]:
# Set the model to train mode
model.train()

best_loss = float('inf')  # Variable to track the best validation loss
early_stopping_counter = 0  # Counter for early stopping
early_stopping_patience = 3  # Number of epochs to wait before early stopping

for epoch in range(num_epochs):
    # Training
    model.train()
    train_loss = 0.0
    train_correct = 0

    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item() * inputs.size(0)
        _, predicted = torch.max(outputs.data, 1)
        train_correct += (predicted == targets).sum().item()
    
    train_loss /= len(train_dataset)
    train_accuracy = train_correct / len(train_dataset)

    # Validation
    model.eval()
    val_loss = 0.0
    val_correct = 0

    with torch.no_grad():
        for inputs, targets in val_loader:
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            
            val_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs.data, 1)
            val_correct += (predicted == targets).sum().item()
    
    val_loss /= len(val_dataset)
    val_accuracy = val_correct / len(val_dataset)

    # Print epoch metrics
    print(f"Epoch: {epoch+1}/{num_epochs}")
    print(f"Train Loss: {train_loss:.4f} | Train Accuracy: {train_accuracy:.4f}")
    print(f"Val Loss: {val_loss:.4f} | Val Accuracy: {val_accuracy:.4f}")

    # Save the best model checkpoint
    if val_loss < best_loss:
        best_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pt')

    # Early stopping
    if epoch > early_stopping_patience and val_loss >= best_loss:
        early_stopping_counter += 1
        if early_stopping_counter == early_stopping_patience:
            print("Early stopping triggered!")
            break
    else:
        early_stopping_counter = 0

# Load the best model checkpoint
model.load_state_dict(torch.load('best_model.pt'))

# Set the model to evaluation mode
model.eval()


### Model Optimization (Hyperparameter Tuning)

>> Auto Tuning using Optuna

In [None]:
import optuna
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score

# Define your model
class MyModel(nn.Module):
    # Your model definition here

# Define your objective function
def objective(trial):
    # Define your hyperparameters to be tuned
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 1e-1) 
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])
    num_epochs = trial.suggest_int('num_epochs', 10, 30)
    patience = trial.suggest_int('patience', 5, 20)
    weight_decay = trial.suggest_loguniform('weight_decay', 1e-6, 1e-3)
    dropout_rate = trial.suggest_uniform('dropout_rate', 0.0, 0.5)
    hidden_size = trial.suggest_categorical('hidden_size', [64, 128, 256])
    num_layers = trial.suggest_int('num_layers', 1, 4)
    # batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])
    optimizer = trial.suggest_categorical('optimizer', ['adam', 'sgd'])
    activation = trial.suggest_categorical('activation', ['relu', 'sigmoid', 'tanh'])
    patience = trial.suggest_int('patience', 5, 20)

    # Define your model architecture with the hyperparameters
    model = MyModel(input_size, hidden_size, num_layers, output_size, dropout_rate, activation)

    # model = MyModel()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    # if optimizer == 'adam':
    #     optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    # else:
    #     optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=weight_decay)


    # Define your loss function
    loss_function = nn.CrossEntropyLoss()

    # Define your dataset and dataloader
    train_dataset = MyDataset(train_data, train_labels)
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

    # Training loop
    for epoch in range(num_epochs):
        model.train()   # Set the model to train mode
        for inputs, targets in train_loader:
            optimizer.zero_grad()   # Zero the gradients
            # Forward pass
            outputs = model(inputs)
            loss = loss_function(outputs, targets)
            # Backward pass and optimization
            loss.backward()
            optimizer.step()

    # Evaluation
    model.eval()
    predictions = []
    with torch.no_grad():
        for inputs, targets in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, dim = 1)
            predictions.extend(predicted.tolist())

    accuracy = accuracy_score(test_labels, predictions)
    
    # Report intermediate results to Optuna
    trial.report(accuracy, epoch)

    # Implement early stopping based on the patience parameter
    if epoch - trial.best_trial.last_epoch > patience:
        break

    # Handle pruning based on the intermediate results
    if trial.should_prune():
        raise optuna.TrialPruned()
    return accuracy

# Define the study
study = optuna.create_study(direction='maximize')

# Run the optimization
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params = study.best_params     #or study.best_trial

# Train the model with the best hyperparameters
best_model = MyModel(**best_params)
# Train your model using the best hyperparameters

# Evaluate the best model on the test set
# Evaluate your best model on the test set

# Print the best hyperparameters and the best score
print("Best Hyperparameters:", best_params)
print("Best Score:", study.best_value)


>> Neurons vs Layers

In [None]:
# A. create a class for the model 
def create_model(nUnits, nLayers):
    class iris_model(nn.Module):
        def __init__(self):
            super().__init__()

            # create dictionary to store the layers
            self.layers = nn.ModuleDict()
            self.nLayers = nLayers 

            ### input layer
            self.layers['input'] = nn.Linear(4, nUnits)
            
            ### hidden layers
            for i in range(nLayers):
                self.layers[f'hidden{i}'] = nn.Linear(nUnits, nUnits)

            ### output layer
            self.layers['output'] = nn.Linear(nUnits, 3)
        
        # forward pass
        def forward(self, x):
            # input layer (note: the code in the video omits the relu after this layer)
            x = F.relu(self.layers['input'](x))

            # hidden layers
            for i in range(self.nLayers):
                x = F.relu(self.layers[f'hidden{i}'](x))
                
            # return output layer
            x = self.layers['output'](x)
            return x 
        
    return iris_model() 

#B. Train the model (return 'final accuracy' and 'trainable parameters')
def train_model():
    loss_function = nn.CrossEntropyLoss()        # loss function 
    optimizer =  torch.optim.SGD (model.parameters(), lr=0.05)  # optimizer

    # loop over the dataset multiple times
    for epoch in range(train_epoch):
        running_loss = 0.0
        total_correct = 0
        total_samples = 0
        for i, data in enumerate(dataloader, 0):
            inputs, labels = data
            # inputs = inputs.float()  # Convert inputs to float type
            # labels = labels.long()  # Convert labels to long type
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients 
            optimizer.zero_grad() 

            # forward + backward + optimize 
            y_pred = create_model(inputs) 
            loss = loss_function(y_pred, labels) 
            losses[epoch] = loss 
            
            loss.backward() 
            optimizer.step() 

            running_loss += loss.item() 
            
            # Calculate accuracy 
            predicted_labels = torch.argmax(y_pred, dim=1) 
            total_correct += torch.sum(predicted_labels == labels).item() 
            total_samples += labels.size(0) 

        accuracy = 100 * total_correct / total_samples
    # total number of trainable parameters in the model
    nParams = sum(p.numel() for p in theModel.parameters() if p.requires_grad)
    
    print('Finished Training')
    return accuracy, nParams



#C. describe the ranges of hyperparameters to be tesed
numlayers = range(1,6)          # number of hidden layers
numunits  = np.arange(4,101,3)  # number of nodes 

# initialize output matrices
accuracies  = np.zeros((len(numunits),len(numlayers)))  #as a matrix
totalparams = np.zeros((len(numunits),len(numlayers)))  #as a matrix

# number of training epochs
numepochs = 500


#D start the experiment!
for unitidx in range(len(numunits)):
  for layeridx in range(len(numlayers)):

    # create a fresh model instance
    net = create_model(numunits[unitidx],numlayers[layeridx]) 

    # run the model and store the results
    accuracy, nParams = train_model(net)
    accuracies[unitidx,layeridx] = accuracy

    # store the total number of parameters in the model
    totalparams[unitidx,layeridx] = nParams


#E. visualize 
# show accuracy as a function of model depth
fig,ax = plt.subplots(1,figsize=(12,6))

ax.plot(numunits,accuracies,'o-',markerfacecolor='w',markersize=9)
ax.plot(numunits[[0,-1]],[33,33],'--',color=[.8,.8,.8])
ax.plot(numunits[[0,-1]],[67,67],'--',color=[.8,.8,.8])
ax.legend(numlayers)
ax.set_ylabel('accuracy')
ax.set_xlabel('Number of hidden units')
ax.set_title('Accuracy')
plt.show()

>> Learning Rate

### Model Saving and Loading

>> Saving the model

In [None]:
import torch

# Save the model checkpoint
checkpoint = {
    'epoch': 300,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,
    'learning_rate': optimizer.param_groups[0]['lr'],
    'hyperparameters': {
                'hidden_units': 64,
                'batch_size': 32
                        },
    # 'other_info': 'Additional information about the checkpoint'
}

torch.save(checkpoint, 'model_checkpoint.pth')



# Save the model without checkpoint information
torch.save(model.state_dict(), 'model.pth')



>> Load the model

In [None]:
# If you saved a checkpoint, you can load it as follows:

checkpoint = torch.load('model_checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
epoch = checkpoint['epoch']
loss = checkpoint['loss']
hidden_units = checkpoint['hyperparameters']['hidden_units']
batch_size = checkpoint['hyperparameters']['batch_size']


#If you saved just the model without the checkpoint information,
model = MyModel()
model.load_state_dict(torch.load('model.pth'))
