# Learning Rate Scheduler for Optimizers

https://pytorch.org/docs/stable/optim.html#module-torch.optim.lr_scheduler

### What is lr_scheduler?

    torch.optim.lr_scheduler
    
> The lr_scheduler module in PyTorch provides a variety of strategies to adjust the learning rate dynamically during training.


### Common Learning Rate Schedulers in PyTorch
>- LambdaLR
>- StepLR
>- MultiStepLR
>- ExponentialLR
>- ReduceLROnPlateau
>- CosineAnnealingLR

## How to Use lr_scheduler
>  Learning rate scheduling should be applied after optimizer’s update
>
    1- Define Model and Optimizer
    2- Initialize the Scheduler
    3- Train :
    
        for epoch in range(train_loader):
             for batch in range(all_batches):
                 optimizer.zero_grad()
                 predictions = model(x)
                 loss = MSELoss(label,predictions)
                 loss.backward()
                 optimizer.step()
             scheduler.step()



In [None]:
import torch
import torch.nn as nn 
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torch.utils.data as data
import torch.nn.functional as F
import torchvision.models as models
from torch.optim import lr_scheduler


In [None]:

# Determine device for computations (GPU if available, otherwise CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Mean and standard deviation for normalization
mean = 0.13066048920154572
std  = 0.30810779333114624

# Batch size for DataLoader
batch_size = 128

# Transformation pipeline for training data
train_transforms = transforms.Compose([
    transforms.RandomRotation(5),               # Random rotation up to 5 degrees
    transforms.RandomCrop(28, padding=2),      # Random crop with padding of 2 pixels
    transforms.ToTensor(),                     # Convert to tensor
    transforms.Normalize(mean=[mean], std=[std])   # Normalize with predefined mean and std
])

# Load training dataset
train_data = datasets.MNIST(root='.data', 
                            train=True, 
                            download=True, 
                            transform=train_transforms)

# DataLoader for training data
train_loader = data.DataLoader(train_data, 
                               shuffle=True,    # Shuffle the data
                               batch_size=batch_size)  # Batch size for training

# Transformation pipeline for testing data
test_transforms = transforms.Compose([
    transforms.ToTensor(),                     # Convert to tensor
    transforms.Normalize(mean=[mean], std=[std])   # Normalize with predefined mean and std
])

# Load testing dataset
test_data = datasets.MNIST(root='.data', 
                           train=False, 
                           download=False, 
                           transform=test_transforms)

# DataLoader for testing data
test_loader = data.DataLoader(test_data,  
                              batch_size=batch_size)  # Batch size for testing

In [None]:
class LeNet5(nn.Module):
    def __init__(self):
        super(LeNet5, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, kernel_size=5, stride=1, padding=2)
        self.conv2 = nn.Conv2d(6, 16, kernel_size=5, stride=1, padding=0)
        self.fc1 = nn.Linear(16*5*5, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        x = F.relu(self.conv1(x))
        # --
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        x = F.relu(self.conv2(x))
        x = F.max_pool2d(x, kernel_size=2, stride=2)
        # ----
        x = x.view(-1, 16*5*5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [None]:
import tqdm
def train_epoch(iterator, model, optimizer, criterion, device):
    """Performs one epoch of training."""
    model.train()
    lrs = []
    epoch_loss = []
    for images, labels in tqdm.tqdm(iterator, desc="Training"):
        images = images.to(device)
        labels = labels.to(device)
        
        optimizer.zero_grad()
        predictions = model(images)
        
        loss = criterion(predictions, labels)
        loss.backward()
        
        optimizer.step()
        
        lrs.append(optimizer.param_groups[0]["lr"])
        epoch_loss.append(loss.item())
        
    return epoch_loss  , lrs

def train(train_iterator, model, optimizer, scheduler, criterion, device, n_epochs=5):
    """Trains the model for the given number of epochs."""
    
    lrs = []
    train_losses = []
    for epoch in range(n_epochs):
       
        epoch_loss, epoch_lrs = train_epoch(train_iterator, model, optimizer, criterion, device)
         
        train_losses.extend(epoch_loss)
        lrs.extend(epoch_lrs)
        print(f"Epoch {epoch+1}/{n_epochs}", 'lr: ',scheduler.get_last_lr() )
        scheduler.step()  # Update learning rate after each epoch
        
    
    return train_losses ,lrs 

def plot_graphs(value_list, labels, title=None, ymin=0, ymax=None, figsize=(15,5)):
    """Plots the losses from multiple experiments."""
    
    fig, ax = plt.subplots(figsize=figsize)
    for value, label in zip(value_list, labels):
        ax.plot(value, label=label)
    ax.set_title(title)
    ax.set_ylabel('Loss')
    ax.set_xlabel('Update Steps')
    ax.set_ylim(ymin=ymin, ymax=ymax)
    ax.grid()
    ax.legend(loc='upper right')

def test_model(model, test_loader, device):
    # Set the model to evaluation mode to turn off dropout, batch normalization, etc.
    model.eval()
 
    # Initialize counters for overall accuracy
    n_correct = 0
    n_samples = 0
    
    # Initialize counters for class-wise accuracy
    n_class_correct = [0 for _ in range(10)]
    n_class_samples = [0 for _ in range(10)]
    
    # Define the class names for easier readability
    classes = ('0' ,'1', '2', '3', '4', '5', '6', '7', '8', '9' )
    
    # Disable gradient calculation since we are in inference mode
    with torch.no_grad():
        # Iterate over the test data loader
        for images, labels in test_loader:
            # Move the images and labels to the specified device (CPU or GPU)
            images = images.to(device)
            labels = labels.to(device)
            
            # Perform the forward pass to get model predictions
            outputs = model(images)
            
            # Get the predicted class by taking the index with the highest score
            _, predicted = torch.max(outputs, 1)
            
            # Update the overall sample and correct prediction counters
            n_samples += labels.size(0)
            n_correct += (predicted == labels).sum().item()
            
            # Update the class-wise correct prediction counters
            for i in range(labels.size(0)):
                label = labels[i]
                pred = predicted[i]
                if label == pred:
                    n_class_correct[label] += 1
                n_class_samples[label] += 1
    
    # Calculate and print the overall accuracy
    overall_acc = 100.0 * n_correct / n_samples
    print(f'Overall accuracy: {overall_acc:.2f} %')
    print('-'*25)
    # Calculate and print the accuracy for each class
    for i in range(10):
        if n_class_samples[i] > 0:
            class_acc = 100.0 * n_class_correct[i] / n_class_samples[i]
            print(f'Accuracy of {classes[i]}: {class_acc:.2f} %')
        else:
            print(f'Accuracy of {classes[i]}: No samples')



## **torch.optim.lr_scheduler**

    provides several methods to adjust the learning rate based on the number of epochs.



In [None]:
import torch.optim.lr_scheduler as lr_scheduler

In [None]:
model = LeNet5()
learning_rate = 0.1
criterion = nn.CrossEntropyLoss() 
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

    torch.optim.SGD(params, lr=0.001, momentum=0, dampening=0, weight_decay=0, nesterov=False, *, maximize=False, foreach=None, differentiable=False)

In [None]:
optimizer

In [None]:
optimizer.param_groups[0]['lr']

In [None]:
optimizer.param_groups[0]['momentum']

# 1. LambdaLR

    Uses a user-defined lambda function to adjust the learning rate.


#### Parameters

>- optimizer (Optimizer) – Wrapped optimizer.
>- lr_lambda (function or list) – A function which computes a multiplicative factor given an integer parameter epoch, or a list of such functions, one for each group in optimizer.param_groups.
>- last_epoch (int) – The index of last epoch. Default: -1.


In [None]:
lambda_fn = lambda epoch: 0.65 ** epoch
# Example: learning rate decays by a factor of 0.65 every epoch
# Epoch 1: Learning Rate: 0.1
# Epoch 2: Learning Rate: 0.065
# Epoch 3: Learning Rate: 0.04225
# Epoch 4: Learning Rate: 0.0274625
# Epoch 5: Learning Rate: 0.01787125

LambdaLR_scheduler =  lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda_fn)
LambdaLR_loss, LambdaLR_lrs  = train(train_loader, model=model, optimizer=optimizer,scheduler=LambdaLR_scheduler, criterion=criterion, device=device, n_epochs=5)

In [None]:
plt.plot( LambdaLR_loss)

In [None]:
plt.plot( LambdaLR_lrs)

In [None]:
plot_graphs( [LambdaLR_lrs,LambdaLR_loss],['LambdaLR_lrs','LambdaLR_loss'],ymax=0.4)

In [None]:
test_model(model, test_loader, device=device)

In [None]:
LambdaLR_scheduler.get_last_lr()

# 2. StepLR

Decreases the learning rate by a factor every few epochs.

    torch.optim.lr_scheduler.StepLR(optimizer, step_size, gamma=0.1, last_epoch=-1, verbose=False)

#### Parameters

>- optimizer (Optimizer) – Wrapped optimizer.
>- step_size (int) – Period of learning rate decay.
>- gamma (float) – Multiplicative factor of learning rate decay. Default: 0.1.
>- last_epoch (int) – The index of last epoch. Default: -1.


#### step_size:
>It defines the number of epochs after which the learning rate will be reduced. In this case, it's set to 7 epochs.

#### gamma: 
>This is the factor by which the learning rate will be multiplied. Here, the learning rate will be multiplied by 0.1 every 7 epochs


In [None]:
model = LeNet5()
learning_rate = 0.1
criterion = nn.CrossEntropyLoss() 
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

steplr_scheduler =  lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)
steplr_loss,steplr_lrs = train(train_loader, model=model, optimizer=optimizer,scheduler=steplr_scheduler, criterion=criterion, device=device, n_epochs=5)

In [None]:
plt.plot(steplr_loss)

In [None]:
plt.plot(steplr_lrs)

In [None]:
plot_graphs( [steplr_lrs,steplr_loss],['steplr_lrs','steplr_loss'])

In [None]:
# Test the model
test_model(model, test_loader, device=device)


In [None]:
steplr_scheduler.get_last_lr()

# 3. MultiStepLR

Decreases the learning rate at specific epochs.


    torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones, gamma=0.1, last_epoch=-1)

#### Parameters
> - optimizer (Optimizer) – Wrapped optimizer.
> - milestones (list) – List of epoch indices. Must be increasing.
> - gamma (float) – Multiplicative factor of learning rate decay. Default: 0.1.
> - last_epoch (int) – The index of last epoch. Default: -1.

In [None]:
model = LeNet5()
learning_rate = 0.1
criterion = nn.CrossEntropyLoss() 
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

MultiStepLR_scheduler =  lr_scheduler.MultiStepLR(optimizer, milestones=[2,3,4], gamma=0.1)
MultiStepLR_scheduler_loss,MultiStepLR_scheduler_lrs = train(train_loader, model=model, optimizer=optimizer,scheduler=MultiStepLR_scheduler, criterion=criterion, device=device, n_epochs=5)

In [None]:
plt.plot(MultiStepLR_scheduler_loss)

In [None]:
plt.plot(MultiStepLR_scheduler_lrs)

In [None]:
plot_graphs( [MultiStepLR_scheduler_lrs,MultiStepLR_scheduler_loss],
            ['MultiStepLR_scheduler_lrs','MultiStepLR_scheduler_loss'])

In [None]:
# Test the model
test_model(model, test_loader, device=device)


In [None]:
MultiStepLR_scheduler.get_last_lr()

# 4. ExponentialLR

Decays the learning rate exponentially.

    torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma, last_epoch=-1)
    
#### Parameters
>- optimizer (Optimizer) – Wrapped optimizer.
>- gamma (float) – Multiplicative factor of learning rate decay.
>- last_epoch (int) – The index of last epoch. Default: -1.


In [None]:
model = LeNet5()
learning_rate = 0.1
criterion = nn.CrossEntropyLoss() 
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)

ExponentialLR_scheduler = lr_scheduler.ExponentialLR(optimizer, gamma=0.1)
ExponentialLR_loss,ExponentialLR_lrs= train(train_loader, model=model, optimizer=optimizer,scheduler=ExponentialLR_scheduler, criterion=criterion, device=device, n_epochs=5)

In [None]:
plt.plot(ExponentialLR_loss)

In [None]:
plt.plot(ExponentialLR_lrs)

In [None]:
plot_graphs( [ExponentialLR_lrs,ExponentialLR_loss],['ExponentialLR_lrs','ExponentialLR_loss'])

In [None]:
# Test the model
test_model(model, test_loader, device=device)


In [None]:
ExponentialLR_scheduler.get_last_lr()

In [None]:
plot_graphs([ LambdaLR_loss , steplr_loss , MultiStepLR_scheduler_loss , ExponentialLR_loss,], 
            ['LambdaLR_loss','steplr_loss','MultiStepLR_scheduler_loss','ExponentialLR_loss'] )

In [None]:
plot_graphs([LambdaLR_lrs , steplr_lrs , MultiStepLR_scheduler_lrs , ExponentialLR_lrs],
           ['LambdaLR_lrs','steplr_lrs','MultiStepLR_scheduler_lrs','ExponentialLR_lrs'])

In [None]:
# next epoch
print(LambdaLR_scheduler.get_last_lr() , steplr_scheduler.get_last_lr() , MultiStepLR_scheduler.get_last_lr(), ExponentialLR_scheduler.get_last_lr())      

 # Using ConvNet for CIFAR-10

In [None]:
import torchvision

# device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Hyperparameters
 
batch_size = 128
 
# Data transforms
transform_train2 = transforms.Compose([
    transforms.RandomHorizontalFlip(),
    transforms.RandomCrop(32, padding=4),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),])

transform_test2 = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),])

# Data  
train_dataset2 = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train2)
test_dataset2 = torchvision.datasets.CIFAR10(root='./data', train=False, download=False, transform=transform_test2)
# Data loader
train_loader2 = torch.utils.data.DataLoader(train_dataset2,batch_size = batch_size, shuffle =True)
test_loader2  = torch.utils.data.DataLoader(test_dataset2,batch_size  = batch_size, shuffle =False)


In [None]:
# Define the  CNN with Batch Norm
class CNN_BN(nn.Module):
    def __init__(self):
        super().__init__() 
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32) 
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64) 
        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.fc1 = nn.Linear(128 * 4 * 4, 128)
        self.fc2 = nn.Linear(128, 10)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))
        x = self.pool(F.relu(self.bn2(self.conv2(x))))
        x = self.pool(F.relu(self.bn3(self.conv3(x))))
 #              ((INPUT SIZE - FILTER SIZE + 2 PADDING) / STRIDE  ) + 1
        #image -> 32 x 32
        #conv1 ->      32    -      3      +   2      ) /     1   ) + 1 =  32
        #pool  ->      32    -      2      +   0      ) /     2   ) + 1 =  16
        #conv2 ->      16    -      3      +   2      ) /     1   ) + 1 =  16
        #pool  ->      16    -      2      +   0      ) /     2   ) + 1 =  8 
        #conv3 ->       8    -      3      +   2      ) /     1   ) + 1 =  8
        #pool  ->       8    -      2      +   0      ) /     2   ) + 1 =  4
        #      -> 4 x 4  
        
        x = x.view(-1, 128 * 4 * 4)
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
 

In [None]:
learning_rate = 0.1
num_epochs = 25
cnn_model = CNN_BN().to(device)
optimizer = torch.optim.SGD(cnn_model.parameters(), lr=learning_rate, momentum=0.9)
criterion = nn.CrossEntropyLoss() 

In [None]:
# epoch   1         5         10           15          20          25
# gamma  0.5       0.5       0.5          0.5          0.5
# lr     0.01 --> 0.005 --> 0.0025 --> 0.00125  --> 0.000625 --> 0.0003125

steplr_scheduler = lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)
steplr_scheduler_loss,steplr_scheduler_lrs = train(train_loader2, model=cnn_model, optimizer=optimizer,scheduler=steplr_scheduler, criterion=criterion, device=device, n_epochs=num_epochs)

In [None]:
plt.plot(steplr_scheduler_loss)

In [None]:
plt.plot(steplr_scheduler_lrs)

In [None]:
plot_graphs([steplr_scheduler_lrs , steplr_scheduler_loss ], 
           ['steplr_scheduler_lrs','steplr_scheduler_loss'])

In [None]:
# Test the model
test_model(cnn_model, test_loader2, device=device)


In [None]:
# optimizer.param_groups

In [None]:
optimizer

In [None]:
optimizer.param_groups[0]["lr"]