# HW2
## Madusanka Madiligama

In [1]:
%matplotlib inline

import torch
import torchvision
from torch import nn

import numpy 
import matplotlib.pyplot as plt
import time

In [2]:
training_data = torchvision.datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=torchvision.transforms.ToTensor()
)

test_data = torchvision.datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=torchvision.transforms.ToTensor()
)

In [3]:
#functions

def train_one_epoch(dataloader, model, loss_fn, optimizer):
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # forward pass
        pred = model(X)
        loss = loss_fn(pred, y) # X is input (images), y is label (0-9)
        
        # backward pass calculates gradients
        loss.backward()
        
        # take one step with these gradients
        optimizer.step()
        
        # resets the gradients 
        optimizer.zero_grad()

def evaluate(dataloader, model, loss_fn):
    # Set the model to evaluation mode - some NN pieces behave differently during training
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    loss, correct = 0, 0

    # We can save computation and memory by not calculating gradients here - we aren't optimizing 
    with torch.no_grad():
        # loop over all of the batches
        for X, y in dataloader:
            pred = model(X)
            loss += loss_fn(pred, y).item()
            # how many are correct in this batch? Tracking for accuracy 
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    loss /= num_batches
    correct /= size
    
    accuracy = 100*correct
    return accuracy, loss

def show_failures(model, dataloader, maxtoshow=10):
    model.eval()
    batch = next(iter(dataloader))
    predictions = model(batch[0])
    
    rounded = predictions.argmax(1) #dimensions=1
    errors = rounded!=batch[1] #X, y so y = label
    print('Showing max', maxtoshow, 'first failures. '
          'The predicted class is shown first and the correct class in parentheses.')
    ii = 0
    plt.figure(figsize=(maxtoshow, 1))
    for i in range(batch[0].shape[0]):
        if ii>=maxtoshow:
            break
        if errors[i]:
            plt.subplot(1, maxtoshow, ii+1)
            plt.axis('off')
            plt.imshow(batch[0][i,0,:,:], cmap="gray")
            plt.title("%d (%d)" % (rounded[i], batch[1][i]))
            ii = ii + 1

class NonlinearClassifier(nn.Module):

    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.layers_stack = nn.Sequential(
            nn.Linear(28*28, 50),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(50, 50),
            nn.ReLU(),
           # nn.Dropout(0.2),
            nn.Linear(50, 50),
            nn.ReLU(),
           # nn.Dropout(0.2),
            nn.Linear(50, 10)
        )


        
    def forward(self, x):
        x = self.flatten(x)
        x = self.layers_stack(x)

        return x

class LinearClassifier(nn.Module):

    def __init__(self):
        super().__init__()
        
        # First, we need to convert the input image to a vector by using 
        # nn.Flatten(). For MNIST, it means the second dimension 28*28 becomes 784.
        self.flatten = nn.Flatten()
        
        # Here, we add a fully connected ("dense") layer that has 28 x 28 = 784 input nodes 
        #(one for each pixel in the input image) and 10 output nodes (for probabilities of each class).
        self.layer_1 = nn.Linear(28*28, 10)
        
    def forward(self, x):

        x = self.flatten(x)
        x = self.layer_1(x)

        return x

##  effect of the batch size

In [4]:
%%time
batch_sizes = [32, 64, 128, 256, 512]
    # batch_size = 128
for batch_size in batch_sizes: 

    train_size = int(0.8 * len(training_data))  # 80% for training
    val_size = len(training_data) - train_size  # Remaining 20% for validation
    training_data, validation_data = torch.utils.data.random_split(training_data, [train_size, val_size], generator=torch.Generator().manual_seed(55))
    

    print(f'*************** Batch size = {batch_size} ********************') 
    nonlinear_model = NonlinearClassifier()
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(nonlinear_model.parameters(), lr=0.05)
    
    # The dataloader makes our dataset iterable 
    train_dataloader = torch.utils.data.DataLoader(training_data, batch_size=batch_size)
    val_dataloader = torch.utils.data.DataLoader(validation_data, batch_size=batch_size)
    
    
    
    epochs = 5
    train_acc_all = []
    val_acc_all = []
    for j in range(epochs):
        train_one_epoch(train_dataloader, nonlinear_model, loss_fn, optimizer)
        
        # checking on the training loss and accuracy once per epoch
        acc, loss = evaluate(train_dataloader, nonlinear_model, loss_fn)
        train_acc_all.append(acc)
        print(f"Epoch {j}: training loss: {loss}, accuracy: {acc}")
        
        # checking on the validation loss and accuracy once per epoch
        val_acc, val_loss = evaluate(val_dataloader, nonlinear_model, loss_fn)
        val_acc_all.append(val_acc)
        print(f"Epoch {j}: val. loss: {val_loss}, val. accuracy: {val_acc}")

    #finally, evaluate how it performs against the test data: 
    batch_size_test = 256
    test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size_test)
    acc_test, loss_test = evaluate(test_dataloader, nonlinear_model, loss_fn)
    print("Test loss: %.4f, test accuracy: %.2f%%" % (loss_test, acc_test))

*************** Batch size = 32 ********************
Epoch 0: training loss: 0.28353117652485765, accuracy: 91.75
Epoch 0: val. loss: 0.2770179298122724, val. accuracy: 91.7
Epoch 1: training loss: 0.17416470660548658, accuracy: 94.87916666666666
Epoch 1: val. loss: 0.17754967229564986, val. accuracy: 94.69166666666666
Epoch 2: training loss: 0.13862590005248784, accuracy: 95.91041666666666
Epoch 2: val. loss: 0.1511280350536108, val. accuracy: 95.5
Epoch 3: training loss: 0.1214609627610383, accuracy: 96.44375
Epoch 3: val. loss: 0.13653348149359226, val. accuracy: 95.85833333333333
Epoch 4: training loss: 0.10653850860536719, accuracy: 96.82916666666667
Epoch 4: val. loss: 0.12564062214270233, val. accuracy: 96.31666666666666
Test loss: 0.1240, test accuracy: 95.94%
*************** Batch size = 64 ********************
Epoch 0: training loss: 0.4805332188308239, accuracy: 87.14322916666667
Epoch 0: val. loss: 0.48688223520914714, val. accuracy: 86.77083333333333
Epoch 1: training loss

The relationship between batch size and test accuracy is indeed significant. Smaller batch sizes tend to yield higher final test accuracies, as seen in various trials where smaller batches like 32 achieved test accuracies above 95%, while larger batches like 512 often resulted in lower accuracies around 58%. However, training with smaller batch sizes increases the computational burden, leading to longer training times due to the higher number of updates required during the training process. To balance between accuracy and training efficiency, a moderate batch size in the range of 64 to 128 is often recommended. This range allows for effective learning dynamics while ensuring that the computational resource demands remain manageable. Adopting this strategy facilitates achieving high test accuracy without disproportionately extending training durations or complicating resource management. Ultimately, selecting an optimal batch size is not just about accuracy alone but also about finding a practical approach that suits the dataset and model being trained. This nuanced approach can lead to more robust and efficient training outcomes.

## effect of the learning rates

In [5]:
%%time
learning_rates = [0.01, 0.05, 0.1, 0.2]
    # batch_size = 128
for lr_ in learning_rates: 
    
    train_size = int(0.8 * len(training_data))  # 80% for training
    val_size = len(training_data) - train_size  # Remaining 20% for validation
    # print(f'train_size: {train_size}')
    # print(f'val_size: {val_size}')
    training_data, validation_data = torch.utils.data.random_split(training_data, [train_size, val_size], generator=torch.Generator().manual_seed(55))
    
    batch_size = 128 #keep constant
    print(f'*************** Learning Rate = {lr_} ********************') 
    nonlinear_model = NonlinearClassifier()
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(nonlinear_model.parameters(), lr=lr_)
    
    # The dataloader makes our dataset iterable 
    train_dataloader = torch.utils.data.DataLoader(training_data, batch_size=batch_size)
    val_dataloader = torch.utils.data.DataLoader(validation_data, batch_size=batch_size)
    
    
    
    epochs = 5
    train_acc_all = []
    val_acc_all = []
    for j in range(epochs):
        train_one_epoch(train_dataloader, nonlinear_model, loss_fn, optimizer)
        
        # checking on the training loss and accuracy once per epoch
        acc, loss = evaluate(train_dataloader, nonlinear_model, loss_fn)
        train_acc_all.append(acc)
        print(f"Epoch {j}: training loss: {loss}, accuracy: {acc}")
        
        # checking on the validation loss and accuracy once per epoch
        val_acc, val_loss = evaluate(val_dataloader, nonlinear_model, loss_fn)
        val_acc_all.append(val_acc)
        print(f"Epoch {j}: val. loss: {val_loss}, val. accuracy: {val_acc}")

    #finally, evaluate how it performs against the test data: 
    batch_size_test = 256
    test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size_test)
    acc_test, loss_test = evaluate(test_dataloader, nonlinear_model, loss_fn)
    print("Test loss: %.4f, test accuracy: %.2f%%" % (loss_test, acc_test))

*************** Learning Rate = 0.01 ********************
Epoch 0: training loss: 2.2981832356956917, accuracy: 9.893184130213632
Epoch 0: val. loss: 2.2991538355427403, val. accuracy: 9.638860630722279
Epoch 1: training loss: 2.286097858010269, accuracy: 10.147507629704986
Epoch 1: val. loss: 2.287155036003359, val. accuracy: 9.893184130213632
Epoch 2: training loss: 2.2684968525801246, accuracy: 17.427517802644964
Epoch 2: val. loss: 2.2698019012328117, val. accuracy: 17.16683621566633
Epoch 3: training loss: 2.237386178194992, accuracy: 20.879959308240082
Epoch 3: val. loss: 2.23907962922127, val. accuracy: 21.337741607324517
Epoch 4: training loss: 2.179336290049359, accuracy: 29.65412004069176
Epoch 4: val. loss: 2.181519023833736, val. accuracy: 29.120040691759918
Test loss: 2.1772, test accuracy: 30.72%
*************** Learning Rate = 0.05 ********************
Epoch 0: training loss: 2.2770631313323975, accuracy: 19.106660308377048
Epoch 0: val. loss: 2.2781411266326903, val. ac

​The learning rate is a critical hyperparameter influencing the performance and quality of a machine learning model.​ As observed, a learning rate of 0.05 resulted in a notable improvement in model accuracy, achieving a final test accuracy of 82.81%, showcasing significant learning progress throughout the epochs. In contrast, a lower learning rate of 0.01 led to very modest accuracies, peaking at just 30.72%, indicating insufficient weight updates for effective learning. In comparison, an excessively high learning rate of 0.2 offered a substantial lift to 90.36% accuracy but posed risks of divergence during training.

It appears that moderate learning rates foster stable convergence better than the extremes. A learning rate of 0.1 also produced a solid performance, reaching an accuracy of 87.09%, suggesting that it effectively balances the need for rapid training updates with maintaining stable convergence. Ideally, a low learning rate hinders learning, while too high risks overshooting optimal weights, resulting in convergence challenges. Thus, selecting an optimal learning rate is paramount for achieving high model accuracy efficiently and effectively.

##  effect of the activation function


In [6]:
%%time

########### Nonlinear Model

train_size = int(0.8 * len(training_data))  # 80% for training
val_size = len(training_data) - train_size  # Remaining 20% for validation
# print(f'train_size: {train_size}')
# print(f'val_size: {val_size}')
training_data, validation_data = torch.utils.data.random_split(training_data, [train_size, val_size], generator=torch.Generator().manual_seed(55))

batch_size = 128 #keep constant
print(f'*************** Nonlinear Model ********************') 
nonlinear_model = NonlinearClassifier()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(nonlinear_model.parameters(), lr=0.05)

# The dataloader makes our dataset iterable 
train_dataloader = torch.utils.data.DataLoader(training_data, batch_size=batch_size)
val_dataloader = torch.utils.data.DataLoader(validation_data, batch_size=batch_size)



epochs = 5
train_acc_all = []
val_acc_all = []
for j in range(epochs):
    train_one_epoch(train_dataloader, nonlinear_model, loss_fn, optimizer)
    
    # checking on the training loss and accuracy once per epoch
    acc, loss = evaluate(train_dataloader, nonlinear_model, loss_fn)
    train_acc_all.append(acc)
    print(f"Epoch {j}: training loss: {loss}, accuracy: {acc}")
    
    # checking on the validation loss and accuracy once per epoch
    val_acc, val_loss = evaluate(val_dataloader, nonlinear_model, loss_fn)
    val_acc_all.append(val_acc)
    print(f"Epoch {j}: val. loss: {val_loss}, val. accuracy: {val_acc}")

#finally, evaluate how it performs against the test data: 
batch_size_test = 256
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size_test)
acc_test, loss_test = evaluate(test_dataloader, nonlinear_model, loss_fn)
print("Test loss: %.4f, test accuracy: %.2f%%" % (loss_test, acc_test))

############ Linear Model


train_size = int(0.8 * len(training_data))  # 80% for training
val_size = len(training_data) - train_size  # Remaining 20% for validation
# print(f'train_size: {train_size}')
# print(f'val_size: {val_size}')
training_data, validation_data = torch.utils.data.random_split(training_data, [train_size, val_size], generator=torch.Generator().manual_seed(55))

batch_size = 128 #keep constant
print(f'*************** Linear Model ********************') 
linear_model = LinearClassifier()
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(nonlinear_model.parameters(), lr=0.05)

# The dataloader makes our dataset iterable 
train_dataloader = torch.utils.data.DataLoader(training_data, batch_size=batch_size)
val_dataloader = torch.utils.data.DataLoader(validation_data, batch_size=batch_size)

epochs = 5
train_acc_all = []
val_acc_all = []
for j in range(epochs):
    train_one_epoch(train_dataloader, linear_model, loss_fn, optimizer)
    
    # checking on the training loss and accuracy once per epoch
    acc, loss = evaluate(train_dataloader, linear_model, loss_fn)
    train_acc_all.append(acc)
    print(f"Epoch {j}: training loss: {loss}, accuracy: {acc}")
    
    # checking on the validation loss and accuracy once per epoch
    val_acc, val_loss = evaluate(val_dataloader, linear_model, loss_fn)
    val_acc_all.append(val_acc)
    print(f"Epoch {j}: val. loss: {val_loss}, val. accuracy: {val_acc}")
    
#finally, evaluate how it performs against the test data: 
batch_size_test = 256
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size_test)
acc_test, loss_test = evaluate(test_dataloader, linear_model, loss_fn)
print("Test loss: %.4f, test accuracy: %.2f%%" % (loss_test, acc_test))

*************** Nonlinear Model ********************
Epoch 0: training loss: 2.294995850207759, accuracy: 13.538270454898308
Epoch 0: val. loss: 2.294008365044227, val. accuracy: 14.400993171942892
Epoch 1: training loss: 2.279382822560329, accuracy: 21.471821145784816
Epoch 1: val. loss: 2.2795231892512393, val. accuracy: 20.235878336436997
Epoch 2: training loss: 2.240607504751168, accuracy: 28.27200745225897
Epoch 2: val. loss: 2.241924817745502, val. accuracy: 27.063935443823713
Epoch 3: training loss: 2.107277337242575, accuracy: 43.067846607669615
Epoch 3: val. loss: 2.1121602058410645, val. accuracy: 40.285536933581625
Epoch 4: training loss: 1.7365573364145614, accuracy: 44.65145163794442
Epoch 4: val. loss: 1.7549063517497137, val. accuracy: 42.70639354438237
Test loss: 1.7300, test accuracy: 44.28%
*************** Linear Model ********************
Epoch 0: training loss: 2.3350457621783747, accuracy: 7.686335403726709
Epoch 0: val. loss: 2.3404565074227075, val. accuracy: 8.0

​The choice of activation function significantly impacts a model's performance and quality, as demonstrated by the contrasting results between the nonlinear and linear models.​ The nonlinear model exhibited a gradual improvement in training accuracy, reaching 44.65% over five epochs, which indicates its ability to learn complex patterns due to the introduction of non-linearity. In contrast, the linear model struggled to move beyond a mere 7.69% accuracy, demonstrating stagnation across all epochs, a clear indication that the linear activation function failed to capture the underlying complexities in the data.

Activation functions like ReLU or sigmoid in nonlinear models allow for better adaptability during training by facilitating the model to approximate a wider variety of functions. This flexibility enables the network to learn from more intricate relationships within the data and enhances overall performance metrics such as accuracy and loss. On the other hand, employing a linear activation function restricts the model's capacity to learn beyond linear relationships, often resulting in poor convergence and limited accuracy

Bonus: What is a learning rate scheduler?
​A learning rate scheduler is a machine learning technique to adjust an optimizer's learning rate during the training process.​ Instead of using a constant learning rate throughout training, a scheduler dynamically changes the learning rate based on specific criteria, such as the epoch number or training performance. This can help improve model convergence by allowing for larger updates at the beginning of training (to speed up learning) and smaller updates later (to fine-tune the model). Common types of learning rate schedules include constant schedules, exponential decay, step decay, and cyclical learning rates.
