---
---
### Deep Learning Practice (series 1)
##### Mostafa Shahbazi Dill - id: 40252521602
##### 2024-March-01
---
---

#### Importing Required Libraries

In [30]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import madgrad

---
---
### 1. Dataset Details

In [31]:
# Define transform to convert images to tensors
transform = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))]
)

# Load MNIST dataset
train_dataset = torchvision.datasets.MNIST(
    root="./data", train=True, transform=transform, download=True
)
test_dataset = torchvision.datasets.MNIST(
    root="./data", train=False, transform=transform, download=True
)

# Define data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Compute the number of input nodes
input_nodes = 28 * 28  # MNIST images are 28x28 pixels

# Compute the number of output nodes
output_nodes = 10  # Since there are 10 classes (digits 0-9)

---
### 1.1
##### MLP model with two hidden layers

In [48]:
# class MLP(nn.Module):
#     def __init__(
#         self, input_size, hidden_size1, hidden_size2, output_size, dropout_rate
#     ):
#         super(MLP, self).__init__()
#         self.fc1 = nn.Linear(input_size, hidden_size1)
#         self.fc2 = nn.Linear(hidden_size1, hidden_size2)
#         self.fc3 = nn.Linear(hidden_size2, output_size)
#         self.dropout = nn.Dropout(dropout_rate)

#     def forward(self, x):
#         x = x.view(x.size(0), -1)  # Flatten the input images
#         x = torch.relu(self.fc1(x))
#         x = self.dropout(x)
#         x = torch.relu(self.fc2(x))
#         x = self.dropout(x)
#         x = self.fc3(x)
#         return x


class MLP(nn.Module):
    def __init__(
        self,
        input_size,
        hidden_size1,
        hidden_size2,
        output_size,
        dropout_rate,
        activation,
    ):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, output_size)
        self.dropout = nn.Dropout(dropout_rate)
        self.activation = activation

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the input images

        # x = torch.relu(self.fc1(x)) # default activation function
        x = self.activation(self.fc1(x))
        x = self.dropout(x)

        x = self.activation(self.fc2(x))
        # x = torch.relu(self.fc2(x)) # default activation function
        x = self.dropout(x)
        x = self.fc3(x)

        return x

---
### 1.2
##### training loop and evaluation function

In [49]:
def train(model, criterion, optimizer, train_loader, epochs):
    model.train()
    for epoch in range(epochs):
        running_loss = 0.0
        for inputs, labels in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        # print(f"Epoch {epoch+1}, Loss: {running_loss/len(train_loader)}")


def evaluate(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = correct / total
    # print(f"Accuracy on test set: {accuracy}")
    return accuracy

---
---
#### 2.1
##### optimization methods: SGD, Adam, RMSprop, MADGRAD (paper), mirrorMADGRAD (paper)

In [50]:
# Define the model, loss function, and optimization methods
model = MLP(
    input_size=input_nodes,
    hidden_size1=128,
    hidden_size2=64,
    output_size=output_nodes,
    dropout_rate=0.5,
    activation=torch.relu,
)
criterion = nn.CrossEntropyLoss()

# Train with SGD optimizer
optimizer_sgd = optim.SGD(model.parameters(), lr=0.01)
train(model, criterion, optimizer_sgd, train_loader, epochs=5)
accuracy_sgd = evaluate(model, test_loader)

# Train with Adam optimizer
model = MLP(
    input_size=input_nodes,
    hidden_size1=128,
    hidden_size2=64,
    output_size=output_nodes,
    dropout_rate=0.5,
    activation=torch.relu,
)
optimizer_adam = optim.Adam(model.parameters(), lr=0.001)
train(model, criterion, optimizer_adam, train_loader, epochs=5)
accuracy_adam = evaluate(model, test_loader)

# Train with RMSprop optimizer
model = MLP(
    input_size=input_nodes,
    hidden_size1=128,
    hidden_size2=64,
    output_size=output_nodes,
    dropout_rate=0.5,
    activation=torch.relu,
)
optimizer_rmsprop = optim.RMSprop(model.parameters(), lr=0.001)
train(model, criterion, optimizer_rmsprop, train_loader, epochs=5)
accuracy_rmsprop = evaluate(model, test_loader)

# Train with MADGRAD optimizer (our new optimizer according to the paper)
model = MLP(
    input_size=input_nodes,
    hidden_size1=128,
    hidden_size2=64,
    output_size=output_nodes,
    dropout_rate=0.5,
    activation=torch.relu,
)
optimizer_madgrad = madgrad.MADGRAD(
    params=model.parameters(), decouple_decay=True, lr=0.001
)
train(model, criterion, optimizer_madgrad, train_loader, epochs=5)
accuracy_madgrad = evaluate(model, test_loader)

# Train with MirrorMADGRAD optimizer
model = MLP(
    input_size=input_nodes,
    hidden_size1=128,
    hidden_size2=64,
    output_size=output_nodes,
    dropout_rate=0.5,
    activation=torch.relu,
)
optimizer_mirrormadgrad = madgrad.MirrorMADGRAD(
    params=model.parameters(), decouple_decay=True, lr=0.001
)
train(model, criterion, optimizer_mirrormadgrad, train_loader, epochs=5)
accuracy_mirrormadgrad = evaluate(model, test_loader)


print(f">>(SGD)<<Accuracy: {accuracy_sgd}")
print(f">>(Adam)<<Accuracy: {accuracy_adam}")
print(f">>(RMSprop)<<Accuracy: {accuracy_rmsprop}")
print(f">>(MADGRAD - paper op)<<Accuracy: {accuracy_madgrad}")
print(f">>(MirrorMADGRAD - paper op)<<Accuracy: {accuracy_mirrormadgrad}")

>>(SGD)<<Accuracy: 0.9186
>>(Adam)<<Accuracy: 0.9397
>>(RMSprop)<<Accuracy: 0.9349
==(MADGRAD - paper op)==Accuracy: 0.9247
--(MirrorMADGRAD - paper op)--Accuracy: 0.9399


---
---
### 3.1
##### evaluate different activation functions

In [22]:
class MLP(nn.Module):
    def __init__(
        self,
        input_size,
        hidden_size1,
        hidden_size2,
        output_size,
        dropout_rate=None,
        activation=None,
    ):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.fc3 = nn.Linear(hidden_size2, output_size)
        if dropout_rate: self.dropout = nn.Dropout(dropout_rate)
        self.activation = activation

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten the input images
        if self.activation: x = self.activation(self.fc1(x))
        if self.dropout: x = self.dropout(x)
        if self.activation: x = self.activation(self.fc2(x))
        if self.dropout: x = self.dropout(x)
        x = self.fc3(x)
        return x

---
### 3.2
##### activation functions: ReLU, Sigmoid, Softmax

In [23]:
# Define activation functions
activations = [torch.relu, torch.sigmoid, torch.softmax]

# Evaluate the model with different activation functions
for activation in activations:
    model = MLP(
        input_nodes, 128, 64, output_nodes, dropout_rate=0.5, activation=activation
    )

    optimizer = optim.Adam(model.parameters(), lr=0.001)
    train(model, criterion, optimizer, train_loader, epochs=5)
    accuracy = evaluate(model, test_loader)
    print(f"Accuracy with {activation.__name__}: {accuracy}")

Accuracy on test set: 0.9332
Accuracy with relu: 0.9332
Accuracy on test set: 0.9434
Accuracy with sigmoid: 0.9434
Accuracy on test set: 0.9403
Accuracy with tanh: 0.9403


---
---
### data augmentation

In [None]:
# Augmentation transforms
augmentation_transform = transforms.Compose(
    [
        transforms.RandomRotation(10),
        transforms.RandomHorizontalFlip(),
        transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1),
        transforms.ToTensor(),
        transforms.Normalize((0.5,), (0.5,)),
    ]
)

# Augmented dataset
augmented_train_dataset = torchvision.datasets.MNIST(
    root="./data", train=True, transform=augmentation_transform, download=True
)
augmented_train_loader = DataLoader(
    augmented_train_dataset, batch_size=64, shuffle=True
)

# Evaluate the model with augmented data
model = MLP(input_nodes, 128, 64, output_nodes, dropout_rate=0.5)
optimizer = optim.Adam(model.parameters(), lr=0.001)
train(model, criterion, optimizer, augmented_train_loader, epochs=5)
accuracy_augmented = evaluate(model, test_loader)
print(f"Accuracy with data augmentation: {accuracy_augmented}")

Epoch 1, Loss: 1.0567214161729508
Epoch 2, Loss: 0.7946562373688989
Epoch 3, Loss: 0.7316161442095283
Epoch 4, Loss: 0.7116706320154134
Epoch 5, Loss: 0.6789353833650984
Accuracy on test set: 0.8722
Accuracy with data augmentation: 0.8722
