In [26]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
import seaborn as sns
import matplotlib.pyplot as plt

In [27]:
# Download the MNIST dataset
transform = transforms.ToTensor()
train_dataset = datasets.MNIST(root='./data', train=True, download=True,
transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True,
transform=transform)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=64, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,
batch_size=64, shuffle=False)

# Code from Exercise 1

In [28]:
class MyNetwork(nn.Module):
    def __init__(self):
        super(MyNetwork, self).__init__()
        self.fc1 = nn.Linear(in_features=28*28, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=64)
        self.fc3 = nn.Linear(in_features=64, out_features=10)
        self.relu = nn.ReLU()
    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x) 
        return x

In [29]:
model = MyNetwork()


criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

num_epochs = 5

In [30]:
losses = []

for epoch in range(num_epochs):
    model.train()
    for data, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/5], Loss: 0.9376
Epoch [2/5], Loss: 0.8663
Epoch [3/5], Loss: 0.2909
Epoch [4/5], Loss: 0.4442
Epoch [5/5], Loss: 0.4665


# 1. Regularization:

a. Add L1 regularization to the 2 nd layer (the layer after the input layer)

In [31]:
# hyperparameter that controls the strength of the L1 regularization
def l1_regularization(model, lambda_l1=0.001):
    l1_loss = 0
    # Access the fc2 layer's parameters
    for param in model.fc2.parameters():
        # computes the L1 norm of the fc2 layer’s parameters
        l1_loss += torch.sum(torch.abs(param))
    return lambda_l1 * l1_loss

In [32]:
losses = []

for epoch in range(num_epochs):
    model.train()
    for data, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, targets)
        
        # Add L1 regularization loss
        l1_loss = l1_regularization(model)
        total_loss = loss + l1_loss
        
        total_loss.backward()
        optimizer.step()
        
        losses.append(total_loss.item())
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss.item():.4f}')

Epoch [1/5], Loss: 0.6510
Epoch [2/5], Loss: 0.6608
Epoch [3/5], Loss: 0.6541
Epoch [4/5], Loss: 0.5606
Epoch [5/5], Loss: 0.5737


b. Add L2 regularization instead on the 2 nd layer.

In [33]:
# hyperparameter that controls the strength of the L2 regularization
def l2_regularization(model, lambda_l2=0.001):
    l2_loss = 0
    # Access the fc2 layer's parameters
    for param in model.fc2.parameters():
        # computes the L2 norm (squared) of the fc2 layer’s parameters
        l2_loss += torch.sum(param ** 2)
    return lambda_l2 * l2_loss

In [34]:
num_epochs = 5
losses = []

for epoch in range(num_epochs):
    model.train()
    for data, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, targets)
        
        # Add L2 regularization loss
        l2_loss = l2_regularization(model)
        total_loss = loss + l2_loss
        
        total_loss.backward()
        optimizer.step()
        
        losses.append(total_loss.item())
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {total_loss.item():.4f}')


Epoch [1/5], Loss: 0.1078
Epoch [2/5], Loss: 0.2450
Epoch [3/5], Loss: 0.3691
Epoch [4/5], Loss: 0.2110
Epoch [5/5], Loss: 0.1722


c. What do you observe? (Hint: The lambda value used has a big impact on
performance.)

L2 regularization generally results in lower and more stable training losses compared to L1 regularization, which can cause higher initial losses and less consistent improvement. L2 is more effective at reducing overfitting and improving performance on the training data, while L1 tends to induce sparsity but may not perform as well in this context

Sparsity refers to the presence of many zero or near-zero values in the weight matrix. L1 regularization encourages sparsity by penalizing the absolute values of weights, often leading to some weights being exactly zero. This can result in a more compact model where only a subset of features is used. L2 regularization, on the other hand, penalizes the squared values of weights and typically results in smaller weights overall but does not force them to be zero.

d. What is the purpose of adding regularization?

The purpose of adding regularization is to prevent overfitting by penalizing large weights or complex models, thereby improving the model's ability to generalize to new, unseen data. Regularization helps to balance the trade-off between fitting the training data and maintaining model simplicity.

# 2. Dropout:

a. Add a dropout layer between the first and second layer. What do you
observe?

In [35]:
class MyNetworkWithDropout(nn.Module):
    def __init__(self):
        super(MyNetworkWithDropout, self).__init__()
        self.fc1 = nn.Linear(in_features=28*28, out_features=128)
        self.fc2 = nn.Linear(in_features=128, out_features=64)
        self.fc3 = nn.Linear(in_features=64, out_features=10)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(p=0.5)  # Dropout layer with a 50% dropout rate

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.dropout(x)  # Apply dropout after the first layer
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [36]:
modelWithDropout = MyNetworkWithDropout()


criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(modelWithDropout.parameters(), lr=0.01)

num_epochs = 5

In [37]:
losses = []

for epoch in range(num_epochs):
    modelWithDropout.train()
    for data, targets in train_loader:
        optimizer.zero_grad()
        outputs = modelWithDropout(data)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/5], Loss: 0.9063
Epoch [2/5], Loss: 0.6110
Epoch [3/5], Loss: 0.5800
Epoch [4/5], Loss: 0.3818
Epoch [5/5], Loss: 0.2768


The loss decreases more smoothly and consistently over epochs with dropout compared to previous runs without it, indicating improved training stability and potentially better generalization. Dropout helps to prevent overfitting by regularizing the model, leading to more gradual and reliable reductions in loss.

b. What is the purpose of adding dropout?

The purpose of adding dropout is to prevent overfitting by randomly deactivating a fraction of neurons during training, which forces the network to learn more robust and generalized features. This regularization technique helps improve the model's ability to generalize to unseen data.

# 3. Layers:

a. Experiment with different amount of layers. What do you observe?

Adding More Layers

In [38]:
class DeepNetwork(nn.Module):
    def __init__(self):
        super(DeepNetwork, self).__init__()
        self.fc1 = nn.Linear(in_features=28*28, out_features=256)
        self.fc2 = nn.Linear(in_features=256, out_features=128)
        self.fc3 = nn.Linear(in_features=128, out_features=64)
        self.fc4 = nn.Linear(in_features=64, out_features=32)
        self.fc5 = nn.Linear(in_features=32, out_features=10)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.relu(self.fc4(x))
        x = self.fc5(x)
        return x

In [39]:
modelDeep = DeepNetwork()


criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(modelDeep.parameters(), lr=0.01)

num_epochs = 5

In [40]:
losses = []

for epoch in range(num_epochs):
    modelDeep.train()
    for data, targets in train_loader:
        optimizer.zero_grad()
        outputs = modelDeep(data)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/5], Loss: 2.2772
Epoch [2/5], Loss: 1.9878
Epoch [3/5], Loss: 0.7202
Epoch [4/5], Loss: 0.2473
Epoch [5/5], Loss: 0.2541


The additional layers in the deep model result in a more variable loss pattern compared to the regular model, with larger fluctuations in loss between epochs, indicating potential overfitting or instability in learning. However, the final loss is lower, suggesting that the increased model capacity may capture more complex features, improving performance on the training data.

Reducing the Number of Layers

In [41]:
class ShallowNetwork(nn.Module):
    def __init__(self):
        super(ShallowNetwork, self).__init__()
        self.fc1 = nn.Linear(in_features=28*28, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=10)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x


In [42]:
modelShallow = ShallowNetwork()


criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(modelShallow.parameters(), lr=0.01)

num_epochs = 5

In [43]:
losses = []

for epoch in range(num_epochs):
    modelShallow.train()
    for data, targets in train_loader:
        optimizer.zero_grad()
        outputs = modelShallow(data)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/5], Loss: 0.7293
Epoch [2/5], Loss: 0.6829
Epoch [3/5], Loss: 0.2837
Epoch [4/5], Loss: 0.1445
Epoch [5/5], Loss: 0.3405


The shallow model shows a more stable loss pattern with a final loss that is generally higher compared to both the regular and deeper models. This suggests that the reduced capacity of the shallow model may lead to underfitting, as it struggles to capture complex patterns in the data, resulting in higher training losses.

b. Experiment with different amount of neurons in each layer. What do you
observe?

Increasing the Number of Neurons

In [44]:
class WideNetwork(nn.Module):
    def __init__(self):
        super(WideNetwork, self).__init__()
        self.fc1 = nn.Linear(in_features=28*28, out_features=512)
        self.fc2 = nn.Linear(in_features=512, out_features=256)
        self.fc3 = nn.Linear(in_features=256, out_features=10)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [45]:
modelWide = WideNetwork()


criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(modelWide.parameters(), lr=0.01)

num_epochs = 5

In [46]:
losses = []

for epoch in range(num_epochs):
    modelWide.train()
    for data, targets in train_loader:
        optimizer.zero_grad()
        outputs = modelWide(data)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/5], Loss: 0.6400
Epoch [2/5], Loss: 0.4006
Epoch [3/5], Loss: 0.4633
Epoch [4/5], Loss: 0.1802
Epoch [5/5], Loss: 0.2223


The wide model shows more variability in the loss values across epochs compared to the normal model, with higher final loss, indicating potential overfitting or instability in learning. While the wide model has increased capacity, it may not necessarily improve performance on the given task and could require additional regularization.

Decreasing the Number of Neurons

In [47]:
class NarrowNetwork(nn.Module):
    def __init__(self):
        super(NarrowNetwork, self).__init__()
        self.fc1 = nn.Linear(in_features=28*28, out_features=64)
        self.fc2 = nn.Linear(in_features=64, out_features=32)
        self.fc3 = nn.Linear(in_features=32, out_features=10)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.view(x.size(0), -1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

In [48]:
modelNarrow = NarrowNetwork()


criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(modelNarrow.parameters(), lr=0.01)

num_epochs = 5

In [49]:
losses = []

for epoch in range(num_epochs):
    modelNarrow.train()
    for data, targets in train_loader:
        optimizer.zero_grad()
        outputs = modelNarrow(data)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/5], Loss: 1.3166
Epoch [2/5], Loss: 0.3643
Epoch [3/5], Loss: 0.3100
Epoch [4/5], Loss: 0.4129
Epoch [5/5], Loss: 0.4244


The narrow model has higher initial and overall loss compared to the normal model, indicating that it may struggle with learning complex patterns due to its limited capacity. Despite a significant drop in loss during training, the final loss values suggest it might still be underfitting and unable to fully capture the data's underlying structure.

# 4. Momentum:

a. Try to add momentum to the SGD optimizer.

In [50]:
model = MyNetwork()


criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

num_epochs = 5

In [51]:
losses = []

for epoch in range(num_epochs):
    model.train()
    for data, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(data)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        losses.append(loss.item())
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/5], Loss: 0.1242
Epoch [2/5], Loss: 0.1112
Epoch [3/5], Loss: 0.3567
Epoch [4/5], Loss: 0.0833
Epoch [5/5], Loss: 0.0528


The model with momentum demonstrates significantly lower loss values and greater stability across epochs compared to the regular model, indicating faster convergence and potentially better training performance. Momentum helps the optimizer to navigate the loss landscape more effectively, reducing fluctuations and achieving a better final loss.

b. Test different values of momentum. What value do you get the highest
accuracy?

In [54]:
# Define a function to train and evaluate the model
def train_and_evaluate(momentum_value):
    model = MyNetwork()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=momentum_value)
    
    # Training loop
    num_epochs = 5
    for epoch in range(num_epochs):
        model.train()
        for data, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(data)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
    
    # Evaluation loop
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            outputs = model(images)
            _, predicted = torch.max(outputs, dim=1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    accuracy = 100 * correct / total
    return accuracy

In [55]:
# Test different momentum values
momentum_values = [0.0, 0.2, 0.4, 0.6, 0.8, 0.9]
results = {}

for momentum in momentum_values:
    accuracy = train_and_evaluate(momentum)
    results[momentum] = accuracy
    print(f"Momentum: {momentum}, Accuracy: {accuracy:.2f}%")

# Find the best momentum value
best_momentum = max(results, key=results.get)
print(f"Best momentum value: {best_momentum} with accuracy: {results[best_momentum]:.2f}%")

Momentum: 0.0, Accuracy: 91.44%
Momentum: 0.2, Accuracy: 91.58%
Momentum: 0.4, Accuracy: 93.01%
Momentum: 0.6, Accuracy: 94.04%
Momentum: 0.8, Accuracy: 96.14%
Momentum: 0.9, Accuracy: 97.45%
Best momentum value: 0.9 with accuracy: 97.45%


c. What happens if momentum is too high?

If momentum is too high, the optimizer might move too quickly and overshoot the best weights, causing the loss to fluctuate and making the training unstable. This can prevent the model from converging properly.