# Homework 1. Part 3.3

In [1]:
import matplotlib.pyplot as plt
import torch
from torch import nn
from torchvision import datasets
from torchvision.transforms import ToTensor
import pandas as pd
from torch.utils.data import DataLoader

In [2]:
# Setting default device

if torch.cuda.is_available():
    device = 'cuda'
elif torch.backends.mps.is_available():
    device = 'mps'
else:
    device = 'cpu'

display(f'{device} is available')

dtype = torch.float
torch.set_default_device(device)

'cuda is available'

### Modifiable model

In [3]:
class ChangeableNetwork(nn.Module):
    def __init__(self, modification=512):
        super().__init__()
        self.linear_relu = nn.Sequential(
            nn.Linear(28*28, modification),
            nn.ReLU(),
            nn.Linear(modification,modification),
            nn.ReLU(),
            nn.Linear(modification, 10),
        )

    def forward(self, x):
        flat = nn.Flatten()
        x = flat(x)
        logits = self.linear_relu(x)
        return logits

### Utility Functions

In [4]:
def cnt_model_params(model):
    """Count model parameters"""
    count = 0
    with torch.no_grad():
        for param in model.parameters():
            count+=param.numel()
    return count

def display_model_info(model_name, model):
    """ Display model information"""
    count = 0
    for module in model.modules():
        if isinstance(module, nn.Module):
            count+=1
    display(model)
    display(f"{model_name}. parameters: {cnt_model_params(model)}")

In [5]:
# Randomly changed the lavels on the training dataset. Kept labels the same on test dataset.
training_ds = datasets.MNIST(root="data", train=True, download=True, transform=ToTensor())
test_ds = datasets.MNIST(root='data', train=False, download=True, transform=ToTensor())


### Training & eval loops

In [6]:
def nmist_train_loop(dataloader, model, loss_fn, optimize):
    model.train()
    num_batches = len(dataloader)
    train_loss, correct = 0,0
    size = len(dataloader.dataset)

    for batch, (X, y) in enumerate(dataloader):
        X = X.to(device)
        optimize.zero_grad()
        y_pred = model(X)
        loss = loss_fn(y_pred, y)
        train_loss += loss.item()
        correct += (y_pred.argmax(dim=1) == y).type(torch.float).sum().item()

        loss.backward()
        optimize.step()
    
    train_loss /= num_batches
    correct /=size
    return train_loss, correct
        
def nmist_val_loop(dataloader, model, loss_fn):
    model.eval()
    size= len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0,0

    with torch.no_grad():
        for X,y in dataloader:
            X = X.to(device)
            y_pred = model(X)
            test_loss += loss_fn(y_pred, y).item()
            correct += (y_pred.argmax(dim=1) == y).type(torch.float).sum().item()
    
    test_loss /= num_batches
    correct /=size
    return test_loss, correct

## Flatness v.s. Generalization part2

In [7]:
# create models and with different Hidden values
m1, m2 = ChangeableNetwork(512), ChangeableNetwork(512)
display_model_info(1, m1)
display_model_info(2, m2)

ChangeableNetwork(
  (linear_relu): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)

'1. parameters: 669706'

ChangeableNetwork(
  (linear_relu): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)

'2. parameters: 669706'

In [None]:
# hyperparameters
loss_fn = torch.nn.CrossEntropyLoss()
epochs = 15
lr = 1e-3

models = (m1,m2)

batch_sizes = (64, 2048)

fit_df = pd.DataFrame()

for batch_size, model in zip(batch_sizes, models):
    optimizer1 = torch.optim.SGD(model.parameters(), lr=lr)
    display("Training & eval: Model with random labels")

    # Dataloader
    training_dl = DataLoader(training_ds, batch_size=batch_size)
    test_dl = DataLoader(test_ds, batch_size=batch_size)
   
    for epoch in range(epochs):
        train_loss, t_correct = nmist_train_loop(training_dl, model, loss_fn, optimizer1)
        val_loss, v_correct = nmist_val_loop(test_dl, model, loss_fn)
        display(f'Epoch {epoch}. Training_loss: {train_loss}. Val_loss: {val_loss}. Training_accuracy: {t_correct}. Val_accuracy: {v_correct}')
    display("Done")
    
    #if fit_df.empty:
    #    fit_df = pd.DataFrame([[epoch, train_loss, val_loss, t_correct, v_correct, cnt_model_params(model)]], columns=['epoch','train_loss','val_loss', 'train_acc', 'v_acc','parameters'])
    #else:
    #    columns=['epoch','train_loss','val_loss', 'train_acc', 'v_acc','parameters']
    #    fit_df = pd.concat([fit_df, pd.DataFrame([[epoch, train_loss, val_loss, t_correct, v_correct, cnt_model_params(model)]], columns=columns )])

'Training & eval: Model with random labels'

'Epoch 0. Training_loss: 2.2774649550919848. Val_loss: 2.252709628670079. Training_accuracy: 0.3039. Val_accuracy: 0.4813'

'Epoch 1. Training_loss: 2.2230263071527867. Val_loss: 2.1843814910597104. Training_accuracy: 0.5777833333333333. Val_accuracy: 0.6401'

'Epoch 2. Training_loss: 2.135086968509373. Val_loss: 2.0675799254399196. Training_accuracy: 0.6563666666666667. Val_accuracy: 0.68'

'Epoch 3. Training_loss: 1.9832452155633775. Val_loss: 1.8686661173583596. Training_accuracy: 0.6887666666666666. Val_accuracy: 0.7123'

'Epoch 4. Training_loss: 1.74193851487723. Val_loss: 1.5774701097208983. Training_accuracy: 0.7195333333333334. Val_accuracy: 0.7412'

In [None]:
# Saving weights for later
torch.save(m1.state_dict(), 'm1.pth')
torch.save(m2.state_dict(), 'm2.pth')

In [None]:
m1_params = torch.nn.utils.parameters_to_vector(m1.parameters())
m2_params = torch.nn.utils.parameters_to_vector(m2.parameters())
alphas = torch.linspace(-2,2, 20)

theta_params_list = [(alpha, (1-alpha) * m1_params + alpha * m2_params) for alpha in alphas]
theta_params_list

In [None]:
#theta_model = ChangeableNetwork(512)
#torch.nn.utils.vector_to_parameters(theta_params, theta_model.parameters())

In [None]:
# Training on theta_models

loss_fn = torch.nn.CrossEntropyLoss()
epochs = 5
lr = 1e-3
batch_size=64

theta_df = pd.DataFrame()

for cur_alpha, theta_params in theta_params_list:
    
    model = ChangeableNetwork(512)
    torch.nn.utils.vector_to_parameters(theta_params, theta_model.parameters())
    
    optimizer1 = torch.optim.SGD(model.parameters(), lr=lr)
    display(f"Training & eval: Model with alpha: {cur_alpha}")

    # Dataloader
    training_dl = DataLoader(training_ds, batch_size=batch_size)
    test_dl = DataLoader(test_ds, batch_size=batch_size)
   
    for epoch in range(epochs):
        train_loss, t_correct = nmist_train_loop(training_dl, model, loss_fn, optimizer1)
        val_loss, v_correct = nmist_val_loop(test_dl, model, loss_fn)
        display(f'Epoch {epoch}. Training_loss: {train_loss}. Val_loss: {val_loss}. Training_accuracy: {t_correct}. Val_accuracy: {v_correct}')
    display("Done")
    if theta_df.empty:
        theta_df = pd.DataFrame([[cur_alpha.item(), train_loss, val_loss, t_correct, v_correct]], columns=['alpha', 'train_loss', 'val_loss','t_acc','v_acc'])
    else:
        theta_df = pd.concat([theta_df, pd.DataFrame([[cur_alpha.item(), train_loss, val_loss, t_correct, v_correct]], columns=['alpha', 'train_loss', 'val_loss','t_acc','v_acc'])])


In [None]:
theta_df

In [None]:
# 'train_loss','val_loss', 'train_acc', 'v_acc','parameters

fig, ax = plt.subplots()
ax.plot(theta_df.alpha, theta_df.train_loss, color = 'tab:red', marker='v')
ax.plot(theta_df.alpha, theta_df.val_loss, color= 'tab:red', marker='x', linestyle='dashed')
ax.set_xlabel('alpha')
ax.set_ylabel('loss', color='tab:red')

ax2 = ax.twinx()
ax2.plot(theta_df.alpha, theta_df.t_acc, color='tab:blue', marker = 'v')
ax2.plot(theta_df.alpha, theta_df.v_acc, color='tab:blue', marker = 'x', linestyle='dashed')
ax2.set_ylabel('accuracy', color='tab:blue')






In [None]:
ax.scatter(theta_df.alpha, fit_df.train_loss, label='train_loss')
ax.scatter(fit_df.parameters, fit_df.val_loss, label='val_loss')
ax.legend()
ax.set_xlabel('number of parameters')
ax.set_ylabel('loss')
ax.grid(True)
plt.title('loss')

fig2, ax2 = plt.subplots()
ax2.scatter(fit_df.parameters, fit_df.train_acc, label='training_acc')
ax2.scatter(fit_df.parameters, fit_df.v_acc, label='testing_acc')
ax2.legend()
ax2.set_xlabel('number of parameters')
ax2.set_ylabel('accuracy')
ax2.grid(True)
plt.title('accuracy')