In [None]:
'''
GOAL: 99% **TEST** ACCURACY
'''

# explore different architectures, data augmentation and regularization methods to determine a suitable range of parameters
# TODO: use optuna to explore the hyperparameter spacej
# TODO: report final hyperparameter values and test accuracy

# SUBMIT:
# TODO: "informed discussion" of approach to hyperparameter exploration / observations
# TODO: submit use of optuna code
# TODO: final performance of model
# TODO: discussion of limitations of not using a separate validation set

In [1]:
import os

import optuna
from optuna.trial import TrialState
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torchvision import datasets
from torchvision import transforms

In [2]:
DEVICE = torch.device("mps")
BATCHSIZE = 512
CLASSES = 10
DIR = os.getcwd() #./datafiles/
EPOCHS = 10
N_TRAIN_EXAMPLES = BATCHSIZE * 30
N_VALID_EXAMPLES = BATCHSIZE * 10

LOSS_FN = nn.CrossEntropyLoss()

torch.manual_seed(0)

In [3]:
def get_mnist():

    # Any data augmentation should be added to training
    train_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=0.1307, std=0.3081),
    ])

    # Test data should have normalization applied, but no augmentation
    test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=0.1307, std=0.3081)
    ])

    # Load FashionMNIST dataset.
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST(DIR, train=True, download=True, transform=transforms.ToTensor()),
        batch_size=BATCHSIZE,
        shuffle=True,
    )
    valid_loader = torch.utils.data.DataLoader(
        datasets.MNIST(DIR, train=False, download=True, transform=transforms.ToTensor()),
        batch_size=BATCHSIZE,
        shuffle=True,
    )

    return train_loader, valid_loader

In [4]:
import math
def get_output_size(input_size, padding, stride, kernel):   
    return math.floor((input_size + 2*padding - kernel)/stride) + 1

In [5]:
class MNIST_Model(nn.Module):
    def __init__(self, trial):
        super().__init__()

        img_size = 28

        # NOTE: optuna params 
        layer1_channels_exp = trial.suggest_int("layer1_channels_exp", 3, 5) # 1st layer output could be 8 16 32
        layer2_channels_exp = trial.suggest_int("layer2_channels_exp", 6, 8) # 2nd layer output could be 64 128 256

        layer1_channels = 2 ** layer1_channels_exp
        layer2_channels = 2 ** layer2_channels_exp

        self.conv1 = nn.Conv2d(1, layer1_channels, kernel_size=(5, 5), padding='same')
        self.conv2 = nn.Conv2d(layer1_channels, layer2_channels, kernel_size=(3, 3), padding='same')
        self.mp = nn.MaxPool2d(kernel_size=(2, 2), stride=2, padding=1)
        
        # Batch nrom
        self.bn1 = nn.BatchNorm2d(layer1_channels)
        self.bn2 = nn.BatchNorm2d(layer2_channels)

        out1 = get_output_size(input_size=img_size, padding=1, stride=2, kernel=2)
        out2 = get_output_size(input_size=out1, padding=1, stride=2, kernel=2)

        # Activation
        self.relu = nn.ReLU()

        p = trial.suggest_float("dropout_p:", 0, 0.1) #NOTE: optuna param
        self.dropout = nn.Dropout(p)
        self.output_layer = nn.Linear(layer2_channels*out2*out2, CLASSES)
    
    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.mp(x)
        x = self.bn1(x)

        x = self.conv2(x)
        x = self.relu(x)
        x = self.mp(x)
        x = self.bn2(x)

        x = self.dropout(x)
        x = x.view(x.size(0), -1) 
        x = self.output_layer(x)
        return x

def define_model(trial):
    model = MNIST_Model(trial)
    return model


In [None]:
def define_model_old(trial):
    # We optimize the number of layers, hidden units and dropout ratio in each layer.
    n_layers = trial.suggest_int("n_layers", 2, 4)
    layers = []

    in_channels = 1
    for i in range(n_layers):
        out_channels = trial.suggest_int("n_units_l{}".format(i), in_channels, 128)
        kernel_size = 7 - (2*i)
        print(f"kernel_size: {kernel_size}")

        # NOTE: Maria -> could vary kernel and padding more
        layers.append(nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            padding='same'
        ))
        layers.append(nn.ReLU())
        layers.append(nn.MaxPool2d(kernel_size=(2, 2), stride=2, padding=1))
        # Batch Norm
        layers.append(nn.BatchNorm2d(out_channels)) #NOTE: could play around with the placement

        print(f"in_channels: {in_channels} out_channels: {out_channels}")
        in_channels = out_channels
    
    # TODO: Maria: Figure this out
    # use_average_pool = trial.suggest_int("use_average_pool", 0, 1) #NOTE: Maria: ask trevor about this?
    # if use_average_pool:
    #     layers.append(nn.AdaptiveAvgPool2d(output_size=(1,1)).squeeze()) #NOTE: Maria: this does not work

    # n_linear_layers = trial.suggest_int("n_linear_layers", 1, 2)
    p = trial.suggest_float("dropout_l{}".format(i), 0.05, 0.25) #NOTE: Zach: Verify p.

    # TODO: delete this... there is still an error with this version but seems like a more important one to solve
    layers.append(nn.Dropout(p))
    layers.append(nn.Linear(in_channels, CLASSES))

    # if n_linear_layers == 1:
    #     layers.append(nn.Dropout(p))
    #     layers.append(nn.Linear(in_channels, CLASSES))
        
    # if n_linear_layers == 2:
    #     intermediate_channels = in_channels/2
    #     layers.append(nn.Linear(in_channels, intermediate_channels))
    #     layers.append(nn.Dropout(p))
    #     layers.append(nn.Linear(intermediate_channels, CLASSES))

    return nn.Sequential(*layers)

In [7]:
def objective(trial):
    # Generate the model.
    model = define_model(trial).to(DEVICE)

    # TODO Zach: Add optimizer, and weight decay.
    # Generate the optimizers.
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop"])
    weight_decay = trial.suggest_float("weight_decay", 1e-4, 1e-2, log=True) #NOTE: Optuna weight dexay param.
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr, weight_decay=weight_decay)

    # Get the FashionMNIST dataset.
    train_loader, valid_loader = get_mnist()
    
    # Initialize variables for tracking the best accuracy and the number of epochs since improvement
    best_patience = 0
    epochs_since_improvement = 0
    best_accuracy = 0  # Initialize to 0 for accuracy
    best_epoch = -1

    # Training of the model.
    for epoch in range(EPOCHS):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            # Limiting training data for faster epochs.
            if batch_idx * BATCHSIZE >= N_TRAIN_EXAMPLES:
                break

            data, target = data.to(DEVICE), target.to(DEVICE)
            optimizer.zero_grad()
            output = model(data)
            loss = LOSS_FN(output, target)
            loss.backward()
            optimizer.step()

        # Validation of the model.
        model.eval()
        correct = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(valid_loader):
                # Limiting validation data.
                if batch_idx * BATCHSIZE >= N_VALID_EXAMPLES: #TODO: discuss getting rid of this 
                    break
                data, target = data.to(DEVICE), target.to(DEVICE)
                output = model(data)
                # Get the index of the max log-probability.
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / min(len(valid_loader.dataset), N_VALID_EXAMPLES)
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy 
            best_epoch = epoch
            if epochs_since_improvement > 0:
                if best_patience < epochs_since_improvement:
                    best_patience = epochs_since_improvement
                epochs_since_improvement = 0
        else:
            epochs_since_improvement += 1

        trial.report(accuracy, epoch)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    print(f"Best patience value: {best_patience}")
    return accuracy


In [8]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100, timeout=600)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2023-11-02 17:32:43,250] A new study created in memory with name: no-name-b226c67d-51e1-41f2-ae1e-6f94ce5f5428
[I 2023-11-02 17:32:52,025] Trial 0 finished with value: 0.9896484375 and parameters: {'layer1_channels_exp': 5, 'layer2_channels_exp': 6, 'dropout_p:': 0.04954806781113186, 'optimizer': 'Adam', 'weight_decay': 0.00012301254698047313, 'lr': 0.0012628735461133397}. Best is trial 0 with value: 0.9896484375.


Best patience value: 3


[I 2023-11-02 17:33:03,139] Trial 1 finished with value: 0.9875 and parameters: {'layer1_channels_exp': 4, 'layer2_channels_exp': 8, 'dropout_p:': 0.006427139479331512, 'optimizer': 'Adam', 'weight_decay': 0.0002795061721743844, 'lr': 0.0032932878931397585}. Best is trial 0 with value: 0.9896484375.


Best patience value: 1


[I 2023-11-02 17:33:11,228] Trial 2 finished with value: 0.900390625 and parameters: {'layer1_channels_exp': 3, 'layer2_channels_exp': 6, 'dropout_p:': 0.07197006756324066, 'optimizer': 'RMSprop', 'weight_decay': 0.005903084532832784, 'lr': 1.3596397305491095e-05}. Best is trial 0 with value: 0.9896484375.


Best patience value: 0


[I 2023-11-02 17:33:22,355] Trial 3 finished with value: 0.97578125 and parameters: {'layer1_channels_exp': 4, 'layer2_channels_exp': 8, 'dropout_p:': 0.004202348924556831, 'optimizer': 'RMSprop', 'weight_decay': 0.0010881457396317786, 'lr': 3.686437074860214e-05}. Best is trial 0 with value: 0.9896484375.


Best patience value: 1


[I 2023-11-02 17:33:31,082] Trial 4 finished with value: 0.9837890625 and parameters: {'layer1_channels_exp': 3, 'layer2_channels_exp': 7, 'dropout_p:': 0.04639268706575398, 'optimizer': 'Adam', 'weight_decay': 0.00013794676534991158, 'lr': 0.0004815240235843506}. Best is trial 0 with value: 0.9896484375.


Best patience value: 1


[I 2023-11-02 17:33:32,559] Trial 5 pruned. 
[I 2023-11-02 17:33:35,766] Trial 6 pruned. 
[I 2023-11-02 17:33:36,669] Trial 7 pruned. 
[I 2023-11-02 17:33:37,577] Trial 8 pruned. 
[I 2023-11-02 17:33:40,744] Trial 9 pruned. 
[I 2023-11-02 17:33:43,418] Trial 10 pruned. 
