In [43]:
'''
GOAL: 99% **TEST** ACCURACY
'''

# explore different architectures, data augmentation and regularization methods to determine a suitable range of parameters
# TODO: use optuna to explore the hyperparameter spacej
# TODO: report final hyperparameter values and test accuracy

# SUBMIT:
# TODO: "informed discussion" of approach to hyperparameter exploration / observations
# TODO: submit use of optuna code
# TODO: final performance of model
# TODO: discussion of limitations of not using a separate validation set

'\nGOAL: 99% **TEST** ACCURACY\n'

In [44]:
import os

import optuna
from optuna.trial import TrialState
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torchvision import datasets
from torchvision import transforms

In [45]:
DEVICE = torch.device("mps")
BATCHSIZE = 512
CLASSES = 10
DIR = os.getcwd() #./datafiles/
EPOCHS = 10
N_TRAIN_EXAMPLES = BATCHSIZE * 30
N_VALID_EXAMPLES = BATCHSIZE * 10

LOSS_FN = nn.CrossEntropyLoss()

In [52]:
def get_mnist(trial):

    aff_alph = trial.suggest_float("affine_alpha", 50.0, 70.0)

    # Any data augmentation should be added to training
    train_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.RandomRotation(15), #already randomly samples in range
        transforms.RandomAffine(25), #already randomly samples in range
        transforms.ElasticTransform(alpha=aff_alph),
        transforms.Normalize(mean=0.1307, std=0.3081),
    ])

    # Test data should have normalization applied, but no augmentation
    test_transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize(mean=0.1307, std=0.3081)
    ])

    # Load FashionMNIST dataset.
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST(DIR, train=True, download=True, transform=transforms.ToTensor()),
        batch_size=BATCHSIZE,
        shuffle=True,
    )
    valid_loader = torch.utils.data.DataLoader(
        datasets.MNIST(DIR, train=False, download=True, transform=transforms.ToTensor()),
        batch_size=BATCHSIZE,
        shuffle=True,
    )

    return train_loader, valid_loader

In [47]:
import math
def get_output_size(input_size, padding, stride, kernel):   
    return math.floor((input_size + 2*padding - kernel)/stride) + 1

In [48]:
class MNIST_Model(nn.Module):
    def __init__(self, trial):
        super().__init__()
        n_layers = trial.suggest_int("n_layers", 2, 3)
        layers = []

        in_exp = 0
        img_size = 28
        for i in range(n_layers):
            out_exp = trial.suggest_int("n_units_l{}".format(i), in_exp, 8)
            in_channels = 2 ** in_exp
            out_channels = 2 ** out_exp

            kernel_size = 7 - (2*i)

            # NOTE: Maria -> could vary kernel and padding more
            layers.append(nn.Conv2d(
                in_channels=in_channels,
                out_channels=out_channels,
                kernel_size=kernel_size,
                padding='same'
            ))
            layers.append(nn.ReLU())
            layers.append(nn.MaxPool2d(kernel_size=(2, 2), stride=2, padding=1))
            img_size = get_output_size(input_size=img_size, stride=2, kernel=2, padding=1)
            # Batch Norm
            layers.append(nn.BatchNorm2d(out_channels)) #NOTE: could play around with the placement

            in_channels = out_channels
            in_exp = out_exp
        
        self.conv_layers = nn.Sequential(*layers)

        p = trial.suggest_float("dropout_l{}".format(i), 0.05, 0.25) #NOTE: Zach: Verify p.
        # TODO: Maria: make this a variable amount
        self.dropout = nn.Dropout(p)
        self.output_layer = nn.Linear(in_channels*img_size*img_size, CLASSES)
    
    def forward(self, x):
        # print(x.shape)
        x = self.conv_layers(x)
        # print(x.shape)

        x = self.dropout(x)
        # print(f"after dropout: {x.shape}")

        x = x.view(x.size(0), -1) 
        # print(f"after reshape: {x.shape}")
        x = self.output_layer(x)
        # print(x.shape)
        return x

def define_model(trial):
    model = MNIST_Model(trial)
    return model


In [49]:
def define_model_old(trial):
    # We optimize the number of layers, hidden units and dropout ratio in each layer.
    n_layers = trial.suggest_int("n_layers", 2, 4)
    layers = []

    in_channels = 1
    for i in range(n_layers):
        out_channels = trial.suggest_int("n_units_l{}".format(i), in_channels, 128)
        kernel_size = 7 - (2*i)
        print(f"kernel_size: {kernel_size}")

        # NOTE: Maria -> could vary kernel and padding more
        layers.append(nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=kernel_size,
            padding='same'
        ))
        layers.append(nn.ReLU())
        layers.append(nn.MaxPool2d(kernel_size=(2, 2), stride=2, padding=1))
        # Batch Norm
        layers.append(nn.BatchNorm2d(out_channels)) #NOTE: could play around with the placement

        print(f"in_channels: {in_channels} out_channels: {out_channels}")
        in_channels = out_channels
    
    # TODO: Maria: Figure this out
    # use_average_pool = trial.suggest_int("use_average_pool", 0, 1) #NOTE: Maria: ask trevor about this?
    # if use_average_pool:
    #     layers.append(nn.AdaptiveAvgPool2d(output_size=(1,1)).squeeze()) #NOTE: Maria: this does not work

    # n_linear_layers = trial.suggest_int("n_linear_layers", 1, 2)
    p = trial.suggest_float("dropout_l{}".format(i), 0.05, 0.25) #NOTE: Zach: Verify p.

    # TODO: delete this... there is still an error with this version but seems like a more important one to solve
    layers.append(nn.Dropout(p))
    layers.append(nn.Linear(in_channels, CLASSES))

    # if n_linear_layers == 1:
    #     layers.append(nn.Dropout(p))
    #     layers.append(nn.Linear(in_channels, CLASSES))
        
    # if n_linear_layers == 2:
    #     intermediate_channels = in_channels/2
    #     layers.append(nn.Linear(in_channels, intermediate_channels))
    #     layers.append(nn.Dropout(p))
    #     layers.append(nn.Linear(intermediate_channels, CLASSES))

    return nn.Sequential(*layers)

In [50]:
def objective(trial):
    # Generate the model.
    model = define_model(trial).to(DEVICE)

    # TODO Zach: Add optimizer, and weight decay.
    # Generate the optimizers.
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop"])
    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-1, log=True)
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr, weight_decay=weight_decay)

    # Get the FashionMNIST dataset.
    train_loader, valid_loader = get_mnist(trial)
    
    # Initialize variables for tracking the best accuracy and the number of epochs since improvement
    best_patience = 0
    epochs_since_improvement = 0
    best_accuracy = 0  # Initialize to 0 for accuracy

    # Training of the model.
    for epoch in range(EPOCHS):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            # Limiting training data for faster epochs.
            if batch_idx * BATCHSIZE >= N_TRAIN_EXAMPLES:
                break

            data, target = data.to(DEVICE), target.to(DEVICE)
            optimizer.zero_grad()
            output = model(data)
            loss = LOSS_FN(output, target)
            loss.backward()
            optimizer.step()

        # Validation of the model.
        model.eval()
        correct = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(valid_loader):
                # Limiting validation data.
                if batch_idx * BATCHSIZE >= N_VALID_EXAMPLES: #TODO: discuss getting rid of this 
                    break
                data, target = data.to(DEVICE), target.to(DEVICE)
                output = model(data)
                # Get the index of the max log-probability.
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / min(len(valid_loader.dataset), N_VALID_EXAMPLES)
        
        if accuracy > best_accuracy:
            best_accuracy = accuracy 
            if epochs_since_improvement > 0:
                if best_patience < epochs_since_improvement:
                    best_patience = epochs_since_improvement
                epochs_since_improvement = 0
        else:
            epochs_since_improvement += 1

        trial.report(accuracy, epoch)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    print(f"Best patience value: {best_patience}")
    return accuracy


In [53]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=2, timeout=600)

pruned_trials = study.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2023-10-31 21:42:41,811] A new study created in memory with name: no-name-74bc0c37-a64a-4c84-b67d-8c7b141e18c5
[I 2023-10-31 21:43:02,700] Trial 0 finished with value: 0.976171875 and parameters: {'n_layers': 2, 'n_units_l0': 4, 'n_units_l1': 8, 'dropout_l1': 0.22249162975337988, 'optimizer': 'RMSprop', 'weight_decay': 0.005210952085501617, 'lr': 0.0023514662160008452, 'affine_alpha': 56.42539390244004}. Best is trial 0 with value: 0.976171875.


Best patience value: 1


[W 2023-10-31 21:43:26,298] Trial 1 failed with parameters: {'n_layers': 3, 'n_units_l0': 8, 'n_units_l1': 8, 'n_units_l2': 8, 'dropout_l2': 0.21609557771115817, 'optimizer': 'RMSprop', 'weight_decay': 0.0007491812529969676, 'lr': 4.2356536168481974e-05, 'affine_alpha': 57.49916045559333} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/opt/homebrew/anaconda3/envs/torch-nightly/lib/python3.8/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/var/folders/0v/1rrf835166g6l4b55txmksjm0000gn/T/ipykernel_88425/2699304349.py", line 47, in objective
    correct += pred.eq(target.view_as(pred)).sum().item()
KeyboardInterrupt
[W 2023-10-31 21:43:26,299] Trial 1 failed with value None.


KeyboardInterrupt: 