# Experimenting with pytorch and optuna for Hyperparameter tuning

Optuna is an automatic hyperparameter optimization software framework, particularly designed for machine learning. It features an imperative, define-by-run style user API. Thanks to our define-by-run API, the code written with Optuna enjoys high modularity, and the user of Optuna can dynamically construct the search spaces for the hyperparameters.

Source : [Optuna website](https://optuna.readthedocs.io/en/latest/index.html)

In [20]:
!pip install optuna
import os

import optuna
from optuna.trial import TrialState
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data
from torchvision import datasets
from torchvision import transforms



Get device function 

In [21]:
def get_device():
    if torch.cuda.is_available():
        device = 'cuda:0'
    else:
        device = 'cpu'
    return device

Parameters definition

In [22]:
DEVICE = get_device()
BATCHSIZE = 128
CLASSES = 10
DIR = os.getcwd()
EPOCHS = 10
LOG_INTERVAL = 10
N_TRAIN_EXAMPLES = BATCHSIZE * 30
N_VALID_EXAMPLES = BATCHSIZE * 10

Function that dynamically create the model based on the hyperparameters of the optuna's trials

In [23]:
def define_model(trial):
    # We optimize the number of layers, hidden units and dropout ratio in each layer.
    n_layers = trial.suggest_int("n_layers", 1, 3)
    layers = []

    in_features = 28 * 28
    for i in range(n_layers):
        out_features = trial.suggest_int("n_units_l{}".format(i), 4, 128)
        layers.append(nn.Linear(in_features, out_features))
        layers.append(nn.ReLU())
        p = trial.suggest_float("dropout_l{}".format(i), 0.2, 0.5)
        layers.append(nn.Dropout(p))

        in_features = out_features
    layers.append(nn.Linear(in_features, CLASSES))
    layers.append(nn.LogSoftmax(dim=1))

    return nn.Sequential(*layers)

Download MNIST dataset

In [24]:
def get_mnist():
    # Load FashionMNIST dataset.
    train_loader = torch.utils.data.DataLoader(
        datasets.FashionMNIST(DIR, train=True, download=True, transform=transforms.ToTensor()),
        batch_size=BATCHSIZE,
        shuffle=True,
    )
    valid_loader = torch.utils.data.DataLoader(
        datasets.FashionMNIST(DIR, train=False, transform=transforms.ToTensor()),
        batch_size=BATCHSIZE,
        shuffle=True,
    )

    return train_loader, valid_loader

Define the objective function, in this case the goal is to Maximize the accuracy

In [25]:
def objective_max_accuracy(trial):

    # Generate the model.
    model = define_model(trial).to(DEVICE)

    # Generate the optimizers.
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)

    # Get the FashionMNIST dataset.
    train_loader, valid_loader = get_mnist()

    # Training of the model.
    for epoch in range(EPOCHS):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            # Limiting training data for faster epochs.
            if batch_idx * BATCHSIZE >= N_TRAIN_EXAMPLES:
                break

            data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)

            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()

        # Validation of the model.
        model.eval()
        correct = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(valid_loader):
                # Limiting validation data.
                if batch_idx * BATCHSIZE >= N_VALID_EXAMPLES:
                    break
                data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)
                output = model(data)
                # Get the index of the max log-probability.
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / min(len(valid_loader.dataset), N_VALID_EXAMPLES)

        trial.report(accuracy, epoch)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return accuracy

Define the objective function, in this case the goal is to Minimize the loss

In [26]:
def objective_minimize_loss(trial):

    # Generate the model.
    model = define_model(trial).to(DEVICE)

    # Generate the optimizers.
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)

    # Get the FashionMNIST dataset.
    train_loader, valid_loader = get_mnist()
    l = 0 
    # Training of the model.
    for epoch in range(EPOCHS):
        model.train()
        for batch_idx, (data, target) in enumerate(train_loader):
            # Limiting training data for faster epochs.
            if batch_idx * BATCHSIZE >= N_TRAIN_EXAMPLES:
                break

            data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)

            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            l = loss
            loss.backward()
            optimizer.step()

        # Validation of the model.
        model.eval()
        correct = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(valid_loader):
                # Limiting validation data.
                if batch_idx * BATCHSIZE >= N_VALID_EXAMPLES:
                    break
                data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)
                output = model(data)
                # Get the index of the max log-probability.
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / min(len(valid_loader.dataset), N_VALID_EXAMPLES)

        trial.report(l, epoch)

        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return l

Main function, create the optuna study for the accuracy maximization, run the trials and find the best one 

In [27]:
#if __name__ == "__main__":
study_max_acc = optuna.create_study(direction="maximize")
study_max_acc.optimize(objective_max_accuracy, n_trials=100, timeout=600)

pruned_trials = study_max_acc.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study_max_acc.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study_max_acc.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study_max_acc.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2021-11-17 09:42:01,546][0m A new study created in memory with name: no-name-0ab189cd-feaf-4ec4-add8-02e18d46f64e[0m
[32m[I 2021-11-17 09:42:06,491][0m Trial 0 finished with value: 0.746875 and parameters: {'n_layers': 1, 'n_units_l0': 41, 'dropout_l0': 0.4739948685289035, 'optimizer': 'RMSprop', 'lr': 0.0002705851759150408}. Best is trial 0 with value: 0.746875.[0m
[32m[I 2021-11-17 09:42:11,603][0m Trial 1 finished with value: 0.115625 and parameters: {'n_layers': 1, 'n_units_l0': 105, 'dropout_l0': 0.26344786412431814, 'optimizer': 'SGD', 'lr': 1.6139793988112148e-05}. Best is trial 0 with value: 0.746875.[0m
[32m[I 2021-11-17 09:42:16,740][0m Trial 2 finished with value: 0.7484375 and parameters: {'n_layers': 3, 'n_units_l0': 46, 'dropout_l0': 0.2021101988679854, 'n_units_l1': 69, 'dropout_l1': 0.4438666831820254, 'n_units_l2': 77, 'dropout_l2': 0.2836361614362407, 'optimizer': 'RMSprop', 'lr': 0.003325107967248953}. Best is trial 2 with value: 0.7484375.[0m
[32

Study statistics: 
  Number of finished trials:  100
  Number of pruned trials:  50
  Number of complete trials:  50
Best trial:
  Value:  0.85703125
  Params: 
    n_layers: 1
    n_units_l0: 81
    dropout_l0: 0.31662232817639624
    optimizer: Adam
    lr: 0.003785935433466244


In [28]:
from optuna.visualization import plot_contour
from optuna.visualization import plot_edf
from optuna.visualization import plot_intermediate_values
from optuna.visualization import plot_optimization_history
from optuna.visualization import plot_parallel_coordinate
from optuna.visualization import plot_param_importances
from optuna.visualization import plot_slice

Following code: optuna visualization methods

In [29]:
plot_optimization_history(study_max_acc)

In [30]:
plot_intermediate_values(study_max_acc)

In [31]:
plot_parallel_coordinate(study_max_acc)

In [32]:
plot_contour(study_max_acc)

In [33]:
plot_slice(study_max_acc)

In [34]:
plot_param_importances(study_max_acc)

In [35]:
plot_edf(study_max_acc)

Main function, create the optuna study for the loss minimization, run the trials and find the best one 

In [36]:
#if __name__ == "__main__":
study_min_loss = optuna.create_study(direction="minimize")
study_min_loss.optimize(objective_minimize_loss, n_trials=100, timeout=600)

pruned_trials = study_min_loss.get_trials(deepcopy=False, states=[TrialState.PRUNED])
complete_trials = study_min_loss.get_trials(deepcopy=False, states=[TrialState.COMPLETE])

print("Study statistics: ")
print("  Number of finished trials: ", len(study_min_loss.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study_min_loss.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[32m[I 2021-11-17 09:47:18,047][0m A new study created in memory with name: no-name-97c5cd89-ceff-42ed-a200-59a14dfa0f0b[0m
[32m[I 2021-11-17 09:47:23,190][0m Trial 0 finished with value: 2.222968816757202 and parameters: {'n_layers': 2, 'n_units_l0': 34, 'dropout_l0': 0.23111495194453618, 'n_units_l1': 120, 'dropout_l1': 0.4245927330003595, 'optimizer': 'RMSprop', 'lr': 1.0326915795454764e-05}. Best is trial 0 with value: 2.222968816757202.[0m
[32m[I 2021-11-17 09:47:28,265][0m Trial 1 finished with value: 1.8775393962860107 and parameters: {'n_layers': 3, 'n_units_l0': 22, 'dropout_l0': 0.44869355148156587, 'n_units_l1': 8, 'dropout_l1': 0.37312977431481514, 'n_units_l2': 61, 'dropout_l2': 0.45667150984183785, 'optimizer': 'RMSprop', 'lr': 0.017034916689202668}. Best is trial 1 with value: 1.8775393962860107.[0m
[32m[I 2021-11-17 09:47:33,255][0m Trial 2 finished with value: 1.5803935527801514 and parameters: {'n_layers': 1, 'n_units_l0': 27, 'dropout_l0': 0.22891322541878

Study statistics: 
  Number of finished trials:  100
  Number of pruned trials:  60
  Number of complete trials:  40
Best trial:
  Value:  0.4049718976020813
  Params: 
    n_layers: 2
    n_units_l0: 111
    dropout_l0: 0.2816063121206171
    n_units_l1: 113
    dropout_l1: 0.3430404953869003
    optimizer: RMSprop
    lr: 0.0013736266222965233


In [37]:
plot_param_importances(study_min_loss)

In [38]:
plot_slice(study_min_loss)

COMPARISON BETWEEN LOSS MINIMIZATION AND ACCURACY MAXIMIZATION

In [44]:
print("Best trial minimizing loss:")
trial_min_loss = study_min_loss.best_trial

print("  Value: ", trial_min_loss.value)

print("  Params: ")
for key, value in trial_min_loss.params.items():
    print("    {}: {}".format(key, value))
print("Best trial maximizing accuracy:")
trial_max_acc = study_max_acc.best_trial

print("  Value: ", trial_max_acc.value)

print("  Params: ")
for key, value in trial_max_acc.params.items():
    print("    {}: {}".format(key, value))

Best trial minimizing loss:
  Value:  0.4049718976020813
  Params: 
    n_layers: 2
    n_units_l0: 111
    dropout_l0: 0.2816063121206171
    n_units_l1: 113
    dropout_l1: 0.3430404953869003
    optimizer: RMSprop
    lr: 0.0013736266222965233
Best trial maximizing accuracy:
  Value:  0.85703125
  Params: 
    n_layers: 1
    n_units_l0: 81
    dropout_l0: 0.31662232817639624
    optimizer: Adam
    lr: 0.003785935433466244


function to create a model based on input parameters

In [41]:
def define_model_with_params(units, dropouts):
    n_layers = len(units)
    layers = []

    in_features = 28 * 28
    for i in range(n_layers):
        out_features = units[i]
        layers.append(nn.Linear(in_features, out_features))
        layers.append(nn.ReLU())
        p = dropouts[i]
        layers.append(nn.Dropout(p))
        in_features = out_features
    layers.append(nn.Linear(in_features, CLASSES))
    layers.append(nn.LogSoftmax(dim=1))

    return nn.Sequential(*layers)

In [49]:
print(trial_min_loss.params)

{'n_layers': 2, 'n_units_l0': 111, 'dropout_l0': 0.2816063121206171, 'n_units_l1': 113, 'dropout_l1': 0.3430404953869003, 'optimizer': 'RMSprop', 'lr': 0.0013736266222965233}


create the model based on the 2 studies

In [50]:
units_max_acc=[]
dropouts_max_acc=[]
units_min_loss=[]
dropouts_min_loss=[]
for i in range(trial_min_loss.params['n_layers']):
  units_min_loss.append(trial_min_loss.params['n_units_l'+str(i)])
  dropouts_min_loss.append(trial_min_loss.params['dropout_l'+str(i)])
for i in range(trial_max_acc.params['n_layers']):
  units_max_acc.append(trial_max_acc.params['n_units_l'+str(i)])
  dropouts_max_acc.append(trial_max_acc.params['dropout_l'+str(i)])

  
model_max_acc = define_model_with_params(units=units_max_acc,dropouts=dropouts_max_acc)
model_min_loss = define_model_with_params(units=units_min_loss,dropouts=dropouts_min_loss)

Model optimizers

In [51]:
train_loader, valid_loader = get_mnist()
optimizer_max_acc = getattr(optim, trial_max_acc.params['optimizer'])(model_max_acc.parameters(), lr=trial_max_acc.params['lr'])
optimizer_min_loss = getattr(optim, trial_min_loss.params['optimizer'])(model_min_loss.parameters(), lr=trial_min_loss.params['lr'])

Train max acc model

In [54]:
# Training of the model.
EPOCHS = 50
train_loss = []
for epoch in range(EPOCHS):
    model_max_acc.train()
    running_loss=0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        # Limiting training data for faster epochs.
        if batch_idx * BATCHSIZE >= N_TRAIN_EXAMPLES:
            break
        data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)
        optimizer_max_acc.zero_grad()
        output = model_max_acc(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        running_loss += loss.item()
        optimizer_max_acc.step()
    loss = running_loss / len(train_loader)
    train_loss.append(loss)
    
    

    # Validation of the model.
    model_max_acc.eval()
    correct = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(valid_loader):
            # Limiting validation data.
            if batch_idx * BATCHSIZE >= N_VALID_EXAMPLES:
                break
            data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)
            output = model_max_acc(data)
            # Get the index of the max log-probability.
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    accuracy = correct / min(len(valid_loader.dataset), N_VALID_EXAMPLES)
    print('Epoch {} of {}, Accuracy: {:.3f}, Loss: {:.3f}'.format(epoch+1, EPOCHS, accuracy, loss))


Epoch 1 of 50, Accuracy: 0.837, Loss: 0.031
Epoch 2 of 50, Accuracy: 0.823, Loss: 0.031
Epoch 3 of 50, Accuracy: 0.838, Loss: 0.033
Epoch 4 of 50, Accuracy: 0.822, Loss: 0.031
Epoch 5 of 50, Accuracy: 0.839, Loss: 0.029
Epoch 6 of 50, Accuracy: 0.836, Loss: 0.029
Epoch 7 of 50, Accuracy: 0.836, Loss: 0.030
Epoch 8 of 50, Accuracy: 0.844, Loss: 0.031
Epoch 9 of 50, Accuracy: 0.845, Loss: 0.028
Epoch 10 of 50, Accuracy: 0.848, Loss: 0.029
Epoch 11 of 50, Accuracy: 0.835, Loss: 0.029
Epoch 12 of 50, Accuracy: 0.835, Loss: 0.029
Epoch 13 of 50, Accuracy: 0.838, Loss: 0.029
Epoch 14 of 50, Accuracy: 0.855, Loss: 0.029
Epoch 15 of 50, Accuracy: 0.838, Loss: 0.028
Epoch 16 of 50, Accuracy: 0.841, Loss: 0.027
Epoch 17 of 50, Accuracy: 0.848, Loss: 0.028
Epoch 18 of 50, Accuracy: 0.858, Loss: 0.027
Epoch 19 of 50, Accuracy: 0.835, Loss: 0.027
Epoch 20 of 50, Accuracy: 0.841, Loss: 0.027
Epoch 21 of 50, Accuracy: 0.848, Loss: 0.027
Epoch 22 of 50, Accuracy: 0.834, Loss: 0.028
Epoch 23 of 50, Acc

Train model min loss

In [55]:
# Training of the model.
EPOCHS = 50
train_loss = []
for epoch in range(EPOCHS):
    model_min_loss.train()
    running_loss=0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        # Limiting training data for faster epochs.
        if batch_idx * BATCHSIZE >= N_TRAIN_EXAMPLES:
            break
        data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)
        optimizer_min_loss.zero_grad()
        output = model_min_loss(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        running_loss += loss.item()
        optimizer_min_loss.step()
    loss = running_loss / len(train_loader)
    train_loss.append(loss)
    
    

    # Validation of the model.
    model_min_loss.eval()
    correct = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(valid_loader):
            # Limiting validation data.
            if batch_idx * BATCHSIZE >= N_VALID_EXAMPLES:
                break
            data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)
            output = model_min_loss(data)
            # Get the index of the max log-probability.
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    accuracy = correct / min(len(valid_loader.dataset), N_VALID_EXAMPLES)
    print('Epoch {} of {}, Accuracy: {:.3f}, Loss: {:.3f}'.format(epoch+1, EPOCHS, accuracy, loss))


Epoch 1 of 50, Accuracy: 0.623, Loss: 0.087
Epoch 2 of 50, Accuracy: 0.729, Loss: 0.055
Epoch 3 of 50, Accuracy: 0.748, Loss: 0.049
Epoch 4 of 50, Accuracy: 0.784, Loss: 0.045
Epoch 5 of 50, Accuracy: 0.780, Loss: 0.042
Epoch 6 of 50, Accuracy: 0.795, Loss: 0.040
Epoch 7 of 50, Accuracy: 0.814, Loss: 0.038
Epoch 8 of 50, Accuracy: 0.798, Loss: 0.037
Epoch 9 of 50, Accuracy: 0.812, Loss: 0.036
Epoch 10 of 50, Accuracy: 0.813, Loss: 0.036
Epoch 11 of 50, Accuracy: 0.841, Loss: 0.033
Epoch 12 of 50, Accuracy: 0.823, Loss: 0.033
Epoch 13 of 50, Accuracy: 0.812, Loss: 0.035
Epoch 14 of 50, Accuracy: 0.835, Loss: 0.031
Epoch 15 of 50, Accuracy: 0.822, Loss: 0.031
Epoch 16 of 50, Accuracy: 0.827, Loss: 0.031
Epoch 17 of 50, Accuracy: 0.833, Loss: 0.032
Epoch 18 of 50, Accuracy: 0.830, Loss: 0.030
Epoch 19 of 50, Accuracy: 0.828, Loss: 0.029
Epoch 20 of 50, Accuracy: 0.845, Loss: 0.032
Epoch 21 of 50, Accuracy: 0.852, Loss: 0.030
Epoch 22 of 50, Accuracy: 0.843, Loss: 0.030
Epoch 23 of 50, Acc