# **Optuna Workshop**


In this workshop you will learn about **Optuna**, an open source hyper parameter optimization framework to automate hyperparameter search.


**You will learn**

  -Basic Functions of Optuna

  -How set up a model to be optimized

  -How to run a study
  
  -How to visualzie resunts with Optuna

## **Step1: Install and Import Required Libraries**

To use optuna all you need to do is

**download:** !pip install optuna

**import:** import optuna

In [None]:
!pip install optuna


In [None]:
import optuna
from optuna.importance import get_param_importances
from optuna.visualization import plot_param_importances


import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, random_split, Subset
import torchvision


#CIFAR-10 Dataset libraries
import torchvision.datasets as datasets
import torchvision.transforms as transforms

import torch.nn.functional as F

#Set Random Seed
torch.manual_seed(41)
np.random.seed(41)

## **Step2: Import Dataset (CIFAR)**

For easy testing we will use the CIFAR dataset

In [None]:
#Data augmentation transform

aug_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    #transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261)) # standard CIFAR values
])

#Load in Dataset into train and test sets

cifar_trainset = datasets.CIFAR10(root='./data', train=True, download = True, transform = aug_transform)

cifar_testset = datasets.CIFAR10(root='./data', train=False, download = True, transform = aug_transform)

**Split up the Train Set**

In [None]:

train_dataset = cifar_trainset
test_dataset = cifar_testset

#Get Split lengths
data_size = len(train_dataset)
train_size = int(0.8 * data_size)
val_size = int(data_size - train_size)

#Split up dataset
train_set, val_set = torch.utils.data.random_split(train_dataset, [train_size, val_size])


**Create DataLoaders**

In [None]:
#Determine Batch Sizez
batch_size = 64

#Define Loader (Will leave drop last on = true for simplicity of chaning batch size with no size errors during testing)
train_loader = DataLoader(train_set, batch_size, shuffle = True, drop_last = True)
val_loader = DataLoader(val_set, batch_size, shuffle = True, drop_last = True)
test_loader = DataLoader(test_dataset, batch_size, shuffle = False, drop_last =True)

## **Step3: Define the CNN Model**

Simple CNN model for image classification

-We will define the model class with a dropout rate paramter for optimization

In [None]:
class ImageCNN(nn.Module):
  def __init__(self, dropout_rate=0.5):
    super(ImageCNN, self).__init__()

    #Define Convolutional Layers
    self.conv1 = nn.Conv2d(in_channels = 3, out_channels = 16, kernel_size = 3, stride = 1, padding = 1)
    self.conv2 = nn.Conv2d(in_channels = 16, out_channels = 16, kernel_size = 3, stride = 1, padding = 1)
    self.conv3 = nn.Conv2d(in_channels = 16, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)

    #Define Pooling Layer
    self.pool = nn.MaxPool2d(kernel_size = 2, stride= 2)

    #Define Fully Connected Layers
    self.fc1 = nn.Linear(4 * 4 * 32, 10)

    #Define Dropout
    self.dropout = nn.Dropout(dropout_rate)


  def forward(self, x):

    x = F.relu(self.conv1(x))
    x = self.pool(x)
    x = F.relu(self.conv2(x))
    x = self.pool(x)
    x = F.relu(self.conv3(x))
    x = self.pool(x)

    x = torch.flatten(x, 1)
    x = x.view(x.size(0), -1)
    x = self.dropout(x)

    x = self.fc1(x)

    return x

## **Step4: Training the Model**

Do an intial round of training as a benchmark

**Initialize the Model**

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = ImageCNN().to(device)

learning_rate = 0.001

# Model Parameters
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)


epochs = 10
N_TRAIN_EXAMPLES = batch_size * 30 #Specify the number of batches we will use in one epoch for quicker training
N_VAL_EXAMPLES = batch_size * 10

**Perform Initial Training**

In [None]:

#For visualization later
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []


#Define function for grid search




#Start MLflow run

for epoch in range(epochs):
    running_loss = 0.0
    running_corrects = 0
    total_samples = 0

    for batch_idx, (data, labels) in enumerate(train_loader):
      model.train()
      data, labels = data.to(device), labels.to(device)
      if batch_idx * batch_size >= N_TRAIN_EXAMPLES: #Reduce the number of batches for each epoch
        break
      optimizer.zero_grad()

      #Forward pass
      y_pred = model(data)
      loss = criterion(y_pred, labels)

      #Backward pass
      loss.backward()
      optimizer.step()

      # Accumulate the loss
      running_loss += loss.item() * data.size(0)

      # Calculate accuracy for this batch and accumulate correct predictions
      _, predicted = torch.max(y_pred, 1)
      running_corrects += (predicted == labels).sum().item()
      total_samples += labels.size(0)


    #--Validation--

    model.eval()

    val_running_loss = 0.0
    val_running_corrects = 0
    val_total_samples = 0

    with torch.no_grad():
      for batch_idx, (data, labels) in enumerate(val_loader):

        data, labels = data.to(device), labels.to(device)
        if batch_idx * batch_size >= N_TRAIN_EXAMPLES: #Reduce the number of batches for each epoch
          break

        y_val = model(data)
        val_loss = criterion(y_val, labels)

        #Accumulate Validation Loss
        val_running_loss += val_loss.item() * data.size(0)


        #Calculate validation accuracy
        _, val_predicted = torch.max(y_val,1)
        val_running_corrects += (val_predicted == labels).sum().item()

        val_total_samples += labels.size(0)



    # Calculate the average loss and overall accuracy for this epoch
    epoch_loss = running_loss / total_samples
    epoch_accuracy = running_corrects / total_samples
    print(f"Epoch {epoch+1}/{epochs}, Training Loss: {epoch_loss:.4f}, Training Accuracy: {epoch_accuracy:.4f}")

    #Calc average validation loss and accuracy for epoch
    epoch_val_loss = val_running_loss / val_total_samples
    epoch_val_accuracy = val_running_corrects / val_total_samples
    print(f'Epoch {epoch+1}/{epochs}, Validation Loss: {epoch_val_loss:.4f}, Validation Accuracy: {epoch_val_accuracy:.4f}')


    ###Train and Validation loss for plotting
    train_losses.append(epoch_loss)
    val_losses.append(epoch_val_loss)

    ###Train and validation accuracies for plotting
    train_accuracies.append(epoch_accuracy)
    val_accuracies.append(epoch_val_accuracy)

print("Training complete!")




## **Step 5 Define Tunable Model Function**

Optuna can be used to optimize hyper parameters as well as model structure and optimizers.  

To optimize a components of the model structure such as dropout rate or nunber of layers, we need to createa define_model fuction.

In this workshop, we will be optimizing dropout_rate,learning rate, and the optimizer.


In order for Optuna to try out different hyperparamter values, you need to suggest a range for it to test using one the **trial.suggest_** methods

In [None]:
#Define trial hyperpameters like this

#hyperparameter = trial.suggest_int("hyperparamter", 1, 5, log=False)

#Pass in train.suggest_"data type"(hyperparamter name, lower bound, upper bound, log=True or False)

In [None]:

def define_model(trial):
  #Optimize the number of layers, and dropout ratio
  dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.5)

  #Instantiate the model with suggested dropout rate
  model = ImageCNN(dropout_rate=dropout_rate)
  return model

## **Step 6: Define Objective Function**

In optuna, your objective function defines the evaluation metric you want to optimise your hyperparameters for.  In our case we will be optimizing the validaiton accuracy.

The **Objective Function** is just a regular training loop function wrapped with the optuna optimization so it can run through all **trials**.

A **trial** is a single evaluation of the objective function.  During each trial, Optuna generate a unique combination of the hyperparamters you specified.




In [None]:

N_TRAIN_EXAMPLES = batch_size * 30
N_VAL_EXAMPLES = batch_size * 10


def objective(trial):

  #Intiantiate a new model for each trail by passing trial to the define_model function
  model = define_model(trial).to(device)

  #Suggest different optimizers using the trial.suggest_ method
  optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
  lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
  optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)

  #Define the number of epochs for each trial
  num_epochs = 20
  for epoch in range(num_epochs):
    model.train()
    for batch_idx, (data, labels) in enumerate(train_loader):

      data, labels = data.to(device), labels.to(device)
      if batch_idx * batch_size >= N_TRAIN_EXAMPLES: #Reduce the number of batches for each epoch
        break


      optimizer.zero_grad()
      output = model(data)
      loss = criterion(output, labels)
      loss.backward()
      optimizer.step()


    #Validation
    model.eval()
    running_corrects = 0
    total_samples = 0
    #predicted = None
    with torch.no_grad():
      for batch_idx, (data, labels) in enumerate(val_loader):
        model.eval()
        data, labels = data.to(device), labels.to(device)

        if batch_idx * batch_size >= N_VAL_EXAMPLES:
          break
        output = model(data)

        _, predicted = torch.max(output, 1)
        running_corrects += (predicted == labels).sum().item()
        total_samples += labels.size(0)

    accuracy = running_corrects / total_samples

    trial.report(accuracy, epoch)

    if trial.should_prune():
      raise optuna.exceptions.TrialPruned()

  return accuracy





## **Step 7 Run the Study**

The **study** is the entire hyperparameter optimization process.  It does all of the trials, collects the results, and determines the best hyperparameters for the defined optimization metrics.

One of the main advantages of Optuna is **pruning** underperforming trials, drastically reducing the run time.

In [None]:
#Define the Study
study = optuna.create_study(direction="maximize")
#Specify the number of trials (n_trials)
study.optimize(objective, n_trials=XXXX, timeout=600)

#Keep track of pruned and complete trials
pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

#Print the Statistics
print("Study statistsics: ")
print("Number of finished trials: ", len(study.trials))
print("Number of pruned trials: ", len(pruned_trials))
print("Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print(" Value: ", trial, trial.value)

print(". Params:  ")
for key, value in trial.params.items():
  print("      {}:  {}".format(key, value))


## **Challenge Add a Learning Rate Scheduler to be Optimized**

Now its time to try to add a new hyperparameter to be tuned!

I have defined a StepLR scheduler which takes two hyper parameters **step_size** and **gamma**

Define the **step_size** and **gamma** so that they can be tuned by Optuna.

**Step_size:** Number of epochs before updating learning rate.

**Gamma:** This is the value the learning rate will be mutiplied by at each interval



In [None]:

N_TRAIN_EXAMPLES = batch_size * 30
N_VAL_EXAMPLES = batch_size * 10



def objective(trial):

  #Intiantiate a new model for each trail by passing trial to the define_model function
  model = define_model(trial).to(device)

  #Suggest different optimizers using the trial.suggest_ method
  optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
  lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
  optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)

  #Define a learning rate scheduler (StepLR)

  step_size = XXXX
  gamma = XXXX

  scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

  #Define the number of epochs for each trial
  num_epochs = 20
  for epoch in range(num_epochs):
    model.train()
    for batch_idx, (data, labels) in enumerate(train_loader):

      data, labels = data.to(device), labels.to(device)
      if batch_idx * batch_size >= N_TRAIN_EXAMPLES: #Reduce the number of batches for each epoch
        break


      optimizer.zero_grad()
      output = model(data)
      loss = criterion(output, labels)
      loss.backward()
      optimizer.step()
    scheduler.step() #step the scheduler


    #Validation
    model.eval()
    running_corrects = 0
    total_samples = 0
    #predicted = None
    with torch.no_grad():
      for batch_idx, (data, labels) in enumerate(val_loader):
        model.eval()
        data, labels = data.to(device), labels.to(device)

        if batch_idx * batch_size >= N_VAL_EXAMPLES:
          break
        output = model(data)

        _, predicted = torch.max(output, 1)
        running_corrects += (predicted == labels).sum().item()
        total_samples += labels.size(0)

    accuracy = running_corrects / total_samples

    trial.report(accuracy, epoch)

    if trial.should_prune():
      raise optuna.exceptions.TrialPruned()

  return accuracy
  #Define the number of epochs for each trial
  num_epochs = 30
  for epoch in range(num_epochs):
    model.train()
    for batch_idx, (data, labels) in enumerate(train_loader):

      data, labels = data.to(device), labels.to(device)
      if batch_idx * batch_size >= N_TRAIN_EXAMPLES: #Reduce the number of batches for each epoch
        break


      optimizer.zero_grad()
      output = model(data)
      loss = criterion(output, labels)
      loss.backward()
      optimizer.step()


    #Validation
    model.eval()
    running_corrects = 0
    total_samples = 0
    #predicted = None
    with torch.no_grad():
      for batch_idx, (data, labels) in enumerate(val_loader):
        model.eval()
        data, labels = data.to(device), labels.to(device)

        if batch_idx * batch_size >= N_VAL_EXAMPLES:
          break
        output = model(data)

        _, predicted = torch.max(output, 1)
        running_corrects += (predicted == labels).sum().item()
        total_samples += labels.size(0)

    accuracy = running_corrects / total_samples

    trial.report(accuracy, epoch)

    if trial.should_prune():
      raise optuna.exceptions.TrialPruned()

  return accuracy



**Run the Study Again**

In [None]:
#Define the Study
study = optuna.create_study(direction="maximize")
#Specify the number of trials (n_trials)
study.optimize(objective, n_trials=XXXX, timeout=600)

#Keep track of pruned and complete trials
pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

#Print the Statistics
print("Study statistsics: ")
print("Number of finished trials: ", len(study.trials))
print("Number of pruned trials: ", len(pruned_trials))
print("Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print(" Value: ", trial, trial.value)

print(". Params:  ")
for key, value in trial.params.items():
  print("      {}:  {}".format(key, value))

##**Check out Optuna's Visualizations**

In [None]:
from optuna.visualization import (
    plot_optimization_history,
    plot_param_importances,
    plot_parallel_coordinate,
    plot_slice,
    plot_contour,
    plot_edf,
    plot_intermediate_values,
)

# After running your study:
fig1 = plot_optimization_history(study)
fig2 = plot_param_importances(study)
fig3 = plot_parallel_coordinate(study)
fig4 = plot_slice(study)
fig5 = plot_contour(study)
fig6 = plot_edf(study)
fig7 = plot_intermediate_values(study)

# Then display the figures
fig1.show()
fig2.show()
fig3.show()
fig4.show()
fig5.show()
fig6.show()
fig7.show()