# Hyperparameter tuning with Ray Tune

### Setup / Imports

In [1]:
import numpy as np
import pandas as pd

import os

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data.dataset import Dataset

from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

from data import cicids
from models import DBN

In [2]:
pd.set_option('display.max_columns', None, 'max_colwidth', None, 'display.expand_frame_repr', False)

### The train function
We wrap the training script in a function `train(config, checkpoint_dir=None, data_dir=None`. As you can guess, the config parameter will receive the hyperparameters we would like to train with. The checkpoint_dir parameter is used to restore checkpoints. The data_dir specifies the directory where we load and store the data, so multiple runs can share the same data source.

In [6]:
def train(config, checkpoint_dir=None, data_dir=None):
    net = DBN(n_hidden=config["n_hidden"], 
              learning_rate=config["learning_rate_rbm"], 
              batch_size=config["batch_size_rbm"])

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = [optim.Adam(m.parameters(), lr=config["learning_rate_dbn"]) for m in net.models]
    optimizer.append(optim.Adam(net.fc.parameters(), lr=config["learning_rate_dbn"]))

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(os.path.join(checkpoint_dir, "checkpoint"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    train_data, val_data, test_data = cicids.get_dataset()
    
    train_loader = torch.utils.data.DataLoader(
        dataset=train_data,
        batch_size=int(config["batch_size_dbn"]),
        shuffle=True,
    )
    valid_loader = torch.utils.data.DataLoader(
        dataset=val_data,
        batch_size=int(config["batch_size_dbn"]),
        shuffle=True
    )


    for epoch in range(1, 10+1):

        ##################################
        ##          TRAIN LOOP          ##
        ##################################
        net.train()

        train_loss = 0.0
        train_steps = 0
        train_total = 0
        train_correct = 0

        train_output_pred = []
        train_output_true = []

        print(f"{tag} Epoch {epoch}/{num_epochs}:")
        for inputs, labels in tqdm(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            for opt in optimizer:
                opt.zero_grad()

            # Passing the batch down the model
            outputs = net(inputs.float())

            # forward + backward + optimize
            loss = criterion(outputs, labels)
            loss.backward()

            # For every possible optimizer performs the gradient update
            for opt in optimizer:
                opt.step()

            train_loss += loss.cpu().item()
            train_steps += 1

            _, predicted = torch.max(outputs.data, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()

            train_output_pred += outputs.argmax(1).tolist()
            train_output_true += labels.tolist()

        ############################################
        ##               VALID LOOP               ##
        ############################################
        net.eval()

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        val_total = 0
        val_correct = 0

        val_output_pred = []
        val_output_true = []

        for inputs, labels in valid_loader:
            with torch.no_grad():
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs.float())

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().item()
                val_steps += 1

                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

                val_output_pred += outputs.argmax(1).tolist()
                val_output_true += labels.tolist()

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((net.state_dict(), optimizer.state_dict()), path)

        tune.report(train_loss=(train_loss/train_steps),
                    train_accuracy=(train_correct/train_total),
                    val_loss=(val_loss/val_steps),
                    val_accuracy=(val_correct/val_total))
        
    print("Finished Training")

### Test set accuracy
Commonly the performance of a machine learning model is tested on a hold-out test
set with data that has not been used for training the model. We also wrap this in a
function:

In [None]:
def test_accuracy(net, batch_size, device="cpu"):
    train_data, val_data, test_data = cicids.get_dataset()

    test_loader  = torch.utils.data.DataLoader(
        dataset=test_data,
        batch_size=batch_size,
        shuffle=False
    )

    net.eval()

    history = {
        'test': {
            'total': 0,
            'loss': 0.0,
            'accuracy': 0.0,
            'output_pred': [],
            'output_true': [],
            'output_pred_prob': []
        }
    }

    criterion = torch.nn.CrossEntropyLoss()

    test_loss = 0.0
    test_steps = 0
    test_total = 0
    test_correct = 0

    test_output_pred = []
    test_output_true = []
    test_output_pred_prob = []

    with torch.no_grad():
        for (inputs, labels) in tqdm(test_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = net(inputs.float())
            loss = criterion(outputs, labels)

            test_loss += loss.cpu().item()
            test_steps += 1

            _, predicted = torch.max(outputs.data, 1)
            test_total += labels.size(0)
            test_correct += (predicted == labels).sum().item()

            test_output_pred += outputs.argmax(1).cpu().tolist()
            test_output_true += labels.tolist()
            test_output_pred_prob += nn.functional.softmax(outputs, dim=0).cpu().tolist()
            
    return correct / totalcicids.

### Configuring the search space

Lastly, we need to define Ray Tune's search space.

At each trial, Ray Tune will now randomly sample a combination of parameters from these search spaces. It will then train a number of models in parallel and find the best performing one among these. We also use the ASHAScheduler which will terminate bad performing trials early.

You can specify the number of CPUs, which are then available e.g. to increase the num_workers of the PyTorch DataLoader instances. The selected number of GPUs are made visible to PyTorch in each trial. Trials do not have access to GPUs that haven't been requested for them - so you don't have to care about two trials using the same set of resources.

Here we can also specify fractional GPUs, so something like gpus_per_trial=0.5 is completely valid. The trials will then share GPUs among each other. You just have to make sure that the models still fit in the GPU memory.
After training the models, we will find the best performing one and load the trained network from the checkpoint file. We then obtain the test set accuracy and report everything by printing.

In [None]:
num_samples=10
max_num_epochs=10

l = np.random.randint(1, 3)

config = {
    "n_hidden": [tune.sample_from(lambda _: 2**np.random.randint(2, 8)) for _ in range(l)],
    "learning_rate_rbm": [tune.loguniform(1e-3, 1e-1) for _ in range(l)],
    "learning_rate_dbn": tune.loguniform(1e-3, 1e-1),
    "batch_size_rbm": [tune.choice([64, 128, 256, 512, 1024]) for _ in range(l)],
    "batch_size_dbn": tune.choice([64, 128, 256, 512, 1024]),
}
scheduler = ASHAScheduler(
    metric="val_loss",
    mode="min",
    max_t=max_num_epochs,
    grace_period=1,
    reduction_factor=2
)
reporter = CLIReporter(metric_columns=["train_loss", "train_accuracy", "val_loss", "val_accuracy", "training_iteration"])
result = tune.run(
    train,
    config=config,
    num_samples=num_samples,
    scheduler=scheduler,
    progress_reporter=reporter
)

best_trial = result.get_best_trial("val_loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final training loss: {}".format(
    best_trial.last_result["train_loss"]))
print("Best trial final training accuracy: {}".format(
    best_trial.last_result["train_accuracy"]))
print("Best trial final validation loss: {}".format(
    best_trial.last_result["val_loss"]))
print("Best trial final validation accuracy: {}".format(
    best_trial.last_result["val_accuracy"]))

best_trained_model = NeuralNetwork(best_trial.config["hidden1_size"],
                                   best_trial.config["hidden2_size"],
                                   best_trial.config["hidden3_size"])
device = "cpu"
if torch.cuda.is_available():
    device = "cuda:0"
    if gpus_per_trial > 1:
        best_trained_model = nn.DataParallel(best_trained_model)
best_trained_model.to(device)

best_checkpoint_dir = best_trial.checkpoint.value
model_state, optimizer_state = torch.load(os.path.join(
    best_checkpoint_dir, "checkpoint"))
best_trained_model.load_state_dict(model_state)

test_acc = test_accuracy(best_trained_model, best_trial.config["batch_size"], device)
print("Best trial test set accuracy: {}".format(test_acc))

| Trial name        | status     | loc   |   batch_size |   hidden1_size |   hidden2_size |   hidden3_size |         lr |   train_loss |   train_accuracy |   val_loss |   val_accuracy |   training_iteration |
|-------------------|------------|-------|--------------|----------------|----------------|----------------|------------|--------------|------------------|------------|----------------|----------------------|
| 00000 | TERMINATED |       |          128 |              4 |             16 |             32 | 0.0499089  |    0.0965664 |         0.969098 |  0.216287  |       0.915488 |                    2 |
| 00001 | TERMINATED |       |          256 |              4 |              4 |              8 | 0.0146023  |    0.0738336 |         0.977657 |  0.205247  |       0.930971 |                   10 |
| 00002 | TERMINATED |       |          512 |              4 |            128 |             32 | 0.00616173 |    0.308407  |         0.894283 |  0.289828  |       0.904676 |                    1 |
| 00003 | TERMINATED |       |          256 |             16 |            128 |             64 | 0.0171306  |    0.0174327 |         0.994842 |  0.0425004 |       0.986367 |                   10 |
| 00004 | TERMINATED |       |          256 |              8 |             16 |            128 | 0.0815692  |    0.0346922 |         0.988216 |  0.0969654 |       0.966947 |                   10 |
| 00005 | TERMINATED |       |           64 |              8 |             16 |             16 | 0.0365723  |    0.0593174 |         0.980345 |  0.233777  |       0.925306 |                    2 |
| 00006 | TERMINATED |       |          256 |             32 |            128 |             32 | 0.00677515 |    0.0332826 |         0.989082 |  0.0936671 |       0.960486 |                    8 |
| 00007 | TERMINATED |       |           64 |              4 |             64 |             64 | 0.00243203 |    0.174005  |         0.944215 |  0.254956  |       0.906671 |                    1 |
| 00008 | TERMINATED |       |          512 |             32 |             64 |             16 | 0.0652585  |    0.0168181 |         0.994624 |  0.0560469 |       0.982534 |                   10 |
| 00009 | TERMINATED |       |          128 |            128 |              8 |              8 | 0.0820741  |    0.0271389 |         0.991193 |  0.0768982 |       0.97436  |                   10 |