# Hyperparameters Optimization with ASHA

- Dataset used synthetic trinomial options


In [1]:
!pip install -U ray[tune]

Collecting ray[tune]
  Downloading ray-1.11.0-cp37-cp37m-manylinux2014_x86_64.whl (52.7 MB)
[K     |████████████████████████████████| 52.7 MB 79 kB/s 
Collecting grpcio<=1.43.0,>=1.28.1
  Downloading grpcio-1.43.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.1 MB)
[K     |████████████████████████████████| 4.1 MB 31.6 MB/s 
[?25hCollecting redis>=3.5.0
  Downloading redis-4.2.2-py3-none-any.whl (226 kB)
[K     |████████████████████████████████| 226 kB 45.3 MB/s 
Collecting tensorboardX>=1.9
  Downloading tensorboardX-2.5-py2.py3-none-any.whl (125 kB)
[K     |████████████████████████████████| 125 kB 33.3 MB/s 
Collecting async-timeout>=4.0.2
  Downloading async_timeout-4.0.2-py3-none-any.whl (5.8 kB)
Collecting deprecated>=1.2.3
  Downloading Deprecated-1.2.13-py2.py3-none-any.whl (9.6 kB)
Installing collected packages: deprecated, async-timeout, redis, grpcio, tensorboardX, ray
  Attempting uninstall: grpcio
    Found existing installation: grpcio 1.44.0
    Uninsta

In [3]:
#@title **Imports**
import os
from functools import partial

import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

In [4]:
synthetic_calls_path = '../data/trinomial_synthetic_calls.csv'
synthetic_puts_path = '../data/trinomial_synthetic_puts.csv'
checkpoint_dir = '../models/opt_checkpoints'

In [5]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    
    return df

In [6]:
class OptDataset(Dataset):

  def __init__(self, X, y):
    self.X = X
    self.y = y

  def __getitem__(self, idx):
    return self.X[idx], self.y[idx]

  def __len__(self):
    return len(self.X)

In [7]:
def preprocessing(df:pd.DataFrame):
  df = pd.get_dummies(df, prefix='', prefix_sep='')
  input_sc = StandardScaler()
  output_sc = StandardScaler()
  input_data = input_sc.fit_transform(df.drop('Option Price', axis=1))
  output_data = output_sc.fit_transform(df['Option Price'].values.reshape(-1, 1))

  return input_data, output_data

In [8]:
def load_data():
  synthetic_calls = pd.read_csv(synthetic_calls_path, index_col=0)
  synthetic_puts = pd.read_csv(synthetic_puts_path, index_col=0)

  synthetic_calls = reduce_mem_usage(synthetic_calls)
  synthetic_puts = reduce_mem_usage(synthetic_puts)
  
  synthetic_options = pd.concat([synthetic_calls, synthetic_puts], ignore_index=True)
  synthetic_options = shuffle(synthetic_options, random_state=0)
  
  input_data, output_data = preprocessing(synthetic_options)

  train_size = 0.8
  val_size = 0.1

  last_train_idx = int(np.round(len(input_data) * train_size))
  last_val_idx = last_train_idx + int(np.round(len(input_data) * val_size))

  X_train = Variable(torch.Tensor(input_data[0:last_train_idx]))
  X_val = Variable(torch.Tensor(input_data[last_train_idx:last_val_idx]))
  X_test = Variable(torch.Tensor(input_data[last_val_idx:]))

  y_train = Variable(torch.Tensor(output_data[0:last_train_idx]))
  y_val = Variable(torch.Tensor(output_data[last_train_idx:last_val_idx]))
  y_test = Variable(torch.Tensor(output_data[last_val_idx:]))

  return OptDataset(X_train, y_train), OptDataset(X_val, y_val), OptDataset(X_test, y_test)

# Configurable Model

FCN model with residual blocks, with the following configurable parameters:

- Number of layers
- Activation function: $[ReLU, LeakyReLU, ELU]$
- Hidden layer size

In [9]:
CUDA = torch.cuda.is_available()
device = 'cuda:0' if CUDA else 'cpu'

In [10]:
class ResBlock(nn.Module):

  def __init__(self, module):
    super(ResBlock, self).__init__()
    self.module = module

  def forward(self, x):
    return self.module(x) + x

In [11]:
class HiddenLayer(nn.Module):

  def __init__(self, layer_size, act_fn):
      super(HiddenLayer, self).__init__()
      
      if act_fn == 'ReLU':
        self.layer = nn.Sequential(
          nn.Linear(layer_size, layer_size),
          nn.ReLU())
      elif act_fn == 'LeakyReLU':
        self.layer = nn.Sequential(
          nn.Linear(layer_size, layer_size),
          nn.LeakyReLU())
      elif act_fn == 'ELU':
        self.layer = nn.Sequential(
          nn.Linear(layer_size, layer_size),
          nn.ELU())
    
  def forward(self, x):
    return self.layer(x)

In [12]:
class Net(nn.Module):

  def __init__(self, input_size, output_size, hidden_size, num_layers, act_fn):
    super(Net, self).__init__()
    self.input_size = input_size
    self.output_size = output_size
    self.hidden_size = hidden_size

    if act_fn == 'ReLU':
      self.initial_layer = nn.Sequential(
          nn.Linear(self.input_size, self.hidden_size),
          nn.ReLU())
    elif act_fn == 'LeakyReLU':
      self.initial_layer = nn.Sequential(
          nn.Linear(self.input_size, self.hidden_size),
          nn.LeakyReLU())
    elif act_fn == 'ELU':
      self.initial_layer = nn.Sequential(
          nn.Linear(self.input_size, self.hidden_size),
          nn.ELU())

    self.hidden_layers_list = []

    for i in range(num_layers // 2):
      self.hidden_layers_list.append(
          ResBlock(
            nn.Sequential(
                HiddenLayer(self.hidden_size, act_fn),
                HiddenLayer(self.hidden_size, act_fn)
            )
        )
      )

    self.hidden_layers = nn.Sequential(*self.hidden_layers_list)

    self.net = nn.Sequential(
        self.initial_layer,
        self.hidden_layers,
        nn.Linear(self.hidden_size, self.output_size)
    )
  
  def forward(self, x):
    return self.net(x)

In [13]:
@torch.no_grad()
def init_xuniform(m):
  if isinstance(m, nn.Linear):
    torch.nn.init.xavier_uniform_(m.weight)
    m.bias.data.fill_(0.01)

In [14]:
input_size = 7
output_size = 1
number_of_samples = 20

In [15]:
def train_cifar(config, checkpoint_dir=None):
    net = Net(input_size, output_size, config['hidden_size'], config['num_layers'], config['act_fn'])
    net = net.to(device)
    net.apply(init_xuniform)

    criterion = nn.MSELoss()
    optimizer = optim.Adam(net.parameters(), lr=config["lr"])

    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    trainset, valset, testset = load_data()

    trainloader = torch.utils.data.DataLoader(
        trainset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)
    valloader = torch.utils.data.DataLoader(
        valset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)

    for epoch in range(25):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 50 == 49:  # print every 50 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((net.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=(val_loss / val_steps))
    print("Finished Training")

In [16]:
def test_accuracy(net, loss_fn):
    trainset, valset, testset = load_data()

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False, num_workers=2)

    losses = []
    with torch.no_grad():
        for data in testloader:
            X, y = data
            X, y = X.to(device), y.to(device)
            out = net(X)
            loss = loss_fn(out, y.to(device))
            losses.append(loss.cpu().item())

    return np.array(losses).mean()

Apart from the networks actitectural parameters, we would like to find the optimal **learning rate** and **batch size**. 

In [17]:
config = {
    "hidden_size": tune.choice([400, 600, 800]),
    "num_layers": tune.choice([4, 6, 8]),
    "act_fn": tune.choice(["ReLU", "LeakyReLU", "ELU"]),
    "lr": tune.loguniform(5e-5, 1e-1),
    "batch_size": tune.choice([512, 1024, 2048])
}

In [18]:
scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=30,
        grace_period=1,
        reduction_factor=2)

In [19]:
reporter = CLIReporter(
        metric_columns=["loss", "training_iteration"])

In [20]:
result = tune.run(
        partial(train_cifar, checkpoint_dir=checkpoint_dir),
        resources_per_trial={"cpu": 2, "gpu": 1},
        config=config,
        num_samples=number_of_samples,
        scheduler=scheduler,
        progress_reporter=reporter)

2022-04-08 08:53:44,456	INFO registry.py:70 -- Detected unknown callable for trainable. Converting to class.


== Status ==
Current time: 2022-04-08 08:53:45 (running for 00:00:00.55)
Memory usage on this node: 1.6/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 16/20 (15 PENDING, 1 RUNNING)
+---------------------+----------+----------------+-----------+--------------+---------------+-------------+--------------+
| Trial name          | status   | loc            | act_fn    |   batch_size |   hidden_size |          lr |   num_layers |
|---------------------+----------+----------------+-----------+--------------+---------------+-------------+--------------|
| DEFAULT_6578f_00000 | RUNNING  | 172.28.0.2:396 | LeakyReLU |         1024 |           600 | 0.0182931   |            4 |
| DEFAULT_6578f_00001 | PEN

[2m[36m(func pid=396)[0m   cpuset_checked))


[2m[36m(func pid=396)[0m [1,    50] loss: 2115.425
[2m[36m(func pid=396)[0m [1,   100] loss: 0.243
== Status ==
Current time: 2022-04-08 08:54:05 (running for 00:00:20.76)
Memory usage on this node: 3.2/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 17/20 (16 PENDING, 1 RUNNING)
+---------------------+----------+----------------+-----------+--------------+---------------+-------------+--------------+
| Trial name          | status   | loc            | act_fn    |   batch_size |   hidden_size |          lr |   num_layers |
|---------------------+----------+----------------+-----------+--------------+---------------+-------------+--------------|
| DEFAULT_6578f_00000 | RUNNING  | 172.28.0.2:

[2m[36m(func pid=395)[0m   cpuset_checked))


[2m[36m(func pid=395)[0m [1,    50] loss: 0.744
[2m[36m(func pid=395)[0m [1,   100] loss: 0.004
== Status ==
Current time: 2022-04-08 08:57:05 (running for 00:03:20.82)
Memory usage on this node: 3.0/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: -6203.759912109375 | Iter 8.000: -0.0041494907950982455 | Iter 4.000: -0.004680000701919198 | Iter 2.000: -0.01526547733694315 | Iter 1.000: -0.013430363927036525
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 18/20 (16 PENDING, 1 RUNNING, 1 TERMINATED)
+---------------------+------------+----------------+-----------+--------------+---------------+-------------+--------------+---------+----------------------+
| Trial name          | status     | loc            | act_fn    |   batch_size |   hidden_size |          lr |   num_layers |    loss |   training_iteration |
|------

[2m[36m(func pid=4998)[0m   cpuset_checked))


[2m[36m(func pid=4998)[0m [1,    50] loss: 0.199
== Status ==
Current time: 2022-04-08 09:01:48 (running for 00:08:03.81)
Memory usage on this node: 3.0/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: -3101.8799671239185 | Iter 8.000: -0.002087194006489881 | Iter 4.000: -0.0024061518876987973 | Iter 2.000: -0.007880495426652489 | Iter 1.000: -0.006788695454015396
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 19/20 (16 PENDING, 1 RUNNING, 2 TERMINATED)
+---------------------+------------+-----------------+-----------+--------------+---------------+-------------+--------------+----------------+----------------------+
| Trial name          | status     | loc             | act_fn    |   batch_size |   hidden_size |          lr |   num_layers |           loss |   training_iteration |
|---------------------+------------+---

[2m[36m(func pid=7258)[0m   cpuset_checked))


[2m[36m(func pid=7258)[0m [1,    50] loss: 0.327
[2m[36m(func pid=7258)[0m [1,   100] loss: 0.009
[2m[36m(func pid=7258)[0m [1,   150] loss: 0.003
[2m[36m(func pid=7258)[0m [1,   200] loss: 0.002
== Status ==
Current time: 2022-04-08 09:04:50 (running for 00:11:05.60)
Memory usage on this node: 3.2/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: -0.0001138351445843 | Iter 8.000: -0.00042640748026315125 | Iter 4.000: -0.00019915150594897567 | Iter 2.000: -0.0004955135163618252 | Iter 1.000: -0.000700614774832502
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 20/20 (16 PENDING, 1 RUNNING, 3 TERMINATED)
+---------------------+------------+-----------------+-----------+--------------+---------------+-------------+--------------+----------------+----------------------+
| Trial name          | status     | loc      

[2m[36m(func pid=7385)[0m   cpuset_checked))


[2m[36m(func pid=7385)[0m [1,    50] loss: 2.135
[2m[36m(func pid=7385)[0m [1,   100] loss: 0.005
[2m[36m(func pid=7385)[0m [1,   150] loss: 0.001
== Status ==
Current time: 2022-04-08 09:05:06 (running for 00:11:21.54)
Memory usage on this node: 3.0/12.7 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 16.000: -0.0001138351445843 | Iter 8.000: -0.00042640748026315125 | Iter 4.000: -0.00019915150594897567 | Iter 2.000: -0.0004955135163618252 | Iter 1.000: -0.0021798154833959416
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 20/20 (15 PENDING, 1 RUNNING, 4 TERMINATED)
+---------------------+------------+-----------------+-----------+--------------+---------------+-------------+--------------+----------------+----------------------+
| Trial name          | status     | loc             | act_fn    |   batch_size |   hidden_size |

[2m[36m(func pid=7817)[0m   cpuset_checked))


[2m[36m(func pid=7817)[0m [1,    50] loss: 2.617
[2m[36m(func pid=7817)[0m [1,   100] loss: 0.005
[2m[36m(func pid=7817)[0m [1,   150] loss: 0.001
[2m[36m(func pid=7817)[0m [1,   200] loss: 0.000
[2m[36m(func pid=7817)[0m [1,   250] loss: 0.000
== Status ==
Current time: 2022-04-08 09:06:03 (running for 00:12:18.52)
Memory usage on this node: 3.4/12.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 16.000: -0.0001138351445843 | Iter 8.000: -0.00042640748026315125 | Iter 4.000: -0.000222228152124444 | Iter 2.000: -0.00037842800549697136 | Iter 1.000: -0.000700614774832502
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 20/20 (14 PENDING, 1 RUNNING, 5 TERMINATED)
+---------------------+------------+-----------------+-----------+--------------+---------------+-------------+--------------+----------------+-------------------

[2m[36m(func pid=10093)[0m   cpuset_checked))


[2m[36m(func pid=10093)[0m [1,    50] loss: 2206840376563962.000
[2m[36m(func pid=10093)[0m [1,   100] loss: 1360856457159.680
[2m[36m(func pid=10093)[0m [1,   150] loss: 2911889876.907
== Status ==
Current time: 2022-04-08 09:09:03 (running for 00:15:18.71)
Memory usage on this node: 3.1/12.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 16.000: -8.20843495966983e-05 | Iter 8.000: -0.0002501159540406661 | Iter 4.000: -0.00019915150594897567 | Iter 2.000: -0.00026134249463211746 | Iter 1.000: -0.0005631910511874594
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 20/20 (13 PENDING, 1 RUNNING, 6 TERMINATED)
+---------------------+------------+------------------+-----------+--------------+---------------+-------------+--------------+----------------+----------------------+
| Trial name          | status     | loc              |

[2m[36m(func pid=10218)[0m   cpuset_checked))


[2m[36m(func pid=10218)[0m [1,    50] loss: 57.069
== Status ==
Current time: 2022-04-08 09:09:20 (running for 00:15:35.25)
Memory usage on this node: 3.0/12.7 GiB
Using AsyncHyperBand: num_stopped=3
Bracket: Iter 16.000: -8.20843495966983e-05 | Iter 8.000: -0.0002501159540406661 | Iter 4.000: -0.00019915150594897567 | Iter 2.000: -0.00026134249463211746 | Iter 1.000: -0.000700614774832502
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 20/20 (12 PENDING, 1 RUNNING, 7 TERMINATED)
+---------------------+------------+------------------+-----------+--------------+---------------+-------------+--------------+----------------+----------------------+
| Trial name          | status     | loc              | act_fn    |   batch_size |   hidden_size |          lr |   num_layers |           loss |   training_iteration |
|---------------------+------

[2m[36m(func pid=10656)[0m   cpuset_checked))


[2m[36m(func pid=10656)[0m [1,    50] loss: 1.365
== Status ==
Current time: 2022-04-08 09:10:37 (running for 00:16:52.59)
Memory usage on this node: 3.0/12.7 GiB
Using AsyncHyperBand: num_stopped=4
Bracket: Iter 16.000: -8.20843495966983e-05 | Iter 8.000: -0.0002501159540406661 | Iter 4.000: -0.000222228152124444 | Iter 2.000: -0.0002121642946440261 | Iter 1.000: -0.0006592976854881272
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 20/20 (11 PENDING, 1 RUNNING, 8 TERMINATED)
+---------------------+------------+------------------+-----------+--------------+---------------+-------------+--------------+----------------+----------------------+
| Trial name          | status     | loc              | act_fn    |   batch_size |   hidden_size |          lr |   num_layers |           loss |   training_iteration |
|---------------------+---------

[2m[36m(func pid=12946)[0m   cpuset_checked))


[2m[36m(func pid=12946)[0m [1,    50] loss: 0.219
== Status ==
Current time: 2022-04-08 09:14:18 (running for 00:20:33.69)
Memory usage on this node: 3.1/12.7 GiB
Using AsyncHyperBand: num_stopped=4
Bracket: Iter 16.000: -5.033355460909661e-05 | Iter 8.000: -7.382442781818099e-05 | Iter 4.000: -0.00019915150594897567 | Iter 2.000: -0.00016298609465593473 | Iter 1.000: -0.0006179805961437524
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 20/20 (10 PENDING, 1 RUNNING, 9 TERMINATED)
+---------------------+------------+------------------+-----------+--------------+---------------+-------------+--------------+----------------+----------------------+
| Trial name          | status     | loc              | act_fn    |   batch_size |   hidden_size |          lr |   num_layers |           loss |   training_iteration |
|---------------------+-----

[2m[36m(func pid=13068)[0m   cpuset_checked))


[2m[36m(func pid=13068)[0m [1,    50] loss: 11536.638
[2m[36m(func pid=13068)[0m [1,   100] loss: 2.723
[2m[36m(func pid=13068)[0m [1,   150] loss: 1.258
== Status ==
Current time: 2022-04-08 09:14:34 (running for 00:20:49.74)
Memory usage on this node: 3.1/12.7 GiB
Using AsyncHyperBand: num_stopped=5
Bracket: Iter 16.000: -5.033355460909661e-05 | Iter 8.000: -7.382442781818099e-05 | Iter 4.000: -0.00019915150594897567 | Iter 2.000: -0.00016298609465593473 | Iter 1.000: -0.0006592976854881272
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 20/20 (9 PENDING, 1 RUNNING, 10 TERMINATED)
+---------------------+------------+------------------+-----------+--------------+---------------+-------------+--------------+----------------+----------------------+
| Trial name          | status     | loc              | act_fn    |   batch_size |   h

[2m[36m(func pid=13196)[0m   cpuset_checked))


[2m[36m(func pid=13196)[0m [1,    50] loss: 0.120
[2m[36m(func pid=13196)[0m [1,   100] loss: 0.003
== Status ==
Current time: 2022-04-08 09:14:50 (running for 00:21:05.69)
Memory usage on this node: 3.1/12.7 GiB
Using AsyncHyperBand: num_stopped=6
Bracket: Iter 16.000: -5.033355460909661e-05 | Iter 8.000: -7.382442781818099e-05 | Iter 4.000: -0.00019915150594897567 | Iter 2.000: -0.00016298609465593473 | Iter 1.000: -0.000700614774832502
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 20/20 (8 PENDING, 1 RUNNING, 11 TERMINATED)
+---------------------+------------+------------------+-----------+--------------+---------------+-------------+--------------+----------------+----------------------+
| Trial name          | status     | loc              | act_fn    |   batch_size |   hidden_size |          lr |   num_layers |           loss |

[2m[36m(func pid=13413)[0m   cpuset_checked))


[2m[36m(func pid=13413)[0m [1,    50] loss: 69.480
[2m[36m(func pid=13413)[0m [1,   100] loss: 0.045
[2m[36m(func pid=13413)[0m [1,   150] loss: 0.008
[2m[36m(func pid=13413)[0m [1,   200] loss: 0.004
[2m[36m(func pid=13413)[0m [1,   250] loss: 0.003
[2m[36m(func pid=13413)[0m [1,   300] loss: 0.002
[2m[36m(func pid=13413)[0m [1,   350] loss: 0.001
== Status ==
Current time: 2022-04-08 09:15:16 (running for 00:21:31.76)
Memory usage on this node: 3.1/12.7 GiB
Using AsyncHyperBand: num_stopped=7
Bracket: Iter 16.000: -5.033355460909661e-05 | Iter 8.000: -7.382442781818099e-05 | Iter 4.000: -0.00019915150594897567 | Iter 2.000: -0.0001858417886251118 | Iter 1.000: -0.0006592976854881272
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 20/20 (7 PENDING, 1 RUNNING, 12 TERMINATED)
+---------------------+------------+----------

[2m[36m(func pid=13550)[0m   cpuset_checked))


[2m[36m(func pid=13550)[0m [1,    50] loss: 0.202
== Status ==
Current time: 2022-04-08 09:15:33 (running for 00:21:48.41)
Memory usage on this node: 3.1/12.7 GiB
Using AsyncHyperBand: num_stopped=8
Bracket: Iter 16.000: -5.033355460909661e-05 | Iter 8.000: -7.382442781818099e-05 | Iter 4.000: -0.00019915150594897567 | Iter 2.000: -0.0001858417886251118 | Iter 1.000: -0.000700614774832502
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 20/20 (6 PENDING, 1 RUNNING, 13 TERMINATED)
+---------------------+------------+------------------+-----------+--------------+---------------+-------------+--------------+----------------+----------------------+
| Trial name          | status     | loc              | act_fn    |   batch_size |   hidden_size |          lr |   num_layers |           loss |   training_iteration |
|---------------------+-------

[2m[36m(func pid=13693)[0m   cpuset_checked))


== Status ==
Current time: 2022-04-08 09:15:54 (running for 00:22:09.51)
Memory usage on this node: 3.1/12.7 GiB
Using AsyncHyperBand: num_stopped=9
Bracket: Iter 16.000: -5.033355460909661e-05 | Iter 8.000: -7.382442781818099e-05 | Iter 4.000: -0.00019915150594897567 | Iter 2.000: -0.0001858417886251118 | Iter 1.000: -0.0010896633676020428
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 20/20 (5 PENDING, 1 RUNNING, 14 TERMINATED)
+---------------------+------------+------------------+-----------+--------------+---------------+-------------+--------------+----------------+----------------------+
| Trial name          | status     | loc              | act_fn    |   batch_size |   hidden_size |          lr |   num_layers |           loss |   training_iteration |
|---------------------+------------+------------------+-----------+--------------+

[2m[36m(func pid=13820)[0m   cpuset_checked))


== Status ==
Current time: 2022-04-08 09:16:16 (running for 00:22:31.44)
Memory usage on this node: 3.1/12.7 GiB
Using AsyncHyperBand: num_stopped=10
Bracket: Iter 16.000: -5.033355460909661e-05 | Iter 8.000: -7.382442781818099e-05 | Iter 4.000: -0.00019915150594897567 | Iter 2.000: -0.0001858417886251118 | Iter 1.000: -0.0014787119603715836
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 20/20 (4 PENDING, 1 RUNNING, 15 TERMINATED)
+---------------------+------------+------------------+-----------+--------------+---------------+-------------+--------------+----------------+----------------------+
| Trial name          | status     | loc              | act_fn    |   batch_size |   hidden_size |          lr |   num_layers |           loss |   training_iteration |
|---------------------+------------+------------------+-----------+--------------

[2m[36m(func pid=13947)[0m   cpuset_checked))


[2m[36m(func pid=13947)[0m [1,    50] loss: 0.066
[2m[36m(func pid=13947)[0m [1,   100] loss: 0.001
[2m[36m(func pid=13947)[0m [1,   150] loss: 0.000
== Status ==
Current time: 2022-04-08 09:16:36 (running for 00:22:51.81)
Memory usage on this node: 3.1/12.7 GiB
Using AsyncHyperBand: num_stopped=11
Bracket: Iter 16.000: -5.033355460909661e-05 | Iter 8.000: -7.382442781818099e-05 | Iter 4.000: -0.00019915150594897567 | Iter 2.000: -0.0001858417886251118 | Iter 1.000: -0.0025688640761654825
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 20/20 (3 PENDING, 1 RUNNING, 16 TERMINATED)
+---------------------+------------+------------------+-----------+--------------+---------------+-------------+--------------+----------------+----------------------+
| Trial name          | status     | loc              | act_fn    |   batch_size |   hidde

[2m[36m(func pid=16276)[0m   cpuset_checked))


[2m[36m(func pid=16276)[0m [1,    50] loss: 5634350.236
== Status ==
Current time: 2022-04-08 09:19:39 (running for 00:25:54.97)
Memory usage on this node: 3.1/12.7 GiB
Using AsyncHyperBand: num_stopped=11
Bracket: Iter 16.000: -3.642434121502447e-05 | Iter 8.000: -5.7675792049849406e-05 | Iter 4.000: -0.00016572728971368632 | Iter 2.000: -0.00016298609465593473 | Iter 1.000: -0.0014787119603715836
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 20/20 (2 PENDING, 1 RUNNING, 17 TERMINATED)
+---------------------+------------+------------------+-----------+--------------+---------------+-------------+--------------+----------------+----------------------+
| Trial name          | status     | loc              | act_fn    |   batch_size |   hidden_size |          lr |   num_layers |           loss |   training_iteration |
|-------------------

[2m[36m(func pid=16411)[0m   cpuset_checked))


[2m[36m(func pid=16411)[0m [1,    50] loss: 0.163
[2m[36m(func pid=16411)[0m [1,   100] loss: 0.002
[2m[36m(func pid=16411)[0m [1,   150] loss: 0.000
[2m[36m(func pid=16411)[0m [1,   200] loss: 0.000
== Status ==
Current time: 2022-04-08 09:20:00 (running for 00:26:15.83)
Memory usage on this node: 3.1/12.7 GiB
Using AsyncHyperBand: num_stopped=12
Bracket: Iter 16.000: -3.642434121502447e-05 | Iter 8.000: -5.7675792049849406e-05 | Iter 4.000: -0.00016572728971368632 | Iter 2.000: -0.00016298609465593473 | Iter 1.000: -0.0025688640761654825
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 20/20 (1 PENDING, 1 RUNNING, 18 TERMINATED)
+---------------------+------------+------------------+-----------+--------------+---------------+-------------+--------------+----------------+----------------------+
| Trial name          | status     

[2m[36m(func pid=17188)[0m   cpuset_checked))


== Status ==
Current time: 2022-04-08 09:21:23 (running for 00:27:38.78)
Memory usage on this node: 3.1/12.7 GiB
Using AsyncHyperBand: num_stopped=13
Bracket: Iter 16.000: -3.642434121502447e-05 | Iter 8.000: -7.382442781818099e-05 | Iter 4.000: -0.000132303073478397 | Iter 2.000: -0.00014097547202254645 | Iter 1.000: -0.0014787119603715836
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44
Number of trials: 20/20 (1 RUNNING, 19 TERMINATED)
+---------------------+------------+------------------+-----------+--------------+---------------+-------------+--------------+----------------+----------------------+
| Trial name          | status     | loc              | act_fn    |   batch_size |   hidden_size |          lr |   num_layers |           loss |   training_iteration |
|---------------------+------------+------------------+-----------+--------------+-----------

2022-04-08 09:21:33,185	INFO tune.py:639 -- Total run time: 1668.76 seconds (1668.07 seconds for the tuning loop).


Result for DEFAULT_6578f_00019:
  date: 2022-04-08_09-21-33
  done: true
  experiment_id: a8449a69be4448d28bfb43d19ba2373d
  hostname: 04103710846f
  iterations_since_restore: 1
  loss: 19.87030433654785
  node_ip: 172.28.0.2
  pid: 17188
  should_checkpoint: true
  time_since_restore: 15.653190612792969
  time_this_iter_s: 15.653190612792969
  time_total_s: 15.653190612792969
  timestamp: 1649409693
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: 6578f_00019
  
== Status ==
Current time: 2022-04-08 09:21:33 (running for 00:27:48.08)
Memory usage on this node: 2.9/12.7 GiB
Using AsyncHyperBand: num_stopped=14
Bracket: Iter 16.000: -3.642434121502447e-05 | Iter 8.000: -7.382442781818099e-05 | Iter 4.000: -0.000132303073478397 | Iter 2.000: -0.00014097547202254645 | Iter 1.000: -0.0025688640761654825
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/6.99 GiB heap, 0.0/3.49 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/DEFAULT_2022-04-08_08-53-44


In [21]:
res_df = result.results_df
res_df

  "Dataframes will use '/' instead of '.' to delimit "


Unnamed: 0_level_0,loss,time_this_iter_s,should_checkpoint,done,timesteps_total,episodes_total,training_iteration,experiment_id,date,timestamp,...,node_ip,time_since_restore,timesteps_since_restore,iterations_since_restore,experiment_tag,config.hidden_size,config.num_layers,config.act_fn,config.lr,config.batch_size
trial_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6578f_00000,3129.306,7.088722,True,True,,,25,c2403b94f9fc4e1aa8c166c0da757b6b,2022-04-08_08-56-53,1649408213,...,172.28.0.2,185.159463,0,25,"0_act_fn=LeakyReLU,batch_size=1024,hidden_size...",600,4,LeakyReLU,0.018293,1024
6578f_00001,0.0001755282,11.586078,True,True,,,25,4102f55201eb447f886cbcfd49cf6dba,2022-04-08_09-01-38,1649408498,...,172.28.0.2,279.885255,0,25,"1_act_fn=LeakyReLU,batch_size=1024,hidden_size...",800,6,LeakyReLU,0.001481,1024
6578f_00002,2.309114e-05,6.689006,True,True,,,25,4f6e49b8c0f940e2be60682de53a34f6,2022-04-08_09-04-37,1649408677,...,172.28.0.2,173.042274,0,25,"2_act_fn=LeakyReLU,batch_size=1024,hidden_size...",400,6,LeakyReLU,8.6e-05,1024
6578f_00003,0.003659016,11.050263,True,True,,,1,9664d2f1cff347989894c560d084c62c,2022-04-08_09-04-54,1649408694,...,172.28.0.2,11.050263,0,1,"3_act_fn=ELU,batch_size=1024,hidden_size=600,l...",600,4,ELU,0.000446,1024
6578f_00004,0.0002453048,11.589147,True,True,,,4,4a8bc69df3934b22b667a620122ca910,2022-04-08_09-05-49,1649408749,...,172.28.0.2,49.214201,0,4,"4_act_fn=LeakyReLU,batch_size=512,hidden_size=...",600,6,LeakyReLU,0.002905,512
6578f_00005,0.000236525,6.844717,True,True,,,25,fafd69c349b94fb4b249a59cb188220a,2022-04-08_09-08-51,1649408931,...,172.28.0.2,176.528298,0,25,"5_act_fn=ReLU,batch_size=1024,hidden_size=400,...",400,6,ReLU,0.003986,1024
6578f_00006,418033500.0,11.726375,True,True,,,1,abfab65206f94f0485d89c5a197a0b2c,2022-04-08_09-09-08,1649408948,...,172.28.0.2,11.726375,0,1,"6_act_fn=ELU,batch_size=1024,hidden_size=400,l...",400,8,ELU,0.079688,1024
6578f_00007,0.0007365799,16.88197,True,True,,,4,9591b3c3c2d3462f9da9ba7404648e08,2022-04-08_09-10-25,1649409025,...,172.28.0.2,70.733349,0,4,"7_act_fn=ReLU,batch_size=512,hidden_size=800,l...",800,8,ReLU,0.004422,512
6578f_00008,0.0003147351,8.354793,True,True,,,25,9189b068770241ab90fa067a7ee233a5,2022-04-08_09-14-07,1649409247,...,172.28.0.2,215.672403,0,25,"8_act_fn=ReLU,batch_size=1024,hidden_size=800,...",800,4,ReLU,0.002434,1024
6578f_00009,0.004204668,9.33744,True,True,,,1,fdf01a6fa1434194b417a1a1a6fc934e,2022-04-08_09-14-22,1649409262,...,172.28.0.2,9.33744,0,1,"9_act_fn=ELU,batch_size=2048,hidden_size=400,l...",400,4,ELU,0.000757,2048


In [22]:
res_df.to_csv('../results/asha_trinomial_results.csv')

In [23]:
best_trial = result.get_best_trial("loss", "min", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))

Best trial config: {'hidden_size': 600, 'num_layers': 4, 'act_fn': 'LeakyReLU', 'lr': 0.00042745369696213476, 'batch_size': 1024}
Best trial final validation loss: 1.1033747832698281e-05


In [24]:
best_trained_model = Net(
    input_size, 
    output_size, 
    best_trial.config['hidden_size'], 
    best_trial.config['num_layers'], 
    best_trial.config['act_fn'])

In [25]:
best_trained_model = best_trained_model.to(device)
best_checkpoint_dir = best_trial.checkpoint.value
model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
best_trained_model.load_state_dict(model_state)

loss_fn = nn.MSELoss()
test_acc = test_accuracy(best_trained_model, loss_fn)
print("Best trial test set accuracy: {}".format(test_acc))

Best trial test set accuracy: 1.0951483459672485e-05


- [Pytorch Tutorial](https://pytorch.org/tutorials/beginner/hyperparameter_tuning_tutorial.html)
- [ASHA Paper](https://openreview.net/pdf?id=S1MAriC5F7)