In [None]:
!pip install -U ray[tune]

In [1]:
from functools import partial
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler
from sklearn.model_selection import train_test_split
from copy import deepcopy

from data import get_data

changes:
- data.py: changed the len of the custom CIFAR10 and CIFAR100 datasets to have the correct len after train-val split for tuning

In [2]:
print(torch.cuda.get_device_name(0))
torch.cuda.device_count()

Tesla V100-SXM2-16GB


2

In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [37]:
datapath = os.path.join("data")
dataset_name = "cifar10"
noise_mode = "sym"
p = 0.4
custom_noise = False
make_new_custom_noise = False
seed = 123
batch_size = 128
n_epochs = 300
lr = 0.001
mo = False
lc_n_epoch = -1
val_size = 0.1 #10% of ciafr10 or cifar100 is 5k images (same as in paper)
use_n_cpus = 4
use_n_gpus_per_trial = 2
val_size = 0.1

# hp
config = {"sigma": tune.grid_search([0.1, 0.2, 0.5, 1.0])}

reporter = CLIReporter(
    # parameter_columns=["sigma"],
    metric_columns=["loss", "accuracy", "training_iteration"])

result = tune.run(
    tune.with_parameters(train_hp, 
                         checkpoint_dir=None, datapath=datapath, dataset_name=dataset_name, 
                         noise_mode=noise_mode, p=p, custom_noise=custom_noise,
                         make_new_custom_noise=make_new_custom_noise,
                         seed=seed, batch_size=batch_size, n_epochs=n_epochs, 
                         lr=lr, mo=mo, lc_n_epoch=lc_n_epoch, val_size=val_size),
    resources_per_trial={"cpu": use_n_cpus, "gpu": use_n_gpus_per_trial},
    config=config,
    num_samples=1,
    progress_reporter=reporter
)

best_trial = result.get_best_trial("accuracy", "max", "last")
print("Best trial config: {}".format(best_trial.config))
print("Best trial final validation accurac: {}".format(
    best_trial.last_result["accuracy"]))
print("Best trial final validation loss: {}".format(
    best_trial.last_result["loss"]))

== Status ==
Current time: 2021-12-02 21:09:00 (running for 00:00:00.12)
Memory usage on this node: 4.2/14.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 256.000: None | Iter 128.000: None | Iter 64.000: None | Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/4 CPUs, 0/2 GPUs, 0.0/7.76 GiB heap, 0.0/3.88 GiB objects (0.0/1.0 accelerator_type:V100)
Result logdir: /home/jupyter/ray_results/train_hp_2021-12-02_21-09-00
Number of trials: 4/4 (4 PENDING)
+----------------------+----------+-------+---------+
| Trial name           | status   | loc   |   sigma |
|----------------------+----------+-------+---------|
| train_hp_1229d_00000 | PENDING  |       |     0.1 |
| train_hp_1229d_00001 | PENDING  |       |     0.2 |
| train_hp_1229d_00002 | PENDING  |       |     0.5 |
| train_hp_1229d_00003 | PENDING  |       |     1   |
+----------------------+----------+-------+---------+


[2m[36m(Impli

  0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 33792/170498071 [00:00<09:21, 303600.83it/s]
  0%|          | 197632/170498071 [00:00<02:49, 1006379.91it/s]
  1%|          | 852992/170498071 [00:00<00:52, 3239120.49it/s]
  2%|▏         | 3146752/170498071 [00:00<00:16, 10121030.33it/s]
  4%|▎         | 6210560/170498071 [00:00<00:10, 16375043.46it/s]
  6%|▌         | 9552896/170498071 [00:00<00:07, 21255371.30it/s]
  8%|▊         | 12928000/170498071 [00:00<00:06, 25016666.33it/s]
 10%|▉         | 16434176/170498071 [00:00<00:05, 28066985.80it/s]
 12%|█▏        | 20448256/170498071 [00:00<00:04, 31657130.40it/s]
 14%|█▍        | 24380416/170498071 [00:01<00:04, 33949069.08it/s]
 17%|█▋        | 28460032/170498071 [00:01<00:03, 35959926.43it/s]
 19%|█▉        | 32867328/170498071 [00:01<00:03, 38381045.59it/s]
 22%|██▏       | 37468160/170498071 [00:01<00:03, 40670183.66it/s]
 25%|██▍       | 42107904/170498071 [00:01<00:03, 42245775.25it/s]
 28%|██▊       | 47093760/1704

== Status ==
Current time: 2021-12-02 21:09:05 (running for 00:00:05.17)
Memory usage on this node: 4.7/14.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 256.000: None | Iter 128.000: None | Iter 64.000: None | Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 4.0/4 CPUs, 2.0/2 GPUs, 0.0/7.76 GiB heap, 0.0/3.88 GiB objects (0.0/1.0 accelerator_type:V100)
Result logdir: /home/jupyter/ray_results/train_hp_2021-12-02_21-09-00
Number of trials: 4/4 (3 PENDING, 1 RUNNING)
+----------------------+----------+-------------------+---------+
| Trial name           | status   | loc               |   sigma |
|----------------------+----------+-------------------+---------|
| train_hp_1229d_00000 | RUNNING  | 10.138.0.16:24340 |     0.1 |
| train_hp_1229d_00001 | PENDING  |                   |     0.2 |
| train_hp_1229d_00002 | PENDING  |                   |     0.5 |
| train_hp_1229d_00003 | PENDING  |   

 58%|█████▊    | 99166208/170498071 [00:02<00:01, 51372892.60it/s]
 61%|██████    | 104305664/170498071 [00:02<00:01, 51283813.64it/s]
 64%|██████▍   | 109434880/170498071 [00:02<00:01, 50916467.66it/s]
 67%|██████▋   | 114632704/170498071 [00:02<00:01, 51231496.95it/s]
 70%|███████   | 119756800/170498071 [00:02<00:00, 51136910.42it/s]
 74%|███████▎  | 125322240/170498071 [00:03<00:00, 52367826.81it/s]
 77%|███████▋  | 130560000/170498071 [00:03<00:00, 51346964.08it/s]
 80%|███████▉  | 136233984/170498071 [00:03<00:00, 52041111.31it/s]
 84%|████████▎ | 142443520/170498071 [00:03<00:00, 54978504.15it/s]
 87%|████████▋ | 147964928/170498071 [00:03<00:00, 55047504.87it/s]
 90%|█████████ | 153476096/170498071 [00:03<00:00, 54300569.78it/s]
 93%|█████████▎| 158912512/170498071 [00:03<00:00, 54249945.99it/s]
 96%|█████████▋| 164529152/170498071 [00:03<00:00, 54655749.52it/s]
170499072it [00:03, 43693413.30it/s]                               


[2m[36m(ImplicitFunc pid=24340)[0m Extracting data/cifar10/cifar-10-python.tar.gz to data/cifar10
[2m[36m(ImplicitFunc pid=24340)[0m Files already downloaded and verified
[2m[36m(ImplicitFunc pid=24340)[0m Files already downloaded and verified
== Status ==
Current time: 2021-12-02 21:09:11 (running for 00:00:11.15)
Memory usage on this node: 5.0/14.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 256.000: None | Iter 128.000: None | Iter 64.000: None | Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 4.0/4 CPUs, 2.0/2 GPUs, 0.0/7.76 GiB heap, 0.0/3.88 GiB objects (0.0/1.0 accelerator_type:V100)
Result logdir: /home/jupyter/ray_results/train_hp_2021-12-02_21-09-00
Number of trials: 4/4 (3 PENDING, 1 RUNNING)
+----------------------+----------+-------------------+---------+
| Trial name           | status   | loc               |   sigma |
|----------------------+----------+--------------

[2m[36m(pid=24340)[0m 2021-12-02 21:09:12,449	ERROR function_runner.py:268 -- Runner Thread raised error.
[2m[36m(pid=24340)[0m Traceback (most recent call last):
[2m[36m(pid=24340)[0m   File "/opt/conda/lib/python3.7/site-packages/ray/tune/function_runner.py", line 262, in run
[2m[36m(pid=24340)[0m     self._entrypoint()
[2m[36m(pid=24340)[0m   File "/opt/conda/lib/python3.7/site-packages/ray/tune/function_runner.py", line 331, in entrypoint
[2m[36m(pid=24340)[0m     self._status_reporter.get_checkpoint())
[2m[36m(pid=24340)[0m   File "/opt/conda/lib/python3.7/site-packages/ray/util/tracing/tracing_helper.py", line 451, in _resume_span
[2m[36m(pid=24340)[0m     return method(self, *_args, **_kwargs)
[2m[36m(pid=24340)[0m   File "/opt/conda/lib/python3.7/site-packages/ray/tune/function_runner.py", line 599, in _trainable_func
[2m[36m(pid=24340)[0m     output = fn()
[2m[36m(pid=24340)[0m   File "/opt/conda/lib/python3.7/site-packages/ray/tune/utils/train

Result for train_hp_1229d_00000:
  date: 2021-12-02_21-09-02
  experiment_id: 2835ba092e044345a6afefc0404b2017
  hostname: nb-test
  node_ip: 10.138.0.16
  pid: 24340
  timestamp: 1638479342
  trial_id: 1229d_00000
  
== Status ==
Current time: 2021-12-02 21:09:12 (running for 00:00:11.99)
Memory usage on this node: 4.8/14.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 256.000: None | Iter 128.000: None | Iter 64.000: None | Iter 32.000: None | Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/4 CPUs, 0/2 GPUs, 0.0/7.76 GiB heap, 0.0/3.88 GiB objects (0.0/1.0 accelerator_type:V100)
Result logdir: /home/jupyter/ray_results/train_hp_2021-12-02_21-09-00
Number of trials: 4/4 (1 ERROR, 3 PENDING)
+----------------------+----------+-------------------+---------+
| Trial name           | status   | loc               |   sigma |
|----------------------+----------+-------------------+---------|
| train_hp_1229d_00001 |

2021-12-02 21:09:12,758	ERROR tune.py:626 -- Trials did not complete: [train_hp_1229d_00000, train_hp_1229d_00001, train_hp_1229d_00002, train_hp_1229d_00003]
2021-12-02 21:09:12,758	INFO tune.py:630 -- Total run time: 12.20 seconds (11.98 seconds for the tuning loop).


AttributeError: 'NoneType' object has no attribute 'config'

In [34]:
len(val_dataloader.dataset)

5000

In [35]:
def train_hp(config, checkpoint_dir, datapath, dataset_name, noise_mode, p, custom_noise, make_new_custom_noise, seed, batch_size, n_epochs, lr, 
             mo, lc_n_epoch, val_size):
    print("config")
    print(config)
    print("config")
    
    train_dataset, _, indices_noisy, noise_rules, test_dataset = get_data(
        dataset_name=dataset_name,
        datapath=datapath,
        noise_mode=noise_mode, 
        p=p,
        custom_noise=custom_noise,
        make_new_custom_noise=make_new_custom_noise,
        seed=seed
    )

    print(np.unique(train_dataset.targets, return_counts=True))
    val_dataset = deepcopy(train_dataset)
    X_train, X_val, y_train, y_val = train_test_split(train_dataset.data, train_dataset.targets, test_size=val_size, stratify=train_dataset.targets, random_state=seed)
    train_dataset.data, train_dataset.targets = X_train, y_train
    val_dataset.data, val_dataset.targets = X_val, y_val

    print(np.unique(y_train, return_counts=True))
    print(np.unique(y_val, return_counts=True))

    # get number of classes
    n_classes = len(list(train_dataset.class_to_idx.keys()))
    # make targets one-hot (easier to handle in lc and sln), targets_one_hot used in lc
    targets = train_dataset.targets
    targets_one_hot, train_dataset.targets = np.eye(n_classes)[targets], np.eye(n_classes)[targets]
    targets_val = val_dataset.targets
    val_dataset.targets = np.eye(n_classes)[targets_val]
    # train_dataloader is modified if lc is used
    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    # train_eval_dataloader is never modified, and is used to compute the loss weights for lc
    train_eval_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
    # val_dataloader
    val_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True, num_workers=2)

    # get models for naive and ema (depends on dataset)
    model_name = "wrn-28-2" if dataset_name in ["cifar10", "cifar100"] else "MODEL_NAME_FOR_CLOTHING1M"
    model = get_model(model_name=model_name, n_classes=n_classes, device=device)
    # if multi gpu
    if device == "cuda":
        if 1 < torch.cuda.device_count():
            model = torch.nn.DataParallel(model)
    model.to(device)
    # optimizer for model
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
    # ema model (MO)
    model_ema = get_model(model_name=model_name, n_classes=n_classes, device=device) if mo else None
    if model_ema:
        # no grads for model_ema
        for param in model_ema.parameters():
            param.detach_()
        # if multi gpu
        if device == "cuda":
            if 1 < torch.cuda.device_count():
                model_ema = torch.nn.DataParallel(model_ema)
        model_ema.to(device)
        # ema model optimizer
        optimizer_ema = WeightEMA(model, model_ema, alpha=0.999)
    else:
        optimizer_ema = None
    
    sigma = deepcopy(config["sigma"])
    
    # start experiment
    for n_epoch in range(1, n_epochs+1):
        # label-correction
        # if SLN-MO-LC model
        if model_ema and 0 < lc_n_epoch and lc_n_epoch <= n_epoch:
            # set sigma to 0, no more stochastic label noise as lc starts
            sigma = 0
            # keep targets one hot through lc
            losses, softmaxes = \
                get_lc_params(model_ema=model_ema, train_eval_dataloader=train_eval_dataloader, device=device, n_epoch=n_epoch, n_epochs=n_epochs)
            # normalize to [0.0, 1.0]
            weights = torch.reshape((losses - torch.min(losses)) / (torch.max(losses) - torch.min(losses)), (len(train_dataloader.dataset), 1))
            weights = weights.numpy()
            preds = np.argmax(softmaxes.numpy(), axis=1).tolist()
            preds_one_hot = np.eye(n_classes)[preds]
            # do lc and reload training data (targets_one_hot fixed variable from above)
            targets_one_hot_lc = weights*targets_one_hot + (1-weights)*preds_one_hot
            train_dataset.targets = targets_one_hot_lc
            train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
        # train
        loss_epoch, accuracy_epoch, loss_noisy_epoch, loss_clean_epoch = train(
            model=model, 
            device=device,
            train_dataloader=train_dataloader,
            optimizer=optimizer,
            optimizer_ema=optimizer_ema,
            sigma=sigma,
            n_classes=n_classes,
            n_epoch=n_epoch,
            n_epochs=n_epochs, 
            indices_noisy=indices_noisy
        )
        
        # if SLN-MO or SLN-MO-LC model, test with EMA model
        if optimizer_ema:
            loss_val, accuracy_val = test(
                model=model_ema, 
                device=device,
                test_dataloader=val_dataloader,
                n_epoch=n_epoch,
                n_epochs=n_epochs)

        # if CE or SLN model, test with model
        else:
            loss_val, accuracy_val = test(
                model=model, 
                device=device,
                test_dataloader=val_dataloader,
                n_epoch=n_epoch,
                n_epochs=n_epochs)
        
        tune.report(loss=loss_val, accuracy=accuracy_val)


In [38]:
os.path.dirname(os.path.realpath(__file__))

NameError: name '__file__' is not defined

In [9]:
class Net(nn.Module):
    def __init__(self, l1=120, l2=84):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(3, 6, 5)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(6, 16, 5)
        self.fc1 = nn.Linear(16 * 5 * 5, l1)
        self.fc2 = nn.Linear(l1, l2)
        self.fc3 = nn.Linear(l2, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 5 * 5)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

def load_data(data_dir="./data"):
    transform = transforms.Compose([
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    trainset = torchvision.datasets.CIFAR10(
        root=data_dir, train=True, download=True, transform=transform)

    testset = torchvision.datasets.CIFAR10(
        root=data_dir, train=False, download=True, transform=transform)

    return trainset, testset

def main(num_samples=10, max_num_epochs=10, gpus_per_trial=2):
    data_dir = os.path.abspath("./data")
    load_data(data_dir)
    config = {
        "l1": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "l2": tune.sample_from(lambda _: 2 ** np.random.randint(2, 9)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice([2, 4, 8, 16])
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        # parameter_columns=["l1", "l2", "lr", "batch_size"],
        metric_columns=["loss", "accuracy", "training_iteration"])
    result = tune.run(
        partial(train_cifar, data_dir=data_dir),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)

    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = Net(best_trial.config["l1"], best_trial.config["l2"])
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))

    
def train_cifar(config, checkpoint_dir=None, data_dir=None):
    net = Net(config["l1"], config["l2"])

    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(net.parameters(), lr=config["lr"], momentum=0.9)

    trainset, testset = load_data(data_dir)

    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(
        trainset, [test_abs, len(trainset) - test_abs])

    trainloader = torch.utils.data.DataLoader(
        train_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)
    valloader = torch.utils.data.DataLoader(
        val_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)

    for epoch in range(10):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1


        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
    print("Finished Training")
    
def test_accuracy(net, device="cpu"):
    trainset, testset = load_data()

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False, num_workers=2)

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

In [10]:
main(num_samples=2, max_num_epochs=2, gpus_per_trial=2)

Files already downloaded and verified
Files already downloaded and verified


2021-12-02 18:03:12,117	INFO registry.py:70 -- Detected unknown callable for trainable. Converting to class.


== Status ==
Current time: 2021-12-02 18:03:12 (running for 00:00:00.23)
Memory usage on this node: 2.5/14.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: None
Resources requested: 0/4 CPUs, 0/2 GPUs, 0.0/7.76 GiB heap, 0.0/3.88 GiB objects (0.0/1.0 accelerator_type:V100)
Result logdir: /home/jupyter/ray_results/DEFAULT_2021-12-02_18-03-12
Number of trials: 2/2 (2 PENDING)
+---------------------+----------+-------+--------------+------+------+------------+
| Trial name          | status   | loc   |   batch_size |   l1 |   l2 |         lr |
|---------------------+----------+-------+--------------+------+------+------------|
| DEFAULT_1d324_00000 | PENDING  |       |            2 |   32 |   64 | 0.0391349  |
| DEFAULT_1d324_00001 | PENDING  |       |            8 |   64 |  128 | 0.00469632 |
+---------------------+----------+-------+--------------+------+------+------------+


== Status ==
Current time: 2021-12-02 18:03:17 (running for 00:00:05.27)
Memor

[2m[36m(pid=27651)[0m   cpuset_checked))


== Status ==
Current time: 2021-12-02 18:03:23 (running for 00:00:11.26)
Memory usage on this node: 5.3/14.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/4 CPUs, 2.0/2 GPUs, 0.0/7.76 GiB heap, 0.0/3.88 GiB objects (0.0/1.0 accelerator_type:V100)
Result logdir: /home/jupyter/ray_results/DEFAULT_2021-12-02_18-03-12
Number of trials: 2/2 (1 PENDING, 1 RUNNING)
+---------------------+----------+-------------------+--------------+------+------+------------+
| Trial name          | status   | loc               |   batch_size |   l1 |   l2 |         lr |
|---------------------+----------+-------------------+--------------+------+------+------------|
| DEFAULT_1d324_00000 | RUNNING  | 10.138.0.16:27651 |            2 |   32 |   64 | 0.0391349  |
| DEFAULT_1d324_00001 | PENDING  |                   |            8 |   64 |  128 | 0.00469632 |
+---------------------+----------+-------------------+--------------+------+------+-------

[2m[36m(pid=27651)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


== Status ==
Current time: 2021-12-02 18:03:28 (running for 00:00:16.28)
Memory usage on this node: 6.5/14.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/4 CPUs, 2.0/2 GPUs, 0.0/7.76 GiB heap, 0.0/3.88 GiB objects (0.0/1.0 accelerator_type:V100)
Result logdir: /home/jupyter/ray_results/DEFAULT_2021-12-02_18-03-12
Number of trials: 2/2 (1 PENDING, 1 RUNNING)
+---------------------+----------+-------------------+--------------+------+------+------------+
| Trial name          | status   | loc               |   batch_size |   l1 |   l2 |         lr |
|---------------------+----------+-------------------+--------------+------+------+------------|
| DEFAULT_1d324_00000 | RUNNING  | 10.138.0.16:27651 |            2 |   32 |   64 | 0.0391349  |
| DEFAULT_1d324_00001 | PENDING  |                   |            8 |   64 |  128 | 0.00469632 |
+---------------------+----------+-------------------+--------------+------+------+-------

[2m[36m(pid=27652)[0m   cpuset_checked))


== Status ==
Current time: 2021-12-02 18:07:49 (running for 00:04:37.55)
Memory usage on this node: 5.9/14.7 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 2.000: None | Iter 1.000: -2.359835350322723
Resources requested: 2.0/4 CPUs, 2.0/2 GPUs, 0.0/7.76 GiB heap, 0.0/3.88 GiB objects (0.0/1.0 accelerator_type:V100)
Result logdir: /home/jupyter/ray_results/DEFAULT_2021-12-02_18-03-12
Number of trials: 2/2 (1 RUNNING, 1 TERMINATED)
+---------------------+------------+-------------------+--------------+------+------+------------+--------+------------+----------------------+
| Trial name          | status     | loc               |   batch_size |   l1 |   l2 |         lr |   loss |   accuracy |   training_iteration |
|---------------------+------------+-------------------+--------------+------+------+------------+--------+------------+----------------------|
| DEFAULT_1d324_00001 | RUNNING    | 10.138.0.16:27652 |            8 |   64 |  128 | 0.00469632 |        |            |      

[2m[36m(pid=27652)[0m   return torch.max_pool2d(input, kernel_size, stride, padding, dilation, ceil_mode)


== Status ==
Current time: 2021-12-02 18:07:54 (running for 00:04:42.56)
Memory usage on this node: 6.5/14.7 GiB
Using AsyncHyperBand: num_stopped=1
Bracket: Iter 2.000: None | Iter 1.000: -2.359835350322723
Resources requested: 2.0/4 CPUs, 2.0/2 GPUs, 0.0/7.76 GiB heap, 0.0/3.88 GiB objects (0.0/1.0 accelerator_type:V100)
Result logdir: /home/jupyter/ray_results/DEFAULT_2021-12-02_18-03-12
Number of trials: 2/2 (1 RUNNING, 1 TERMINATED)
+---------------------+------------+-------------------+--------------+------+------+------------+--------+------------+----------------------+
| Trial name          | status     | loc               |   batch_size |   l1 |   l2 |         lr |   loss |   accuracy |   training_iteration |
|---------------------+------------+-------------------+--------------+------+------+------------+--------+------------+----------------------|
| DEFAULT_1d324_00001 | RUNNING    | 10.138.0.16:27652 |            8 |   64 |  128 | 0.00469632 |        |            |      

2021-12-02 18:08:59,925	INFO tune.py:630 -- Total run time: 347.81 seconds (347.56 seconds for the tuning loop).


Result for DEFAULT_1d324_00001:
  accuracy: 0.5108
  date: 2021-12-02_18-08-59
  done: true
  experiment_id: 60c450ce341e46569d7d0d8f0381d834
  hostname: nb-test
  iterations_since_restore: 2
  loss: 1.376428749513626
  node_ip: 10.138.0.16
  pid: 27652
  time_since_restore: 75.84299778938293
  time_this_iter_s: 35.00030064582825
  time_total_s: 75.84299778938293
  timestamp: 1638468539
  timesteps_since_restore: 0
  training_iteration: 2
  trial_id: 1d324_00001
  
== Status ==
Current time: 2021-12-02 18:08:59 (running for 00:05:47.57)
Memory usage on this node: 5.8/14.7 GiB
Using AsyncHyperBand: num_stopped=2
Bracket: Iter 2.000: None | Iter 1.000: -1.9005813336610793
Resources requested: 0/4 CPUs, 0/2 GPUs, 0.0/7.76 GiB heap, 0.0/3.88 GiB objects (0.0/1.0 accelerator_type:V100)
Result logdir: /home/jupyter/ray_results/DEFAULT_2021-12-02_18-03-12
Number of trials: 2/2 (2 TERMINATED)
+---------------------+------------+-------------------+--------------+------+------+------------+----

TypeError: expected str, bytes or os.PathLike object, not NoneType