In [1]:
import torch
import pytorch_lightning as pl
from torch.utils.data import DataLoader, random_split
from torch.nn import functional as F
from torchvision.datasets import MNIST
from torchvision import transforms
import os

In [129]:
import shutil
from tempfile import mkdtemp
from pytorch_lightning.callbacks import Callback
from pytorch_lightning.utilities.cloud_io import load as pl_load
import ray
from ray import tune
from ray.tune import CLIReporter, JupyterNotebookReporter
from ray.tune.schedulers import ASHAScheduler, PopulationBasedTraining
import numpy as np

In [130]:
ray.__version__,pl.__version__,torch.__version__

('0.9.0.dev0', '0.8.4', '1.5.1')

In [109]:
class LightningMNISTClassifier(pl.LightningModule):
    """
    This has been adapted from
    https://towardsdatascience.com/from-pytorch-to-pytorch-lightning-a-gentle-introduction-b371b7caaf09
    """

    def __init__(self, config, data_dir=None):
        super(LightningMNISTClassifier, self).__init__()

        self.data_dir = data_dir or os.getcwd()
        self.num_workers = config['num_workers']

        self.layer_1_size = config["layer_1_size"]
        self.layer_2_size = config["layer_2_size"]
        self.lr = config["lr"]
        self.batch_size = config["batch_size"]

        # mnist images are (1, 28, 28) (channels, width, height)
        self.layer_1 = torch.nn.Linear(28 * 28, self.layer_1_size)
        self.layer_2 = torch.nn.Linear(self.layer_1_size, self.layer_2_size)
        self.layer_3 = torch.nn.Linear(self.layer_2_size, 10)

    def forward(self, x):
        batch_size, channels, width, height = x.size()
        x = x.view(batch_size, -1)

        x = self.layer_1(x)
        x = torch.relu(x)

        x = self.layer_2(x)
        x = torch.relu(x)

        x = self.layer_3(x)
        x = torch.log_softmax(x, dim=1)

        return x

    def cross_entropy_loss(self, logits, labels):
        return F.nll_loss(logits, labels)

    def accuracy(self, logits, labels):
        _, predicted = torch.max(logits.data, 1)
        correct = (predicted == labels).sum().item()
        accuracy = correct / len(labels)
        return torch.tensor(accuracy)

    def training_step(self, train_batch, batch_idx):
        x, y = train_batch
        logits = self.forward(x)
        loss = self.cross_entropy_loss(logits, y)
        accuracy = self.accuracy(logits, y)

        logs = {"train_loss": loss, "train_accuracy": accuracy}
        return {"loss": loss, "log": logs}

    def validation_step(self, val_batch, batch_idx):
        x, y = val_batch
        logits = self.forward(x)
        loss = self.cross_entropy_loss(logits, y)
        accuracy = self.accuracy(logits, y)

        return {"val_loss": loss, "val_accuracy": accuracy}

    def validation_epoch_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        avg_acc = torch.stack([x["val_accuracy"] for x in outputs]).mean()
        tensorboard_logs = {"val_loss": avg_loss, "val_accuracy": avg_acc}

        return {
            "avg_val_loss": avg_loss,
            "avg_val_accuracy": avg_acc,
            "log": tensorboard_logs
        }

    @staticmethod
    def download_data(data_dir):
        transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize((0.1307, ), (0.3081, ))
        ])
        return MNIST(data_dir, train=True, download=True, transform=transform)

    def prepare_data(self):
        mnist_train = self.download_data(self.data_dir)

        self.mnist_train, self.mnist_val = random_split(
            mnist_train, [55000, 5000])

    def train_dataloader(self):
        return DataLoader(self.mnist_train, batch_size=int(self.batch_size), num_workers=self.num_workers)

    def val_dataloader(self):
        return DataLoader(self.mnist_val, batch_size=int(self.batch_size), num_workers=self.num_workers)

    def configure_optimizers(self):
        opt = torch.optim.SGD(params=self.parameters(), lr=self.lr)
        sched = torch.optim.lr_scheduler.ExponentialLR(opt, gamma=.8)
        return [opt], [sched]

def train_mnist(config):
    model = LightningMNISTClassifier(config)
    trainer = pl.Trainer(max_epochs=5, show_progress_bar=True, gpus=1)
    trainer.fit(model)

In [110]:
config = {
    "layer_1_size": 128,
    "layer_2_size": 256,
    "lr": 1e-3,
    "batch_size": 64,
    "num_workers":4
}
train_mnist(config)

GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type   | Params
-----------------------------------
0 | layer_1 | Linear | 100 K 
1 | layer_2 | Linear | 33 K  
2 | layer_3 | Linear | 2 K   


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




In [128]:
# note type difference. a bunch of stuff (eg tensorboard,pytorch dataloader) 
# dont like when np.int64 passed instead of int.
ch1 = tune.choice([10,20,30])
ch2 = tune.choice(np.array([10,20,30], dtype=np.object))
type(ch1.func(_)), type(ch2.func(_))

(numpy.int64, int)

In [117]:
class TuneReportCallback(Callback):
    def on_validation_end(self, trainer, pl_module):
        tune.report(
            loss=trainer.callback_metrics["avg_val_loss"].item(),
            mean_accuracy=trainer.callback_metrics["avg_val_accuracy"].item()
        )

def train_mnist_tune(config):
    model = LightningMNISTClassifier(config, config["data_dir"])
    trainer = pl.Trainer(
        max_epochs=10,
        gpus=1,
        progress_bar_refresh_rate=0,
        callbacks=[TuneReportCallback()])

    trainer.fit(model)

def tune_mnist_asha(local_dir):
    data_dir = mkdtemp(prefix="mnist_data_")
    LightningMNISTClassifier.download_data(data_dir)
    config = {
        "layer_1_size": tune.choice(np.array([32, 64, 128], dtype=np.object)), # to keep dtype as python int
        "layer_2_size": tune.choice(np.array([64, 128, 256], dtype=np.object)),
        "lr": tune.loguniform(1e-4, 1e-1),
        "batch_size": tune.choice(np.array([32, 64, 128], dtype=np.object)),
        "data_dir": data_dir,
        "num_workers":4
    }
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=10,
        grace_period=1,
        reduction_factor=2)
    reporter = JupyterNotebookReporter(overwrite=True,
        metric_columns=["loss", "mean_accuracy", "training_iteration"],
        parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"], # not yet in pip install ray[tune]
    )
    res = tune.run(
        train_mnist_tune,
        resources_per_trial={"cpu": 1, 'gpu':.2},
        config=config,
        local_dir=local_dir,
        num_samples=10,
        scheduler=scheduler,
        progress_reporter=reporter,
    )
    shutil.rmtree(data_dir)
    return res

In [38]:
ray.shutdown()
res = tune_mnist_asha(local_dir='./ray_mnist_asha/')

Trial name,status,loc,batch_size,layer_1_size,layer_2_size,lr,loss,mean_accuracy,training_iteration
train_mnist_tune_1ec51_00000,TERMINATED,,64,64,128,0.00188364,0.15708,0.959652,4
train_mnist_tune_1ec51_00001,TERMINATED,,64,128,64,0.000377129,0.205501,0.941258,1
train_mnist_tune_1ec51_00002,TERMINATED,,128,64,64,0.0118723,0.184885,0.957812,10
train_mnist_tune_1ec51_00003,TERMINATED,,64,128,256,0.00110605,0.122183,0.973497,10
train_mnist_tune_1ec51_00004,TERMINATED,,64,64,256,0.0459378,1.44557,0.432358,1
train_mnist_tune_1ec51_00005,TERMINATED,,128,32,256,0.000818334,0.210547,0.937109,1
train_mnist_tune_1ec51_00006,TERMINATED,,128,64,128,0.000203156,0.308827,0.90957,1
train_mnist_tune_1ec51_00007,TERMINATED,,32,128,256,0.000949311,0.140603,0.965764,4
train_mnist_tune_1ec51_00008,TERMINATED,,128,64,128,0.000148696,0.345566,0.902539,1
train_mnist_tune_1ec51_00009,TERMINATED,,64,128,64,0.00204752,0.120947,0.97231,10


In [123]:
%load_ext tensorboard
%tensorboard --logdir=./ray_mnist_asha/

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [40]:
class CheckpointCallback(Callback):
    def on_validation_end(self, trainer, pl_module):
        path = tune.make_checkpoint_dir(trainer.global_step)
        trainer.save_checkpoint(os.path.join(path, "checkpoint"))
        tune.save_checkpoint(path)

In [116]:
def train_mnist_tune_checkpoint(config, checkpoint=None):
    if checkpoint is None or len(checkpoint) == 0: # sometimes gets {} as input.
        checkpoint_path = None
    else:
        checkpoint_path = os.path.join(checkpoint, "checkpoint")

    trainer = pl.Trainer(
        max_epochs=10,
        gpus=1,
        progress_bar_refresh_rate=0,
        callbacks=[CheckpointCallback(),
                   TuneReportCallback(), 
                   pl.callbacks.LearningRateLogger()
                  ],
        resume_from_checkpoint=checkpoint_path)
    
    model = LightningMNISTClassifier(config)
    trainer.fit(model)

In [118]:
def tune_mnist_pbt(local_dir):
    data_dir = mkdtemp(prefix="mnist_data_")
    LightningMNISTClassifier.download_data(data_dir)
    config = {
        "layer_1_size": tune.choice(np.array([32, 64, 128], dtype=np.object)), # to keep dtype as python int
        "layer_2_size": tune.choice(np.array([64, 128, 256], dtype=np.object)),
        "lr": 1e-3,
        "batch_size": 64,
        "data_dir": data_dir,
        "num_workers": 3
    }
    scheduler = PopulationBasedTraining(
        time_attr="training_iteration",
        metric="loss",
        mode="min",
        perturbation_interval=3,
        hyperparam_mutations={
            "lr": lambda: tune.loguniform(1e-4, 1e-1).func(None),
            "batch_size": [32, 64, 128]
        })
    reporter = JupyterNotebookReporter(overwrite=False,
        metric_columns=["loss", "mean_accuracy", "training_iteration"],
        parameter_columns=["layer_1_size", "layer_2_size", "lr", "batch_size"],
        )
    res = tune.run(
        train_mnist_tune_checkpoint,
        resources_per_trial={"cpu": 1, 'gpu':.3},
        config=config,
        local_dir=local_dir,
        num_samples=3,
        scheduler=scheduler,
        progress_reporter=reporter)
                                       
    shutil.rmtree(data_dir)
    return res

In [127]:
res = tune_mnist_pbt(local_dir='./pbt_logs4/')

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to /tmp/mnist_data_ctfm5foh/MNIST/raw/train-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting /tmp/mnist_data_ctfm5foh/MNIST/raw/train-images-idx3-ubyte.gz to /tmp/mnist_data_ctfm5foh/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to /tmp/mnist_data_ctfm5foh/MNIST/raw/train-labels-idx1-ubyte.gz



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting /tmp/mnist_data_ctfm5foh/MNIST/raw/train-labels-idx1-ubyte.gz to /tmp/mnist_data_ctfm5foh/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to /tmp/mnist_data_ctfm5foh/MNIST/raw/t10k-images-idx3-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting /tmp/mnist_data_ctfm5foh/MNIST/raw/t10k-images-idx3-ubyte.gz to /tmp/mnist_data_ctfm5foh/MNIST/raw
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to /tmp/mnist_data_ctfm5foh/MNIST/raw/t10k-labels-idx1-ubyte.gz


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Extracting /tmp/mnist_data_ctfm5foh/MNIST/raw/t10k-labels-idx1-ubyte.gz to /tmp/mnist_data_ctfm5foh/MNIST/raw
Processing...
Done!




Trial name,status,loc,layer_1_size,layer_2_size,lr,batch_size
train_mnist_tune_checkpoint_48500_00000,RUNNING,,64,256,0.001,64
train_mnist_tune_checkpoint_48500_00001,PENDING,,128,256,0.001,64
train_mnist_tune_checkpoint_48500_00002,PENDING,,32,64,0.001,64


[2m[36m(pid=23927)[0m Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to /nvme_drive/orm/car_stuff/vroom_nbs/pbt_logs4/train_mnist_tune_checkpoint/train_mnist_tune_checkpoint_0_layer_1_size=64,layer_2_size=256_2020-07-01_20-16-32dbjkw02e/MNIST/raw/train-images-idx3-ubyte.gz
[2m[36m(pid=23927)[0m GPU available: True, used: True
[2m[36m(pid=23927)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=23927)[0m CUDA_VISIBLE_DEVICES: [0]
0it [00:00, ?it/s]7)[0m 
[2m[36m(pid=23929)[0m GPU available: True, used: True
[2m[36m(pid=23929)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=23929)[0m CUDA_VISIBLE_DEVICES: [0]
0it [00:00, ?it/s]9)[0m 
[2m[36m(pid=23931)[0m Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to /nvme_drive/orm/car_stuff/vroom_nbs/pbt_logs4/train_mnist_tune_checkpoint/train_mnist_tune_checkpoint_2_layer_1_size=32,layer_2_size=64_2020-07-01_20-16-32auazt2qt/MNIST/raw/train-images-idx3-ubyte.

0it [00:00, ?it/s]9)[0m 
[2m[36m(pid=23931)[0m Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to /nvme_drive/orm/car_stuff/vroom_nbs/pbt_logs4/train_mnist_tune_checkpoint/train_mnist_tune_checkpoint_2_layer_1_size=32,layer_2_size=64_2020-07-01_20-16-32auazt2qt/MNIST/raw/t10k-labels-idx1-ubyte.gz
0it [00:00, ?it/s]1)[0m 
[2m[36m(pid=23929)[0m Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to /nvme_drive/orm/car_stuff/vroom_nbs/pbt_logs4/train_mnist_tune_checkpoint/train_mnist_tune_checkpoint_1_layer_1_size=128,layer_2_size=256_2020-07-01_20-16-32zmhj0sem/MNIST/raw/t10k-labels-idx1-ubyte.gz
8192it [00:00, 129102.06it/s]
[2m[36m(pid=23931)[0m Extracting /nvme_drive/orm/car_stuff/vroom_nbs/pbt_logs4/train_mnist_tune_checkpoint/train_mnist_tune_checkpoint_2_layer_1_size=32,layer_2_size=64_2020-07-01_20-16-32auazt2qt/MNIST/raw/t10k-labels-idx1-ubyte.gz to /nvme_drive/orm/car_stuff/vroom_nbs/pbt_logs4/train_mnist_tune_checkpoint/train

Trial name,status,loc,layer_1_size,layer_2_size,lr,batch_size,loss,mean_accuracy,training_iteration
train_mnist_tune_checkpoint_48500_00000,RUNNING,10.0.0.207:23927,64,256,0.001,64,2.00295,0.586234,1.0
train_mnist_tune_checkpoint_48500_00001,RUNNING,,128,256,0.001,64,,,
train_mnist_tune_checkpoint_48500_00002,RUNNING,,32,64,0.001,64,,,


Result for train_mnist_tune_checkpoint_48500_00002:
  date: 2020-07-01_20-16-40
  done: false
  experiment_id: 1965d993bc2a4f139a6f13706511d9db
  experiment_tag: 2_layer_1_size=32,layer_2_size=64
  hostname: claustrophobia
  iterations_since_restore: 1
  loss: 2.076528549194336
  mean_accuracy: 0.4521360695362091
  node_ip: 10.0.0.207
  pid: 23931
  should_checkpoint: true
  time_since_restore: 7.542717218399048
  time_this_iter_s: 7.542717218399048
  time_total_s: 7.542717218399048
  timestamp: 1593649000
  timesteps_since_restore: 0
  training_iteration: 1
  trial_id: '48500_00002'
  
Result for train_mnist_tune_checkpoint_48500_00001:
  date: 2020-07-01_20-16-40
  done: false
  experiment_id: cc77c29d241a40029689ac6299af3267
  experiment_tag: 1_layer_1_size=128,layer_2_size=256
  hostname: claustrophobia
  iterations_since_restore: 1
  loss: 1.964975357055664
  mean_accuracy: 0.606210470199585
  node_ip: 10.0.0.207
  pid: 23929
  should_checkpoint: true
  time_since_restore: 7.57439

Trial name,status,loc,layer_1_size,layer_2_size,lr,batch_size,loss,mean_accuracy,training_iteration
train_mnist_tune_checkpoint_48500_00000,RUNNING,10.0.0.207:23927,64,256,0.001,64,1.3656,0.740111,2
train_mnist_tune_checkpoint_48500_00001,RUNNING,10.0.0.207:23929,128,256,0.001,64,1.36715,0.738726,2
train_mnist_tune_checkpoint_48500_00002,RUNNING,10.0.0.207:23931,32,64,0.001,64,1.67813,0.54015,2


Result for train_mnist_tune_checkpoint_48500_00000:
  date: 2020-07-01_20-16-50
  done: false
  experiment_id: 38f740b14d3540afbd647ac72d6610f3
  experiment_tag: 0_layer_1_size=64,layer_2_size=256
  hostname: claustrophobia
  iterations_since_restore: 3
  loss: 0.9397650957107544
  mean_accuracy: 0.8039952516555786
  node_ip: 10.0.0.207
  pid: 23927
  should_checkpoint: true
  time_since_restore: 16.699106454849243
  time_this_iter_s: 4.637747526168823
  time_total_s: 16.699106454849243
  timestamp: 1593649010
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: '48500_00000'
  


2020-07-01 20:16:50,471	INFO pbt.py:308 -- [pbt]: no checkpoint for trial. Skip exploit for Trial train_mnist_tune_checkpoint_48500_00002


Result for train_mnist_tune_checkpoint_48500_00002:
  date: 2020-07-01_20-16-50
  done: false
  experiment_id: 1965d993bc2a4f139a6f13706511d9db
  experiment_tag: 2_layer_1_size=32,layer_2_size=64
  hostname: claustrophobia
  iterations_since_restore: 3
  loss: 1.2985531091690063
  mean_accuracy: 0.701344907283783
  node_ip: 10.0.0.207
  pid: 23931
  should_checkpoint: true
  time_since_restore: 17.077626943588257
  time_this_iter_s: 4.731122970581055
  time_total_s: 17.077626943588257
  timestamp: 1593649010
  timesteps_since_restore: 0
  training_iteration: 3
  trial_id: '48500_00002'
  
Result for train_mnist_tune_checkpoint_48500_00001:
  date: 2020-07-01_20-16-50
  done: false
  experiment_id: cc77c29d241a40029689ac6299af3267
  experiment_tag: 1_layer_1_size=128,layer_2_size=256
  hostname: claustrophobia
  iterations_since_restore: 3
  loss: 0.9618237018585205
  mean_accuracy: 0.7972705960273743
  node_ip: 10.0.0.207
  pid: 23929
  should_checkpoint: true
  time_since_restore: 17.

Trial name,status,loc,layer_1_size,layer_2_size,lr,batch_size,loss,mean_accuracy,training_iteration
train_mnist_tune_checkpoint_48500_00000,RUNNING,10.0.0.207:23927,64,256,0.001,64,0.75071,0.831685,4
train_mnist_tune_checkpoint_48500_00001,RUNNING,10.0.0.207:23929,128,256,0.001,64,0.961824,0.797271,3
train_mnist_tune_checkpoint_48500_00002,RUNNING,10.0.0.207:23931,32,64,0.001,64,1.29855,0.701345,3


Result for train_mnist_tune_checkpoint_48500_00000:
  date: 2020-07-01_20-16-59
  done: false
  experiment_id: 38f740b14d3540afbd647ac72d6610f3
  experiment_tag: 0_layer_1_size=64,layer_2_size=256
  hostname: claustrophobia
  iterations_since_restore: 5
  loss: 0.6553573608398438
  mean_accuracy: 0.8473101258277893
  node_ip: 10.0.0.207
  pid: 23927
  should_checkpoint: true
  time_since_restore: 26.240272760391235
  time_this_iter_s: 4.824969530105591
  time_total_s: 26.240272760391235
  timestamp: 1593649019
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: '48500_00000'
  
Result for train_mnist_tune_checkpoint_48500_00002:
  date: 2020-07-01_20-16-59
  done: false
  experiment_id: 1965d993bc2a4f139a6f13706511d9db
  experiment_tag: 2_layer_1_size=32,layer_2_size=64
  hostname: claustrophobia
  iterations_since_restore: 5
  loss: 0.9121752381324768
  mean_accuracy: 0.768789529800415
  node_ip: 10.0.0.207
  pid: 23931
  should_checkpoint: true
  time_since_restore: 26.5

Trial name,status,loc,layer_1_size,layer_2_size,lr,batch_size,loss,mean_accuracy,training_iteration
train_mnist_tune_checkpoint_48500_00000,RUNNING,10.0.0.207:23927,64,256,0.001,64,0.655357,0.84731,5
train_mnist_tune_checkpoint_48500_00001,RUNNING,10.0.0.207:23929,128,256,0.001,64,0.778442,0.822191,4
train_mnist_tune_checkpoint_48500_00002,RUNNING,10.0.0.207:23931,32,64,0.001,64,0.912175,0.76879,5


Result for train_mnist_tune_checkpoint_48500_00001:
  date: 2020-07-01_20-17-00
  done: false
  experiment_id: cc77c29d241a40029689ac6299af3267
  experiment_tag: 1_layer_1_size=128,layer_2_size=256
  hostname: claustrophobia
  iterations_since_restore: 5
  loss: 0.6852383017539978
  mean_accuracy: 0.8364319801330566
  node_ip: 10.0.0.207
  pid: 23929
  should_checkpoint: true
  time_since_restore: 26.69636106491089
  time_this_iter_s: 4.731658220291138
  time_total_s: 26.69636106491089
  timestamp: 1593649020
  timesteps_since_restore: 0
  training_iteration: 5
  trial_id: '48500_00001'
  
Result for train_mnist_tune_checkpoint_48500_00000:
  date: 2020-07-01_20-17-09
  done: false
  experiment_id: 38f740b14d3540afbd647ac72d6610f3
  experiment_tag: 0_layer_1_size=64,layer_2_size=256
  hostname: claustrophobia
  iterations_since_restore: 7
  loss: 0.5653228759765625
  mean_accuracy: 0.8633307218551636
  node_ip: 10.0.0.207
  pid: 23927
  should_checkpoint: true
  time_since_restore: 35.

Trial name,status,loc,layer_1_size,layer_2_size,lr,batch_size,loss,mean_accuracy,training_iteration
train_mnist_tune_checkpoint_48500_00000,RUNNING,10.0.0.207:23927,64,256,0.001,64,0.565323,0.863331,7
train_mnist_tune_checkpoint_48500_00001,RUNNING,10.0.0.207:23929,128,256,0.001,64,0.630687,0.846321,6
train_mnist_tune_checkpoint_48500_00002,RUNNING,10.0.0.207:23931,32,64,0.001,64,0.82628,0.785997,6


Result for train_mnist_tune_checkpoint_48500_00002:
  date: 2020-07-01_20-17-09
  done: false
  experiment_id: 1965d993bc2a4f139a6f13706511d9db
  experiment_tag: 2_layer_1_size=32,layer_2_size=64
  hostname: claustrophobia
  iterations_since_restore: 7
  loss: 0.7707626819610596
  mean_accuracy: 0.795094907283783
  node_ip: 10.0.0.207
  pid: 23931
  should_checkpoint: true
  time_since_restore: 36.14347767829895
  time_this_iter_s: 4.820389270782471
  time_total_s: 36.14347767829895
  timestamp: 1593649029
  timesteps_since_restore: 0
  training_iteration: 7
  trial_id: '48500_00002'
  
Result for train_mnist_tune_checkpoint_48500_00001:
  date: 2020-07-01_20-17-09
  done: false
  experiment_id: cc77c29d241a40029689ac6299af3267
  experiment_tag: 1_layer_1_size=128,layer_2_size=256
  hostname: claustrophobia
  iterations_since_restore: 7
  loss: 0.5956544876098633
  mean_accuracy: 0.8544303774833679
  node_ip: 10.0.0.207
  pid: 23929
  should_checkpoint: true
  time_since_restore: 36.39

Trial name,status,loc,layer_1_size,layer_2_size,lr,batch_size,loss,mean_accuracy,training_iteration
train_mnist_tune_checkpoint_48500_00000,RUNNING,10.0.0.207:23927,64,256,0.001,64,0.541735,0.866693,8
train_mnist_tune_checkpoint_48500_00001,RUNNING,10.0.0.207:23929,128,256,0.001,64,0.595654,0.85443,7
train_mnist_tune_checkpoint_48500_00002,RUNNING,10.0.0.207:23931,32,64,0.001,64,0.732932,0.804193,8


Result for train_mnist_tune_checkpoint_48500_00000:
  date: 2020-07-01_20-17-18
  done: false
  experiment_id: 38f740b14d3540afbd647ac72d6610f3
  experiment_tag: 0_layer_1_size=64,layer_2_size=256
  hostname: claustrophobia
  iterations_since_restore: 9
  loss: 0.5251153111457825
  mean_accuracy: 0.8684731125831604
  node_ip: 10.0.0.207
  pid: 23927
  should_checkpoint: true
  time_since_restore: 45.297504901885986
  time_this_iter_s: 4.816346645355225
  time_total_s: 45.297504901885986
  timestamp: 1593649038
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: '48500_00000'
  


2020-07-01 20:17:19,137	INFO pbt.py:78 -- [explore] perturbed config from {'layer_1_size': 64, 'layer_2_size': 256, 'lr': 0.001, 'batch_size': 64, 'data_dir': '/tmp/mnist_data_ctfm5foh', 'num_workers': 3} -> {'layer_1_size': 64, 'layer_2_size': 256, 'lr': 0.0008, 'batch_size': 32, 'data_dir': '/tmp/mnist_data_ctfm5foh', 'num_workers': 3}
2020-07-01 20:17:19,138	INFO pbt.py:316 -- [exploit] transferring weights from trial train_mnist_tune_checkpoint_48500_00000 (score -0.5251153111457825) -> train_mnist_tune_checkpoint_48500_00002 (score -0.706142246723175)


Result for train_mnist_tune_checkpoint_48500_00002:
  date: 2020-07-01_20-17-19
  done: false
  experiment_id: 1965d993bc2a4f139a6f13706511d9db
  experiment_tag: 2_layer_1_size=32,layer_2_size=64@perturbed[batch_size=32,lr=0.0008]
  hostname: claustrophobia
  iterations_since_restore: 9
  loss: 0.706142246723175
  mean_accuracy: 0.8085442781448364
  node_ip: 10.0.0.207
  pid: 23931
  should_checkpoint: true
  time_since_restore: 45.74351263046265
  time_this_iter_s: 4.86440372467041
  time_total_s: 45.74351263046265
  timestamp: 1593649039
  timesteps_since_restore: 0
  training_iteration: 9
  trial_id: '48500_00002'
  
Result for train_mnist_tune_checkpoint_48500_00001:
  date: 2020-07-01_20-17-19
  done: false
  experiment_id: cc77c29d241a40029689ac6299af3267
  experiment_tag: 1_layer_1_size=128,layer_2_size=256
  hostname: claustrophobia
  iterations_since_restore: 9
  loss: 0.5547235608100891
  mean_accuracy: 0.8619462251663208
  node_ip: 10.0.0.207
  pid: 23929
  should_checkpoint

Trial name,status,loc,layer_1_size,layer_2_size,lr,batch_size,loss,mean_accuracy,training_iteration
train_mnist_tune_checkpoint_48500_00000,RUNNING,10.0.0.207:23927,64,256,0.001,64,0.525115,0.868473,9
train_mnist_tune_checkpoint_48500_00001,RUNNING,10.0.0.207:23929,128,256,0.001,64,0.554724,0.861946,9
train_mnist_tune_checkpoint_48500_00002,RUNNING,10.0.0.207:23931,64,256,0.0008,32,0.706142,0.808544,9


[2m[36m(pid=23955)[0m 2020-07-01 20:17:20,140	INFO trainable.py:464 -- Restored on 10.0.0.207 from checkpoint: /nvme_drive/orm/car_stuff/vroom_nbs/pbt_logs4/train_mnist_tune_checkpoint/train_mnist_tune_checkpoint_2_layer_1_size=32,layer_2_size=64_2020-07-01_20-16-32auazt2qt/checkpoint_default/./
[2m[36m(pid=23955)[0m 2020-07-01 20:17:20,140	INFO trainable.py:471 -- Current state after restoring: {'_iteration': 9, '_timesteps_total': None, '_time_total': 45.297504901885986, '_episodes_total': None}
[2m[36m(pid=23955)[0m GPU available: True, used: True
[2m[36m(pid=23955)[0m TPU available: False, using: 0 TPU cores
[2m[36m(pid=23955)[0m CUDA_VISIBLE_DEVICES: [0]
[2m[36m(pid=23955)[0m 
[2m[36m(pid=23955)[0m   | Name    | Type   | Params
[2m[36m(pid=23955)[0m -----------------------------------
[2m[36m(pid=23955)[0m 0 | layer_1 | Linear | 50 K  
[2m[36m(pid=23955)[0m 1 | layer_2 | Linear | 16 K  
[2m[36m(pid=23955)[0m 2 | layer_3 | Linear | 2 K   
Result for

Trial name,status,loc,layer_1_size,layer_2_size,lr,batch_size,loss,mean_accuracy,training_iteration
train_mnist_tune_checkpoint_48500_00000,TERMINATED,,64,256,0.001,64,0.513038,0.870451,10
train_mnist_tune_checkpoint_48500_00001,TERMINATED,,128,256,0.001,64,0.542287,0.863924,10
train_mnist_tune_checkpoint_48500_00002,RUNNING,10.0.0.207:23955,64,256,0.0008,32,0.513218,0.86664,10


Trial name,status,loc,layer_1_size,layer_2_size,lr,batch_size,loss,mean_accuracy,training_iteration
train_mnist_tune_checkpoint_48500_00000,TERMINATED,,64,256,0.001,64,0.513038,0.870451,10
train_mnist_tune_checkpoint_48500_00001,TERMINATED,,128,256,0.001,64,0.542287,0.863924,10
train_mnist_tune_checkpoint_48500_00002,TERMINATED,,64,256,0.0008,32,0.513218,0.86664,10


In [126]:
%reload_ext tensorboard
%tensorboard --logdir ./pbt_logs4/

In [125]:
!pkill -9 tensorboard