In [2]:
import torch
import torch.nn as nn
import torchvision
import pytorch_lightning as pl
from torch.nn import functional as F
import torchmetrics
import wandb
from torchvision.datasets import CIFAR10
from torch.utils.data import DataLoader, random_split

In [3]:
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning import Trainer

wandb_logger = WandbLogger(project="f5611")

[34m[1mwandb[0m: Currently logged in as: [33mpkantek[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
class CIFAR10DataModule(pl.LightningDataModule):
    def __init__(self, data_dir='../data', batch_size=64):
        super().__init__()
        self.data_dir = data_dir
        self.batch_size = batch_size
        self.transform = transform = torchvision.transforms.Compose([
            torchvision.transforms.ToTensor(),
            torchvision.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
        ])

    def prepare_data(self):
        CIFAR10(self.data_dir, train=True, download=True)
        CIFAR10(self.data_dir, train=False, download=True)

    def setup(self, stage=None):
        if stage == 'fit' or stage is None:
            cifar10 = CIFAR10(self.data_dir, train=True, transform=self.transform)
            self.cifar_train, self.cifar_val = random_split(cifar10, [45000, 5000])
        if stage == 'test' or stage is None:
            self.cifar_test = CIFAR10(self.data_dir, train=False, transform=self.transform)

    def train_dataloader(self):
        return DataLoader(self.cifar_train, batch_size=self.batch_size, num_workers=2)

    def val_dataloader(self):
        return DataLoader(self.cifar_val, batch_size=10 * self.batch_size, num_workers=2)

    def test_dataloader(self):
        return DataLoader(self.cifar_test, batch_size=10 * self.batch_size, num_workers=2)

In [5]:
def get_acc(n_classes):
    if n_classes > 2:
        return torchmetrics.Accuracy(task="multiclass", num_classes=n_classes)
    return torchmetrics.Accuracy(task="bianry")

# Define the PyTorch Lightning model
class CIFAR10Model(pl.LightningModule):
    def __init__(self, model, in_dims, n_classes=10, lr=1e-4):
        super().__init__()
        self.model = model
        
        self.save_hyperparameters(ignore=['model'])
        
        self.train_acc = get_acc(n_classes)
        self.valid_acc = get_acc(n_classes)
        self.test_acc = get_acc(n_classes)

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits, loss = self.loss(x, y)
        preds = torch.argmax(logits, 1)
    
        self.log('train/loss', loss, on_epoch=True)
        self.train_acc(preds, y)
        self.log('train/acc', self.train_acc, on_epoch=True)

        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits, loss = self.loss(x, y)
        preds = torch.argmax(logits, 1)

        self.valid_acc(preds, y)
        self.log("valid/loss_epoch", loss)  # default on val/test is on_epoch only
        self.log('valid/acc_epoch', self.valid_acc)

        return logits

    def validation_epoch_end(self, outputs):
        dummy_input = torch.zeros(self.hparams["in_dims"], device=self.device)
        model_filename = f"model_{str(self.global_step).zfill(5)}.onnx"
        torch.onnx.export(self, dummy_input, model_filename, opset_version=11)
        artifact = wandb.Artifact(name="model.ckpt", type="model")
        artifact.add_file(model_filename)
        self.logger.experiment.log_artifact(artifact)

        flattened_logits = torch.flatten(torch.cat(validation_step_outputs))
        self.logger.experiment.log(
            {"valid/logits": wandb.Histogram(flattened_logits.to("cpu")),
             "global_step": self.global_step})
    
    def test_step(self, batch, batch_idx):
        x, y = batch
        logits, loss = self.loss(x, y)
        preds = torch.argmax(logits, 1)

        self.test_acc(preds, y)
        self.log("test/loss_epoch", loss, on_step=False, on_epoch=True)
        self.log("test/acc_epoch", self.test_acc, on_step=False, on_epoch=True)
    
    def test_epoch_end(self, test_step_outputs):  # args are defined as part of pl API
        dummy_input = torch.zeros(self.hparams["in_dims"], device=self.device)
        model_filename = "model_final.onnx"
        self.to_onnx(model_filename, dummy_input, export_params=True)
        artifact = wandb.Artifact(name="model.ckpt", type="model")
        artifact.add_file(model_filename)
        wandb.log_artifact(artifact)
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits, loss = self.loss(x, y)
        preds = torch.argmax(logits, 1)

        self.valid_acc(preds, y)
        self.log("valid/loss_epoch", loss)
        self.log('valid/acc_epoch', self.valid_acc)

        return logits

    def validation_epoch_end(self, validation_step_outputs):
        dummy_input = torch.zeros(self.hparams["in_dims"], device=self.device)
        model_filename = f"model_{str(self.global_step).zfill(5)}.onnx"
        torch.onnx.export(self, dummy_input, model_filename, opset_version=11)
        artifact = wandb.Artifact(name="model.ckpt", type="model")
        artifact.add_file(model_filename)
        self.logger.experiment.log_artifact(artifact)

        flattened_logits = torch.flatten(torch.cat(validation_step_outputs))
        self.logger.experiment.log(
            {"valid/logits": wandb.Histogram(flattened_logits.to("cpu")),
             "global_step": self.global_step})

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=self.hparams["lr"])
    
    def loss(self, x, y):
        logits = self(x)
        cel = torch.nn.CrossEntropyLoss()
        loss = cel(logits, y)
        return logits, loss

In [6]:
class ImagePredictionLogger(pl.Callback):
    def __init__(self, val_samples, num_samples=32):
        super().__init__()
        self.val_imgs, self.val_labels = val_samples
        self.val_imgs = self.val_imgs[:num_samples]
        self.val_labels = self.val_labels[:num_samples]
          
    def on_validation_epoch_end(self, trainer, pl_module):
        val_imgs = self.val_imgs.to(device=pl_module.device)

        logits = pl_module(val_imgs)
        preds = torch.argmax(logits, 1)

        trainer.logger.experiment.log({
            "examples": [wandb.Image(x, caption=self.labels_to_caption(pred, y)) 
                            for x, pred, y in zip(val_imgs, preds, self.val_labels)],
            "global_step": trainer.global_step
            })
        
    def labels_to_caption(self, pred, y):
        mapping = {0: "airplane", 1:"car", 2:"bird", 3:"cat", 4:"deer", 5:"dog", 6:"frog", 7:"horse", 8:"ship", 9: "truck"}
        pred_name = mapping[pred.item()]
        y_name = mapping[y.item()]
        return f"Pred:{pred_name}, Label:{y_name}"

In [7]:
mobilenet = torchvision.models.mobilenet_v3_large(weights=torchvision.models.MobileNet_V3_Large_Weights.IMAGENET1K_V2, progress=True)
mobilenet.classifier[3] = nn.Linear(in_features=1280, out_features=10, bias=True)
mobilenet.train()

MobileNetV3(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
      (2): Hardswish()
    )
    (1): InvertedResidual(
      (block): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=16, bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
        )
        (1): Conv2dNormActivation(
          (0): Conv2d(16, 16, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (1): BatchNorm2d(16, eps=0.001, momentum=0.01, affine=True, track_running_stats=True)
        )
      )
    )
    (2): InvertedResidual(
      (block): Sequential(
        (0): Conv2dNormActivation(
          (0): Conv2d(16, 64, kernel_size=(1, 1), stride=(1, 1), bi

In [8]:
# setup data
cifar = CIFAR10DataModule()
cifar.prepare_data()
cifar.setup()

# grab samples to log predictions on
samples = next(iter(cifar.val_dataloader()))

Files already downloaded and verified
Files already downloaded and verified


In [12]:
trainer = pl.Trainer(
    logger=wandb_logger,    # W&B integration
    log_every_n_steps=50,   # set the logging frequency
    max_epochs=5,           # number of epochs
    deterministic=True,     # keep it deterministic
    callbacks=[ImagePredictionLogger(samples)] # see Callbacks section
    )

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [13]:
# setup model
model = CIFAR10Model(model=mobilenet, in_dims=(1, 3, 32, 32))

# fit the model
trainer.fit(model, cifar)

# evaluate the model on a test set
trainer.test(datamodule=cifar,
             ckpt_path=None)  # uses last-saved model

wandb.finish()

  rank_zero_warn(


Files already downloaded and verified
Files already downloaded and verified



  | Name      | Type               | Params
-------------------------------------------------
0 | model     | MobileNetV3        | 4.2 M 
1 | train_acc | MulticlassAccuracy | 0     
2 | valid_acc | MulticlassAccuracy | 0     
3 | test_acc  | MulticlassAccuracy | 0     
-------------------------------------------------
4.2 M     Trainable params
0         Non-trainable params
4.2 M     Total params
16.859    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

(1, 3, 32, 32)


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

(1, 3, 32, 32)


Validation: 0it [00:00, ?it/s]

(1, 3, 32, 32)


Validation: 0it [00:00, ?it/s]

(1, 3, 32, 32)


Validation: 0it [00:00, ?it/s]

(1, 3, 32, 32)


Validation: 0it [00:00, ?it/s]

(1, 3, 32, 32)


`Trainer.fit` stopped: `max_epochs=5` reached.
  rank_zero_warn(


Files already downloaded and verified
Files already downloaded and verified


Restoring states from the checkpoint path at .\f5611\20ew4paf\checkpoints\epoch=4-step=3520.ckpt
Loaded model weights from checkpoint at .\f5611\20ew4paf\checkpoints\epoch=4-step=3520.ckpt


Testing: 0it [00:00, ?it/s]

0,1
epoch,▁▁▁▁▁▁▁▁▃▃▃▃▃▃▃▃▅▅▅▅▅▅▅▅▆▆▆▆▆▆▆▆████████
global_step,▁▁▂▂▄▄▅▅▇▇██
test/acc_epoch,▁
test/loss_epoch,▁
train/acc_epoch,▁▄▆▇█
train/acc_step,▁▁▂▄▄▅▄▄▄▅▅▆▆▆▅▅▆▆▆▆▆▆▆▆▇▅▇▇▇██▆▇▇██▇███
train/loss_epoch,█▅▃▂▁
train/loss_step,█▇▇▅▅▄▅▄▄▄▄▃▄▃▄▄▃▂▃▂▃▃▂▃▂▃▂▂▂▁▁▂▂▂▁▁▂▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇████
valid/acc_epoch,▁▆▇██

0,1
epoch,4.0
global_step,3520.0
test/acc_epoch,0.6718
test/loss_epoch,1.10433
train/acc_epoch,0.83258
train/acc_step,0.82812
train/loss_epoch,0.48509
train/loss_step,0.43491
trainer/global_step,3520.0
valid/acc_epoch,0.6588


In [15]:
wandb.finish()

# Hyperparameter tuning

In [10]:
import ray


cifar_ref = ray.put(cifar)

In [10]:
from ray import air, tune
from ray.tune.integration.pytorch_lightning import TuneReportCallback

from ray.air import session
from ray.air.integrations.wandb import WandbLoggerCallback


def train_model(config):
    
    # setup model
    model = CIFAR10Model(model=mobilenet, in_dims=(1, 3, 32, 32), lr=config["lr"])
    
    # Create the Tune Reporting Callback
    metrics = {"loss": "valid/loss_epoch", "acc": "valid/acc_epoch"}
    # callbacks = [TuneReportCheckpointCallback(metrics, filename="checkpoint", on="validation_end")]

    trainer = pl.Trainer(
        # logger=wandb_logger,    # W&B integration
        log_every_n_steps=50,   # set the logging frequency
        max_epochs=5,           # number of epochs
        deterministic=True,     # keep it deterministic
        # callbacks=callbacks # see Callbacks section
    )
    
    # fit the model
    trainer.fit(model, ray.get(cifar_ref))


config = {
    "lr": tune.loguniform(1e-4, 1e-1)
}

# Make sure to specify how many actors each training run will create via the "extra_cpu" field.
tuner = tune.Tuner(
    train_model,
    tune_config=tune.TuneConfig(
        metric="loss",
        mode="min",
        num_samples=4
    ),
    run_config=air.RunConfig(
        name="tune_cifar",
            # callbacks=[
            #     WandbLoggerCallback(project="f5611")
            # ]
        ),
    param_space=config
)

results = tuner.fit()

print("Best hyperparameters found were: ", results.get_best_result().config)

 19%|█▉        | 33226752/170498071 [00:33<02:07, 1077757.86it/s]
 20%|█▉        | 33357824/170498071 [00:33<02:05, 1092973.58it/s]
 20%|█▉        | 33488896/170498071 [00:33<02:09, 1054107.13it/s]
 20%|█▉        | 33619968/170498071 [00:33<02:18, 991462.23it/s] 


0,1
Current time:,2023-01-15 10:46:58
Running for:,00:10:47.25
Memory:,9.7/15.7 GiB

Trial name,# failures,error file
train_model_0c049_00000,1,C:\Users\pkantek\ray_results\tune_cifar\train_model_0c049_00000_0_lr=0.0103_2023-01-15_10-36-12\error.txt
train_model_0c049_00001,1,C:\Users\pkantek\ray_results\tune_cifar\train_model_0c049_00001_1_lr=0.0003_2023-01-15_10-36-18\error.txt
train_model_0c049_00002,1,C:\Users\pkantek\ray_results\tune_cifar\train_model_0c049_00002_2_lr=0.0005_2023-01-15_10-36-24\error.txt
train_model_0c049_00003,1,C:\Users\pkantek\ray_results\tune_cifar\train_model_0c049_00003_3_lr=0.0715_2023-01-15_10-36-30\error.txt

Trial name,status,loc,lr
train_model_0c049_00000,ERROR,127.0.0.1:17920,0.0102839
train_model_0c049_00001,ERROR,127.0.0.1:17832,0.000277742
train_model_0c049_00002,ERROR,127.0.0.1:3208,0.000519884
train_model_0c049_00003,ERROR,127.0.0.1:18168,0.0715363


 20%|█▉        | 33751040/170498071 [00:33<02:08, 1063991.22it/s]
 20%|█▉        | 33882112/170498071 [00:33<02:16, 1000774.40it/s]
 20%|█▉        | 34013184/170498071 [00:34<02:33, 887255.39it/s] 
 20%|██        | 34144256/170498071 [00:34<02:33, 886163.84it/s]
 20%|██        | 34275328/170498071 [00:34<02:28, 917093.51it/s]
 20%|██        | 34406400/170498071 [00:34<02:25, 934589.77it/s]
 20%|██        | 34504704/170498071 [00:34<02:35, 873280.61it/s]
 20%|██        | 34635776/170498071 [00:34<02:24, 937970.52it/s]
 20%|██        | 34766848/170498071 [00:34<02:16, 991128.62it/s]
[2m[36m(train_model pid=17920)[0m GPU available: False, used: False
[2m[36m(train_model pid=17920)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_model pid=17920)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_model pid=17920)[0m HPU available: False, using: 0 HPUs


[2m[36m(train_model pid=17920)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ../data\cifar-10-python.tar.gz


  0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 32768/170498071 [00:00<10:29, 270831.15it/s]
  0%|          | 65536/170498071 [00:00<09:25, 301166.29it/s]
  0%|          | 98304/170498071 [00:00<10:15, 277030.32it/s]
  0%|          | 229376/170498071 [00:00<04:44, 597729.47it/s]
  0%|          | 360448/170498071 [00:00<03:39, 773972.67it/s]
  0%|          | 458752/170498071 [00:00<04:05, 692096.76it/s]
  0%|          | 720896/170498071 [00:00<02:22, 1193467.00it/s]
  1%|          | 1015808/170498071 [00:01<02:01, 1395823.40it/s]
  1%|          | 1376256/170498071 [00:01<01:32, 1824772.68it/s]
  1%|          | 1572864/170498071 [00:01<01:38, 1719211.27it/s]
  1%|          | 1769472/170498071 [00:01<01:41, 1668951.63it/s]
  1%|          | 1966080/170498071 [00:01<01:50, 1520833.32it/s]
  1%|          | 2129920/170498071 [00:01<02:09, 1301711.23it/s]
  1%|▏         | 2293760/170498071 [00:01<02:11, 1279937.69it/s]
  1%|▏         | 2457600/170498071 [00:02<02:25, 1154229.75i

[2m[36m(train_model pid=17832)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ../data\cifar-10-python.tar.gz


  4%|▎         | 6062080/170498071 [00:05<02:59, 914983.64it/s]
  4%|▎         | 6160384/170498071 [00:06<03:03, 897843.26it/s]
  0%|          | 0/170498071 [00:00<?, ?it/s]
  4%|▎         | 6291456/170498071 [00:06<02:53, 947655.16it/s]
  0%|          | 32768/170498071 [00:00<10:11, 278571.77it/s]
  4%|▎         | 6389760/170498071 [00:06<02:55, 937759.68it/s]
  0%|          | 65536/170498071 [00:00<10:21, 274065.38it/s]
  4%|▍         | 6488064/170498071 [00:06<02:53, 946346.83it/s]
  0%|          | 131072/170498071 [00:00<06:50, 414555.98it/s]
  4%|▍         | 6619136/170498071 [00:06<02:53, 947212.00it/s]
  0%|          | 196608/170498071 [00:00<06:02, 469518.60it/s]
  4%|▍         | 6717440/170498071 [00:06<03:02, 895845.65it/s]
  0%|          | 262144/170498071 [00:00<06:24, 442833.70it/s]
  0%|          | 491520/170498071 [00:00<03:15, 869885.68it/s]
  4%|▍         | 6848512/170498071 [00:06<03:50, 708741.34it/s]
  0%|          | 589824/170498071 [00:00<03:49, 739363.43it/s]
  4

[2m[36m(train_model pid=3208)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ../data\cifar-10-python.tar.gz


  6%|▌         | 10649600/170498071 [00:12<04:30, 590641.04it/s]
  3%|▎         | 5046272/170498071 [00:06<03:59, 690158.20it/s]
  6%|▋         | 10715136/170498071 [00:12<04:30, 590344.62it/s]
  3%|▎         | 5144576/170498071 [00:06<03:52, 711923.63it/s]
  6%|▋         | 10813440/170498071 [00:12<04:14, 626636.33it/s]
  0%|          | 0/170498071 [00:00<?, ?it/s]
  6%|▋         | 10911744/170498071 [00:12<03:51, 689645.35it/s]
  3%|▎         | 5242880/170498071 [00:06<03:58, 691864.75it/s]
  0%|          | 32768/170498071 [00:00<10:59, 258321.08it/s]
  6%|▋         | 11010048/170498071 [00:13<04:00, 663283.02it/s]
  3%|▎         | 5341184/170498071 [00:06<04:17, 642520.51it/s]
  0%|          | 65536/170498071 [00:00<10:55, 259954.12it/s]
  3%|▎         | 5439488/170498071 [00:07<04:02, 680460.08it/s]
  0%|          | 98304/170498071 [00:00<09:58, 284476.72it/s]
  7%|▋         | 11108352/170498071 [00:13<04:21, 609671.90it/s]
  0%|          | 163840/170498071 [00:00<07:10, 395986.34i

[2m[36m(train_model pid=18168)[0m Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ../data\cifar-10-python.tar.gz


  5%|▍         | 7798784/170498071 [00:12<06:17, 431182.86it/s]
  2%|▏         | 3997696/170498071 [00:06<04:17, 646318.00it/s]
  8%|▊         | 13729792/170498071 [00:19<07:49, 334070.31it/s]
  5%|▍         | 7864320/170498071 [00:13<06:04, 446396.48it/s]
  2%|▏         | 4096000/170498071 [00:06<04:22, 632830.33it/s]
  0%|          | 0/170498071 [00:00<?, ?it/s]
  8%|▊         | 13828096/170498071 [00:19<06:30, 400819.05it/s]
  0%|          | 32768/170498071 [00:00<10:11, 278902.47it/s]
  5%|▍         | 7929856/170498071 [00:13<07:55, 342064.59it/s]
  2%|▏         | 4161536/170498071 [00:06<05:49, 476053.32it/s]
  0%|          | 65536/170498071 [00:00<11:39, 243656.99it/s]
  8%|▊         | 13893632/170498071 [00:19<07:07, 365983.23it/s]
  3%|▎         | 4292608/170498071 [00:06<04:26, 624248.79it/s]
  5%|▍         | 8028160/170498071 [00:13<06:14, 433977.31it/s]
  8%|▊         | 13959168/170498071 [00:19<06:37, 394268.88it/s]
  3%|▎         | 4390912/170498071 [00:06<04:35, 603698.62

[2m[36m(train_model pid=3208)[0m Extracting ../data\cifar-10-python.tar.gz to ../data


 84%|████████▎ | 142508032/170498071 [07:20<01:42, 273764.67it/s]
 37%|███▋      | 62816256/170498071 [07:07<12:01, 149281.64it/s]
 83%|████████▎ | 141557760/170498071 [07:26<01:20, 358735.99it/s]
 84%|████████▎ | 142540800/170498071 [07:20<01:42, 272221.47it/s]
 37%|███▋      | 62849024/170498071 [07:07<10:25, 171977.36it/s]
 84%|████████▎ | 142573568/170498071 [07:21<01:41, 275505.87it/s]
 83%|████████▎ | 141623296/170498071 [07:27<01:16, 377146.99it/s]
 84%|████████▎ | 142606336/170498071 [07:21<01:40, 276853.63it/s]
 83%|████████▎ | 141688832/170498071 [07:27<01:19, 362713.57it/s]
 84%|████████▎ | 142639104/170498071 [07:21<01:39, 280415.95it/s]
 37%|███▋      | 62881792/170498071 [07:08<11:01, 162682.59it/s]
 37%|███▋      | 62914560/170498071 [07:08<09:56, 180369.10it/s]
 83%|████████▎ | 141754368/170498071 [07:27<01:15, 378481.52it/s]
 84%|████████▎ | 142704640/170498071 [07:21<01:34, 293251.74it/s]
 37%|███▋      | 62947328/170498071 [07:08<09:03, 197844.45it/s]
 83%|████████▎ 

[2m[36m(train_model pid=3208)[0m Files already downloaded and verified


 84%|████████▍ | 142868480/170498071 [07:30<01:08, 403631.31it/s]
 84%|████████▍ | 143589376/170498071 [07:24<01:20, 333494.12it/s]
 37%|███▋      | 63569920/170498071 [07:11<07:19, 243375.23it/s]
 84%|████████▍ | 142934016/170498071 [07:30<01:11, 386805.64it/s]
[2m[36m(train_model pid=3208)[0m Missing logger folder: C:\Users\pkantek\ray_results\tune_cifar\train_model_0c049_00002_2_lr=0.0005_2023-01-15_10-36-24\lightning_logs
 37%|███▋      | 63602688/170498071 [07:11<08:20, 213485.65it/s]
 84%|████████▍ | 142999552/170498071 [07:30<01:07, 408464.62it/s]
 84%|████████▍ | 143065088/170498071 [07:30<01:01, 445258.73it/s]
 84%|████████▍ | 143654912/170498071 [07:24<01:39, 269777.61it/s]
 37%|███▋      | 63635456/170498071 [07:11<08:26, 210884.52it/s]
 84%|████████▍ | 143720448/170498071 [07:24<01:30, 294318.09it/s]
 37%|███▋      | 63668224/170498071 [07:11<08:19, 213895.65it/s]
 84%|████████▍ | 143130624/170498071 [07:30<01:08, 401845.23it/s]
 84%|████████▍ | 143753216/170498071 [07:2

Sanity Checking: 0it [00:00, ?it/s] 


 84%|████████▍ | 143785984/170498071 [07:25<01:46, 250550.58it/s]
 84%|████████▍ | 143196160/170498071 [07:31<01:26, 316171.57it/s]
 37%|███▋      | 63733760/170498071 [07:12<10:26, 170285.14it/s]
 84%|████████▍ | 143261696/170498071 [07:31<01:14, 364633.77it/s]
 84%|████████▍ | 143851520/170498071 [07:25<01:38, 270181.54it/s]
 37%|███▋      | 63766528/170498071 [07:12<09:16, 191787.90it/s]
 84%|████████▍ | 143327232/170498071 [07:31<01:10, 385044.01it/s]
 84%|████████▍ | 143884288/170498071 [07:25<01:36, 274570.44it/s]
 84%|████████▍ | 143392768/170498071 [07:31<01:10, 382372.92it/s]
 84%|████████▍ | 143917056/170498071 [07:25<01:36, 276362.83it/s]
 37%|███▋      | 63799296/170498071 [07:12<09:23, 189459.98it/s]
 84%|████████▍ | 143949824/170498071 [07:25<01:37, 273354.35it/s]
 37%|███▋      | 63832064/170498071 [07:12<08:44, 203218.31it/s]
 84%|████████▍ | 143458304/170498071 [07:31<01:11, 380151.09it/s]
 37%|███▋      | 63864832/170498071 [07:12<08:36, 206379.15it/s]
 84%|████████▍ 

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


 87%|████████▋ | 148340736/170498071 [07:44<00:53, 417230.82it/s]
 88%|████████▊ | 149553152/170498071 [07:38<00:36, 571845.91it/s]
 87%|████████▋ | 148406272/170498071 [07:44<00:54, 407179.59it/s]
 88%|████████▊ | 149618688/170498071 [07:38<00:39, 522821.29it/s]
 39%|███▉      | 66420736/170498071 [07:25<06:37, 261843.22it/s]
 87%|████████▋ | 148471808/170498071 [07:44<00:57, 384448.75it/s]
 88%|████████▊ | 149684224/170498071 [07:38<00:39, 524896.88it/s]
 39%|███▉      | 66453504/170498071 [07:25<06:36, 262524.59it/s]
 87%|████████▋ | 148537344/170498071 [07:44<00:54, 399368.36it/s]
 88%|████████▊ | 149749760/170498071 [07:38<00:38, 542322.67it/s]
 39%|███▉      | 66486272/170498071 [07:25<06:37, 261428.86it/s]
 87%|████████▋ | 148602880/170498071 [07:44<00:50, 434376.66it/s]
 88%|████████▊ | 149848064/170498071 [07:38<00:35, 585656.20it/s]
 39%|███▉      | 66551808/170498071 [07:25<05:31, 313555.32it/s]
 88%|████████▊ | 149913600/170498071 [07:38<00:35, 573202.77it/s]
 39%|███▉     

Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:01<00:01,  1.97s/it]


 88%|████████▊ | 150765568/170498071 [07:40<00:37, 528664.43it/s]
 87%|████████▋ | 149159936/170498071 [07:46<01:05, 324391.90it/s]
 88%|████████▊ | 150863872/170498071 [07:40<00:34, 573705.01it/s]
 39%|███▉      | 67010560/170498071 [07:27<06:59, 246955.30it/s]
 88%|████████▊ | 149225472/170498071 [07:46<01:00, 349157.19it/s]
 89%|████████▊ | 150962176/170498071 [07:40<00:34, 572074.15it/s]
 88%|████████▊ | 149291008/170498071 [07:46<00:55, 384354.37it/s]
 88%|████████▊ | 149356544/170498071 [07:46<00:52, 400258.91it/s]
 89%|████████▊ | 151060480/170498071 [07:40<00:31, 623299.45it/s]
 39%|███▉      | 67108864/170498071 [07:27<06:46, 254520.70it/s]
 88%|████████▊ | 149422080/170498071 [07:46<00:49, 425909.28it/s]
 89%|████████▊ | 151158784/170498071 [07:40<00:29, 650338.67it/s]
 39%|███▉      | 67174400/170498071 [07:27<05:57, 289328.45it/s]
 89%|████████▊ | 151257088/170498071 [07:41<00:28, 685294.43it/s]
 88%|████████▊ | 149487616/170498071 [07:47<00:52, 397619.04it/s]
 89%|████████

Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:03<00:00,  1.73s/it]


 88%|████████▊ | 149815296/170498071 [07:47<00:44, 469888.10it/s]
 89%|████████▉ | 151846912/170498071 [07:41<00:24, 754949.71it/s]
 88%|████████▊ | 149880832/170498071 [07:47<00:43, 469248.66it/s]
 89%|████████▉ | 151945216/170498071 [07:41<00:25, 739945.35it/s]
 40%|███▉      | 67371008/170498071 [07:28<08:22, 205219.39it/s]
 89%|████████▉ | 152043520/170498071 [07:42<00:24, 768026.51it/s]
 40%|███▉      | 67403776/170498071 [07:29<07:56, 216141.41it/s]
 88%|████████▊ | 149946368/170498071 [07:48<00:46, 440358.56it/s]
 89%|████████▉ | 152141824/170498071 [07:42<00:23, 790707.51it/s]
 40%|███▉      | 67436544/170498071 [07:29<07:40, 223705.02it/s]
 88%|████████▊ | 150011904/170498071 [07:48<00:44, 457864.38it/s]
 89%|████████▉ | 152240128/170498071 [07:42<00:22, 803222.24it/s]
 40%|███▉      | 67469312/170498071 [07:29<07:19, 234609.00it/s]
 88%|████████▊ | 150077440/170498071 [07:48<00:44, 457926.36it/s]
 89%|████████▉ | 152338432/170498071 [07:42<00:22, 814786.27it/s]
 88%|████████▊

Trial name,date,experiment_id,hostname,node_ip,pid,timestamp,trial_id
train_model_0c049_00000,2023-01-15_10-36-18,11ed610918ae45d1bd18865876d84b70,CZW21I26Q1204,127.0.0.1,17920,1673775378,0c049_00000
train_model_0c049_00001,2023-01-15_10-36-24,5d73303a699a471ab200e926717c4da9,CZW21I26Q1204,127.0.0.1,17832,1673775384,0c049_00001
train_model_0c049_00002,2023-01-15_10-36-30,e1b086115adb46ac8332fc71aefb9c51,CZW21I26Q1204,127.0.0.1,3208,1673775390,0c049_00002
train_model_0c049_00003,2023-01-15_10-36-37,345924c51bc74fa1a601de6395d7edcf,CZW21I26Q1204,127.0.0.1,18168,1673775397,0c049_00003


 88%|████████▊ | 150667264/170498071 [07:49<00:54, 360800.21it/s]
 90%|████████▉ | 153288704/170498071 [07:43<00:29, 582596.97it/s]
 40%|███▉      | 67829760/170498071 [07:30<05:51, 292020.01it/s]
 88%|████████▊ | 150732800/170498071 [07:50<00:49, 399348.45it/s]
 90%|████████▉ | 153387008/170498071 [07:43<00:28, 604102.04it/s]
 40%|███▉      | 67862528/170498071 [07:30<06:04, 281942.02it/s]
 90%|█████████ | 153452544/170498071 [07:44<00:28, 598453.93it/s]
 40%|███▉      | 67895296/170498071 [07:30<06:07, 279450.17it/s]
 88%|████████▊ | 150831104/170498071 [07:50<00:42, 462560.52it/s]
 40%|███▉      | 67928064/170498071 [07:31<06:07, 278971.21it/s]
 90%|█████████ | 153550848/170498071 [07:44<00:27, 608513.96it/s]
 40%|███▉      | 67960832/170498071 [07:31<06:11, 275934.26it/s]
 89%|████████▊ | 150896640/170498071 [07:50<00:46, 421083.60it/s]
 90%|█████████ | 153616384/170498071 [07:44<00:28, 601142.48it/s]
 89%|████████▊ | 150962176/170498071 [07:50<00:46, 420632.88it/s]
 90%|█████████ 

[2m[36m(train_model pid=17832)[0m Extracting ../data\cifar-10-python.tar.gz to ../data


 98%|█████████▊| 167641088/170498071 [08:20<00:04, 700892.60it/s]
 98%|█████████▊| 167739392/170498071 [08:21<00:04, 683830.01it/s]
 43%|████▎     | 74055680/170498071 [08:01<13:28, 119279.15it/s]
 98%|█████████▊| 167837696/170498071 [08:21<00:03, 717655.09it/s]
 98%|█████████▊| 167936000/170498071 [08:21<00:03, 729137.70it/s]
 43%|████▎     | 74088448/170498071 [08:02<13:57, 115061.41it/s]
 99%|█████████▊| 168034304/170498071 [08:21<00:03, 729871.72it/s]
 99%|█████████▊| 168132608/170498071 [08:21<00:03, 737930.40it/s]
 43%|████▎     | 74121216/170498071 [08:02<13:29, 119016.15it/s]
 99%|█████████▊| 168230912/170498071 [08:21<00:03, 737314.89it/s]
 43%|████▎     | 74153984/170498071 [08:02<12:23, 129576.53it/s]
 99%|█████████▊| 168329216/170498071 [08:21<00:03, 719903.99it/s]
 99%|█████████▉| 168427520/170498071 [08:21<00:02, 742852.84it/s]
 44%|████▎     | 74186752/170498071 [08:02<12:18, 130394.27it/s]
 99%|█████████▉| 168525824/170498071 [08:22<00:02, 771425.96it/s]
 99%|█████████▉

[2m[36m(train_model pid=17832)[0m Files already downloaded and verified


 44%|████▎     | 74481664/170498071 [08:04<07:54, 202299.42it/s]
100%|█████████▉| 169705472/170498071 [08:23<00:01, 730416.60it/s]
 44%|████▎     | 74514432/170498071 [08:04<07:51, 203686.32it/s]
100%|█████████▉| 169803776/170498071 [08:23<00:00, 747802.17it/s]
[2m[36m(train_model pid=17832)[0m Missing logger folder: C:\Users\pkantek\ray_results\tune_cifar\train_model_0c049_00001_1_lr=0.0003_2023-01-15_10-36-18\lightning_logs
 44%|████▎     | 74547200/170498071 [08:04<07:14, 220969.21it/s]
100%|█████████▉| 169934848/170498071 [08:23<00:00, 782388.77it/s]
 44%|████▎     | 74579968/170498071 [08:04<06:43, 237830.22it/s]
100%|█████████▉| 170033152/170498071 [08:24<00:00, 800660.40it/s]
 44%|████▍     | 74612736/170498071 [08:04<06:23, 250001.86it/s]
100%|█████████▉| 170131456/170498071 [08:24<00:00, 818394.63it/s]
 44%|████▍     | 74645504/170498071 [08:05<06:33, 243641.51it/s]
100%|█████████▉| 170229760/170498071 [08:24<00:00, 650505.94it/s]
[2m[36m(train_model pid=17832)[0m 
[2m

Sanity Checking: 0it [00:00, ?it/s]m 


100%|█████████▉| 170426368/170498071 [08:24<00:00, 682627.60it/s]
 44%|████▍     | 74711040/170498071 [08:05<07:38, 208698.47it/s]
100%|██████████| 170498071/170498071 [08:24<00:00, 337829.98it/s]
 44%|████▍     | 74743808/170498071 [08:05<07:19, 217704.02it/s]
 44%|████▍     | 74776576/170498071 [08:05<06:49, 233505.00it/s]
 44%|████▍     | 74809344/170498071 [08:05<06:28, 246292.05it/s]


[2m[36m(train_model pid=17920)[0m Extracting ../data\cifar-10-python.tar.gz to ../data


 44%|████▍     | 74874880/170498071 [08:05<05:44, 277532.48it/s]
 44%|████▍     | 74940416/170498071 [08:06<05:49, 273679.03it/s]
 44%|████▍     | 74973184/170498071 [08:06<05:47, 274940.43it/s]
 44%|████▍     | 75005952/170498071 [08:06<05:59, 265872.32it/s]
 44%|████▍     | 75038720/170498071 [08:06<06:02, 262988.59it/s]
 44%|████▍     | 75071488/170498071 [08:06<05:57, 267085.02it/s]
 44%|████▍     | 75104256/170498071 [08:06<05:54, 269123.54it/s]
 44%|████▍     | 75137024/170498071 [08:06<05:49, 272962.12it/s]
 44%|████▍     | 75169792/170498071 [08:07<05:59, 265431.34it/s]
 44%|████▍     | 75202560/170498071 [08:07<05:57, 266302.76it/s]
 44%|████▍     | 75268096/170498071 [08:07<05:12, 304878.86it/s]
 44%|████▍     | 75300864/170498071 [08:07<05:17, 300299.56it/s]
 44%|████▍     | 75366400/170498071 [08:07<05:02, 314704.29it/s]
 44%|████▍     | 75399168/170498071 [08:07<05:08, 307912.80it/s]
 44%|████▍     | 75431936/170498071 [08:07<05:19, 297363.09it/s]
 44%|████▍     | 75464704

[2m[36m(train_model pid=17920)[0m Files already downloaded and verified


 44%|████▍     | 75792384/170498071 [08:09<05:15, 300173.89it/s]
 44%|████▍     | 75857920/170498071 [08:09<04:35, 343210.63it/s]
 45%|████▍     | 75923456/170498071 [08:09<04:34, 345149.94it/s]
[2m[36m(train_model pid=17920)[0m Missing logger folder: C:\Users\pkantek\ray_results\tune_cifar\train_model_0c049_00000_0_lr=0.0103_2023-01-15_10-36-12\lightning_logs
 45%|████▍     | 75988992/170498071 [08:09<04:22, 360639.37it/s]
 45%|████▍     | 76054528/170498071 [08:09<04:14, 371132.70it/s]
 45%|████▍     | 76120064/170498071 [08:09<04:19, 363891.77it/s]
 45%|████▍     | 76185600/170498071 [08:10<04:06, 382944.69it/s]
 45%|████▍     | 76251136/170498071 [08:10<04:12, 373353.41it/s]
[2m[36m(train_model pid=17920)[0m 
[2m[36m(train_model pid=17920)[0m   | Name      | Type               | Params
[2m[36m(train_model pid=17920)[0m -------------------------------------------------
[2m[36m(train_model pid=17920)[0m 0 | model     | MobileNetV3        | 4.2 M 
[2m[36m(train_model 

Sanity Checking: 0it [00:00, ?it/s]m 


 45%|████▍     | 76382208/170498071 [08:10<03:45, 418082.22it/s]
 45%|████▍     | 76447744/170498071 [08:10<03:41, 425255.53it/s]
 45%|████▍     | 76513280/170498071 [08:10<03:46, 414768.20it/s]
 45%|████▍     | 76578816/170498071 [08:11<03:40, 426862.08it/s]
 45%|████▍     | 76644352/170498071 [08:11<03:25, 456802.42it/s]
 45%|████▍     | 76709888/170498071 [08:11<03:19, 471048.96it/s]
 45%|████▌     | 76775424/170498071 [08:11<03:12, 486562.79it/s]
 45%|████▌     | 76840960/170498071 [08:11<03:21, 464085.17it/s]
 45%|████▌     | 76906496/170498071 [08:11<03:23, 458782.44it/s]
 45%|████▌     | 76972032/170498071 [08:11<03:13, 483606.12it/s]
 45%|████▌     | 77037568/170498071 [08:11<03:05, 504324.24it/s]
 45%|████▌     | 77103104/170498071 [08:12<03:32, 440032.76it/s]
 45%|████▌     | 77168640/170498071 [08:12<03:27, 449077.84it/s]
 45%|████▌     | 77234176/170498071 [08:12<03:15, 477369.43it/s]
 45%|████▌     | 77299712/170498071 [08:12<03:06, 499094.00it/s]
 45%|████▌     | 77365248

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


 50%|████▉     | 84901888/170498071 [08:20<01:39, 861635.21it/s]
 50%|████▉     | 85032960/170498071 [08:20<01:35, 891800.11it/s]
 50%|████▉     | 85131264/170498071 [08:20<01:36, 884861.30it/s]
 50%|█████     | 85262336/170498071 [08:20<01:33, 915991.66it/s]
 50%|█████     | 85393408/170498071 [08:20<01:28, 958410.51it/s]
 50%|█████     | 85491712/170498071 [08:20<01:28, 963985.87it/s]
 50%|█████     | 85622784/170498071 [08:21<01:27, 975559.21it/s]
 50%|█████     | 85753856/170498071 [08:21<01:25, 990808.43it/s]
 50%|█████     | 85884928/170498071 [08:21<01:28, 953020.36it/s]
 50%|█████     | 86016000/170498071 [08:21<01:31, 920769.59it/s]
 51%|█████     | 86114304/170498071 [08:21<01:58, 712848.09it/s]
 51%|█████     | 86278144/170498071 [08:21<01:37, 861352.76it/s]
 51%|█████     | 86376448/170498071 [08:22<01:42, 819587.67it/s]
 51%|█████     | 86474752/170498071 [08:22<01:41, 827491.65it/s]
 51%|█████     | 86573056/170498071 [08:22<01:40, 832761.37it/s]


Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:01<00:01,  1.95s/it]


 51%|█████     | 86671360/170498071 [08:22<01:44, 798896.47it/s]
 51%|█████     | 86769664/170498071 [08:22<01:43, 809453.47it/s]
 51%|█████     | 86867968/170498071 [08:22<01:45, 790324.13it/s]
 51%|█████     | 86966272/170498071 [08:22<01:39, 837526.41it/s]
 51%|█████     | 87097344/170498071 [08:22<01:33, 888827.44it/s]
 51%|█████     | 87195648/170498071 [08:23<01:36, 863653.65it/s]
 51%|█████     | 87293952/170498071 [08:23<01:36, 859652.25it/s]
 51%|█████▏    | 87392256/170498071 [08:23<01:37, 856266.79it/s]
 51%|█████▏    | 87490560/170498071 [08:23<01:33, 889065.31it/s]
 51%|█████▏    | 87719936/170498071 [08:23<01:32, 899208.34it/s]
 52%|█████▏    | 87818240/170498071 [08:23<01:30, 908825.59it/s]
 52%|█████▏    | 87949312/170498071 [08:23<01:28, 932179.61it/s]
 52%|█████▏    | 88047616/170498071 [08:23<01:28, 934553.22it/s]
 52%|█████▏    | 88145920/170498071 [08:24<01:31, 902354.00it/s]
 52%|█████▏    | 88276992/170498071 [08:24<01:27, 940602.07it/s]


Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:03<00:00,  1.90s/it]


 52%|█████▏    | 88375296/170498071 [08:24<01:31, 901931.56it/s]
 52%|█████▏    | 88473600/170498071 [08:24<01:32, 888164.45it/s]
 52%|█████▏    | 88604672/170498071 [08:24<01:27, 937328.05it/s]
 52%|█████▏    | 88702976/170498071 [08:24<01:30, 903060.87it/s]
 52%|█████▏    | 88834048/170498071 [08:24<01:27, 936880.17it/s]
 52%|█████▏    | 88932352/170498071 [08:24<01:30, 905741.17it/s]
 52%|█████▏    | 89030656/170498071 [08:25<01:29, 914196.06it/s]
 52%|█████▏    | 89161728/170498071 [08:25<01:31, 889320.70it/s]
 52%|█████▏    | 89292800/170498071 [08:25<01:28, 919155.47it/s]
 52%|█████▏    | 89391104/170498071 [08:25<01:30, 900799.41it/s]
 53%|█████▎    | 89522176/170498071 [08:25<01:31, 880516.54it/s]
 53%|█████▎    | 89620480/170498071 [08:25<01:35, 846687.56it/s]
 53%|█████▎    | 89751552/170498071 [08:25<01:28, 910432.37it/s]
 53%|█████▎    | 89849856/170498071 [08:25<01:31, 884101.83it/s]
 53%|█████▎    | 89980928/170498071 [08:26<01:28, 910294.07it/s]
2023-01-15 10:45:04,596	E

Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]


 53%|█████▎    | 90177536/170498071 [08:26<01:25, 934413.73it/s]
 53%|█████▎    | 90275840/170498071 [08:26<01:30, 881857.63it/s]
 53%|█████▎    | 90406912/170498071 [08:26<01:30, 883158.61it/s]
 53%|█████▎    | 90636288/170498071 [08:26<01:24, 940675.30it/s]
 53%|█████▎    | 90767360/170498071 [08:26<01:22, 970018.76it/s]
 53%|█████▎    | 90898432/170498071 [08:27<01:21, 975050.56it/s]
 53%|█████▎    | 90996736/170498071 [08:27<01:21, 972081.41it/s]
 53%|█████▎    | 91095040/170498071 [08:27<01:22, 966156.91it/s]
 54%|█████▎    | 91226112/170498071 [08:27<01:19, 1001244.31it/s]
 54%|█████▎    | 91357184/170498071 [08:27<01:16, 1035888.55it/s]
 54%|█████▎    | 91488256/170498071 [08:27<01:20, 983307.95it/s] 
 54%|█████▎    | 91619328/170498071 [08:27<01:18, 1011061.11it/s]


Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:01<00:01,  1.83s/it]


 54%|█████▍    | 91750400/170498071 [08:27<01:15, 1036560.43it/s]
 54%|█████▍    | 91881472/170498071 [08:28<01:15, 1034573.52it/s]
 54%|█████▍    | 92012544/170498071 [08:28<01:18, 997277.39it/s] 
 54%|█████▍    | 92143616/170498071 [08:28<01:17, 1016158.83it/s]
 54%|█████▍    | 92274688/170498071 [08:28<01:15, 1030425.07it/s]
 54%|█████▍    | 92405760/170498071 [08:28<01:14, 1050845.27it/s]
 54%|█████▍    | 92536832/170498071 [08:28<01:10, 1100832.14it/s]
 54%|█████▍    | 92667904/170498071 [08:28<01:12, 1077686.13it/s]
 54%|█████▍    | 92798976/170498071 [08:28<01:10, 1097256.78it/s]
 55%|█████▍    | 92930048/170498071 [08:29<01:09, 1109500.30it/s]


Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:03<00:00,  1.63s/it]


 55%|█████▍    | 93061120/170498071 [08:29<01:07, 1139393.34it/s]
 55%|█████▍    | 93192192/170498071 [08:29<01:06, 1157201.84it/s]
 55%|█████▍    | 93323264/170498071 [08:29<01:04, 1198162.68it/s]
 55%|█████▍    | 93487104/170498071 [08:29<01:03, 1214908.28it/s]
 55%|█████▍    | 93618176/170498071 [08:29<01:04, 1186399.25it/s]
 55%|█████▌    | 93782016/170498071 [08:29<01:01, 1257362.49it/s]
 55%|█████▌    | 93945856/170498071 [08:29<00:59, 1278939.53it/s]
 55%|█████▌    | 94109696/170498071 [08:29<00:58, 1307901.22it/s]
 55%|█████▌    | 94273536/170498071 [08:30<00:56, 1354113.60it/s]
 55%|█████▌    | 94437376/170498071 [08:30<00:55, 1371478.25it/s]
 55%|█████▌    | 94601216/170498071 [08:30<00:54, 1383976.43it/s]
 56%|█████▌    | 94765056/170498071 [08:30<00:59, 1265588.72it/s]
2023-01-15 10:45:09,041	ERROR trial_runner.py:1088 -- Trial train_model_0c049_00000: Error processing event.
ray.exceptions.RayTaskError(AttributeError): [36mray::ImplicitFunc.train()[39m (pid=17920, ip=127

[2m[36m(train_model pid=18168)[0m Extracting ../data\cifar-10-python.tar.gz to ../data
[2m[36m(train_model pid=18168)[0m Files already downloaded and verified


[2m[36m(train_model pid=18168)[0m Missing logger folder: C:\Users\pkantek\ray_results\tune_cifar\train_model_0c049_00003_3_lr=0.0715_2023-01-15_10-36-30\lightning_logs
[2m[36m(train_model pid=18168)[0m 
[2m[36m(train_model pid=18168)[0m   | Name      | Type               | Params
[2m[36m(train_model pid=18168)[0m -------------------------------------------------
[2m[36m(train_model pid=18168)[0m 0 | model     | MobileNetV3        | 4.2 M 
[2m[36m(train_model pid=18168)[0m 1 | train_acc | MulticlassAccuracy | 0     
[2m[36m(train_model pid=18168)[0m 2 | valid_acc | MulticlassAccuracy | 0     
[2m[36m(train_model pid=18168)[0m 3 | test_acc  | MulticlassAccuracy | 0     
[2m[36m(train_model pid=18168)[0m -------------------------------------------------
[2m[36m(train_model pid=18168)[0m 4.2 M     Trainable params
[2m[36m(train_model pid=18168)[0m 0         Non-trainable params
[2m[36m(train_model pid=18168)[0m 4.2 M     Total params
[2m[36m(train_mode

Sanity Checking: 0it [00:00, ?it/s]m 
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:01<00:01,  1.49s/it]
Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:02<00:00,  1.43s/it]


2023-01-15 10:46:58,735	ERROR trial_runner.py:1088 -- Trial train_model_0c049_00003: Error processing event.
ray.exceptions.RayTaskError(AttributeError): [36mray::ImplicitFunc.train()[39m (pid=18168, ip=127.0.0.1, repr=train_model)
  File "python\ray\_raylet.pyx", line 830, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 834, in ray._raylet.execute_task
  File "python\ray\_raylet.pyx", line 780, in ray._raylet.execute_task.function_executor
  File "C:\Users\pkantek\AppData\Local\pypoetry\Cache\virtualenvs\ml-astr-wIJ34D6o-py3.10\lib\site-packages\ray\_private\function_manager.py", line 674, in actor_method_executor
    return method(__ray_actor, *args, **kwargs)
  File "C:\Users\pkantek\AppData\Local\pypoetry\Cache\virtualenvs\ml-astr-wIJ34D6o-py3.10\lib\site-packages\ray\util\tracing\tracing_helper.py", line 466, in _resume_span
    return method(self, *_args, **_kwargs)
  File "C:\Users\pkantek\AppData\Local\pypoetry\Cache\virtualenvs\ml-astr-wIJ34D6o-py3.10\lib\s

RuntimeError: No best trial found for the given metric: loss. This means that no trial has reported this metric, or all values reported for this metric are NaN. To not ignore NaN values, you can set the `filter_nan_and_inf` arg to False.

In [None]:

wandb.finish()

In [12]:
results.get_best_result().config



RuntimeError: No best trial found for the given metric: loss. This means that no trial has reported this metric, or all values reported for this metric are NaN. To not ignore NaN values, you can set the `filter_nan_and_inf` arg to False.

In [None]:
from ray import air, tune
from ray.tune.integration.pytorch_lightning import TuneReportCallback

from ray.air import session
from ray.air.integrations.wandb import WandbLoggerCallback


def train_model(config):
    
    # setup model
    model = CIFAR10Model(model=mobilenet, in_dims=(1, 3, 32, 32), lr=config["lr"])
    
    # Create the Tune Reporting Callback
    metrics = {"loss": "valid/loss_epoch", "acc": "valid/acc_epoch"}
    callbacks = [TuneReportCallback(metrics, on="validation_end")]

    trainer = pl.Trainer(
        logger=wandb_logger,    # W&B integration
        log_every_n_steps=50,   # set the logging frequency
        max_epochs=5,           # number of epochs
        deterministic=True,     # keep it deterministic
        callbacks=callbacks # see Callbacks section
    )
    
    # fit the model
    trainer.fit(model, ray.get(cifar_ref))


config = {
    "lr": tune.loguniform(1e-4, 1e-1)
}

# Make sure to specify how many actors each training run will create via the "extra_cpu" field.
tuner = tune.Tuner(
    train_model,
    tune_config=tune.TuneConfig(
        metric="loss",
        mode="min",
        num_samples=4
    ),
    run_config=air.RunConfig(
        name="tune_cifar",
            # callbacks=[
            #     WandbLoggerCallback(project="f5611")
            # ]
        ),
    param_space=config
)

results = tuner.fit()

print("Best hyperparameters found were: ", results.get_best_result().config)

0,1
Current time:,2023-01-15 11:29:20
Running for:,00:37:04.15
Memory:,12.1/15.7 GiB

Trial name,status,loc,lr,iter,total time (s),loss,acc
train_model_4ad8d_00000,RUNNING,127.0.0.1:17996,0.000324606,2,1502.53,0.781569,0.731
train_model_4ad8d_00001,RUNNING,127.0.0.1:11576,0.0515827,2,1650.59,2.3081,0.1044
train_model_4ad8d_00002,RUNNING,127.0.0.1:9880,0.0085377,2,1595.47,1.28863,0.5954
train_model_4ad8d_00003,RUNNING,127.0.0.1:3208,0.000170226,2,1500.58,0.929202,0.681


[2m[36m(train_model pid=17996)[0m GPU available: False, used: False
[2m[36m(train_model pid=17996)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_model pid=17996)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_model pid=17996)[0m HPU available: False, using: 0 HPUs


[2m[36m(train_model pid=17996)[0m Files already downloaded and verified
[2m[36m(train_model pid=17996)[0m Files already downloaded and verified


[2m[36m(train_model pid=17996)[0m 
[2m[36m(train_model pid=17996)[0m   | Name      | Type               | Params
[2m[36m(train_model pid=17996)[0m -------------------------------------------------
[2m[36m(train_model pid=17996)[0m 0 | model     | MobileNetV3        | 4.2 M 
[2m[36m(train_model pid=17996)[0m 1 | train_acc | MulticlassAccuracy | 0     
[2m[36m(train_model pid=17996)[0m 2 | valid_acc | MulticlassAccuracy | 0     
[2m[36m(train_model pid=17996)[0m 3 | test_acc  | MulticlassAccuracy | 0     
[2m[36m(train_model pid=17996)[0m -------------------------------------------------
[2m[36m(train_model pid=17996)[0m 4.2 M     Trainable params
[2m[36m(train_model pid=17996)[0m 0         Non-trainable params
[2m[36m(train_model pid=17996)[0m 4.2 M     Total params
[2m[36m(train_model pid=17996)[0m 16.859    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]m 


[2m[36m(train_model pid=11576)[0m GPU available: False, used: False
[2m[36m(train_model pid=11576)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_model pid=11576)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_model pid=11576)[0m HPU available: False, using: 0 HPUs
[2m[36m(train_model pid=9880)[0m GPU available: False, used: False
[2m[36m(train_model pid=9880)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_model pid=9880)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_model pid=9880)[0m HPU available: False, using: 0 HPUs
[2m[36m(train_model pid=3208)[0m GPU available: False, used: False
[2m[36m(train_model pid=3208)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_model pid=3208)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_model pid=3208)[0m HPU available: False, using: 0 HPUs


[2m[36m(train_model pid=11576)[0m Files already downloaded and verified
[2m[36m(train_model pid=9880)[0m Files already downloaded and verified
[2m[36m(train_model pid=3208)[0m Files already downloaded and verified
[2m[36m(train_model pid=11576)[0m Files already downloaded and verified
[2m[36m(train_model pid=9880)[0m Files already downloaded and verified
[2m[36m(train_model pid=3208)[0m Files already downloaded and verified


[2m[36m(train_model pid=11576)[0m 
[2m[36m(train_model pid=11576)[0m   | Name      | Type               | Params
[2m[36m(train_model pid=11576)[0m -------------------------------------------------
[2m[36m(train_model pid=11576)[0m 0 | model     | MobileNetV3        | 4.2 M 
[2m[36m(train_model pid=11576)[0m 1 | train_acc | MulticlassAccuracy | 0     
[2m[36m(train_model pid=11576)[0m 2 | valid_acc | MulticlassAccuracy | 0     
[2m[36m(train_model pid=11576)[0m 3 | test_acc  | MulticlassAccuracy | 0     
[2m[36m(train_model pid=11576)[0m -------------------------------------------------
[2m[36m(train_model pid=11576)[0m 4.2 M     Trainable params
[2m[36m(train_model pid=11576)[0m 0         Non-trainable params
[2m[36m(train_model pid=11576)[0m 4.2 M     Total params
[2m[36m(train_model pid=11576)[0m 16.859    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]m 


[2m[36m(train_model pid=9880)[0m 
[2m[36m(train_model pid=9880)[0m   | Name      | Type               | Params
[2m[36m(train_model pid=9880)[0m -------------------------------------------------
[2m[36m(train_model pid=9880)[0m 0 | model     | MobileNetV3        | 4.2 M 
[2m[36m(train_model pid=9880)[0m 1 | train_acc | MulticlassAccuracy | 0     
[2m[36m(train_model pid=9880)[0m 2 | valid_acc | MulticlassAccuracy | 0     
[2m[36m(train_model pid=9880)[0m 3 | test_acc  | MulticlassAccuracy | 0     
[2m[36m(train_model pid=9880)[0m -------------------------------------------------
[2m[36m(train_model pid=9880)[0m 4.2 M     Trainable params
[2m[36m(train_model pid=9880)[0m 0         Non-trainable params
[2m[36m(train_model pid=9880)[0m 4.2 M     Total params
[2m[36m(train_model pid=9880)[0m 16.859    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s] 


[2m[36m(train_model pid=3208)[0m 
[2m[36m(train_model pid=3208)[0m   | Name      | Type               | Params
[2m[36m(train_model pid=3208)[0m -------------------------------------------------
[2m[36m(train_model pid=3208)[0m 0 | model     | MobileNetV3        | 4.2 M 
[2m[36m(train_model pid=3208)[0m 1 | train_acc | MulticlassAccuracy | 0     
[2m[36m(train_model pid=3208)[0m 2 | valid_acc | MulticlassAccuracy | 0     
[2m[36m(train_model pid=3208)[0m 3 | test_acc  | MulticlassAccuracy | 0     
[2m[36m(train_model pid=3208)[0m -------------------------------------------------
[2m[36m(train_model pid=3208)[0m 4.2 M     Trainable params
[2m[36m(train_model pid=3208)[0m 0         Non-trainable params
[2m[36m(train_model pid=3208)[0m 4.2 M     Total params
[2m[36m(train_model pid=3208)[0m 16.859    Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s] 
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:03<00:03,  3.00s/it]
Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:05<00:00,  2.86s/it]
                                                                           
Epoch 0:   0%|          | 0/712 [00:00<?, ?it/s] 
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0:   0%|          | 0/2 [00:00<?, ?it/s]
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:03<00:03,  3.68s/it]
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:03<00:03,  3.53s/it]
Sanity Checking DataLoader 0:  50%|█████     | 1/2 [00:03<00:03,  3.22s/it]
Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:06<00:00,  3.18s/it]
Sanity Checking DataLoader 0: 100%|██████████| 2/2 [00:06<00:00,  3.17s/it]
Sanity Checking DataLoader 0: 100

Exception ignored in: <function WandbLoggerCallback.__del__ at 0x000001F037C7DE10>
Traceback (most recent call last):
  File "C:\Users\pkantek\AppData\Local\pypoetry\Cache\virtualenvs\ml-astr-wIJ34D6o-py3.10\lib\site-packages\ray\air\integrations\wandb.py", line 556, in __del__
    self._trial_processes[trial].join(timeout=2)
  File "C:\Users\pkantek\AppData\Local\Programs\Python\Python310\lib\multiprocessing\process.py", line 148, in join
    assert self._popen is not None, 'can only join a started process'
AssertionError: can only join a started process


Epoch 0:  15%|█▍        | 105/712 [02:12<12:44,  1.26s/it, loss=2.3, v_num=3zln]


Traceback (most recent call last):
  File "C:\Users\pkantek\AppData\Local\Programs\Python\Python310\lib\multiprocessing\queues.py", line 239, in _feed
    reader_close()
  File "C:\Users\pkantek\AppData\Local\Programs\Python\Python310\lib\multiprocessing\connection.py", line 182, in close
    self._close()
  File "C:\Users\pkantek\AppData\Local\Programs\Python\Python310\lib\multiprocessing\connection.py", line 282, in _close
    _CloseHandle(self._handle)
OSError: [WinError 6] The handle is invalid


Epoch 0:  16%|█▋        | 117/712 [02:25<12:18,  1.24s/it, loss=1.61, v_num=3zln]
Epoch 0:  15%|█▍        | 104/712 [02:12<12:53,  1.27s/it, loss=2.03, v_num=3zln]
Epoch 0:  15%|█▍        | 106/712 [02:13<12:40,  1.26s/it, loss=2.3, v_num=3zln]
Epoch 0:  15%|█▍        | 104/712 [02:12<12:52,  1.27s/it, loss=1.97, v_num=3zln]
Epoch 0:  17%|█▋        | 118/712 [02:26<12:15,  1.24s/it, loss=1.6, v_num=3zln] 
Epoch 0:  15%|█▍        | 105/712 [02:13<12:49,  1.27s/it, loss=2.03, v_num=3zln]
Epoch 0:  15%|█▌        | 107/712 [02:13<12:37,  1.25s/it, loss=2.3, v_num=3zln]
Epoch 0:  15%|█▍        | 105/712 [02:12<12:48,  1.27s/it, loss=1.97, v_num=3zln]
Epoch 0:  17%|█▋        | 119/712 [02:27<12:13,  1.24s/it, loss=1.6, v_num=3zln]
Epoch 0:  15%|█▍        | 106/712 [02:14<12:47,  1.27s/it, loss=2.04, v_num=3zln]
Epoch 0:  15%|█▌        | 108/712 [02:14<12:34,  1.25s/it, loss=2.3, v_num=3zln]
Epoch 0:  15%|█▍        | 106/712 [02:13<12:45,  1.26s/it, loss=1.95, v_num=3zln]
Epoch 0:  17%|█▋    

Trial name,acc,date,done,episodes_total,experiment_id,hostname,iterations_since_restore,loss,node_ip,pid,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
train_model_4ad8d_00000,0.731,2023-01-15_11-17-24,False,,d05fa9b2ae3b4d7c9faa7a3c4bb629d2,CZW21I26Q1204,2,0.781569,127.0.0.1,17996,1502.53,730.487,1502.53,1673777844,0,,2,4ad8d_00000,0.0
train_model_4ad8d_00001,0.1044,2023-01-15_11-20-01,False,,21d4a33ad26943d290a97de7b3b6739a,CZW21I26Q1204,2,2.3081,127.0.0.1,11576,1650.59,882.606,1650.59,1673778001,0,,2,4ad8d_00001,0.00600004
train_model_4ad8d_00002,0.5954,2023-01-15_11-19-06,False,,095d905fca6c4f6db43419d76e077d37,CZW21I26Q1204,2,1.28863,127.0.0.1,9880,1595.47,817.467,1595.47,1673777946,0,,2,4ad8d_00002,0.0059979
train_model_4ad8d_00003,0.681,2023-01-15_11-17-32,False,,a0b336232d4d41ea857f88295e95a23c,CZW21I26Q1204,2,0.929202,127.0.0.1,3208,1500.58,727.697,1500.58,1673777852,0,,2,4ad8d_00003,0.00799966


Epoch 0: 100%|██████████| 712/712 [12:18<00:00,  1.04s/it, loss=1.05, v_num=3zln]
Epoch 0: 100%|██████████| 712/712 [12:18<00:00,  1.04s/it, loss=1.05, v_num=3zln]
Epoch 1:   0%|          | 0/712 [00:00<?, ?it/s, loss=1.05, v_num=3zln]          
[2m[36m(train_model pid=9880)[0m 
Epoch 0:  99%|█████████▉| 708/712 [12:07<00:04,  1.03s/it, loss=1.27, v_num=3zln]
[2m[36m(train_model pid=11576)[0m 
Epoch 0: 100%|██████████| 712/712 [12:08<00:00,  1.02s/it, loss=2.16, v_num=3zln]
[2m[36m(train_model pid=3208)[0m 
Epoch 0: 100%|█████████▉| 710/712 [12:08<00:02,  1.03s/it, loss=1.27, v_num=3zln]
[2m[36m(train_model pid=9880)[0m 
Epoch 0: 100%|█████████▉| 709/712 [12:10<00:03,  1.03s/it, loss=1.27, v_num=3zln]
Epoch 0: 100%|██████████| 712/712 [12:11<00:00,  1.03s/it, loss=2.16, v_num=3zln]
Epoch 0: 100%|██████████| 712/712 [12:11<00:00,  1.03s/it, loss=2.16, v_num=3zln]
Epoch 1:   0%|          | 0/712 [00:00<?, ?it/s, loss=2.16, v_num=3zln]          
[2m[36m(train_model pid=3208)

In [10]:
import ray

In [11]:
from ray import air, tune
from ray.tune.integration.pytorch_lightning import TuneReportCallback

from ray.air import session
from ray.air.integrations.wandb import WandbLoggerCallback


def train_model(config):
    
    # setup model
    model = CIFAR10Model(model=mobilenet, in_dims=(1, 3, 32, 32), lr=config["lr"])
    
    # Create the Tune Reporting Callback
    metrics = {"loss": "valid/loss_epoch", "acc": "valid/acc_epoch"}
    callbacks = [TuneReportCallback(metrics, on="validation_end")]

    trainer = pl.Trainer(
        logger=wandb_logger,    # W&B integration
        log_every_n_steps=100,   # set the logging frequency
        max_epochs=5,           # number of epochs
        deterministic=True,     # keep it deterministic
        callbacks=callbacks, # see Callbacks section
        enable_progress_bar=False
    )
    
    trainer.fit(model, ray.get(cifar_ref))


config = {
    "lr": tune.loguniform(1e-4, 1e-1)
}

tuner = tune.Tuner(
    train_model,
    tune_config=tune.TuneConfig(
        metric="loss",
        mode="min",
        num_samples=4
    ),
    run_config=air.RunConfig(
        name="tune_cifar",
            # callbacks=[
            #     WandbLoggerCallback(project="f5611")
            # ]
        ),
    param_space=config
)

results = tuner.fit()

print("Best hyperparameters found were: ", results.get_best_result().config)

0,1
Current time:,2023-01-15 13:01:07
Running for:,01:04:25.09
Memory:,7.3/15.7 GiB

Trial name,status,loc,lr,iter,total time (s),loss,acc
train_model_4bb8d_00000,TERMINATED,127.0.0.1:16996,0.00516099,5,3619.9,1.08385,0.623
train_model_4bb8d_00001,TERMINATED,127.0.0.1:6772,0.0277259,5,3847.4,2.35855,0.0982
train_model_4bb8d_00002,TERMINATED,127.0.0.1:12768,0.0688029,5,3850.99,2.31379,0.09
train_model_4bb8d_00003,TERMINATED,127.0.0.1:3248,0.000521136,5,3376.87,0.840888,0.7638


[2m[36m(train_model pid=16996)[0m GPU available: False, used: False
[2m[36m(train_model pid=16996)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_model pid=16996)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_model pid=16996)[0m HPU available: False, using: 0 HPUs


[2m[36m(train_model pid=16996)[0m Files already downloaded and verified
[2m[36m(train_model pid=16996)[0m Files already downloaded and verified


[2m[36m(train_model pid=16996)[0m 
[2m[36m(train_model pid=16996)[0m   | Name      | Type               | Params
[2m[36m(train_model pid=16996)[0m -------------------------------------------------
[2m[36m(train_model pid=16996)[0m 0 | model     | MobileNetV3        | 4.2 M 
[2m[36m(train_model pid=16996)[0m 1 | train_acc | MulticlassAccuracy | 0     
[2m[36m(train_model pid=16996)[0m 2 | valid_acc | MulticlassAccuracy | 0     
[2m[36m(train_model pid=16996)[0m 3 | test_acc  | MulticlassAccuracy | 0     
[2m[36m(train_model pid=16996)[0m -------------------------------------------------
[2m[36m(train_model pid=16996)[0m 4.2 M     Trainable params
[2m[36m(train_model pid=16996)[0m 0         Non-trainable params
[2m[36m(train_model pid=16996)[0m 4.2 M     Total params
[2m[36m(train_model pid=16996)[0m 16.859    Total estimated model params size (MB)
[2m[36m(train_model pid=16996)[0m   rank_zero_warn(
[2m[36m(train_model pid=6772)[0m GPU available:

[2m[36m(train_model pid=6772)[0m Files already downloaded and verified


[2m[36m(train_model pid=3248)[0m GPU available: False, used: False
[2m[36m(train_model pid=3248)[0m TPU available: False, using: 0 TPU cores
[2m[36m(train_model pid=3248)[0m IPU available: False, using: 0 IPUs
[2m[36m(train_model pid=3248)[0m HPU available: False, using: 0 HPUs


[2m[36m(train_model pid=12768)[0m Files already downloaded and verified
[2m[36m(train_model pid=3248)[0m Files already downloaded and verified
[2m[36m(train_model pid=6772)[0m Files already downloaded and verified
[2m[36m(train_model pid=12768)[0m Files already downloaded and verified
[2m[36m(train_model pid=3248)[0m Files already downloaded and verified


[2m[36m(train_model pid=6772)[0m 
[2m[36m(train_model pid=6772)[0m   | Name      | Type               | Params
[2m[36m(train_model pid=6772)[0m -------------------------------------------------
[2m[36m(train_model pid=6772)[0m 0 | model     | MobileNetV3        | 4.2 M 
[2m[36m(train_model pid=6772)[0m 1 | train_acc | MulticlassAccuracy | 0     
[2m[36m(train_model pid=6772)[0m 2 | valid_acc | MulticlassAccuracy | 0     
[2m[36m(train_model pid=6772)[0m 3 | test_acc  | MulticlassAccuracy | 0     
[2m[36m(train_model pid=6772)[0m -------------------------------------------------
[2m[36m(train_model pid=6772)[0m 4.2 M     Trainable params
[2m[36m(train_model pid=6772)[0m 0         Non-trainable params
[2m[36m(train_model pid=6772)[0m 4.2 M     Total params
[2m[36m(train_model pid=6772)[0m 16.859    Total estimated model params size (MB)
[2m[36m(train_model pid=6772)[0m   rank_zero_warn(
[2m[36m(train_model pid=12768)[0m 
[2m[36m(train_model pid

Trial name,acc,date,done,episodes_total,experiment_id,experiment_tag,hostname,iterations_since_restore,loss,node_ip,pid,time_since_restore,time_this_iter_s,time_total_s,timestamp,timesteps_since_restore,timesteps_total,training_iteration,trial_id,warmup_time
train_model_4bb8d_00000,0.623,2023-01-15_12-57-09,True,,28551eaaf4914acd84627ca244c087da,0_lr=0.0052,CZW21I26Q1204,5,1.08385,127.0.0.1,16996,3619.9,699.474,3619.9,1673783829,0,,5,4bb8d_00000,0.0
train_model_4bb8d_00001,0.0982,2023-01-15_13-01-03,True,,c6571711bfae495f9dde9956facec392,1_lr=0.0277,CZW21I26Q1204,5,2.35855,127.0.0.1,6772,3847.4,711.435,3847.4,1673784063,0,,5,4bb8d_00001,0.00299931
train_model_4bb8d_00002,0.09,2023-01-15_13-01-06,True,,b489e2df2f8d4f06872eb3690bb37e01,2_lr=0.0688,CZW21I26Q1204,5,2.31379,127.0.0.1,12768,3850.99,711.809,3850.99,1673784066,0,,5,4bb8d_00002,0.00499988
train_model_4bb8d_00003,0.7638,2023-01-15_12-53-13,True,,e6c561fcefc74cd197f847112da816b8,3_lr=0.0005,CZW21I26Q1204,5,0.840888,127.0.0.1,3248,3376.87,632.425,3376.87,1673783593,0,,5,4bb8d_00003,0.00399852


[2m[36m(train_model pid=3248)[0m `Trainer.fit` stopped: `max_epochs=5` reached.
[2m[36m(train_model pid=16996)[0m `Trainer.fit` stopped: `max_epochs=5` reached.
[2m[36m(train_model pid=6772)[0m `Trainer.fit` stopped: `max_epochs=5` reached.
[2m[36m(train_model pid=12768)[0m `Trainer.fit` stopped: `max_epochs=5` reached.
2023-01-15 13:01:08,035	INFO tune.py:762 -- Total run time: 3865.49 seconds (3865.08 seconds for the tuning loop).


Best hyperparameters found were:  {'lr': 0.0005211356069429316}


In [16]:
results.get_dataframe()

Unnamed: 0,loss,acc,time_this_iter_s,done,timesteps_total,episodes_total,training_iteration,trial_id,experiment_id,date,...,time_total_s,pid,hostname,node_ip,time_since_restore,timesteps_since_restore,iterations_since_restore,warmup_time,config/lr,logdir
0,1.083846,0.623,699.474198,False,,,5,4bb8d_00000,28551eaaf4914acd84627ca244c087da,2023-01-15_12-57-09,...,3619.897151,16996,CZW21I26Q1204,127.0.0.1,3619.897151,0,5,0.0,0.005161,C:\Users\pkantek\ray_results\tune_cifar\train_...
1,2.358554,0.0982,711.434555,False,,,5,4bb8d_00001,c6571711bfae495f9dde9956facec392,2023-01-15_13-01-03,...,3847.400492,6772,CZW21I26Q1204,127.0.0.1,3847.400492,0,5,0.002999,0.027726,C:\Users\pkantek\ray_results\tune_cifar\train_...
2,2.313794,0.09,711.80944,False,,,5,4bb8d_00002,b489e2df2f8d4f06872eb3690bb37e01,2023-01-15_13-01-06,...,3850.991401,12768,CZW21I26Q1204,127.0.0.1,3850.991401,0,5,0.005,0.068803,C:\Users\pkantek\ray_results\tune_cifar\train_...
3,0.840888,0.7638,632.424868,False,,,5,4bb8d_00003,e6c561fcefc74cd197f847112da816b8,2023-01-15_12-53-13,...,3376.872896,3248,CZW21I26Q1204,127.0.0.1,3376.872896,0,5,0.003999,0.000521,C:\Users\pkantek\ray_results\tune_cifar\train_...
