In [1]:
import os
import torch
import torchvision

from pytorch_lightning.loggers.neptune import NeptuneLogger

from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks.early_stopping import EarlyStopping
from pytorch_lightning.callbacks import ModelCheckpoint

from model import PlaceholderModel
from pathlib import Path
from argparse import Namespace


from ax.plot.contour import plot_contour
from ax.plot.trace import optimization_trace_single_method
from ax.service.managed_loop import optimize
from ax.utils.notebook.plotting import render, init_notebook_plotting

from ax.service.ax_client import AxClient

from env import NEPTUNE_API_TOKEN

In [2]:
algo = "conv_expaning_ae"
architecture = "arch_1_l1_loss"

# data_path = '../../data/Proca_fiducial_scaled_cropped.hdf5'
data_path = '/scratch/ns4486/numerical-relativity-interpolation/Proca_fiducial_scaled_cropped.hdf5'

# checkpoint_path = '/Users/nikhilvs/checkpoints'
checkpoint_path = f'/scratch/prs392/capstone/checkpoints/{algo}/{architecture}'
experiment_name = 'expanding_AE_batch1'

d = os.path.join(checkpoint_path, experiment_name)
Path(d).mkdir(parents=True, exist_ok=True)
versions = [o for o in os.listdir(d) if os.path.isdir(os.path.join(d,o))]

versions = sorted(versions)

In [3]:
checkpoint_path

'/scratch/prs392/capstone/checkpoints/conv_expaning_ae/arch_1_l1_loss'

In [4]:
from tensorboard.backend.event_processing import event_accumulator
import numpy as np

def _load_run(path):
    event_acc = event_accumulator.EventAccumulator(path)
    event_acc.Reload()
    data = {}

    for tag in sorted(event_acc.Tags()["scalars"]):
        x, y = [], []

        for scalar_event in event_acc.Scalars(tag):
            x.append(scalar_event.step)
            y.append(scalar_event.value)

        data[tag] = (np.asarray(x), np.asarray(y))
    return data

In [5]:
from pytorch_lightning.core.saving import load_hparams_from_yaml

list_existing_hparams = []
list_of_val_loss = []

for version in versions:
    hparam_path = os.path.join(checkpoint_path, experiment_name, version, 'hparams.yaml')
    hparams_new = load_hparams_from_yaml(hparam_path)

    print(hparams_new)
#     print(min(_load_run(os.path.join(checkpoint_path, experiment_name, version))['val_loss'][1]))

#     list_existing_hparams.append(hparams_new)
#     list_of_val_loss.append(min(_load_run(os.path.join(checkpoint_path, experiment_name, version))['val_loss'][1]))

{}
{}
{}
{}
{}



Missing Tags: /scratch/prs392/capstone/checkpoints/conv_expaning_ae/arch_1_l1_loss/expanding_AE_batch1/NUM-42/hparams.yaml.


Missing Tags: /scratch/prs392/capstone/checkpoints/conv_expaning_ae/arch_1_l1_loss/expanding_AE_batch1/NUM-56/hparams.yaml.


Missing Tags: /scratch/prs392/capstone/checkpoints/conv_expaning_ae/arch_1_l1_loss/expanding_AE_batch1/NUM-64/hparams.yaml.


Missing Tags: /scratch/prs392/capstone/checkpoints/conv_expaning_ae/arch_1_l1_loss/expanding_AE_batch1/NUM-65/hparams.yaml.


Missing Tags: /scratch/prs392/capstone/checkpoints/conv_expaning_ae/arch_1_l1_loss/expanding_AE_batch1/NUM-66/hparams.yaml.



In [6]:
def train_evaluate(parameterization):
    seed_everything(123)

    print(parameterization)
    
    hparams = Namespace(**parameterization)
    model = PlaceholderModel(hparams, data_path)
#     logger = TensorBoardLogger(checkpoint_path, name=experiment_name)
    neptune_logger = NeptuneLogger(
        api_key=NEPTUNE_API_TOKEN,
        project_name="nyuds2019/numerical-relativity-interpolation",
        experiment_name=experiment_name,  # Optional,
        params=parameterization,  # Optional,
        tags=["Param" ,"expanding_AE", "scaled_data", "3D", "l1"]  # Optional,
    )
    checkpoint_callback = ModelCheckpoint(
        filepath=None,
        save_top_k=5,
        save_last = True,
        verbose=False,
        monitor='val_loss',
        mode='min',
        prefix=''
    )
    trainer = Trainer(
#          logger = logger, 
         logger = neptune_logger,
         default_root_dir=checkpoint_path,
         checkpoint_callback = checkpoint_callback,
         max_epochs = 100,
         gpus = -1,
         distributed_backend = 'dp',
#          val_check_interval=0.10,
         log_save_interval=1,
         row_log_interval=1,
         fast_dev_run=False
    )
    trainer.fit(model)
    
    model_best_loss = model.best_loss
    
    del trainer, model, checkpoint_callback, neptune_logger, hparams
    
    torch.cuda.empty_cache()
    
    return {
        'loss': (model_best_loss, 0.0)
    }

In [7]:
ax_client = AxClient()
ax_client.create_experiment(
    name = experiment_name,
    parameters=[
        {"name": "batch_size", "type": "choice", "values": [8, 16]},
        {"name": "num_layers", "type": "choice", "values": [3, 4, 5, 6]},
        {"name": "lr", "type": "range", "bounds": [1e-4, 0.1], "log_scale": True},
        {"name": "lr_type", "type": "choice", "values": ['adam', 'sgd']},
        {"name": "scheduler_epoch", "type": "choice", "values": [5, 7, 10]},
        {"name": "scheduler_step_size", "type": "range", "bounds": [0.1, 1.0]}
    ],
    objective_name="loss",
    minimize=True,
)

[INFO 10-11 16:06:52] ax.service.ax_client: Starting optimization with verbose logging. To disable logging, set the `verbose_logging` argument to `False`. Note that float values in the logs are rounded to 2 decimal points.
[INFO 10-11 16:06:52] ax.modelbridge.dispatch_utils: Using Sobol generation strategy.


In [None]:
total_number_of_trials = 50

fixed_params = [
    {
        'batch_size': 8,
        'num_layers': 4,
        'lr': 0.0001,
        'lr_type': 'adam',
        'scheduler_epoch': 10,
        'scheduler_step_size': 0.6
    },
    {
        'batch_size': 16,
        'num_layers': 4,
        'lr': 0.001,
        'lr_type': 'adam',
        'scheduler_epoch': 7,
        'scheduler_step_size': 0.8
    },
#     {
#         'batch_size': 16,
#         'num_layers': 5,
#         'lr': 0.0001,
#         'lr_type': 'adam',
#         'scheduler_epoch': 10,
#         'scheduler_step_size': 0.9
#     },
    {
        'batch_size': 16,
        'num_layers': 5,
        'lr': 0.001,
        'lr_type': 'adam',
        'scheduler_epoch': 7,
        'scheduler_step_size': 0.8
    },
    {
        'batch_size': 16,
        'num_layers': 6,
        'lr': 0.0001,
        'lr_type': 'adam',
        'scheduler_epoch': 10,
        'scheduler_step_size': 0.9
    },
    {
        'batch_size': 16,
        'num_layers': 6,
        'lr': 0.001,
        'lr_type': 'adam',
        'scheduler_epoch': 7,
        'scheduler_step_size': 0.8
    }
]

for params in fixed_params:
    parameters, trial_index = ax_client.attach_trial(params)
    ax_client.complete_trial(trial_index=trial_index, raw_data=train_evaluate(parameters))
    total_number_of_trials -= 1
    
for _ in range(total_number_of_trials):
    parameters, trial_index = ax_client.get_next_trial()
    print(trial_index)
    ax_client.complete_trial(trial_index=trial_index, raw_data=train_evaluate(parameters))

[INFO 10-11 16:06:54] ax.service.ax_client: Attached custom parameterization {'batch_size': 8, 'num_layers': 4, 'lr': 0.0, 'lr_type': 'adam', 'scheduler_epoch': 10, 'scheduler_step_size': 0.6} as trial 0.


{'batch_size': 8, 'num_layers': 4, 'lr': 0.0001, 'lr_type': 'adam', 'scheduler_epoch': 10, 'scheduler_step_size': 0.6}
https://ui.neptune.ai/nyuds2019/numerical-relativity-interpolation/e/NUM-67


NeptuneLogger will work in online mode
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
CUDA_VISIBLE_DEVICES: [0,1]
Set SLURM handle signals.

  | Name      | Type    | Params
--------------------------------------
0 | encoder   | Encoder | 226 K 
1 | decoder   | Decoder | 226 K 
2 | criterion | L1Loss  | 0     


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…