In [1]:
from pathlib import Path

import azureml
from IPython.display import display, Markdown
from azureml.core import Datastore, Experiment, ScriptRunConfig, Workspace, RunConfiguration
from azureml.core.dataset import Dataset
from azureml.core.environment import Environment
from azureml.core.runconfig import DockerConfiguration
from azureml.exceptions import UserErrorException
import shutil


from model_drift import settings, helpers

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

Failure while loading azureml_run_type_providers. Failed to load entrypoint automl = azureml.train.automl.run:AutoMLRun._from_run_dto with exception (cloudpickle 2.0.0 (d:\code\mlopsday2\medimaging-modeldriftmonitoring\.venv\lib\site-packages), Requirement.parse('cloudpickle<2.0.0,>=1.1.0'), {'azureml-dataprep'}).


Azure ML SDK Version:  1.35.0


In [2]:
# Connect to workspace
ws = Workspace.from_config(settings.AZUREML_CONFIG)

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


In [5]:
dbg = True

log_refresh_rate = 25
if dbg:
    log_refresh_rate = 1

env_name = "finetune-padchest"

# Name experiement
experiment_name = 'finetune-padchest' if not dbg else 'finetune-padchest-dbg'

# Input Dataset
dataset = Dataset.get_by_name(ws, name='padchest')

#Experiment
exp = Experiment(workspace=ws, name=experiment_name)

#Environment
environment_file = settings.CONDA_ENVIRONMENT_FILE
project_dir = settings.SRC_DIR
pytorch_env = Environment.from_conda_specification(env_name, file_path =str(environment_file))
pytorch_env.register(workspace=ws)
build = pytorch_env.build(workspace=ws)
pytorch_env.environment_variables["RSLEX_DIRECT_VOLUME_MOUNT"] = "True"

# Run Configuration
run_config = RunConfiguration()
run_config.environment_variables["RSLEX_DIRECT_VOLUME_MOUNT"] = "True"
run_config.environment = pytorch_env
run_config.docker = DockerConfiguration(use_docker=True, shm_size="100G")


args = {
 'data_folder': dataset.as_named_input('dataset').as_mount(),
 'run_azure': 1,
 'batch_size': 6,
 'output_dir': './outputs',
 'pretrained': 'pretrained-chexpert/iter_662400.pth.tar',
 'num_workers': -1,
 'max_epochs':  24 if not dbg else 5,
 'progress_bar_refresh_rate': 25 if not dbg else 1,
 'log_every_n_steps':  25 if not dbg else 1,
 'flush_logs_every_n_steps':  25 if not dbg else 1,
 'accelerator': 'ddp',
 'freeze_backbone': 0,
 'frontal_only': 1,
 'num_sanity_val_steps': 0
 }

if dbg:
    args.update({
        'limit_train_batches': 5,
        'limit_val_batches': 40,
        "num_sanity_val_steps": 40
    })

args.setdefault("num_sanity_val_steps", 0)


for param, value in args.items():
    print(f"{param}: {value}")

config = ScriptRunConfig(
    source_directory = str(project_dir), 
    script = "scripts/finetune/train.py",
    arguments=helpers.argsdict2list(args),
)

config.run_config = run_config

for param, value in args.items():
    print(f"{param}: {value}")




data_folder <azureml.data.dataset_consumption_config.DatasetConsumptionConfig object at 0x000001B68A17ABA8>
run_azure 1
batch_size 6
output_dir ./outputs
pretrained pretrained-chexpert/iter_662400.pth.tar
num_workers -1
max_epochs 5
progress_bar_refresh_rate 1
log_every_n_steps 1
flush_logs_every_n_steps 1
accelerator ddp
freeze_backbone 0
frontal_only 1
num_sanity_val_steps 40
limit_train_batches 5
limit_val_batches 40


NameError: name 'run' is not defined

In [None]:
config.run_config.target = "nc24-uswest2"
# config.run_config.target = "NC24rs-v3-usw2-d"
run = exp.submit(config)
display(Markdown(f"""
- Environment: {pytorch_env.name}
- Experiment: [{run.experiment.name}]({run.experiment.get_portal_url()})
- Run: [{run.display_name}]({run.get_portal_url()})
- Target: {config.run_config.target}
"""))

In [4]:
from azureml.train.hyperdrive import GridParameterSampling, RandomParameterSampling, BanditPolicy, HyperDriveConfig, uniform, PrimaryMetricGoal, choice, loguniform
run_config = RunConfiguration()

cluster_name = "NC24rs-v3-usw2-d"
# cluster_name = "NC24rs-v3-usw2-l"

run_config.environment = pytorch_env
run_config.docker = DockerConfiguration(use_docker=True, shm_size="100G")
run_config.target = cluster_name

if "limit_train_batches" in args:
    del args["limit_train_batches"]

if "limit_val_batches" in args:
    del args["limit_val_batches"]

args.update({
 'max_epochs':  24,
 'progress_bar_refresh_rate': 25,
 'log_every_n_steps':  25,
 'flush_logs_every_n_steps':  25,
 "num_sanity_val_steps": 0
})


param_sampling = RandomParameterSampling(
    {   "freeze_backbone": choice([0, 1]),
        "batch_size": choice([8, 12, 16]),
        "learning_rate": choice(1e-2, 1e-4, 1e-4),
        "step_size": choice([4, 8, 12]),
        "accumulate_grad_batches": choice([1, 2, 4]),
        "frontal_only": choice([0, 1])
    }
)

experiment_name = 'finetune-padchest-hyper'
exp = Experiment(workspace=ws, name=experiment_name)
config.run_config = run_config
hyperdrive_config = HyperDriveConfig(run_config=config,
                                     hyperparameter_sampling=param_sampling, 
                                     policy=None,
                                     primary_metric_name='val/AUROC.mean',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=6*18,
                                     max_concurrent_runs=6)

for param, value in args.items():
    print(f"{param}: {value}")

data_folder <azureml.data.dataset_consumption_config.DatasetConsumptionConfig object at 0x000001B6FFF51550>
run_azure 1
batch_size 6
output_dir ./outputs
pretrained pretrained-chexpert/iter_662400.pth.tar
num_workers -1
max_epochs 24
progress_bar_refresh_rate 25
log_every_n_steps 25
flush_logs_every_n_steps 25
accelerator ddp
freeze_backbone 0
frontal_only 1
num_sanity_val_steps 0


In [None]:
# start the HyperDrive run
hyperdrive_run = exp.submit(hyperdrive_config)
display(Markdown(f"""
- Experiment: [{hyperdrive_run.experiment.name}]({hyperdrive_run.experiment.get_portal_url()})
- Run: [{hyperdrive_run.display_name}]({hyperdrive_run.get_portal_url()})
- Target: {config.run_config.target}
"""))