In [1]:
from pathlib import Path

import azureml
from IPython.display import display, Markdown
from azureml.core import Datastore, Experiment, ScriptRunConfig, Workspace, RunConfiguration
from azureml.core.dataset import Dataset
from azureml.core.environment import Environment
from azureml.core.runconfig import DockerConfiguration
from azureml.exceptions import UserErrorException
import shutil


from model_drift import settings, helpers

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)

Failure while loading azureml_run_type_providers. Failed to load entrypoint automl = azureml.train.automl.run:AutoMLRun._from_run_dto with exception (cloudpickle 2.0.0 (d:\code\mlopsday2\medimaging-modeldriftmonitoring\.venv\lib\site-packages), Requirement.parse('cloudpickle<2.0.0,>=1.1.0'), {'azureml-dataprep'}).


Azure ML SDK Version:  1.34.0


In [2]:
# Connect to workspace
subscription_id = '9ca8df1a-bf40-49c6-a13f-66b72a85f43c'
resource_group = 'MLOps-Prototype'
workspace_name = 'MLOps_shared'

ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


In [3]:
# Get Dataset
pc_dataset = Dataset.get_by_name(ws, name='padchest')


In [4]:
env_name = "finetune-padchest"

environment_file = settings.CONDA_ENVIRONMENT_FILE
project_dir = Path("./experiment/")

helpers.copytree(settings.TOP_DIR.joinpath('model_drift'), project_dir.joinpath('model_drift'))

pytorch_env = Environment.from_conda_specification(env_name, file_path =str(environment_file))

pytorch_env.register(workspace=ws)
build = pytorch_env.build(workspace=ws)

pytorch_env.environment_variables["RSLEX_DIRECT_VOLUME_MOUNT"] = "True"

In [71]:
dbg = False

helpers.copytree(settings.TOP_DIR.joinpath('model_drift'), project_dir.joinpath('model_drift'))

log_refresh_rate = 25
if dbg:
    log_refresh_rate = 1

# Name experiement
experiment_name = 'finetune-padchest' if not dbg else 'finetune-padchest-dbg'
exp = Experiment(workspace=ws, name=experiment_name)

print("Experiment:", exp.name)
print("Environment:", pytorch_env.name)

run_config = RunConfiguration()

run_config.environment = pytorch_env
run_config.docker = DockerConfiguration(use_docker=True, shm_size="100G")
args = [
    '--data_folder', pc_dataset.as_named_input('padchestv1').as_mount(),
    '--csv_root', ".",
    '--run_azure', 1,
    "--batch_size", 6,
    '--output_dir', './outputs',
    '--train_csv', "padchest_10labels_train.csv",
    "--val_csv", "padchest_10labels_val.csv",
    "--checkpoint", "pretrained-chexpert-iter_662400.pth.tar",

    '--max_epochs', 24 if not dbg else 5,
    '--num_workers', -1,

    '--progress_bar_refresh_rate', log_refresh_rate,
    "--log_every_n_steps", log_refresh_rate,
    "--flush_logs_every_n_steps", log_refresh_rate,
    "--accelerator", "ddp",
    "--freeze_backbone", 0,

    ]

if dbg:
    args += [
        # '--limit_train_batches', 5,
        # '--limit_val_batches', 40,
        # "--num_sanity_val_steps", 40
    ]

if "--num_sanity_val_steps" not in args:
    args += ["--num_sanity_val_steps", 0]

config = ScriptRunConfig(
    source_directory = str(project_dir), 
    script = "train.py",
    arguments=args,
)

config.run_config = run_config


Experiment: finetune-padchest
Environment: finetune-padchest


In [72]:
config.run_config.target = "nc24-uswest2"
# config.run_config.target = "NC24rs-v3-usw2-d"
run = exp.submit(config)
display(Markdown(f"""
- Experiement: [{run.experiment.name}]({run.experiment.get_portal_url()})
- Run: [{run.display_name}]({run.get_portal_url()})
- Target: {config.run_config.target}
"""))



- Experiement: [finetune-padchest](https://ml.azure.com/experiments/finetune-padchest?wsid=/subscriptions/9ca8df1a-bf40-49c6-a13f-66b72a85f43c/resourcegroups/MLOps-Prototype/workspaces/MLOps_shared&tid=72f988bf-86f1-41af-91ab-2d7cd011db47)
- Run: [bright_loquat_47w5k3hp](https://ml.azure.com/runs/finetune-padchest_1635021703_36eaf995?wsid=/subscriptions/9ca8df1a-bf40-49c6-a13f-66b72a85f43c/resourcegroups/MLOps-Prototype/workspaces/MLOps_shared&tid=72f988bf-86f1-41af-91ab-2d7cd011db47)
- Target: nc24-uswest2


In [80]:
from azureml.train.hyperdrive import GridParameterSampling, RandomParameterSampling, BanditPolicy, HyperDriveConfig, uniform, PrimaryMetricGoal, choice, loguniform
run_config = RunConfiguration()

cluster_name = "NC24rs-v3-usw2-d"
# cluster_name = "nc24-uswest2"

run_config.environment = pytorch_env
run_config.docker = DockerConfiguration(use_docker=True, shm_size="100G")
run_config.target = cluster_name


param_sampling = RandomParameterSampling(
    {   "freeze_backbone": choice([0, 1]),
        "batch_size": choice([8, 12, 16]),
        "learning_rate": choice(1e-2, 1e-4, 1e-4),
        "step_size": choice([4, 8, 12]),
        "accumulate_grad_batches": choice([1, 2, 4])
    }
)

experiment_name = 'finetune-padchest-hyper'
exp = Experiment(workspace=ws, name=experiment_name)
config.run_config = run_config
hyperdrive_config = HyperDriveConfig(run_config=config,
                                     hyperparameter_sampling=param_sampling, 
                                     policy=None,
                                     primary_metric_name='val/loss',
                                     primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
                                     max_total_runs=6*12,
                                     max_concurrent_runs=6)

In [81]:
# start the HyperDrive run
hyperdrive_run = exp.submit(hyperdrive_config)
display(Markdown(f"""
- Experiment: [{hyperdrive_run.experiment.name}]({hyperdrive_run.experiment.get_portal_url()})
- Run: [{hyperdrive_run.display_name}]({hyperdrive_run.get_portal_url()})
- Target: {config.run_config.target}
"""))


- Experiment: [finetune-padchest-hyper](https://ml.azure.com/experiments/finetune-padchest-hyper?wsid=/subscriptions/9ca8df1a-bf40-49c6-a13f-66b72a85f43c/resourcegroups/MLOps-Prototype/workspaces/MLOps_shared&tid=72f988bf-86f1-41af-91ab-2d7cd011db47)
- Run: [zen_net_t30h0y0w](https://ml.azure.com/runs/HD_e9329a40-b7e2-4309-b35a-3ed45c4de352?wsid=/subscriptions/9ca8df1a-bf40-49c6-a13f-66b72a85f43c/resourcegroups/MLOps-Prototype/workspaces/MLOps_shared&tid=72f988bf-86f1-41af-91ab-2d7cd011db47)
- Target: NC24rs-v3-usw2-d


In [79]:
!python experiment/train.py --help


=====
 Pytorch Lightning Version: 1.4.7

Failure while loading azureml_run_type_providers. Failed to load entrypoint automl = azureml.train.automl.run:AutoMLRun._from_run_dto with exception (cloudpickle 2.0.0 (d:\code\mlopsday2\medimaging-modeldriftmonitoring\.venv\lib\site-packages), Requirement.parse('cloudpickle<2.0.0,>=1.1.0'), {'azureml-dataprep'}).



 Pytorch Version: 1.8.0
 Num GPUs: 1
 Num CPUs: 16
 Node Rank: 0
 Local Rank: 0
 World Size: 1
 Global Rank: 0
 Rank ID: 0-0
=====

usage: train.py [-h] [--run_azure RUN_AZURE] [--output_dir OUTPUT_DIR]
                [--data_folder DATA_FOLDER] [--batch_size BATCH_SIZE]
                [--train_csv TRAIN_CSV] [--val_csv VAL_CSV]
                [--checkpoint CHECKPOINT] [--csv_root CSV_ROOT]
                [--num_classes NUM_CLASSES] [--num_workers NUM_WORKERS]
                [--learning_rate LEARNING_RATE]
                [--freeze_backbone FREEZE_BACKBONE]
                [--weight_decay WEIGHT_DECAY] [--gamma GAMMA]
                [--step_size STEP_SIZE] [--logger [LOGGER]]
                [--checkpoint_callback [CHECKPOINT_CALLBACK]]
                [--default_root_dir DEFAULT_ROOT_DIR]
                [--gradient_clip_val GRADIENT_CLIP_VAL]
                [--gradient_clip_algorithm GRADIENT_CLIP_ALGORITHM]
                [--process_position PROCESS_POSITION] [--num_nodes N