In [1]:
from pathlib import Path

import azureml
from IPython.display import display, Markdown
from azureml.core import Run, Model
from azureml.core import Datastore, Experiment, ScriptRunConfig, Workspace, RunConfiguration
from azureml.core.dataset import Dataset
from azureml.data import OutputFileDatasetConfig
from azureml.core.environment import Environment
from azureml.core.runconfig import DockerConfiguration
from azureml.exceptions import UserErrorException

from model_drift import settings, helpers
# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)


Azure ML SDK Version:  1.35.0


In [2]:
# Connect to workspace
ws = Workspace.from_config(settings.AZUREML_CONFIG)


If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


In [3]:
input_dataset_name="results"

dbg = 0
experiment_name = 'generate-drift-csv-peds2'
if dbg:
    experiment_name += "-dbg"
env_name = "vae"

exp = Experiment(workspace=ws, name=experiment_name)

environment_file = settings.CONDA_ENVIRONMENT_FILE
project_dir = settings.SRC_DIR
pytorch_env = Environment.from_conda_specification(env_name, file_path =str(environment_file))
pytorch_env.register(workspace=ws)
build = pytorch_env.build(workspace=ws)
pytorch_env.environment_variables["RSLEX_DIRECT_VOLUME_MOUNT"] = "True"
pytorch_env.environment_variables["AZUREML_COMPUTE_USE_COMMON_RUNTIME"] = "false"

# Run Configuration
run_config = RunConfiguration()
run_config.environment_variables["RSLEX_DIRECT_VOLUME_MOUNT"] = "True"
run_config.environment_variables["AZUREML_COMPUTE_USE_COMMON_RUNTIME"] = "false"
run_config.environment = pytorch_env
run_config.docker = DockerConfiguration(use_docker=True, shm_size="100G")

# Input Dataset
dataset = Dataset.get_by_name(ws, name=input_dataset_name)

# datastore = ws.datastores[datastore_name]







In [16]:
args = {
'run_azure': 1,
"input_dir": dataset.as_named_input('input').as_mount(),
"output_dir": "./outputs/",
"vae_filter": 'all-data',
"classifier_filter": 'frontal_only',
"stride": "D",
"min_periods": 150,
"ref_frontal_only": 1,
"include_metadata": 1,
"replacement": 1,
"sample_size": 2500,
"n_samples": 20,
"generate_name": 0,
"num_workers": 10,
"dbg": int(dbg),

"start_date": "2012-11-01",
"end_date":"2015-02-01",
 }
compute_target = "cpu-cluster"
window = "30D"
args_update = {"window": window}
# args_update = {"window": window, "nonfrontal_add_date": "2014-06-01",}
# args_update = {"window": window, "nonfrontal_add_date": "2014-04-01",}
# args_update = {"window": window, "nonfrontal_add_date": "2014-06-01", "frontal_remove_date": "2014-09-01"}
# args_update = {"window": window, "nonfrontal_add_date": "2014-04-01", "frontal_remove_date": "2014-09-01"}
# args_update = {"window": window, "nonfrontal_add_date": "2014-06-01", "peds_weight": .5, "frontal_remove_date": "2014-08-01"}
# args_update = {"window": window, "nonfrontal_add_date": "2014-06-01", "peds_weight": .25, "frontal_remove_date": "2014-08-01"}

args_update = {"window": window, "peds_weight": .5}
args_update = {"window": window, "peds_weight": .75}
args_update = {"window": window, "peds_weight": .75, "frontal_remove_date": "2014-09-01"}
# args_update = {"window": window, "peds_weight": 1, "frontal_remove_date": "2014-06-01"}

# args_update = {"window": window, "midrc_weight": .5}
# args_update = {"window": window, "midrc_weight": .75}
# args_update = {"window": window, "midrc_weight": .25}


# all_bad = True
# compute_target = "cpu-cluster"
# args_update = {
# "window": window,
# "bad_q": 0.1,
# "bad_start_date": "2014-06-01" if not all_bad else args['start_date'],
# "bad_sample_start_date": "2014-06-01" if not all_bad else args['start_date'],
# "bad_sample_end_date":  "2014-12-31" if not all_bad else args['end_date']

# }

# all_good = True
# compute_target = "cpu-cluster2"
# args_update = {
# "window": window,
# "good_q": 0.1,
# "good_start_date": "2014-06-01" if not all_good else args['start_date'],
# "good_sample_start_date": "2014-06-01" if not all_good else args['start_date'],
# "good_sample_end_date":  "2014-12-31" if not all_good else args['end_date']

# }

# compute_target = "cpu-cluster2"
# args_update = {
# "window": window,
# "bad_q": 1.0,
# "bad_start_date": "2014-06-01",
# "bad_sample_start_date":  "2014-01-01",
# "bad_sample_end_date":  "2014-05-31"

# }

# compute_target = "cpu-cluster"
# args_update = {
#     "window": window,
#     "midrc_weight": .75,
#     "midrc_include": "Typical",
#     "midrc_exclude": "Negative"
    
    
# }

args.update({"classifier_dataset": "padchest-finetuned-chx-frontalonly"})
args.update(args_update)

if 'peds_weight' in args:
    args['include_metadata'] = 0
    args['peds_start_date'] = "2014-06-01"
    
if 'midrc_weight' in args:
    args['midrc_start_date'] = "2014-06-01"
    
if args.get("bad_q", 0):
    args["frontal_remove_date"] = args['bad_start_date']
    
    
if args.get("good_q", 0):
    args["frontal_remove_date"] = args['good_start_date']
    
if args['window'] == '14D':
    args['min_period'] = 75


for name, value in args.items():
    print(name, value)



config = ScriptRunConfig(
    source_directory=str(project_dir),
    script="scripts/drift/generate-drift-csv.py",
    arguments=helpers.argsdict2list(args),
)
run_config.target = compute_target
config.run_config = run_config

run = exp.submit(config)
display(Markdown(f"""
- Experiment: [{run.experiment.name}]({run.experiment.get_portal_url()})
- Run: [{run.display_name}]({run.get_portal_url()})
- Target: {config.run_config.target}
"""))

run_azure 1
input_dir <azureml.data.dataset_consumption_config.DatasetConsumptionConfig object at 0x00000156CBB4F198>
output_dir ./outputs/
vae_filter all-data
classifier_filter frontal_only
stride D
min_periods 150
ref_frontal_only 1
include_metadata 0
replacement 1
sample_size 2500
n_samples 20
generate_name 0
num_workers 10
dbg 0
start_date 2012-11-01
end_date 2015-02-01
classifier_dataset padchest-finetuned-chx-frontalonly
window 30D
peds_weight 1
frontal_remove_date 2014-06-01
peds_start_date 2014-06-01



- Experiment: [generate-drift-csv-peds2](https://ml.azure.com/experiments/generate-drift-csv-peds2?wsid=/subscriptions/9ca8df1a-bf40-49c6-a13f-66b72a85f43c/resourcegroups/MLOps-Prototype/workspaces/MLOps_shared&tid=72f988bf-86f1-41af-91ab-2d7cd011db47)
- Run: [hungry_ticket_7rjkc9dv](https://ml.azure.com/runs/generate-drift-csv-peds2_1643398683_1cc492e3?wsid=/subscriptions/9ca8df1a-bf40-49c6-a13f-66b72a85f43c/resourcegroups/MLOps-Prototype/workspaces/MLOps_shared&tid=72f988bf-86f1-41af-91ab-2d7cd011db47)
- Target: cpu-cluster


In [10]:
ws = Workspace.from_config(settings.AZUREML_CONFIG)

# experiment_name = 'generate-drift-csv-5'
exp = Experiment(workspace=ws, name=experiment_name)

import pandas as pd

df = []
for run in exp.get_runs():
    if run.status in ['Failed', "Canceled"]:
        continue

    args = run.get_details()['runDefinition']['arguments']
    keys, values = args[::2], args[1::2]
    keys = [k[2:] for k in keys]
    
    d = dict(zip(keys, values))
    d.update(run.tags)
    
    
    d['id'] = run.id
    d['display_name'] = run.display_name
    d['url'] = run.get_portal_url()
    d['run'] = run
    d['Status'] = run.status
    df.append(d)
    
    
df = pd.DataFrame(df).set_index(['display_name'])
df

Unnamed: 0_level_0,run_azure,input_dir,output_dir,vae_filter,classifier_filter,stride,min_periods,ref_frontal_only,include_metadata,replacement,...,classifier_dataset,window,peds_weight,peds_start_date,_aml_system_ComputeTargetStatus,id,url,run,Status,frontal_remove_date
display_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
gifted_onion_vmg06cy4,1,DatasetConsumptionConfig:input,./outputs/,all-data,frontal_only,D,150,1,0,1,...,padchest-finetuned-chx-frontalonly,30D,0.75,2014-06-01,"{""AllocationState"":""steady"",""PreparingNodeCoun...",generate-drift-csv-peds2_1643397022_a0b7e1a0,https://ml.azure.com/runs/generate-drift-csv-p...,"Run(Experiment: generate-drift-csv-peds2,\nId:...",Running,
mango_gyro_nc94tsrs,1,DatasetConsumptionConfig:input,./outputs/,all-data,frontal_only,D,150,1,0,1,...,padchest-finetuned-chx-frontalonly,30D,1.0,2014-06-01,"{""AllocationState"":""steady"",""PreparingNodeCoun...",generate-drift-csv-peds2_1643397012_4e1a749b,https://ml.azure.com/runs/generate-drift-csv-p...,"Run(Experiment: generate-drift-csv-peds2,\nId:...",Running,2014-06-01
loyal_scooter_l8n6cmhj,1,DatasetConsumptionConfig:input,./outputs/,all-data,frontal_only,D,150,1,0,1,...,padchest-finetuned-chx-frontalonly,30D,0.5,2014-06-01,"{""AllocationState"":""steady"",""PreparingNodeCoun...",generate-drift-csv-peds2_1643397030_3dd88d53,https://ml.azure.com/runs/generate-drift-csv-p...,"Run(Experiment: generate-drift-csv-peds2,\nId:...",Queued,


In [13]:
def is_arg_col(c):
    if "mlflow" in c or "_aml" in c or 'run_' in c or 'url' in c:
        return False

    ignore = ["output_dir", "input_dir", 'run', 'display_name', 'id']

    return c not in ignore
arg_cols = [c for c in df.columns if is_arg_col(c)]
arg_df = df#[pd.to_numeric(df["good_q"], errors="coerce").fillna(0)>0]

arg_cols = [c for c in arg_cols if arg_df[c].fillna('NA').nunique() > 1]
# arg_df = df[arg_cols].query("good_q >).copy()
# arg_df = arg_df[~arg_df['frontal_remove_date'].str.contains('2013').astype(bool)]
# arg_df = arg_df[~arg_df.duplicated(keep='first')]
arg_df[arg_cols]#.sort_values(['good_start_date','good_q'])

Unnamed: 0_level_0,peds_weight,Status,frontal_remove_date
display_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
gifted_onion_vmg06cy4,0.75,Running,
mango_gyro_nc94tsrs,1.0,Running,2014-06-01
loyal_scooter_l8n6cmhj,0.5,Queued,
