In [None]:
from pathlib import Path
import json

import azureml
from IPython.display import display, Markdown
from azureml.core import Run, Model
from azureml.core import Datastore, Experiment, ScriptRunConfig, Workspace, RunConfiguration
from azureml.core.dataset import Dataset
from azureml.data import OutputFileDatasetConfig
from azureml.core.environment import Environment
from azureml.core.runconfig import DockerConfiguration
from azureml.exceptions import UserErrorException

from model_drift import settings, helpers
# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)


In [None]:
# Connect to workspace
ws = Workspace.from_config(settings.AZUREML_CONFIG)
ws

In [None]:
input_dataset_name="results"

dbg = 1
experiment_name = 'generate-drift-csv-label-mod'
if dbg:
    experiment_name += "-dbg"
env_name = "vae"

exp = Experiment(workspace=ws, name=experiment_name)

environment_file = settings.CONDA_ENVIRONMENT_FILE
project_dir = settings.SRC_DIR
pytorch_env = Environment.from_conda_specification(env_name, file_path =str(environment_file))
pytorch_env.register(workspace=ws)
build = pytorch_env.build(workspace=ws)
pytorch_env.environment_variables["RSLEX_DIRECT_VOLUME_MOUNT"] = "True"
pytorch_env.environment_variables["AZUREML_COMPUTE_USE_COMMON_RUNTIME"] = "false"

# Run Configuration
run_config = RunConfiguration()
run_config.environment_variables["RSLEX_DIRECT_VOLUME_MOUNT"] = "True"
run_config.environment_variables["AZUREML_COMPUTE_USE_COMMON_RUNTIME"] = "false"
run_config.environment = pytorch_env
run_config.docker = DockerConfiguration(use_docker=True, shm_size="100G")

# Input Dataset
dataset = Dataset.get_by_name(ws, name=input_dataset_name)

# datastore = ws.datastores[datastore_name]







In [None]:
args = {
'run_azure': 1,
"input_dir": dataset.as_named_input('input').as_mount(),
"output_dir": "./outputs/",
"vae_filter": 'all-data',
"classifier_filter": 'frontal_only',
"stride": "D",
"min_periods": 150,
"ref_frontal_only": 1,
"include_metadata": 1,
"replacement": 1,
"sample_size": 2500,
"n_samples": 20,
"generate_name": 0,
"num_workers": 10,
"dbg": int(dbg),

 }


label_mods = {
    'No Finding': [0.1, '2014-04-01', None],
 'Pneumonia': [0.045, '2014-04-01', None],
 'Opacity': [0.191, '2014-04-01', None],
 'Pleural Abnormalities': [0.128, '2014-04-01', None],
 'Cardiomegaly': [0.105, '2014-04-01', None],
 'Pleural Effusion': [0.075, '2014-04-01', None],
 'Lesion': [0.031, '2014-04-01', None],
 'Atelectasis': [0.084, '2014-04-01', None],
 'Consolidation': [0.018, '2014-04-01', None],
#  'Edema': [0.001, '2014-04-01', None]
 }
p = 0
for _, (pct, _, _) in label_mods.items():
    p += pct
    
for k, v in label_mods.items():
    v[0] = v[0]/p
    label_mods[k] = v



lcx = [['Pneumonia', 'Opacity', 'Pleural Abnormalities'],
 ['Cardiomegaly', 'Pleural Effusion', 'Lesion'],
 ['Atelectasis', 'Consolidation', 'Edema']]
# lcx = [['Consolidation', 'Pleural Abnormalities', 'Opacity'],
#  ['Pneumonia', 'Atelectasis', 'Pleural Effusion'],
#  ['Edema', 'Lesion', 'Cardiomegaly']]

lcx = [['Opacity', 'Pleural Abnormalities', 'Cardiomegaly'],
 ['Atelectasis', 'Pleural Effusion', 'Pneumonia'],
 ['Lesion', 'Consolidation', 'Edema']]
i = 0

labels = lcx[i]
pct = .50
label_mods = {labels[0]: [pct, "2014-04-01", "2014-06-30"],
              labels[1]: [pct, "2014-07-01", "2014-09-30"],
              labels[2]: [pct, "2014-10-01", None],
              }
label_mods['No Finding'] = [1-pct, "2014-04-01", None]


lcx_pairs = [['Opacity', 'Pleural Effusion'],
 ['Pleural Abnormalities', 'Pneumonia'],
 ['Cardiomegaly', 'Consolidation'],
 ['Atelectasis', 'Lesion']]
i = 3
labels = lcx_pairs[i]
pct = .75

A = "2014-04-01"
B0 = "2014-08-15"
B1 = "2014-08-16"
label_mods = {labels[0]: [pct, A, B0],
              labels[1]: [pct, B1, None],
              }
label_mods['No Finding'] = [1-pct, A, None]


compute_target = "cpu-cluster" if i < 1 else "cpu-cluster2"
window = "30D"
args_update = {"window": window, "mod_end_date": "2014-12-31"}
# args_update["randomize_start_date"] = "2014-06-01"
args_update['label_modifiers'] = json.dumps(label_mods)

args.update({"classifier_dataset": "padchest-finetuned-chx-frontalonly"})
args.update(args_update)

if 'peds_weight' in args:
    args['include_metadata'] = 0
    args['peds_start_date'] = "2014-06-01"
    
    
if args['window'] == '14D':
    args['min_period'] = 75

for name, value in args.items():
    print(name, value)

config = ScriptRunConfig(
    source_directory=str(project_dir),
    script="scripts/drift/generate-drift-csv-label.py",
    arguments=helpers.argsdict2list(args),
)
run_config.target = compute_target
config.run_config = run_config

run = exp.submit(config)
display(Markdown(f"""
- Experiment: [{run.experiment.name}]({run.experiment.get_portal_url()})
- Run: [{run.display_name}]({run.get_portal_url()})
- Target: {config.run_config.target}
"""))

In [None]:
import pandas as pd


len(pd.date_range("2014-04-01", "2014-08-15", freq="D")), len(pd.date_range("2014-08-16", "2014-12-31", freq="D"))

In [None]:
raise

In [None]:
label_mods = {
    'No Finding': [0.1, '2014-04-01', None],
 'Pneumonia': [0.045, '2014-04-01', None],
 'Opacity': [0.191, '2014-04-01', None],
 'Pleural Abnormalities': [0.128, '2014-04-01', None],
 'Cardiomegaly': [0.105, '2014-04-01', None],
 'Pleural Effusion': [0.075, '2014-04-01', None],
 'Lesion': [0.031, '2014-04-01', None],
 'Atelectasis': [0.084, '2014-04-01', None],
 'Consolidation': [0.018, '2014-04-01', None],
#  'Edema': [0.001, '2014-04-01', None]
 }

p = 0
for _, (pct, _, _) in label_mods.items():
    p += pct
    
for k, v in label_mods.items():
    v[0] = v[0]/p
    label_mods[k] = v
    

label_mods

In [None]:
ws = Workspace.from_config(settings.AZUREML_CONFIG)

experiment_name = 'generate-drift-csv-3'
exp = Experiment(workspace=ws, name=experiment_name)

import pandas as pd

df = []
for run in exp.get_runs():
    if run.status in ['Failed', "Canceled"]:
        continue

    args = run.get_details()['runDefinition']['arguments']
    keys, values = args[::2], args[1::2]
    keys = [k[2:] for k in keys]
    
    d = dict(zip(keys, values))
    d.update(run.tags)
    
    
    d['id'] = run.id
    d['display_name'] = run.display_name
    d['url'] = run.get_portal_url()
    d['run'] = run
    d['Status'] = run.status
    df.append(d)
    
    
df = pd.DataFrame(df).set_index(['display_name'])

In [None]:
def is_arg_col(c):
    if "mlflow" in c or "_aml" in c or 'run_' in c or 'url' in c:
        return False

    ignore = ["output_dir", "input_dir", 'run', 'display_name', 'id']

    return c not in ignore
arg_cols = [c for c in df.columns if is_arg_col(c)]
# arg_cols = [c for c in arg_cols if df[c].fillna('NA').nunique() > 1]
arg_df = df[arg_cols].query("classifier_dataset == 'padchest-finetuned-chx-frontalonly'").copy()
arg_df = arg_df[~arg_df['frontal_remove_date'].str.contains('2013').astype(bool)]
# arg_df = arg_df[~arg_df.duplicated(keep='first')]
arg_df

In [None]:
s = ["window", "nonfrontal_add_date", "frontal_remove_date", 'Status']

arg_df[[c for c in arg_df if (arg_df[c].fillna('NA').nunique() > 1) or c in s]].sort_values(by=s, ascending=False)[s]