### Prepare Workspace

In [2]:
import azureml.core
from azureml.core import Workspace
from azureml.core import Experiment
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

from azureml.train.estimator import Estimator
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveRunConfig, PrimaryMetricGoal
from azureml.train.hyperdrive import choice, uniform
from azureml.widgets import RunDetails

### Prepare AML Workspace

In [None]:
# load workspace configuration from the config.json file in the current folder.
ws = Workspace.from_config()

In [30]:
print(ws.name, ws.location, ws.resource_group, ws.location, sep = '\t')
experiment_name = 'mxk-train'
script_folder = './'
exp = Experiment(workspace=ws, name=experiment_name)


vm_size = "STANDARD_NC6"

compute_target = ws.compute_targets[compute_name]
if compute_target and type(compute_target) is AmlCompute:
    print('found compute target. just use it. ' + compute_name)

ds = ws.get_default_datastore()
print(ds.datastore_type, ds.account_name, ds.container_name)

mak-ml	westeurope	makshared	westeurope
found compute target. just use it. gpucluster
AzureBlob makml9496683038 azureml-blobstore-43aa3424-3674-489b-808b-1e49daacf13c


In [4]:
# ds.upload(src_dir='./bin', target_path='mxk-train', overwrite=True, show_progress=True)

### Prepare Training

In [4]:
os.chdir('../code')

In [5]:
%load_ext autoreload
%autoreload 2

In [31]:
script_params = {
    '--data-dir': ds.path('mxk').as_mount(),
    '--epochs':2, 
    '--steps':20,
    '--fl-gamma':1.5,
    '--fl-alpha':0.25,
    '--weights': 'model/resnet50_coco_best_v2.1.0.h5',
    '--backbone' : 'resnet50',
    '--annotations':'train_set_v2_retina.csv',
    '--classes':'classes.csv',
    '--val-annotations':'test_set_v2_retina.csv',
    '--no-snapshots':''
}

In [32]:
script_params

{'--data-dir': $AZUREML_DATAREFERENCE_30d5512fa6224ca3a6b9cebf102051bd,
 '--epochs': 2,
 '--steps': 20,
 '--fl-gamma': 1.5,
 '--fl-alpha': 0.25,
 '--weights': 'model/resnet50_coco_best_v2.1.0.h5',
 '--backbone': 'resnet50',
 '--annotations': 'train_set_v2_retina.csv',
 '--classes': 'classes.csv',
 '--val-annotations': 'test_set_v2_retina.csv',
 '--no-snapshots': ''}

In [70]:
est = Estimator(source_directory = script_folder,
                compute_target = compute_target,
                script_params = script_params,
                entry_script = "train.py",
                pip_packages = ["tensorflow-gpu==1.12", "keras", "h5py", "progressbar2", "opencv-python-headless","Cython"],
                conda_packages=['opencv', 'h5py','mesa-libgl-cos6-x86_64', 'pillow','gcc','libgcc'],
                use_gpu = True)


### Hyperparamters

In [10]:
max_total_runs=20
max_concurrent_runs=4

In [11]:
param_sampling = RandomParameterSampling( {
        "--lr": uniform(1e-6, 1e-04),
        "--batch-size": choice(1, 2, 4, 8),
        "--fl-gamma": choice(0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5),
        "--fl-alpha": choice(0.25, 0.5, 0.75, 1)
    }
)
# slack: slack allowed with respect to the best performing training run
early_termination_policy = BanditPolicy(slack_factor = 0.25, evaluation_interval=2, delay_evaluation=4) #0.25

In [12]:
hdc = HyperDriveRunConfig(estimator=est,
                          hyperparameter_sampling=param_sampling, 
                          policy=early_termination_policy,
                          primary_metric_name="EAD_Score", 
                          primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                          max_total_runs=max_total_runs,
                          max_concurrent_runs=max_concurrent_runs)

### Submit Training Run

In [71]:
run = exp.submit(est)

In [19]:
run = exp.submit(config=hdc)

In [20]:
RunDetails(run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [50]:
run

Experiment,Id,Type,Status,Details Page,Docs Page
mxk-train,mxk-train_1552317899_54e16f31,azureml.scriptrun,Queued,Link to Azure Portal,Link to Documentation


In [69]:
run.wait_for_completion(show_output=True)

RunId: mxk-train_1552322678_254415bd

Streaming azureml-logs/60_control_log.txt

Streaming log file azureml-logs/60_control_log.txt
Streaming log file azureml-logs/80_driver_log.txt

Streaming azureml-logs/80_driver_log.txt

Using TensorFlow backend.
In file included from /azureml-envs/azureml_08129cdcc775e572b20ae3bd95421af0/lib/python3.6/site-packages/numpy/core/include/numpy/ndarraytypes.h:1824:0,
                 from /azureml-envs/azureml_08129cdcc775e572b20ae3bd95421af0/lib/python3.6/site-packages/numpy/core/include/numpy/ndarrayobject.h:12,
                 from /azureml-envs/azureml_08129cdcc775e572b20ae3bd95421af0/lib/python3.6/site-packages/numpy/core/include/numpy/arrayobject.h:4,
                 from /root/.pyxbld/temp.linux-x86_64-3.6/pyrex/compute_overlap.c:593:
  ^


The experiment failed. Finalizing run...
Logging experiment finalizing status in history service
Cleaning up all outstanding Run operations, waiting 300.0 seconds
2 items cleaning up...
Cleanup took 0.20233

{'runId': 'mxk-train_1552322678_254415bd',
 'target': 'gpucluster',
 'status': 'Failed',
 'startTimeUtc': '2019-03-11T16:44:58.309049Z',
 'endTimeUtc': '2019-03-11T16:45:38.346401Z',
 'properties': {'azureml.runsource': 'experiment',
  'ContentSnapshotId': '434bbed1-563f-47a5-afe3-67d2b51db649'},
 'runDefinition': {'Script': 'train.py',
  'Arguments': ['--data-dir',
   '$AZUREML_DATAREFERENCE_30d5512fa6224ca3a6b9cebf102051bd',
   '--epochs',
   '2',
   '--steps',
   '20',
   '--fl-gamma',
   '1.5',
   '--fl-alpha',
   '0.25',
   '--weights',
   'model/resnet50_coco_best_v2.1.0.h5',
   '--backbone',
   'resnet50',
   '--annotations',
   'train_set_v2_retina.csv',
   '--classes',
   'classes.csv',
   '--val-annotations',
   'test_set_v2_retina.csv',
   '--no-snapshots'],
  'SourceDirectoryDataStore': None,
  'Framework': 0,
  'Communicator': 0,
  'Target': 'gpucluster',
  'DataReferences': {'30d5512fa6224ca3a6b9cebf102051bd': {'DataStoreName': 'workspaceblobstore',
    'Mode': 'Mount',
 

### Cancel Training

In [29]:
# from azureml.core import Run
# Run(exp, run_id='mxk_1552252494771_12').cancel()
# run.cancel()