### Prepare Workspace

In [86]:
import azureml.core
from azureml.core import Workspace
from azureml.core import Experiment
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

from azureml.train.estimator import Estimator
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveRunConfig, PrimaryMetricGoal, GridParameterSampling
from azureml.train.hyperdrive import choice, uniform
from azureml.widgets import RunDetails

### Prepare AML Workspace

In [None]:
# load workspace configuration from the config.json file in the current folder.
ws = Workspace.from_config()

In [61]:
print(ws.name, ws.location, ws.resource_group, ws.location, sep = '\t')
experiment_name = 'mxk-24'
script_folder = './'
exp = Experiment(workspace=ws, name=experiment_name)

# choose a name for your cluster
compute_name = "gpucluster24"

compute_target = ws.compute_targets[compute_name]
if compute_target and type(compute_target) is AmlCompute:
    print('found compute target. just use it. ' + compute_name)

ds = ws.get_default_datastore()
print(ds.datastore_type, ds.account_name, ds.container_name)

mak-ml	westeurope	makshared	westeurope
found compute target. just use it. gpucluster24
AzureBlob makml9496683038 azureml-blobstore-43aa3424-3674-489b-808b-1e49daacf13c


In [4]:
# ds.upload(src_dir='./bin', target_path='mxk-train', overwrite=True, show_progress=True)

### Deploy Training

In [5]:
os.chdir('../code')

In [6]:
%load_ext autoreload
%autoreload 2

In [80]:
script_params = {
    '--data-dir': ds.path('mxk').as_mount(),
    '--epochs':6, 
#     '--steps':1800,
    '--fl-gamma':1.5,
    '--fl-alpha':0.25,
    '--weights': 'model/resnet50_coco_best_v2.1.0.h5',
    '--backbone' : 'resnet50',
    '--annotations':'train_set_v2_retina.csv',
    '--classes':'classes.csv',
    '--val-annotations':'test_set_v2_retina.csv',
    '--no-snapshots':'',
    '--multi-gpu':'4',
    '--multi-gpu-force':''
}

In [81]:
script_params

{'--data-dir': $AZUREML_DATAREFERENCE_e02db99f35854229bc3709011823cb90,
 '--epochs': 6,
 '--fl-gamma': 1.5,
 '--fl-alpha': 0.25,
 '--weights': 'model/resnet50_coco_best_v2.1.0.h5',
 '--backbone': 'resnet50',
 '--annotations': 'train_set_v2_retina.csv',
 '--classes': 'classes.csv',
 '--val-annotations': 'test_set_v2_retina.csv',
 '--no-snapshots': '',
 '--multi-gpu': '4',
 '--multi-gpu-force': ''}

In [82]:
est = Estimator(source_directory = script_folder,
                compute_target = compute_target,
                script_params = script_params,
                entry_script = "train.py",
                pip_packages = ["tensorflow-gpu==1.12", "keras", "h5py", "progressbar2", "opencv-python-headless"],
                conda_packages=['opencv', 'h5py','mesa-libgl-cos6-x86_64', 'pillow'],
                use_gpu = True)

In [84]:
max_total_runs=3
max_concurrent_runs=3

In [87]:
param_sampling = GridParameterSampling( {
#         "--lr": uniform(1e-6, 1e-04),
        "--batch-size": choice(4, 8, 16) #, 32)#,
#         "--fl-gamma": choice(0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5),
#         "--fl-alpha": choice(0.25, 0.5, 0.75, 1)
    }
)
# slack: slack allowed with respect to the best performing training run
early_termination_policy = BanditPolicy(slack_factor = 0.15, evaluation_interval=2)#1, delay_evaluation=4) #0.25

In [88]:
hdc = HyperDriveRunConfig(estimator=est,
                          hyperparameter_sampling=param_sampling, 
                          policy=early_termination_policy,
                          primary_metric_name="EAD_Score", 
                          primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                          max_total_runs=max_total_runs,
                          max_concurrent_runs=max_concurrent_runs)

### Submit Training Run

In [89]:
run = exp.submit(config=hdc)

In [90]:
RunDetails(run).show()

_HyperDriveWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO'…

In [91]:
run.wait_for_completion(show_output=True)

RunId: mxk-24_1552259521591




ClientRequestError: Error occurred in request., ConnectionError: HTTPSConnectionPool(host='westeurope.experiments.azureml.net', port=443): Max retries exceeded with url: /history/v1.0/subscriptions/50324bce-875f-4a7b-9d3c-0e33679f5d72/resourceGroups/makshared/providers/Microsoft.MachineLearningServices/workspaces/mak-ml/experiments/mxk-24/runs/mxk-24_1552259521591/details (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x0000023B2B4692E8>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed',))

Performing interactive authentication. Please follow the instructions on the terminal.


Note, we have launched a browser for you to login. For old experience with device code, use "az login --use-device-code"
You have logged in. Now let us find all the subscriptions to which you have access...
Failed to authenticate '{'additional_properties': {}, 'id': '/tenants/3336d6b0-b132-47ee-a49b-3ab470a5336e', 'tenant_id': '3336d6b0-b132-47ee-a49b-3ab470a5336e'}' due to error 'Get Token request returned http error: 400 and server response: {"error":"invalid_grant","error_description":"AADSTS50057: User account is disabled.\r\nTrace ID: 1477bea7-d2f1-40a9-a361-9cc726256b00\r\nCorrelation ID: fa127c46-0f8e-4733-9b47-89df2c67fae8\r\nTimestamp: 2019-03-11 08:51:39Z","error_codes":[50057],"timestamp":"2019-03-11 08:51:39Z","trace_id":"1477bea7-d2f1-40a9-a361-9cc726256b00","correlation_id":"fa127c46-0f8e-4733-9b47-89df2c67fae8"}'


Interactive authentication successfully completed.


### Cancel Training

In [79]:
# run.cancel()

{'code': 200, 'result': 'Cancellation requested for mxk-24_1552258279772'}