### Prepare Workspace

In [1]:
import azureml.core
from azureml.core import Workspace
from azureml.core import Experiment
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

from azureml.train.estimator import Estimator
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveRunConfig, PrimaryMetricGoal
from azureml.train.hyperdrive import choice, uniform
from azureml.widgets import RunDetails

### Prepare AML Workspace

In [2]:
# load workspace configuration from the config.json file in the current folder.
ws = Workspace.from_config()

Found the config file in: C:\Users\makayser\Desktop\git\mxk_retinanet\notebook\aml_config\config.json


In [3]:
print(ws.name, ws.location, ws.resource_group, ws.location, sep = '\t')
experiment_name = 'mxk-hyper3'
script_folder = './'
exp = Experiment(workspace=ws, name=experiment_name)

compute_name = "gpucluster"

compute_target = ws.compute_targets[compute_name]
if compute_target and type(compute_target) is AmlCompute:
    print('found compute target. just use it. ' + compute_name)

ds = ws.get_default_datastore()
print(ds.datastore_type, ds.account_name, ds.container_name)

mak-ml	westeurope	makshared	westeurope
found compute target. just use it. gpucluster
AzureBlob makml9496683038 azureml-blobstore-43aa3424-3674-489b-808b-1e49daacf13c


In [None]:
# ds.upload(src_dir='./bin', target_path='mxk-train', overwrite=True, show_progress=True)

### Prepare Training

In [4]:
os.chdir('../code')

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
script_params = {
    '--data-dir': ds.path('mxk').as_mount(),
    '--epochs':10, 
    '--weights': 'model/resnet50_coco_best_v2.1.0.h5',
    '--backbone' : 'resnet50',
    '--annotations':'train_set_v2_retina.csv',
    '--classes':'classes.csv',
    '--val-annotations':'test_set_v2_retina.csv',
    '--no-snapshots':'',
    '--batch-size':4,
    '--score-threshold':0.3,
    '--random-transform':''
}

In [None]:
script_params

In [None]:
est = Estimator(source_directory = script_folder,
                compute_target = compute_target,
                script_params = script_params,
                entry_script = "train.py",
                pip_packages = ["tensorflow-gpu==1.12", "keras", "h5py", "progressbar2","scikit-image","imageio","Shapely","imgaug", "opencv-python-headless","Cython"],
                conda_packages=['opencv', 'h5py','mesa-libgl-cos6-x86_64', 'pillow','gcc','libgcc'],
                use_gpu = True)

### Hyperparamters

In [None]:
max_total_runs=30
max_concurrent_runs=8

In [None]:
param_sampling = RandomParameterSampling( {
        "--lr": uniform(1e-6, 1e-04),
        "--fl-gamma": choice(1, 1.25, 1.5, 1.75, 2, 2.25),
        "--fl-alpha": choice(0.25, 0.5, 0.75, 1),
        "--neg-overlap": choice (0.4, 0.5, 0.6),
        "--pos-overlap": choice (0.5, 0.6, 0.7),
        "--fpn-layers": choice (4, 5)
    }
)

In [None]:
# param_sampling = RandomParameterSampling( {
#         "--lr": uniform(1e-6, 1e-04),
#         "--batch-size": choice(1, 2, 4, 8),
#         "--fl-gamma": choice(0.75, 1, 1.25, 1.5, 1.75, 2, 2.25, 2.5),
#         "--fl-alpha": choice(0.25, 0.5, 0.75, 1)
#     }
# )
# slack: slack allowed with respect to the best performing training run
early_termination_policy = BanditPolicy(slack_factor = 0.2, evaluation_interval=1, delay_evaluation=2)

In [None]:
hdc = HyperDriveRunConfig(estimator=est,
                          hyperparameter_sampling=param_sampling, 
                          policy=early_termination_policy,
                          primary_metric_name="EAD_Score", 
                          primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                          max_total_runs=max_total_runs,
                          max_concurrent_runs=max_concurrent_runs)

### Submit Training Run

In [None]:
# #Run Single
# run = exp.submit(est)

In [None]:
# Run Hyperparams
run = exp.submit(config=hdc)

In [None]:
RunDetails(run).show()

In [None]:
run

In [None]:
run.wait_for_completion(show_output=True)

### Cancel Training

In [None]:
# from azureml.core import Run
# Run(exp, run_id='mxk_1552252494771_12').cancel()
# run.cancel()

In [7]:
from azureml.core import Run
run = Run(exp, run_id='mxk-hyper3_1552597605207_1')
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 'â€¦

In [None]:
%matplotlib inline
import numpy as np
import os
import matplotlib.pyplot as plt

In [None]:
run.get_details()

In [None]:
run.get_metrics()


In [None]:
metrics = run.get_metrics()

plt.figure(figsize = (13,5))
for metric in metrics:
    if len(metrics[metric]['EAD_Score']) > 5:
        plt.plot(metrics[metric]['EAD_Score'], 'r-', lw=4, alpha=.6)
        plt.plot(metrics[metric]['mAP'], 'b--', alpha=0.5)
        plt.legend(['EAD_Score', 'mAP'])
        plt.xlabel('epochs', fontsize=14)
        plt.ylabel('accuracy', fontsize=14)
        plt.title('EAD_Score and mAP over Epochs', fontsize=16)
        # run.log_image(name='acc_over_epochs.png', plot=plt)
        plt.show()