## 0. Setup

In [1]:
import azureml.core
from azureml.core import Workspace
from azureml.core import Experiment
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

In [2]:
# load workspace configuration from the config.json file in the current folder.
ws = Workspace.from_config()
print(ws.name, ws.location, ws.resource_group, ws.location, sep = '\t')
experiment_name = 'mxk-train'
script_folder = './'
exp = Experiment(workspace=ws, name=experiment_name)

# choose a name for your cluster
compute_name = "gpucluster"
compute_min_nodes = 0
compute_max_nodes = 4
vm_size = "STANDARD_NC6"

compute_target = ws.compute_targets[compute_name]
if compute_target and type(compute_target) is AmlCompute:
    print('found compute target. just use it. ' + compute_name)

ds = ws.get_default_datastore()
print(ds.datastore_type, ds.account_name, ds.container_name)

Found the config file in: C:\Users\makayser\Desktop\git\mxk_retinanet\notebook\aml_config\config.json
mak-ml	westeurope	makshared	westeurope
found compute target. just use it. gpucluster
AzureBlob makml9496683038 azureml-blobstore-43aa3424-3674-489b-808b-1e49daacf13c


In [49]:
# ds.upload(src_dir='./bin', target_path='mxk-train', overwrite=True, show_progress=True)

Uploading ./bin\__init__.py
Uploading ./bin\__pycache__\__init__.cpython-36.pyc
Uploading ./bin\__pycache__\train.cpython-36.pyc
Uploading ./bin\convert_model.py
Uploading ./bin\debug.py
Uploading ./bin\evaluate.py
Uploaded ./bin\evaluate.py, 1 files out of an estimated total of 6
Uploaded ./bin\debug.py, 2 files out of an estimated total of 6
Uploaded ./bin\__pycache__\__init__.cpython-36.pyc, 3 files out of an estimated total of 6
Uploaded ./bin\convert_model.py, 4 files out of an estimated total of 6
Uploaded ./bin\__init__.py, 5 files out of an estimated total of 6
Uploaded ./bin\__pycache__\train.cpython-36.pyc, 6 files out of an estimated total of 6


$AZUREML_DATAREFERENCE_912b6e41e21b484084a14416f983d2ac

## Deploy Training

In [3]:
import os
os.chdir('../code')

In [5]:
%load_ext autoreload
%autoreload 2

In [67]:
script_params = {
    '--data-dir': ds.path('mxk').as_mount(),
    '--epochs':1, 
    '--steps':2300,
    '--fl-gamma':1.5,
    '--fl-alpha':0.25,
    '--weights': 'model/resnet50_coco_best_v2.1.0.h5',
    '--backbone' : 'resnet50',
    '--annotations':'train_set_v2_retina.csv',
    '--classes':'classes.csv'
}

In [68]:
script_params

{'--data-dir': $AZUREML_DATAREFERENCE_8ab86004993e4bef9a669d5ce9e0959c,
 '--epochs': 1,
 '--steps': 2300,
 '--fl-gamma': 1.5,
 '--fl-alpha': 0.25,
 '--weights': 'model/resnet50_coco_best_v2.1.0.h5',
 '--backbone': 'resnet50',
 '--annotations': 'train_set_v2_retina.csv',
 '--classes': 'classes.csv'}

In [69]:
from azureml.core import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_GPU_IMAGE

cd = CondaDependencies()
for ch in ['conda-forge','anaconda']:
    cd.add_channel(ch)
for pkg in ['opencv=3.4.2', 'tensorflow-gpu','h5py','mesa-libgl-cos6-x86_64', 'pillow', 'six', 'progressbar2',
            'keras']:
    cd.add_conda_package(pkg)
for pkg in ['opencv-python-headless']:
    cd.add_pip_package(pkg)
print(cd.serialize_to_string())

rc = RunConfiguration()
rc.environment.python.conda_dependencies = cd
rc.environment.docker.enabled = True
rc.environment.docker.gpu_support = True
rc.environment.docker.base_image = azureml.core.runconfig.DEFAULT_GPU_IMAGE

from azureml.train.estimator import Estimator
est = Estimator(source_directory=script_folder,
                script_params=script_params,
                compute_target=compute_target,
               entry_script='train.py',                
                environment_definition=rc.environment
               )


# Conda environment specification. The dependencies defined in this file will
# be automatically provisioned for runs with userManagedDependencies=False.

# Details about the Conda environment file format:
# https://conda.io/docs/user-guide/tasks/manage-environments.html#create-env-file-manually

name: project_environment
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2

- pip:
    # Required packages for AzureML execution, history, and data preparation.
  - azureml-defaults
  - opencv-python-headless
- opencv=3.4.2
- tensorflow-gpu
- h5py
- mesa-libgl-cos6-x86_64
- pillow
- six
- progressbar2
- keras
channels:
- conda-forge
- anaconda



In [None]:
from azureml.train.hyperdrive import RandomParameterSampling
param_sampling = RandomParameterSampling( {
        "--lr": uniform(1e-6, 1e-04),
        "--fl-gamma": choice(0.75, 1, 1.25, 1.5, 1.75, 2, 2.25),
        "--fl-alpha": choice(0.25, 0.5, 0.75, 1),
        "--neg-overlap": choice (0.4, 0.5, 0.6),
        "--pos-overlap": choice (0.5, 0.6, 0.7),
        "--fpn-layers": choice (4, 5)
    }
)

In [None]:
# slack: slack allowed with respect to the best performing training run
from azureml.train.hyperdrive import BanditPolicy
early_termination_policy = BanditPolicy(slack_factor = 0.25, evaluation_interval=1, delay_evaluation=2)

In [None]:
max_total_runs=30,
max_concurrent_runs=4

In [None]:
from azureml.core.run import Run
run_logger = Run.get_context()
run_logger.log("EAD_Score", float(EAD_Score))

In [None]:
from azureml.train.hyperdrive import HyperDriveRunConfig
hyperdrive_run_config = HyperDriveRunConfig(estimator=est,
                          hyperparameter_sampling=param_sampling, 
                          policy=early_termination_policy,
                          primary_metric_name="EAD_Score", 
                          primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                          max_total_runs=max_total_runs,
                          max_concurrent_runs=max_concurrent_runs)

In [None]:
run = exp.submit(config=hyperdrive_run_config)

In [70]:
run = exp.submit(est)

In [71]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [72]:
run

Experiment,Id,Type,Status,Details Page,Docs Page
mxk-train,mxk-train_1552173280_6ff3a97a,azureml.scriptrun,Queued,Link to Azure Portal,Link to Documentation


In [None]:
run.wait_for_completion(show_output=True)

RunId: mxk-train_1552173280_6ff3a97a

Streaming azureml-logs/60_control_log.txt

Streaming log file azureml-logs/60_control_log.txt

Streaming azureml-logs/80_driver_log.txt

Using TensorFlow backend.
2019-03-09 23:21:24.107271: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
Creating model, this may take a second...
  weight_values[i].shape))
  weight_values[i].shape))
Epoch 1/1

   1/2300 [..............................] - ETA: 8:29:36 - loss: 3.6002 - regression_loss: 2.4120 - classification_loss: 1.1881
   2/2300 [..............................] - ETA: 6:36:31 - loss: 3.6698 - regression_loss: 2.4694 - classification_loss: 1.2004
   3/2300 [..............................] - ETA: 6:27:30 - loss: 3.5641 - regression_loss: 2.3576 - classification_loss: 1.2065
   4/2300 [..............................] - ETA: 6:05:38 - loss: 3.6175 - regression_loss: 2.4008 - classification_loss: 1.2167
  

  57/2300 [..............................] - ETA: 4:24:27 - loss: 3.4542 - regression_loss: 2.2025 - classification_loss: 1.2517
  58/2300 [..............................] - ETA: 4:24:51 - loss: 3.4378 - regression_loss: 2.1837 - classification_loss: 1.2542
  59/2300 [..............................] - ETA: 4:24:21 - loss: 3.4412 - regression_loss: 2.1875 - classification_loss: 1.2537
  60/2300 [..............................] - ETA: 4:23:49 - loss: 3.4443 - regression_loss: 2.1913 - classification_loss: 1.2530
  61/2300 [..............................] - ETA: 4:24:09 - loss: 3.4436 - regression_loss: 2.1918 - classification_loss: 1.2518
  62/2300 [..............................] - ETA: 4:24:27 - loss: 3.4422 - regression_loss: 2.1914 - classification_loss: 1.2508
  63/2300 [..............................] - ETA: 4:23:55 - loss: 3.4496 - regression_loss: 2.1997 - classification_loss: 1.2499
  64/2300 [..............................] - ETA: 4:24:17 - loss: 3.4489 - regression_loss: 2.200