## 0. Setup

In [1]:
import azureml.core
from azureml.core import Workspace
from azureml.core import Experiment
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

In [2]:
# load workspace configuration from the config.json file in the current folder.
ws = Workspace.from_config()
print(ws.name, ws.location, ws.resource_group, ws.location, sep = '\t')
experiment_name = 'mxk-train'
script_folder = './'
exp = Experiment(workspace=ws, name=experiment_name)

# choose a name for your cluster
compute_name = "gpucluster"
compute_min_nodes = 0
compute_max_nodes = 4
vm_size = "STANDARD_NC6"

compute_target = ws.compute_targets[compute_name]
if compute_target and type(compute_target) is AmlCompute:
    print('found compute target. just use it. ' + compute_name)

ds = ws.get_default_datastore()
print(ds.datastore_type, ds.account_name, ds.container_name)

Found the config file in: C:\Users\makayser\Desktop\git\mxk_retinanet\notebook\aml_config\config.json
mak-ml	westeurope	makshared	westeurope
found compute target. just use it. gpucluster
AzureBlob makml9496683038 azureml-blobstore-43aa3424-3674-489b-808b-1e49daacf13c


In [None]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_GPU_IMAGE

# Create a new runconfig object
run_amlcompute = RunConfiguration(framework="python")

# Use the gpu_cluster you created above. 
run_amlcompute.target = compute_target

# Enable Docker
run_amlcompute.environment.docker.enabled = True

# Set Docker base image to the default CPU-based image
run_amlcompute.environment.docker.base_image = DEFAULT_GPU_IMAGE

# Use conda_dependencies.yml to create a conda environment in the Docker image for execution
run_amlcompute.environment.python.user_managed_dependencies = False

# Auto-prepare the Docker image when used for execution (if it is not already prepared)
run_amlcompute.auto_prepare_environment = True

# Specify CondaDependencies obj, add necessary packages
run_amlcompute.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['scikit-learn'])

In [3]:
import os
os.chdir('../code')

## 2. Run the model

In [4]:
# import sys
# sys.path.append('../code')

In [5]:
%load_ext autoreload
%autoreload 2

In [6]:
# #arg_list = ['--epochs=30', '--steps=1000', '--fl-gamma=1.5', '--fl-alpha=0.25', '--backbone=resnet50', '--snapshot-path=/content/gdrive/My Drive/BA/RetinaNet/final_split/resnet50/run_1', '--random-transform', '--tensorboard-dir=/content/gdrive/My Drive/BA/RetinaNet/final_split/resnet50/run_1', 'csv', '/content/gdrive/My Drive/BA/Data/train_set_v2_retina.csv', '/content/gdrive/My Drive/BA/Data/classes.csv', '--val-annotations=/content/gdrive/My Drive/BA/Data/test_set_v2_retina.csv']
# # arg_list = ['--epochs=10', '--steps=1000', '--fl-gamma=1.5', '--fl-alpha=0.25','--snapshot', 
# #             '../assets/resnet50_csv_09_0.30.h5',  '--backbone=resnet50', 
# #             '--snapshot-path=../assets/', 
# #             '--random-transform', '--tensorboard-dir=../assets/'
# #             , 'csv', '../assets/train_set_v2_retina.csv', 
# #             '../assets/classes.csv']#, '--val-annotations=../assets/test_set_v2_retina.csv']

# arg_list = ['--epochs=5', '--steps=2300', '--fl-gamma=1.5', '--fl-alpha=0.25', '--weights', 
#             '../assets/resnet50_coco_best_v2.1.0.h5', 
#             '--backbone=resnet50', 
#             '--snapshot-path=../assets/run_1', 
#             '--random-transform', '--tensorboard-dir=../assets/run_1', 
#             'csv', '../assets/train_set_v2_retina.csv', 
#             '../assets/classes.csv']
# import train
# train.main(arg_list)

In [38]:
script_params = {
    '--epochs':1, 
    '--steps':2300,
    '--fl-gamma':1.5,
    '--fl-alpha':0.25,
    '--weights': ds.path('mxk/model/resnet50_coco_best_v2.1.0.h5').as_mount(),
    '--backbone' : 'resnet50',
    '--snapshot-path':ds.path('mxk/').as_mount(),
    '--random-transform':'',
    'csv':'', 
    '--annotations':ds.path('mxk/train_set_v2_retina.csv').as_mount(),
    '--classes':ds.path('mxk/classes.csv').as_mount()
}

In [39]:
script_params

{'--epochs': 5,
 '--steps': 2300,
 '--fl-gamma': 1.5,
 '--fl-alpha': 0.25,
 '--weights': $AZUREML_DATAREFERENCE_3da6649159e14daa8e57d84dfcc8a8c4,
 '--backbone': 'resnet50',
 '--snapshot-path': $AZUREML_DATAREFERENCE_62aca3f1e0ab46bbb1cb35f3b246c320,
 '--random-transform': '',
 'csv': '',
 '--annotations': $AZUREML_DATAREFERENCE_53e4b2b0884b4f7896014f6301633e6a,
 '--classes': $AZUREML_DATAREFERENCE_0c0a5f47a4fd45c49f889018e1c738fd}

In [40]:
from azureml.core import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

cd = CondaDependencies()
for ch in ['conda-forge']:
    cd.add_channel(ch)
for pkg in ['tensorflow-gpu','h5py','mesa-libgl-cos6-x86_64', 'opencv', 'pillow', 'six','progressbar2',
            'keras']:
    cd.add_conda_package(pkg)
print(cd.serialize_to_string())

rc = RunConfiguration()
rc.environment.python.conda_dependencies = cd
rc.environment.docker.enabled = True
rc.environment.docker.gpu_support = True
rc.environment.docker.base_image = azureml.core.runconfig.DEFAULT_GPU_IMAGE

from azureml.train.estimator import Estimator
est = Estimator(source_directory=script_folder,
                script_params=script_params,
                compute_target=compute_target,
               entry_script='train.py',                
                environment_definition=rc.environment
               )


# Conda environment specification. The dependencies defined in this file will
# be automatically provisioned for runs with userManagedDependencies=False.

# Details about the Conda environment file format:
# https://conda.io/docs/user-guide/tasks/manage-environments.html#create-env-file-manually

name: project_environment
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2

- pip:
    # Required packages for AzureML execution, history, and data preparation.
  - azureml-defaults
- tensorflow-gpu
- h5py
- mesa-libgl-cos6-x86_64
- opencv
- pillow
- six
- progressbar2
- keras
channels:
- conda-forge



In [23]:
# from azureml.train.dnn import TensorFlow

# # script_params = {
# #     '--epochs':5, 
# #     '--steps':2300,
# #     '--fl-gamma':1.5,
# #     '--fl-alpha':0.25,
# #     '--weights': ds.path('mxk/model/resnet50_coco_best_v2.1.0.h5').as_mount(),
# #     '--backbone' : 'resnet50',
# #     '--snapshot-path':ds.path('mxk/').as_mount(),
# #     '--random-transform':True,
# #     'csv':True, 
# #     'annotations':ds.path('mxk/train_set_v2_retina.csv').as_mount(),
# #     'classes':ds.path('mxk/classes.csv').as_mount()
# # }


# # '--epochs':5, 
# # '--steps':2300,
# # '--fl-gamma':1.5,
# # '--fl-alpha':0.25,
# # '--weights':'../assets/resnet50_coco_best_v2.1.0.h5', #ds.path('mxk/weights/').as_mount()
# # '--backbone' : 'resnet50',
# # '--snapshot-path':'../assets/run_1',#ds.path('mxk/weights/').as_mount()
# # '--random-transform',
# # 'csv', 
# # '--annotations':'../assets/train_set_v2_retina.csv', #ds.path('mxk/').as_mount()
# # '--classes':'../assets/classes.csv'#ds.path('mxk/').as_mount() 

    
# est = TensorFlow(source_directory=script_folder,
#                  script_params=script_params,
#                  compute_target=compute_target, 
#                  conda_packages=['keras'],#,'h5py','pillow','opencv','progressbar2'],
#                  entry_script='train.py', 
#                  vm_priority='lowpriority',
#                  use_gpu=True)

In [30]:
run = exp.submit(est)

In [32]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [41]:
run

Experiment,Id,Type,Status,Details Page,Docs Page
mxk-train,mxk-train_1552153065_fa20c9ec,azureml.scriptrun,Failed,Link to Azure Portal,Link to Documentation


In [34]:
run.wait_for_completion(show_output=True)

RunId: mxk-train_1552153065_fa20c9ec

Streaming azureml-logs/20_image_build_log.txt

2019/03/09 17:38:06 Using acb_vol_078b9917-ccdc-46f7-9e3c-feb443f9727a as the home volume
2019/03/09 17:38:06 Creating Docker network: acb_default_network, driver: 'bridge'
2019/03/09 17:38:06 Successfully set up Docker network: acb_default_network
2019/03/09 17:38:06 Setting up Docker configuration...
2019/03/09 17:38:07 Successfully set up Docker configuration
2019/03/09 17:38:07 Logging in to registry: makml4077955604.azurecr.io
2019/03/09 17:38:08 Successfully logged into makml4077955604.azurecr.io
2019/03/09 17:38:08 Executing step ID: acb_step_0. Working directory: '', Network: 'acb_default_network'
2019/03/09 17:38:08 Obtaining source code and scanning for dependencies...
2019/03/09 17:38:09 Successfully obtained source code and scanned for dependencies
2019/03/09 17:38:09 Launching container with name: acb_step_0
Sending build context to Docker daemon  40.96kB

Step 1/13 : FROM mcr.microsoft.co


Streaming azureml-logs/60_control_log.txt

Streaming log file azureml-logs/60_control_log.txt
Streaming log file azureml-logs/80_driver_log.txt

Streaming azureml-logs/80_driver_log.txt

Using TensorFlow backend.


The experiment failed. Finalizing run...
Logging experiment finalizing status in history service
Cleaning up all outstanding Run operations, waiting 300.0 seconds
1 items cleaning up...
Cleanup took 0.10144829750061035 seconds
Traceback (most recent call last):
  File "azureml-setup/context_manager_injector.py", line 152, in <module>
    execute_with_context(cm_objects, options.invocation)
  File "azureml-setup/context_manager_injector.py", line 88, in execute_with_context
    runpy.run_path(sys.argv[0], globals(), run_name="__main__")
  File "/azureml-envs/azureml_8950a5aaadc8a5c69f0df5849b31de2f/lib/python3.6/runpy.py", line 263, in run_path
    pkg_name=pkg_name, script_name=fname)
  File "/azureml-envs/azureml_8950a5aaadc8a5c69f0df5849b31de2f/lib/python3.6/runpy.py", li

{'runId': 'mxk-train_1552153065_fa20c9ec',
 'target': 'gpucluster',
 'status': 'Failed',
 'startTimeUtc': '2019-03-09T18:00:35.064377Z',
 'endTimeUtc': '2019-03-09T18:02:21.763978Z',
 'properties': {'azureml.runsource': 'experiment',
  'ContentSnapshotId': 'ee31ca30-715b-4c42-a62c-4ca2a57a4347'},
 'runDefinition': {'Script': 'train.py',
  'Arguments': ['--epochs',
   '5',
   '--steps',
   '2300',
   '--fl-gamma',
   '1.5',
   '--fl-alpha',
   '0.25',
   '--weights',
   '$AZUREML_DATAREFERENCE_adeba778cb7949e9ac70c690958f2302',
   '--backbone',
   'resnet50',
   '--snapshot-path',
   '$AZUREML_DATAREFERENCE_13b62a02a4ff4f65af8cf09f906e03fc',
   '--random-transform',
   'true',
   'csv',
   'true',
   'annotations',
   '$AZUREML_DATAREFERENCE_eb55e0f8570540578c90b9fdba05dd1f',
   'classes',
   '$AZUREML_DATAREFERENCE_de27857bb2f543c696d464e5959bf9a6'],
  'SourceDirectoryDataStore': None,
  'Framework': 0,
  'Communicator': 0,
  'Target': 'gpucluster',
  'DataReferences': {'adeba778cb7949