# Launch Training Jobs on Azure Cluster

In [1]:
from azureml.core import Workspace
from azureml.exceptions import WorkspaceException
from azureml.exceptions import ComputeTargetException
from azureml.core.compute import ComputeTarget
from azureml.core.compute import AmlCompute
from azureml.core import Experiment
from azureml.core import Datastore
from azureml.train.estimator import Estimator
from azureml.tensorboard import Tensorboard
from azureml.train.hyperdrive import HyperDriveConfig
from azureml.train.hyperdrive import GridParameterSampling
from azureml.train.hyperdrive import PrimaryMetricGoal
from azureml.train.hyperdrive import MedianStoppingPolicy
from azureml.train.hyperdrive.parameter_expressions import choice
from azureml.train.hyperdrive.parameter_expressions import uniform

from pathlib import Path

## Options

In [2]:
LIST_VMS = False
USE_GPU = True

## Create Workspace

In [3]:
try:
    ws = Workspace.create(name='replearn',
                          location='eastus',
                          resource_group='rg1',
                          subscription_id='5fb52191-233d-4b0f-9713-de0e41784e6e')
    ws.write_config()
except WorkspaceException:
    print('Found existing Workspace, using it.')
    ws = Workspace.from_config(Path.cwd() / '.azureml' / 'config.json')

Found existing Workspace, using it.


In [4]:
if LIST_VMS:
    print(AmlCompute.supported_vmsizes(workspace=ws))

## Create Compute Target (Cluster)
A persistent Azure Machine Learning Compute can be reused across jobs. The compute can be shared with other users in the workspace and is kept between jobs.  
https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets  

In [5]:
# Set cluster name
cluster_name = "NC6-cluster8"
vm_priority = 'dedicated'  # dedicated or lowpriority
max_nodes = 8

# Verify that cluster does not exist already
try:
    cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, using it.')
except ComputeTargetException:
    if USE_GPU:
        vm_size = 'Standard_NC6s_v3'
    else:
        vm_size = 'Standard_DS4_v2'
    compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                           max_nodes=max_nodes,
                                                           vm_priority=vm_priority,
                                                           idle_seconds_before_scaledown=3600)
    cluster = ComputeTarget.create(ws, cluster_name, compute_config)

cluster.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned


## Register Datastore  
https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-access-data

In [6]:
datastores = ws.datastores
if 'training_data' not in datastores.keys():
    acct_key = 'fzVL5O3ybeVQ/eBeMzp5YqnmDUkFwhVJTWPaNezIuxAZWoduY79W7o3l3Zop3FN22txHCXl3UBkdaaM/9C+12Q=='
    Datastore.register_azure_blob_container(workspace=ws, 
                                            datastore_name='training_data', 
                                            container_name='asgdata',
                                            account_name='asgdata', 
                                            account_key=acct_key,
                                            create_if_not_exists=False)
else:
    print('Found existing training_data Datastore, using it.')
ds = ws.datastores['training_data']

Found existing training_data Datastore, using it.


## Create an Experiment and Run  
https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-train-ml-models  

In [11]:
# `train_dir` and `val_dir` are path on container to training and val `*.tfr` files
script_params = {
    '--azure_ml': "",
    '--train_dir': ds.path('Data/LibriSpeech/tfrecords/train-clean-100').as_download(),
    '--val_dir': ds.path('Data/LibriSpeech/tfrecords/dev-clean').as_download(),
}

tf_est = Estimator(source_directory=Path.cwd() / '..' / 'replearn',
                   script_params=script_params,
                   compute_target=cluster,
                   use_gpu=USE_GPU,
                   entry_script='train.py',
                   pip_packages=['tensorflow-gpu'])

In [12]:
experiment_name = 'replearn'
exp = Experiment(workspace=ws, name=experiment_name)

In [13]:
run = exp.submit(tf_est)
print(run.get_portal_url())
run.wait_for_completion(show_output=True)  # view stream of stdout

https://ml.azure.com/experiments/replearn/runs/replearn_1573748586_72698b27?wsid=/subscriptions/5fb52191-233d-4b0f-9713-de0e41784e6e/resourcegroups/rg1/workspaces/replearn
RunId: replearn_1573748586_72698b27
Web View: https://ml.azure.com/experiments/replearn/runs/replearn_1573748586_72698b27?wsid=/subscriptions/5fb52191-233d-4b0f-9713-de0e41784e6e/resourcegroups/rg1/workspaces/replearn

Streaming azureml-logs/55_azureml-execution-tvmps_d6421dce1d7d649d669e6c4157d926022a0c7de982c9dc77df2f2c3c355e8778_d.txt

2019-11-14T16:23:26Z Starting output-watcher...
Login Succeeded
Using default tag: latest
latest: Pulling from azureml/azureml_6b880195e982ace8c060c59daeee4849
Digest: sha256:118d16bb324bbbda549c648bcbba47f265f13b99b24a7a0957b0454e141e5d15
Status: Image is up to date for replearnee29d080.azurecr.io/azureml/azureml_6b880195e982ace8c060c59daeee4849:latest
44173cf736caa961ce2409528b615569a2d6f89688284c4461d16ed89a9fa889
2019/11/14 16:23:29 Version: 3.0.01032.0003 Branch: master Commit:



Step accuracies:
[0.09388269 0.09304325 0.07967433 0.10266265 0.0829584  0.09643158
 0.09755856 0.09827687 0.09222487 0.09889195]
Epoch 1/25
2019-11-14 16:27:26.066102: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference___backward_cudnn_gru_with_fallback_61164_61303_specialized_for_StatefulPartitionedCall_at___inference_distributed_function_62451' and '__inference___backward_standard_gru_61698_62255' both implement 'gru_e4ade8a1-b3c1-409e-b44e-1c89c35002ec' but their signatures do not match.
2019-11-14 16:27:26.350845: I tensorflow/core/profiler/lib/profiler_session.cc:184] Profiler session started.
2019-11-14 16:27:26.351956: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcupti.so.10.0
2019-11-14 16:27:26.706364: I tensorflow/core/platform/default/device_tracer.cc:588] Collecting 621 kernel records, 38 




Execution Summary
RunId: replearn_1573748586_72698b27
Web View: https://ml.azure.com/experiments/replearn/runs/replearn_1573748586_72698b27?wsid=/subscriptions/5fb52191-233d-4b0f-9713-de0e41784e6e/resourcegroups/rg1/workspaces/replearn



{'runId': 'replearn_1573748586_72698b27',
 'target': 'NC6-cluster8',
 'status': 'Completed',
 'startTimeUtc': '2019-11-14T16:23:26.526308Z',
 'endTimeUtc': '2019-11-14T16:51:39.133199Z',
 'properties': {'_azureml.ComputeTargetType': 'batchai',
  'ContentSnapshotId': '64e0b164-0d93-484c-af0c-d24d38cc05ca',
  'azureml.git.repository_uri': 'https://gitlab.analog.com/PCoady/replearn.git',
  'mlflow.source.git.repoURL': 'https://gitlab.analog.com/PCoady/replearn.git',
  'azureml.git.branch': 'master',
  'mlflow.source.git.branch': 'master',
  'azureml.git.commit': 'bc2c93460e967dc3f5d52d8b46da4e181e22f91d',
  'mlflow.source.git.commit': 'bc2c93460e967dc3f5d52d8b46da4e181e22f91d',
  'azureml.git.dirty': 'True',
  'AzureML.DerivedImageName': 'azureml/azureml_6b880195e982ace8c060c59daeee4849',
  'ProcessInfoFile': 'azureml-logs/process_info.json',
  'ProcessStatusFile': 'azureml-logs/process_status.json'},
 'inputDatasets': [],
 'runDefinition': {'script': 'train.py',
  'arguments': ['--azure_

## Launch TensorBoard Server  
https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-monitor-tensorboard

In [None]:
tb = Tensorboard([run])
tb.start() # click on link and tensboard up-and-running with run training data

In [None]:
# close server when done
tb.stop()

## Hyperparameter Tuning: Grid Search  
https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters

In [9]:
param_sampling = GridParameterSampling(
    {"dim_z": choice(5, 10, 20),
     "dim_c": choice(5, 10, 20),
})

hyperdrive_run_config = HyperDriveConfig(estimator=tf_est,
                          hyperparameter_sampling=param_sampling, 
                          primary_metric_name="val_loss",
                          policy=MedianStoppingPolicy(evaluation_interval=5, delay_evaluation=5),
                          primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
                          max_total_runs=100,
                          max_concurrent_runs=max_nodes)

In [10]:
experiment = Experiment(ws, 'hyperparam')
hyperdrive_run = experiment.submit(hyperdrive_run_config)
print(hyperdrive_run.get_portal_url())

https://ml.azure.com/experiments/hyperparam/runs/hyperparam_1573744387365987?wsid=/subscriptions/5fb52191-233d-4b0f-9713-de0e41784e6e/resourcegroups/rg1/workspaces/replearn
