# Launch Training Jobs on Azure Cluster

In [1]:
from azureml.core import Workspace
from azureml.exceptions import WorkspaceException
from azureml.exceptions import ComputeTargetException
from azureml.core.compute import ComputeTarget
from azureml.core.compute import AmlCompute
from azureml.core import Experiment
from azureml.core import Datastore
from azureml.train.estimator import Estimator
from azureml.tensorboard import Tensorboard
from azureml.train.hyperdrive import HyperDriveConfig
from azureml.train.hyperdrive import GridParameterSampling
from azureml.train.hyperdrive import PrimaryMetricGoal
from azureml.train.hyperdrive import MedianStoppingPolicy
from azureml.train.hyperdrive.parameter_expressions import choice
from azureml.train.hyperdrive.parameter_expressions import uniform

from pathlib import Path

## Options

In [2]:
LIST_VMS = False
USE_GPU = True

## Create Workspace

In [3]:
try:
    ws = Workspace.create(name='replearn',
                          location='eastus',
                          resource_group='rg1',
                          subscription_id='5fb52191-233d-4b0f-9713-de0e41784e6e')
    ws.write_config()
except WorkspaceException:
    print('Found existing Workspace, using it.')
    ws = Workspace.from_config(Path.cwd() / '.azureml' / 'config.json')

Deploying AppInsights with name replearninsights3b3bc630.
Deployed AppInsights with name replearninsights3b3bc630. Took 16.04 seconds.
Deploying KeyVault with name replearnkeyvault009ea547.
Deploying StorageAccount with name replearnstorage27908898b.
Deployed KeyVault with name replearnkeyvault009ea547. Took 32.69 seconds.
Deployed StorageAccount with name replearnstorage27908898b. Took 118.74 seconds.
Deploying Workspace with name replearn.
Deployed Workspace with name replearn. Took 52.89 seconds.


In [4]:
if LIST_VMS:
    print(AmlCompute.supported_vmsizes(workspace=ws))

## Create Compute Target (Cluster)
A persistent Azure Machine Learning Compute can be reused across jobs. The compute can be shared with other users in the workspace and is kept between jobs.  
https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets  

In [5]:
# Set cluster name
cluster_name = "NC6-cluster8"
vm_priority = 'dedicated'  # dedicated or lowpriority
max_nodes = 8

# Verify that cluster does not exist already
try:
    cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, using it.')
except ComputeTargetException:
    if USE_GPU:
        vm_size = 'Standard_NC6s_v3'
    else:
        vm_size = 'Standard_DS4_v2'
    compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                           max_nodes=max_nodes,
                                                           vm_priority=vm_priority,
                                                           idle_seconds_before_scaledown=3600)
    cluster = ComputeTarget.create(ws, cluster_name, compute_config)

cluster.wait_for_completion(show_output=True)

Creating
Succeeded
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned


## Register Datastore  
https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-access-data

In [6]:
datastores = ws.datastores
if 'training_data' not in datastores.keys():
    acct_key = 'fzVL5O3ybeVQ/eBeMzp5YqnmDUkFwhVJTWPaNezIuxAZWoduY79W7o3l3Zop3FN22txHCXl3UBkdaaM/9C+12Q=='
    Datastore.register_azure_blob_container(workspace=ws, 
                                            datastore_name='training_data', 
                                            container_name='asgdata',
                                            account_name='asgdata', 
                                            account_key=acct_key,
                                            create_if_not_exists=False)
else:
    print('Found existing training_data Datastore, using it.')
ds = ws.datastores['training_data']

## Create an Experiment and Run  
https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-train-ml-models  

In [7]:
# `train_dir` and `val_dir` are path on container to training and val `*.tfr` files
script_params = {
    '--azure_ml': "",
    '--train_dir': ds.path('Data/LibriSpeech/tfrecords/train-clean-100').as_download(),
    '--val_dir': ds.path('Data/LibriSpeech/tfrecords/dev-clean').as_download(),
}

tf_est = Estimator(source_directory=Path.cwd() / '..' / 'replearn',
                   script_params=script_params,
                   compute_target=cluster,
                   use_gpu=USE_GPU,
                   entry_script='train.py',
                   pip_packages=['tensorflow-gpu'])

In [8]:
experiment_name = 'replearn'
exp = Experiment(workspace=ws, name=experiment_name)

In [9]:
run = exp.submit(tf_est)
print(run.get_portal_url())
# run.wait_for_completion(show_output=True)  # view stream of stdout

https://ml.azure.com/experiments/replearn/runs/replearn_1573856774_2a1ccb7b?wsid=/subscriptions/5fb52191-233d-4b0f-9713-de0e41784e6e/resourcegroups/rg1/workspaces/replearn
RunId: replearn_1573856774_2a1ccb7b
Web View: https://ml.azure.com/experiments/replearn/runs/replearn_1573856774_2a1ccb7b?wsid=/subscriptions/5fb52191-233d-4b0f-9713-de0e41784e6e/resourcegroups/rg1/workspaces/replearn

Streaming azureml-logs/20_image_build_log.txt

2019/11/15 22:26:28 Downloading source code...
2019/11/15 22:26:30 Finished downloading source code
2019/11/15 22:26:30 Creating Docker network: acb_default_network, driver: 'bridge'
2019/11/15 22:26:31 Successfully set up Docker network: acb_default_network
2019/11/15 22:26:31 Setting up Docker configuration...
2019/11/15 22:26:31 Successfully set up Docker configuration
2019/11/15 22:26:31 Logging in to registry: replearn00b9615a.azurecr.io
2019/11/15 22:26:33 Successfully logged into replearn00b9615a.azurecr.io
2019/11/15 22:26:33 Executing step ID: acb



2019-11-15 22:51:33.616553: W tensorflow/core/common_runtime/base_collective_executor.cc:216] BaseCollectiveExecutor::StartAbort Out of range: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_4]]
2019-11-15 22:51:33.616656: W tensorflow/core/common_runtime/base_collective_executor.cc:216] BaseCollectiveExecutor::StartAbort Out of range: End of sequence
	 [[{{node IteratorGetNext}}]]
Step accuracies:
[0.89257157 0.64039654 0.4945167  0.4055432  0.343077   0.3022729
 0.27097878 0.25064987 0.23527977 0.2246062 ]
4200/4200 - 56s - loss: 1.6499 - val_loss: 1.6974
Epoch 7/25
2019-11-15 22:52:29.069029: W tensorflow/core/common_runtime/base_collective_executor.cc:216] BaseCollectiveExecutor::StartAbort Out of range: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_4]]
2019-11-15 22:52:29.069118: W tensorflow/core/common_runtime/base_collective_executor.cc:216] BaseCollectiveExecutor::StartAbort Out of range: End of sequence
	 [[{{node IteratorGetNext}}]]




2019-11-15 23:03:36.112502: W tensorflow/core/common_runtime/base_collective_executor.cc:216] BaseCollectiveExecutor::StartAbort Out of range: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_4]]
2019-11-15 23:03:36.112642: W tensorflow/core/common_runtime/base_collective_executor.cc:216] BaseCollectiveExecutor::StartAbort Out of range: End of sequence
	 [[{{node IteratorGetNext}}]]
Step accuracies:
[0.8952477  0.63000524 0.49870777 0.40667132 0.34293273 0.3027837
 0.2721078  0.25037178 0.23582429 0.22535913]
4200/4200 - 56s - loss: 1.6487 - val_loss: 1.6919
Epoch 20/25
2019-11-15 23:04:31.770391: W tensorflow/core/common_runtime/base_collective_executor.cc:216] BaseCollectiveExecutor::StartAbort Out of range: End of sequence
	 [[{{node IteratorGetNext}}]]
	 [[IteratorGetNext/_4]]
2019-11-15 23:04:31.770493: W tensorflow/core/common_runtime/base_collective_executor.cc:216] BaseCollectiveExecutor::StartAbort Out of range: End of sequence
	 [[{{node IteratorGetNext}}]]



ClientRequestError: Error occurred in request., ConnectionError: HTTPSConnectionPool(host='eastus.experiments.azureml.net', port=443): Max retries exceeded with url: /history/v1.0/subscriptions/5fb52191-233d-4b0f-9713-de0e41784e6e/resourceGroups/rg1/providers/Microsoft.MachineLearningServices/workspaces/replearn/experiments/replearn/runs/replearn_1573856774_2a1ccb7b/details (Caused by NewConnectionError('<urllib3.connection.VerifiedHTTPSConnection object at 0x133ae3e48>: Failed to establish a new connection: [Errno 8] nodename nor servname provided, or not known',))

## Launch TensorBoard Server  
https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-monitor-tensorboard

In [None]:
tb = Tensorboard([run])
tb.start() # click on link and tensboard up-and-running with run training data

In [None]:
# close server when done
tb.stop()

## Hyperparameter Tuning: Grid Search  
https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters

In [9]:
param_sampling = GridParameterSampling(
    {"dim_z": choice(5, 10, 20),
     "dim_c": choice(5, 10, 20),
})

hyperdrive_run_config = HyperDriveConfig(estimator=tf_est,
                          hyperparameter_sampling=param_sampling, 
                          primary_metric_name="val_loss",
                          policy=MedianStoppingPolicy(evaluation_interval=5, delay_evaluation=5),
                          primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
                          max_total_runs=100,
                          max_concurrent_runs=max_nodes)

In [10]:
experiment = Experiment(ws, 'hyperparam')
hyperdrive_run = experiment.submit(hyperdrive_run_config)
print(hyperdrive_run.get_portal_url())

https://ml.azure.com/experiments/hyperparam/runs/hyperparam_1573744387365987?wsid=/subscriptions/5fb52191-233d-4b0f-9713-de0e41784e6e/resourcegroups/rg1/workspaces/replearn
