# Launch Training Jobs on Azure Cluster

In [None]:
from azureml.core import Workspace
from azureml.exceptions import WorkspaceException
from azureml.exceptions import ComputeTargetException
from azureml.core.compute import ComputeTarget
from azureml.core.compute import AmlCompute
from azureml.core import Experiment
from azureml.core import Datastore
from azureml.train.estimator import Estimator
from azureml.tensorboard import Tensorboard
from azureml.train.hyperdrive import HyperDriveConfig
from azureml.train.hyperdrive import GridParameterSampling
from azureml.train.hyperdrive import PrimaryMetricGoal
from azureml.train.hyperdrive import MedianStoppingPolicy
from azureml.train.hyperdrive.parameter_expressions import choice
from azureml.train.hyperdrive.parameter_expressions import uniform

from pathlib import Path

## Options

In [None]:
LIST_VMS = False
USE_GPU = True

## Create Workspace

In [None]:
try:
    ws = Workspace.create(name='kws',
                          location='eastus',
                          resource_group='rg1',
                          subscription_id='5fb52191-233d-4b0f-9713-de0e41784e6e')
    ws.write_config()
except WorkspaceException:
    print('Found existing Workspace, using it.')
    ws = Workspace.from_config(Path.cwd() / '.azureml' / 'config.json')

In [None]:
if LIST_VMS:
    print(AmlCompute.supported_vmsizes(workspace=ws))

## Create Compute Target (Cluster)
A persistent Azure Machine Learning Compute can be reused across jobs. The compute can be shared with other users in the workspace and is kept between jobs.  
https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets  

In [None]:
# Set cluster name
cluster_name = "NC6-standard8"
vm_priority = 'dedicated'  # 'dedicated' or 'lowpriority'
max_nodes = 8

# Verify that cluster does not exist already
try:
    cluster = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, using it.')
except ComputeTargetException:
    if USE_GPU:
        vm_size = 'Standard_NC6'
#         vm_size = 'Standard_NC6s_v3'
    else:
        vm_size = 'Standard_DS4_v2'
    compute_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                           max_nodes=max_nodes,
                                                           vm_priority=vm_priority,
                                                           idle_seconds_before_scaledown=3600)
    cluster = ComputeTarget.create(ws, cluster_name, compute_config)

cluster.wait_for_completion(show_output=True)

## Register Datastore  
https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-access-data

In [None]:
datastores = ws.datastores
if 'training_data' not in datastores.keys():
    acct_key = 'fzVL5O3ybeVQ/eBeMzp5YqnmDUkFwhVJTWPaNezIuxAZWoduY79W7o3l3Zop3FN22txHCXl3UBkdaaM/9C+12Q=='
    Datastore.register_azure_blob_container(workspace=ws, 
                                            datastore_name='training_data', 
                                            container_name='asgdata',
                                            account_name='asgdata', 
                                            account_key=acct_key,
                                            create_if_not_exists=False)
else:
    print('Found existing training_data Datastore, using it.')
ds = ws.datastores['training_data']

## Create an Experiment and Run  
https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-train-ml-models  

In [None]:
# `data_dir` is the path on container to training and val `*.tfr` files
script_params = {
    '--azure_ml': "",
    '--data_dir': ds.path('Data/kws/tfrecords').as_download(),
    '--ds_type': "samples",
}

tf_est = Estimator(source_directory=Path.cwd() / '..' / 'kws',
                   script_params=script_params,
                   compute_target=cluster,
                   use_gpu=USE_GPU,
                   entry_script='train.py',
                   pip_packages=['tensorflow-gpu'])

In [None]:
experiment_name = 'kws'
exp = Experiment(workspace=ws, name=experiment_name)

In [None]:
run = exp.submit(tf_est)
print(run.get_portal_url())
# run.wait_for_completion(show_output=True)  # view stream of stdout

## Launch TensorBoard Server  
https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-monitor-tensorboard

In [None]:
tb = Tensorboard([run])
tb.start() # click on link and tensboard up-and-running with run training data

In [None]:
# close server when done
tb.stop()

## Hyperparameter Tuning: Grid Search  
https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-tune-hyperparameters

In [None]:
param_sampling = GridParameterSampling(
    {"load_genc": choice(0, 1),
     "train_genc": choice(0, 1),
})

hyperdrive_run_config = HyperDriveConfig(estimator=tf_est,
                          hyperparameter_sampling=param_sampling, 
                          primary_metric_name="val_loss",
#                           policy=MedianStoppingPolicy(evaluation_interval=5, delay_evaluation=11),
                          primary_metric_goal=PrimaryMetricGoal.MINIMIZE,
                          max_total_runs=100,
                          max_concurrent_runs=max_nodes)

In [None]:
experiment = Experiment(ws, 'hyperparam')
hyperdrive_run = experiment.submit(hyperdrive_run_config)
print(hyperdrive_run.get_portal_url())