In [1]:
import azureml.core
from azureml.core.compute import ComputeTarget, BatchAiCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import EnvironmentDefinition
from azureml.train.estimator import Estimator
from azureml.core.workspace import Workspace
from azureml.core import Experiment
from azureml.train.widgets import RunDetails

import json
import sys
import os
import shutil

sys.path.append('src')

from registry import azure_container_registry_for
import batch_ai


In [2]:
print("SDK version:", azureml.core.VERSION)

SDK version: 0.1.68


In [3]:
ws = Workspace.from_config(path='configs/aml_config/azml_config.json')

Found the config file in: /workspace/AMLBatchAI/configs/aml_config/azml_config.json


In [4]:
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

Workspace name: msksalt
Azure region: eastus
Subscription id: edf507a2-6235-46c5-b560-fd463ba2e771
Resource group: msazmlksaltrg


In [6]:
registry_name = ws.get_details()['containerRegistry'].split('/')[-1]
registry_name

'msksaltacrahtgbdgx'

In [7]:
azr = azure_container_registry_for(ws.resource_group, registry_name, subscription_id=ws.subscription_id)

In [8]:
compute_target = batch_ai.cluster(ws, 
                                  "nc6v33", 
                                  vm_size='STANDARD_NC6s_v3', 
                                  autoscale_enabled=True, 
                                  vm_priority='dedicated', 
                                  cluster_min_nodes=0, 
                                  cluster_max_nodes=10,
                                  location='eastus')

Creating
succeeded
BatchAI wait for completion finished
Minimum number of nodes requested have been provisioned


In [12]:
project_folder = '../'
# os.makedirs(project_folder, exist_ok=True)

In [None]:
%%writefile test_batch.py
import subprocess
from pprint import pprint

if __name__=='__main__':
    results = subprocess.run(['printenv'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    pprint(results.stdout.decode('ascii'))
    results = subprocess.run(['df', '-h'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    pprint(results.stdout.decode('ascii'))
    results = subprocess.run(['pwd'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    pprint(results.stdout.decode('ascii'))
    results = subprocess.run(['ls', '-ltra', 'azmlblob'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    pprint(results.stdout.decode('ascii'))

In [14]:
from azureml.core.image import ContainerImage
from azureml.core import Datastore

In [17]:
default_ds = ws.get_default_datastore()
print(default_ds.datastore_type, default_ds.account_name, default_ds.container_name)

AzureFile msksaltstoragezxwncynt azureml-filestore-293323a0-66b9-4da4-ae16-8f81a6215b73


In [24]:
data_ds = Datastore.register_azure_blob_container(workspace=ws, 
                                             datastore_name='datablob', 
                                             container_name='datablob',
                                             account_name=default_ds.account_name, 
                                             account_key=default_ds.account_key,
                                             create_if_not_exists=True)

In [34]:
data_ds.upload(src_dir=os.path.join(os.getenv('DATA'), 'train'), target_path='data/train')

$AZUREML_DATAREFERENCE_d9b71909089e4a70b2cbd2d447cb3d9d

In [39]:
data_ds.upload_files([os.path.join(os.getenv('DATA'), 'train.csv')], target_path='data')

$AZUREML_DATAREFERENCE_5b118d7de67c420593485ced3451067d

In [43]:
path_on_datastore = 'data'
data_ds_path = data_ds.path(path_on_datastore)
print(data_ds_path)

$AZUREML_DATAREFERENCE_e1cc4af6ca134060be953e04f3102e2d


In [55]:
tboard_ds = Datastore.register_azure_file_share(workspace=ws, 
                                             datastore_name='tboardlogs', 
                                             file_share_name='tboardlogs',
                                             account_name=default_ds.account_name, 
                                             account_key=default_ds.account_key,
                                             create_if_not_exists=True)

In [62]:
data_ref = data_ds.as_download('data')

In [48]:
env = EnvironmentDefinition()
env.python.user_managed_dependencies=True
env.docker.enabled = True
env.docker.gpu_support = True
env.docker.shared_volumes = True
env.docker.base_image = "masalvar/ksalt"
env.docker.base_image_registry=azr

In [51]:
env.environment_variables={
    'DATA':data_ref
    'TBOARD_LOGS':
}

dict

In [45]:
from azureml.train.estimator import Estimator

In [46]:
Estimator??

In [33]:



estimator = Estimator(source_directory=project_folder, 
                    compute_target=compute_target,
                    entry_script='test_batch.py',
                    environment_definition=env,
                     inputs=[ds_data.as_download()])

In [34]:
experiment_name='batchai-hyperdrive'
experiment = Experiment(ws, name=experiment_name)

In [35]:
run = experiment.submit(estimator)

In [37]:
list(experiment.get_runs())

[Run(Experiment: batchai-hyperdrive,
 Id: batchai-hyperdrive_1539538774868,
 Type: azureml.scriptrun,
 Status: Running)]

In [38]:
print(json.dumps(run.get_details(), indent=4))

{
    "runId": "batchai-hyperdrive_1539538774868",
    "target": "gpucluster",
    "status": "Running",
    "startTimeUtc": "2018-10-14T17:39:37.07345Z",
    "properties": {
        "azureml.runsource": "experiment",
        "ContentSnapshotId": "e75a0c7e-d875-4a0f-abb1-f506d4eb7169"
    },
    "runDefinition": {
        "Script": "pytorch_train.py",
        "Arguments": [
            "--data_dir",
            "$AZUREML_DATAREFERENCE_70ad34a962dc436e9323b4da0c3581fb",
            "--num_epochs",
            "25",
            "--output_dir",
            "./outputs"
        ],
        "Framework": 0,
        "Target": "gpucluster",
        "DataReferences": {
            "70ad34a962dc436e9323b4da0c3581fb": {
                "DataStoreName": "workspacefilestore",
                "Mode": "Mount",
                "PathOnDataStore": "hymenoptera_data",
                "PathOnCompute": null,
                "Overwrite": false
            }
        },
        "JobName": null,
        "AutoPrep

In [39]:
RunDetails(run).show()

_UserRun()

In [44]:
from azureml.train.hyperdrive import (BanditPolicy, 
                                      HyperDriveRunConfig, 
                                      RandomParameterSampling, 
                                      uniform, 
                                      PrimaryMetricGoal)

In [45]:
param_sampling = RandomParameterSampling( {
        'learning_rate': uniform(0.0005, 0.005),
        'momentum': uniform(0.9, 0.99)
    }
)

early_termination_policy = BanditPolicy(slack_factor=0.15, evaluation_interval=1, delay_evaluation=10)

hyperdrive_run_config = HyperDriveRunConfig(estimator=estimator,
                                            hyperparameter_sampling=param_sampling, 
                                            policy=early_termination_policy,
                                            primary_metric_name='best_val_acc',
                                            primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                            max_total_runs=20,
                                            max_concurrent_runs=4)

In [46]:
# start the HyperDrive run
hyperdrive_run = experiment.submit(hyperdrive_run_config)

In [47]:
RunDetails(hyperdrive_run).show()

_HyperDrive(widget_settings={'childWidgetDisplay': 'popup'})

_UserRun(widget_settings={'display': 'popup'})

_UserRun(widget_settings={'display': 'popup'})

_UserRun(widget_settings={'display': 'popup'})

In [49]:
RunDetails(hyperdrive_run).get_widget_data()

{'status': 'Running',
 'workbench_run_details_uri': 'https://mlworkspace.azure.ai/portal/subscriptions/edf507a2-6235-46c5-b560-fd463ba2e771/resourceGroups/msazmlrg/providers/Microsoft.MachineLearningServices/workspaces/mstest/experiment/batchai-hyperdrive/run/batchai-hyperdrive_1539540150583',
 'run_id': 'batchai-hyperdrive_1539540150583',
 'run_properties': {'run_id': 'batchai-hyperdrive_1539540150583',
  'created_utc': '2018-10-14T18:02:30.929855Z',
  'properties': {'primary_metric_config': '{"name": "best_val_acc", "goal": "maximize"}',
   'runTemplate': 'HyperDrive',
   'azureml.runsource': 'hyperdrive'},
  'tags': {'max_concurrent_jobs': '4',
   'max_total_jobs': '20',
   'max_duration_minutes': '10080',
   'policy_config': '{"name": "BANDIT", "properties": {"evaluation_interval": 1, "delay_evaluation": 10, "slack_factor": 0.15}}',
   'generator_config': '{"name": "RANDOM", "parameter_space": {"learning_rate": ["uniform", [0.0005, 0.005]], "momentum": ["uniform", [0.9, 0.99]]}}',


In [65]:
%%time
best_run = hyperdrive_run.get_best_run_by_primary_metric()

CPU times: user 11.9 s, sys: 421 ms, total: 12.4 s
Wall time: 57 s


In [66]:
%%time
best_run_metrics = best_run.get_metrics()
print(best_run)

Run(Experiment: batchai-hyperdrive,
Id: batchai-hyperdrive_1539540150583_16,
Type: azureml.scriptrun,
Status: Completed)
CPU times: user 56.5 ms, sys: 0 ns, total: 56.5 ms
Wall time: 338 ms
