In [20]:
# Check core SDK version number
import azureml.core

print("SDK version:", azureml.core.VERSION)

SDK version: 1.0.57


In [21]:
from azureml.telemetry import set_diagnostics_collection

set_diagnostics_collection(send_diagnostics=True)

Turning diagnostics collection on. 


In [3]:
from azureml.core.workspace import Workspace

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

Workspace name: platform
Azure region: eastus2
Subscription id: e19ffd0c-ea1c-4296-bc99-7c2262768216
Resource group: cloud-shell-storage-westeurope


In [4]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
cluster_name = "gpu-cluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                           max_nodes=4)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

Creating a new compute target...
Creating
Succeeded
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-09-01T13:45:07.710000+00:00', 'errors': None, 'creationTime': '2019-09-01T13:44:29.494774+00:00', 'modifiedTime': '2019-09-01T13:45:16.151436+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}


In [5]:
import os

project_folder = './platform_bench'
os.makedirs(project_folder, exist_ok=True)

In [6]:
!Ls

[1m[36mMachineLearningNotebooks[m[m config.json              [1m[36mplatform_bench[m[m
Makefile                 [1m[36mdata[m[m                     [1m[36mrecommenders[m[m
README.md                [1m[36mexperiments[m[m              requirements.txt
Scenario.md              [1m[36mnlp[m[m                      setup.cfg
Untitled.ipynb           [1m[36mnni[m[m


In [8]:
from azureml.core import Experiment

experiment_name = 'platform_bench'
experiment = Experiment(ws, name=experiment_name)

In [13]:
from azureml.train.dnn import PyTorch

script_params = {
    '--data_root': "cifar10",
    '--log_dir': './outputs'
}

estimator = PyTorch(source_directory=project_folder, 
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='experiments/train.py',
                    use_gpu=True,
                    pip_packages=['pillow==5.4.1'])



In [14]:
run = experiment.submit(estimator)
print(run)

Run(Experiment: platform_bench,
Id: platform_bench_1567346087_7e7ceb5f,
Type: azureml.scriptrun,
Status: Starting)


In [19]:
print(run.get_details())

{'runId': 'platform_bench_1567346087_7e7ceb5f', 'target': 'gpu-cluster', 'status': 'Preparing', 'properties': {'_azureml.ComputeTargetType': 'batchai', 'ContentSnapshotId': '34f42540-554f-40e6-80cc-dd5143c3377f', 'azureml.git.repository_uri': 'git@github.com:neuromation/platform_benchmark.git', 'mlflow.source.git.repoURL': 'git@github.com:neuromation/platform_benchmark.git', 'azureml.git.branch': 'azure', 'mlflow.source.git.branch': 'azure', 'azureml.git.commit': '794d76f356d085c9f182ea3d695036bf5b24a03c', 'mlflow.source.git.commit': '794d76f356d085c9f182ea3d695036bf5b24a03c', 'azureml.git.dirty': 'True'}, 'runDefinition': {'script': 'experiments/train.py', 'arguments': ['--data_root', 'cifar10', '--log_dir', './outputs'], 'sourceDirectoryDataStore': None, 'framework': 'Python', 'communicator': 'None', 'target': 'gpu-cluster', 'dataReferences': {}, 'data': {}, 'jobName': None, 'maxRunDurationSeconds': None, 'nodeCount': 1, 'environment': {'name': 'Experiment platform_bench Environment'