In [1]:
from azureml.core import Workspace, Experiment, Dataset
from azureml.core.runconfig import RunConfiguration, MpiConfiguration
from azureml.train.estimator import Estimator
from azureml.core.compute import ComputeTarget, AmlCompute

## Get workspace

In [2]:
ws = Workspace.from_config()
ws

Workspace.create(name='benchy', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='copeters_benchmarking')

## Setup dataset

In [3]:
if 'weather-files' not in ws.datasets:
    ds = Dataset.File.from_files('https://azureopendatastorage.blob.core.windows.net/isdweatherdatacontainer/ISDWeather/*/*/*.parquet', validate=False)
    ds = ds.register(ws, 'weather-files')
else:
    ds = ws.datasets['weather-files']
    
ds

{
  "source": [
    "https://azureopendatastorage.blob.core.windows.net/isdweatherdatacontainer/ISDWeather/*/*/*.parquet"
  ],
  "definition": [
    "GetFiles"
  ],
  "registration": {
    "id": "5a3248d1-e0ee-49b1-8e2b-d5167afa0e6c",
    "name": "weather-files",
    "version": 1,
    "workspace": "Workspace.create(name='benchy', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='copeters_benchmarking')"
  }
}

## Setup parameters 

In [23]:
# {compute_name: (vCPUs, RAM, DISK, $/hr)}

MAX_VCPUS = 1000
MIN_RAM   = 100
MAX_RAM   = 1000

computes = {
    #'STANDARD_D12'    : (4,  28,  200,  .386),
    'STANDARD_D12_V2' : (4,  28,  200,  .370),
    #'STANDARD_D13'    : (8,  56,  400,  .771),
    'STANDARD_D13_V2' : (8,  56,  400,  .741),
    'STANDARD_DS12_V2': (4,  28,   56,  .370),
    'STANDARD_DS13_V2': (8,  56,  112,  .741),
    'STANDARD_DS15_V2': (20, 140, 280, 1.852),
    'STANDARD_DS5_V2' : (16,  56, 112, 1.170),
    #'STANDARD_F32S_V2': (32, 64,  256, 1.360)
}

computes

{'STANDARD_D12': (4, 28, 200, 0.386),
 'STANDARD_D12_V2': (4, 28, 200, 0.37),
 'STANDARD_D13': (8, 56, 400, 0.771),
 'STANDARD_D13_V2': (8, 56, 400, 0.741),
 'STANDARD_DS12_V2': (4, 28, 56, 0.37),
 'STANDARD_DS13_V2': (8, 56, 112, 0.741),
 'STANDARD_DS15_V2': (20, 140, 280, 1.852),
 'STANDARD_DS5_V2': (16, 56, 112, 1.17)}

In [24]:
vm_sizes = list(computes)
nodeses  = list(reversed([1, 2, 3, 4, 5, 20]))

## Create clusters

In [25]:
for vm_size in vm_sizes:
    ct_name = vm_size.replace('STANDARD_', '').replace('_', '-')
    if ct_name not in ws.compute_targets:
        # create config for Azure ML cluster
        # change properties as needed
        # final default values for blog tbd - need to benchmark and minimize cost
        config = AmlCompute.provisioning_configuration(
                 vm_size                 = vm_size,
                 max_nodes               = max(nodeses),
                 vnet_resourcegroup_name = ws.resource_group,
                 vnet_name               = 'bench-vnet',
                 subnet_name             = 'default'
        )

        ct = ComputeTarget.create(ws, ct_name, config)
        ct.wait_for_completion(show_output=True)    
    else:
        print(f'{ct_name} already exists')

ws.compute_targets

D12 already exists
D12-V2 already exists
D13 already exists
D13-V2 already exists
DS12-V2 already exists
DS13-V2 already exists
DS15-V2 already exists
DS5-V2 already exists


{'D12': AmlCompute(workspace=Workspace.create(name='benchy', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='copeters_benchmarking'), name=D12, id=/subscriptions/6560575d-fa06-4e7d-95fb-f962e74efd7a/resourceGroups/copeters_benchmarking/providers/Microsoft.MachineLearningServices/workspaces/benchy/computes/D12, type=AmlCompute, provisioning_state=Succeeded, location=northcentralus, tags=None),
 'D12-V2': AmlCompute(workspace=Workspace.create(name='benchy', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='copeters_benchmarking'), name=D12-V2, id=/subscriptions/6560575d-fa06-4e7d-95fb-f962e74efd7a/resourceGroups/copeters_benchmarking/providers/Microsoft.MachineLearningServices/workspaces/benchy/computes/D12-V2, type=AmlCompute, provisioning_state=Succeeded, location=northcentralus, tags=None),
 'D13': AmlCompute(workspace=Workspace.create(name='benchy', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='copeters_benchmark

## Submit jobs

In [26]:
exp = Experiment(ws, 'test4')

for nodes in nodeses:
    print(f'\nNodes: {nodes}')
    for vm_size in vm_sizes:
        ct_name = vm_size.replace('STANDARD_', '').replace('_', '-')
        
        vcpus, ram, disk, wage = computes[vm_size]
        
        vcpus *= nodes
        ram   *= nodes
        disk  *= nodes
        wage  *= nodes
        
        
        if vcpus < MAX_VCPUS and ram > MIN_RAM and ram < MAX_RAM:
            print(f'\tVM Size: {vm_size:16} | vCPUs - {vcpus} | RAM - {ram}GB | WAGE - ${round(wage, 3)}/hr')
            
            est = Estimator('code', 
                            compute_target=ws.compute_targets[ct_name], 
                            entry_script='runDask.py', 
                            conda_dependencies_file='environment.yml', 
                            #script_params={'--datastore': ws.get_default_datastore()},
                            inputs=[ds.as_named_input('weather').as_download('/tmp/noaa')],
                            node_count=nodes,
                            distributed_training=MpiConfiguration())
            
            print('\t\tSubmitting run...')
            run = exp.submit(est)
            
            run.log('nodes', nodes)
            run.log('vm_size', vm_size)
            run.log('vcpus', vcpus)
            run.log('ram', ram)
            run.log('disk', disk)
            run.log('wage', wage)
            
            print('\t\tRun submitted...')


Nodes: 20
	VM Size: STANDARD_D12     | vCPUs - 80 | RAM - 560GB | WAGE - $7.72/hr
		Submitting run...
		Run submitted...
	VM Size: STANDARD_D12_V2  | vCPUs - 80 | RAM - 560GB | WAGE - $7.4/hr
		Submitting run...
		Run submitted...
	VM Size: STANDARD_DS12_V2 | vCPUs - 80 | RAM - 560GB | WAGE - $7.4/hr
		Submitting run...
		Run submitted...

Nodes: 5
	VM Size: STANDARD_D12     | vCPUs - 20 | RAM - 140GB | WAGE - $1.93/hr
		Submitting run...
		Run submitted...
	VM Size: STANDARD_D12_V2  | vCPUs - 20 | RAM - 140GB | WAGE - $1.85/hr
		Submitting run...
		Run submitted...
	VM Size: STANDARD_D13     | vCPUs - 40 | RAM - 280GB | WAGE - $3.855/hr
		Submitting run...
		Run submitted...
	VM Size: STANDARD_D13_V2  | vCPUs - 40 | RAM - 280GB | WAGE - $3.705/hr
		Submitting run...
		Run submitted...
	VM Size: STANDARD_DS12_V2 | vCPUs - 20 | RAM - 140GB | WAGE - $1.85/hr
		Submitting run...
		Run submitted...
	VM Size: STANDARD_DS13_V2 | vCPUs - 40 | RAM - 280GB | WAGE - $3.705/hr
		Submitting run..

## Kill clusters

In [None]:
for ct in ws.compute_targets:
    #ws.compute_targets[ct].delete()
    pass

ws.compute_targets

## Visualize data

## Cancel runs

In [None]:
exp = Experiment(ws, 'test4')

for run in exp.get_runs():
    if run.get_status() == 'Running':
        run.cancel()