# Start Interactive Dask Cluster

In [None]:
pip install --upgrade git+https://github.com/drabastomek/dask-cloudprovider

**RESTART YOUR KERNEL**

## Imports

Import all packages used in this notebook.

In [1]:
import os

from azureml.widgets import RunDetails
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core import Workspace, Experiment, Datastore, Dataset, Environment

## Azure ML setup

Get the workspace.

In [2]:
ws = Workspace.from_config()
ws

Workspace.create(name='eus-azureml', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='cody-rg')

### Enter your name

Enter your name and virtual network information.

In [3]:
### name
name        = 'dask'                                    # REPLACE

### vnet settings
vnet_rg     = ws.resource_group                         # replace if needed
vnet_name   = f'wifi-{ws.get_details()["location"]}'    # replace if needed
subnet_name = 'default'                                 # replace if needed

### azure ml names 
ct_name     = f'{name}-ct'
env_name    = f'{name}-env'

### trust but verify
verify = f'''
Name: {name}

vNET RG: {vnet_rg}
vNET name: {vnet_name}
vNET subnet name: {subnet_name}

Compute target: {ct_name}
Environment name: {env_name}
'''

print(verify)


Name: dask

vNET RG: cody-rg
vNET name: wifi-eastus
vNET subnet name: default

Compute target: dask-ct
Environment name: dask-env



### Create VM pool

Create Azure ML VM pool for creating remote dask cluster(s).

In [4]:
if ct_name not in ws.compute_targets:
    # create config for Azure ML cluster
    # change properties as needed
    config = AmlCompute.provisioning_configuration(
             vm_size                       = 'STANDARD_DS13_V2', # 8 core 56 GiB 112 SSD 
             min_nodes                     = 0,
             max_nodes                     = 100,
             vnet_resourcegroup_name       = vnet_rg,              
             vnet_name                     = vnet_name,         
             subnet_name                   = subnet_name,          
             idle_seconds_before_scaledown = 300
    )
    ct = ComputeTarget.create(ws, ct_name, config)
    ct.wait_for_completion(show_output=True)    
else:
    ct = ws.compute_targets[ct_name]
    
ct

Creating
Succeeded
AmlCompute wait for completion finished
Minimum number of nodes requested have been provisioned


AmlCompute(workspace=Workspace.create(name='eus-azureml', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='cody-rg'), name=dask-ct, id=/subscriptions/6560575d-fa06-4e7d-95fb-f962e74efd7a/resourceGroups/cody-rg/providers/Microsoft.MachineLearningServices/workspaces/eus-azureml/computes/dask-ct, type=AmlCompute, provisioning_state=Succeeded, location=eastus, tags=None)

### Mount Compute Instance code fileshare

This will create the compute instance code fileshare as a datastore. The default name `code-391ff5ac-6576-460f-ba4d-7e03433c68b6` and has the same credentials as the default fileshare for the workspace. This will be mounted for easy access to notebooks on the cluster.

In [5]:
codefileshare = 'codefileshare'

if codefileshare not in ws.datastores:
    Datastore.register_azure_file_share(ws, codefileshare,
                                        'code-391ff5ac-6576-460f-ba4d-7e03433c68b6',                    
                                        account_name = ws.datastores['workspacefilestore'].account_name, 
                                        account_key  = ws.datastores['workspacefilestore'].account_key   
                                       )

### Get data

This will get NOAA ISD Weather data which is used in the demo. If you already have data in Blob or ALDSv1v2 you want to use, skip this.

In [6]:
dsetdata = 'noaa-isd-files'
data_url = 'https://azureopendatastorage.blob.core.windows.net/isdweatherdatacontainer/ISDWeather'

if dsetdata not in ws.datasets:
    os.system('sudo chmod 777 /mnt')
    for year in range(2008, 2020+1):
        ds = Dataset.File.from_files(f'{data_url}/year={year}/month=*/*.parquet', validate=False)
        print('Downloading...')
        %time ds.download(f'/mnt/data/isd/year={year}', overwrite=True)
    print('Uploading...')
    %time ws.get_default_datastore().upload('/mnt/data/isd', '/noaa-isd', show_progress=False)
    ds = Dataset.File.from_files((ws.get_default_datastore(), '/noaa-isd/**/*.parquet'))
    ds = ds.register(ws, dsetdata)

CPU times: user 236 ms, sys: 63.7 ms, total: 300 ms
Wall time: 12.5 s
CPU times: user 82.3 ms, sys: 13.8 ms, total: 96 ms
Wall time: 9.66 s
CPU times: user 77 ms, sys: 10.5 ms, total: 87.5 ms
Wall time: 9.6 s
CPU times: user 87.9 ms, sys: 14.6 ms, total: 103 ms
Wall time: 9.85 s
CPU times: user 71.7 ms, sys: 0 ns, total: 71.7 ms
Wall time: 9.65 s
CPU times: user 81.8 ms, sys: 2.58 ms, total: 84.4 ms
Wall time: 9.93 s
CPU times: user 73.1 ms, sys: 14.2 ms, total: 87.3 ms
Wall time: 10.3 s
CPU times: user 84.1 ms, sys: 754 µs, total: 84.8 ms
Wall time: 11.8 s
CPU times: user 95.7 ms, sys: 554 µs, total: 96.2 ms
Wall time: 6.58 s
CPU times: user 70.5 ms, sys: 5.5 ms, total: 76 ms
Wall time: 7.89 s
CPU times: user 78.4 ms, sys: 8.39 ms, total: 86.8 ms
Wall time: 8.19 s
CPU times: user 87.8 ms, sys: 9.73 ms, total: 97.5 ms
Wall time: 9.75 s
CPU times: user 95.5 ms, sys: 0 ns, total: 95.5 ms
Wall time: 3.3 s


### Start cluster



In [7]:
from dask_cloudprovider import AzureMLCluster

In [8]:
packages = ['mpi4py',
            'distributed',
            'dask[complete]',
            'dask-ml[complete]',
            'fastparquet',
            'pyarrow',
            'jupyterlab',
            'joblib',
            'notebook',
            'adlfs', 
            'fsspec', 
            'azureml-sdk',
            'lz4']

env = Environment(name=env_name)

for package in packages:
    env.python.conda_dependencies.add_pip_package(package)

In [9]:
cluster = AzureMLCluster(ws, 
                         ct, 
                         env, 
                         jupyter=True, 
                         initial_node_count=10,
                         datastores=[ws.datastores[datastore] for datastore in ws.datastores]
                        )



############################## Setting up cluster ##############################
########################## Submitting the experiment ###########################
####################### Waiting for scheduler node's IP ########################
.................................................................................................................................................................................................................................................................


########################### Scheduler: 10.2.0.5:8786 ###########################
############################### On the same VNET ###############################
########################### Connections established ############################
############################ Scaling to 10 workers #############################
############################### Scaling is done ################################


In [10]:
cluster.run

Experiment,Id,Type,Status,Details Page,Docs Page
dask-cloudprovider,dask-cloudprovider_1582513353_456b702c,azureml.scriptrun,Running,Link to Azure Machine Learning studio,Link to Documentation


In [11]:
cluster.scale(25) # need more than default quota for this 

In [12]:
cluster

VBox(children=(HTML(value='<h2>AzureMLCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped>\n  …

In [13]:
#from dask.distributed import Client
#c = Client(cluster)

In [14]:
#cluster.close()

In [15]:
help(AzureMLCluster)

Help on class AzureMLCluster in module dask_cloudprovider.providers.azure.azureml:

class AzureMLCluster(distributed.deploy.cluster.Cluster)
 |  Deploy a Dask cluster using Azure ML
 |  
 |  This creates a dask scheduler and workers on an Azure ML Compute Target.
 |  
 |  Parameters
 |  ----------
 |  workspace: azureml.core.Workspace (required)
 |      Azure ML Workspace - see https://aka.ms/azureml/workspace
 |  
 |  compute_target: azureml.core.ComputeTarget (required)
 |      Azure ML Compute Target - see https://aka.ms/azureml/computetarget
 |  
 |  environment_definition: azureml.core.Environment (required)
 |      Azure ML Environment - see https://aka.ms/azureml/environments
 |  
 |  experiment_name: str (optional)
 |      The name of the Azure ML Experiment used to control the cluster.
 |  
 |      Defaults to ``dask-cloudprovider``.
 |  
 |  initial_node_count: int (optional)
 |      The initial number of nodes for the Dask Cluster.
 |  
 |      Defaults to ``1``.
 |  
 |  ju