# Start Interactive Dask Cluster

In [None]:
pip install --upgrade git+https://github.com/drabastomek/dask-cloudprovider

**RESTART YOUR KERNEL**

## Imports

Import all packages used in this notebook.

In [1]:
from os import chmod
from Crypto.PublicKey import RSA

key = RSA.generate(2048)
with open("private.key", 'wb') as content_file:
    content_file.write(key.exportKey('PEM'))
pubkey = key.publickey()
with open("public.key", 'wb') as content_file:
    content_file.write(pubkey.exportKey('OpenSSH'))

In [2]:
import os

from azureml.widgets import RunDetails
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core import Workspace, Experiment, Datastore, Dataset, Environment

## Azure ML setup

Get the workspace.

In [3]:
ws = Workspace.from_config()
ws

Workspace.create(name='AzureML-UKS', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='cody-training')

### Enter your name

Enter your name and virtual network information.

In [4]:
### name
name        = 'cody3'             # replace or not to replace

### azure ml names 
ct_name     = f'{name}-ct'       # replace if desired

### trust but verify
verify = f'''
Name: {name}

Compute target: {ct_name}
'''

print(verify)


Name: cody3

Compute target: cody3-ct



In [5]:
with open('public.key', 'r') as f:
    pubkey = f.read()
    
pubkey

'ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQCDy8G38ZgjuDMwWcHxzStTGzQJanSCxrczyduGb0HobvOY4me7G7Wp4kBjpWb9VyClR2ou1vzI9GDOWuCRVav5PU8gc2Tws7lSXNCwcgvVOCHoFWpxaflnP8ESoXzpbE42joggriEgbmJeScsvVmyzqLtnSMlPbpkr4eOZk8P1Lhew7X1+SlOQurELm3cRaf8lq5mNKx54FDGI074/yQDRJL+FillmEpHdEgs2gQONaEvERr7cZr/IdCaW5v04A/hLT+Rz325mjpPRnpQwikqW+SMR5xd/6kB9KMDajIj6BoAIgsGQ+3lJvFmikOsRF4yY1sY9cihCKuwNlHp5yOsD'

### Create VM pool

Create Azure ML VM pool for creating remote dask cluster(s).

In [6]:
if ct_name not in ws.compute_targets:
    # create config for Azure ML cluster
    # change properties as needed
    config = AmlCompute.provisioning_configuration(
             vm_size                         = 'STANDARD_DS13_V2', # 8 core 56 GiB 112 SSD 
             min_nodes                       = 0,
             max_nodes                       = 100,
             admin_username                  = name,
             admin_user_ssh_key              = pubkey,
             remote_login_port_public_access = 'Enabled',
             idle_seconds_before_scaledown   = 300
    )
    ct = ComputeTarget.create(ws, ct_name, config)
    ct.wait_for_completion(show_output=True)    
else:
    ct = ws.compute_targets[ct_name]
    
ct

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


AmlCompute(workspace=Workspace.create(name='AzureML-UKS', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='cody-training'), name=cody3-ct, id=/subscriptions/6560575d-fa06-4e7d-95fb-f962e74efd7a/resourceGroups/cody-training/providers/Microsoft.MachineLearningServices/workspaces/AzureML-UKS/computes/cody3-ct, type=AmlCompute, provisioning_state=Succeeded, location=uksouth, tags=None)

### Mount Compute Instance code fileshare

This will create the compute instance code fileshare as a datastore. The default name `code-391ff5ac-6576-460f-ba4d-7e03433c68b6` and has the same credentials as the default fileshare for the workspace. This will be mounted for easy access to notebooks on the cluster.

In [7]:
filesharename = 'codefileshare'

if filesharename not in ws.datastores:
    Datastore.register_azure_file_share(ws, filesharename,
                                        'code-391ff5ac-6576-460f-ba4d-7e03433c68b6', # lol                    
                                        account_name = ws.datastores['workspacefilestore'].account_name, 
                                        account_key  = ws.datastores['workspacefilestore'].account_key   
                                       )

### Get data

This will get NOAA ISD Weather data which is used in the demo. If you already have data in Blob or ALDSv1v2 you want to use, skip this.

In [8]:
dsetdata = 'noaa-isd-files'
data_url = 'https://azureopendatastorage.blob.core.windows.net/isdweatherdatacontainer/ISDWeather'

if dsetdata not in ws.datasets:
    os.system('sudo chmod 777 /mnt')
    for year in range(2008, 2020+1):
        ds = Dataset.File.from_files(f'{data_url}/year={year}/month=*/*.parquet', validate=False)
        print('Downloading...')
        %time ds.download(f'/mnt/data/isd/year={year}', overwrite=True)
    print('Uploading...')
    %time ws.get_default_datastore().upload('/mnt/data/isd', '/noaa-isd', show_progress=False)
    ds = Dataset.File.from_files((ws.get_default_datastore(), '/noaa-isd/**/*.parquet'))
    ds = ds.register(ws, dsetdata)

### Start cluster

In [9]:
from dask_cloudprovider import AzureMLCluster

In [None]:
cluster = AzureMLCluster(ws, 
                         ct, 
                         ws.environments['AzureML-Dask-CPU'], 
                         jupyter=True, 
                         datastores=[ws.datastores[datastore] for datastore in ws.datastores],
                         scheduler_idle_timeout=7200,
                         admin_username=name,
                         admin_ssh_key='private.key'
                        )

############################## Setting up cluster ##############################
####################### Waiting for scheduler node's IP ########################
..........................................................


########################### Scheduler: 10.0.0.4:8786 ###########################


In [None]:
cluster.run

In [None]:
cluster.scale(30) # need more than default quota for this 

In [None]:
cluster

In [None]:
#from dask.distributed import Client
#c = Client(cluster)

In [None]:
#cluster.close()

In [None]:
#help(AzureMLCluster)