# Start Interactive Dask Cluster

In [None]:
pip install --upgrade git+https://github.com/drabastomek/dask-cloudprovider

**RESTART YOUR KERNEL**

## Imports

Import all packages used in this notebook.

In [1]:
from cryptography.hazmat.primitives import serialization as crypto_serialization
from cryptography.hazmat.primitives.asymmetric import rsa
from cryptography.hazmat.backends import default_backend as crypto_default_backend

key = rsa.generate_private_key(
    backend=crypto_default_backend(),
    public_exponent=65537,
    key_size=2048
)
private_key = key.private_bytes(
    crypto_serialization.Encoding.PEM,
    crypto_serialization.PrivateFormat.PKCS8,
    crypto_serialization.NoEncryption())
public_key = key.public_key().public_bytes(
    crypto_serialization.Encoding.OpenSSH,
    crypto_serialization.PublicFormat.OpenSSH
)

with open('private.key', 'wb') as f:
    f.write(private_key)
    
with open('public.key', 'wb') as f:
    f.write(public_key)
    
with open('public.key', 'r') as f:
    pubkey = f.read()

In [2]:
import os

from azureml.widgets import RunDetails
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core import Workspace, Experiment, Datastore, Dataset, Environment

## Azure ML setup

Get the workspace.

In [3]:
ws = Workspace.from_config()
ws

If you run your code in unattended mode, i.e., where you can't give a user input, then we recommend to use ServicePrincipalAuthentication or MsiAuthentication.
Please refer to aka.ms/aml-notebook-auth for different authentication mechanisms in azureml-sdk.


Workspace.create(name='AzureML', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='cody-training')

### Enter your name

Enter your name and virtual network information.

In [4]:
### name
name        = 'ydoc'             # replace or not to replace

### azure ml names 
ct_name     = f'{name}-ct'       # replace if desired

### trust but verify
verify = f'''
Name: {name}

Compute target: {ct_name}
'''

print(verify)


Name: ydoc

Compute target: ydoc-ct



### Create VM pool

Create Azure ML VM pool for creating remote dask cluster(s).

In [5]:
if ct_name not in ws.compute_targets:
    # create config for Azure ML cluster
    # change properties as needed
    config = AmlCompute.provisioning_configuration(
             vm_size                         = 'STANDARD_DS13_V2', # 8 core 56 GiB 112 SSD 
             min_nodes                       = 0,
             max_nodes                       = 100,
             admin_username                  = name,
             admin_user_ssh_key              = pubkey,
             remote_login_port_public_access = 'Enabled',
             idle_seconds_before_scaledown   = 300
    )
    ct = ComputeTarget.create(ws, ct_name, config)
    ct.wait_for_completion(show_output=True)    
else:
    ct = ws.compute_targets[ct_name]
    
ct

Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


AmlCompute(workspace=Workspace.create(name='AzureML', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='cody-training'), name=ydoc-ct, id=/subscriptions/6560575d-fa06-4e7d-95fb-f962e74efd7a/resourceGroups/cody-training/providers/Microsoft.MachineLearningServices/workspaces/AzureML/computes/ydoc-ct, type=AmlCompute, provisioning_state=Succeeded, location=westus2, tags=None)

### Mount Compute Instance code fileshare

This will create the compute instance code fileshare as a datastore. The default name `code-391ff5ac-6576-460f-ba4d-7e03433c68b6` and has the same credentials as the default fileshare for the workspace. This will be mounted for easy access to notebooks on the cluster.

In [6]:
filesharename = 'codefileshare'

if filesharename not in ws.datastores:
    Datastore.register_azure_file_share(ws, filesharename,
                                        'code-391ff5ac-6576-460f-ba4d-7e03433c68b6', # lol                    
                                        account_name = ws.datastores['workspacefilestore'].account_name, 
                                        account_key  = ws.datastores['workspacefilestore'].account_key   
                                       )

### Get data

This will get NOAA ISD Weather data which is used in the demo. If you already have data in Blob or ALDSv1v2 you want to use, skip this.

In [None]:
dsetdata = 'noaa-isd-files'
data_url = 'https://azureopendatastorage.blob.core.windows.net/isdweatherdatacontainer/ISDWeather'

if dsetdata not in ws.datasets:
    os.system('sudo chmod 777 /mnt')
    for year in range(2008, 2020+1):
        ds = Dataset.File.from_files(f'{data_url}/year={year}/month=*/*.parquet', validate=False)
        print('Downloading...')
        %time ds.download(f'/mnt/data/isd/year={year}', overwrite=True)
    print('Uploading...')
    %time ws.get_default_datastore().upload('/mnt/data/isd', '/noaa-isd', show_progress=False)
    ds = Dataset.File.from_files((ws.get_default_datastore(), '/noaa-isd/**/*.parquet'))
    ds = ds.register(ws, dsetdata)

### Start cluster

In [9]:
from dask_cloudprovider import AzureMLCluster

In [10]:
cluster = AzureMLCluster(ws, 
                         ct, 
                         ws.environments['AzureML-Dask-CPU'], 
                         jupyter=True, 
                         datastores=[ws.datastores[datastore] for datastore in ws.datastores],
                         scheduler_idle_timeout=7200,
                         admin_username=name,
                         admin_ssh_key='private.key'
                        )

############################## Setting up cluster ##############################
####################### Waiting for scheduler node's IP ########################
.............


########################### Scheduler: 10.0.0.5:8786 ###########################
############################# Not on the same VNET #############################
###################### scheduler_public_ip: 52.143.85.106 ######################
######################### scheduler_public_port: 50000 #########################


OSError: Timed out trying to connect to 'tcp://CODYP:9002' after 10 s: Timed out trying to connect to 'tcp://CODYP:9002' after 10 s: in <distributed.comm.tcp.TCPConnector object at 0x7f90a067ea10>: ConnectionRefusedError: [Errno 111] Connection refused

In [None]:
cluster.run

In [None]:
help(AzureMLCluster.__setup_port_forwarding)

In [None]:
cluster.scale(30) # need more than default quota for this 

In [None]:
cluster

In [None]:
#from dask.distributed import Client
#c = Client(cluster)

In [None]:
#cluster.close()

In [None]:
#help(AzureMLCluster)