In [2]:
import os
import dask
import time
import joblib
import fsspec
import socket
import matplotlib

import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt

from datetime import datetime
from dask.distributed import Client
from IPython.core.display import HTML
from dask_ml.xgboost import XGBRegressor

from azureml.widgets import RunDetails
from azureml.train.estimator import Estimator
from azureml.core.runconfig import MpiConfiguration
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core import Workspace, Experiment, Dataset, Environment, Model

%matplotlib inline

Failure while loading azureml_run_type_providers. Failed to load entrypoint hyperdrive = azureml.train.hyperdrive:HyperDriveRun._from_run_dto with exception cannot import name '_DistributedTraining'.


In [3]:
ws = Workspace.from_config()
ws

Workspace.create(name='uks-azureml', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='copeters-rg')

In [4]:
### name
name        = 'cody100'             # REPLACE

### vnet settings
vnet_rg     = ws.resource_group  # replace if needed
vnet_name   = 'uksouth-vnet'     # replace if needed
subnet_name = 'default'          # replace if needed

### azure ml names 
env_name    = f'{name}-dask-env'
ct_name     = f'{name}-dask-ct'
exp_name    = f'{name}-dask-demo'

### trust but verify
verify = f'''
Name: {name}

vNET RG: {vnet_rg}
vNET name: {vnet_name}
vNET subnet name: {subnet_name}

Environment: {env_name}
Compute target: {ct_name}
Experiment name: {exp_name}
'''

print(verify)


Name: cody100

vNET RG: copeters-rg
vNET name: uksouth-vnet
vNET subnet name: default

Environment: cody100-dask-env
Compute target: cody100-dask-ct
Experiment name: cody100-dask-demo



In [5]:
if env_name not in ws.environments:
    env = Environment.from_existing_conda_environment(env_name, 'dask')
    env = env.register(ws)
else:
    env = ws.environments[env_name]
    
env.name, env.version

('cody100-dask-env', '1')

In [6]:
if ct_name not in ws.compute_targets:
    # create config for Azure ML cluster
    # change properties as needed
    config = AmlCompute.provisioning_configuration(
             vm_size                       = 'STANDARD_D14_V2',  
             min_nodes                     = 100,
             max_nodes                     = 100,
             vnet_resourcegroup_name       = vnet_rg,              
             vnet_name                     = vnet_name,            
             subnet_name                   = subnet_name,          
             idle_seconds_before_scaledown = 300
    )
    ct = ComputeTarget.create(ws, ct_name, config)
    ct.wait_for_completion(show_output=True)    
else:
    ct = ws.compute_targets[ct_name]
    
ct

AmlCompute(workspace=Workspace.create(name='uks-azureml', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='copeters-rg'), name=cody100-dask-ct, id=/subscriptions/6560575d-fa06-4e7d-95fb-f962e74efd7a/resourceGroups/copeters-rg/providers/Microsoft.MachineLearningServices/workspaces/uks-azureml/computes/cody100-dask-ct, type=AmlCompute, provisioning_state=Succeeded, location=uksouth, tags=None)

In [8]:
# # of nodes 
nodes = 99
exp   = Experiment(ws, exp_name)
est   = Estimator('setup', 
                  compute_target          = ct, 
                  entry_script            = 'start.py',          # sets up Dask cluster
                  environment_definition          = env,                 # use same env as local
                  node_count              = nodes,        
                  distributed_training    = MpiConfiguration()
                 )
#run = next(exp.get_runs()) # use this to get existing run (if kernel restarted, etc)
run = exp.submit(est)
run



Experiment,Id,Type,Status,Details Page,Docs Page
cody100-dask-demo,cody100-dask-demo_1579623805_cea9e67b,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation


In [9]:
dstore = ws.datastores['data4dask']

In [10]:
STORAGE_OPTIONS = {
    'account_name': dstore.account_name, 
    'account_key' : dstore.account_key
}

In [11]:
dstore.account_name

'data4uksouth'

In [12]:
dstore.account_key

'qhWtyoChzuJ3PT6YoxXjrQYisoLkDNUBBQiQXNyZvjEuKCKYKHSLjs+x/5Xu0B8D/Ky5VDiFRy5kcKvctjZSRw=='

In [13]:
protocol  = 'abfs'      # use 'adl' for Azure Data Lake Gen 1
container = 'datasets'  # only contains ISD, GFS is wip

In [14]:
fs = fsspec.filesystem(protocol, **STORAGE_OPTIONS, container_name=container)

In [15]:
fs.ls('/noaa/gfs')

['noaa/gfs/GFSProcessed/', 'noaa/gfs/8c6ca145-43b7-4492-8222-30f0a138fe69']

In [16]:
files = []
for file in fs.glob('noaa/gfs/GFSProcessed/year=*/month=*'): 
    files += fs.ls(f'{file}/')
files[-5:]

['noaa/gfs/GFSProcessed/year=2019/month=9/day=5/',
 'noaa/gfs/GFSProcessed/year=2019/month=9/day=6/',
 'noaa/gfs/GFSProcessed/year=2019/month=9/day=7/',
 'noaa/gfs/GFSProcessed/year=2019/month=9/day=8/',
 'noaa/gfs/GFSProcessed/year=2019/month=9/day=9/']

In [None]:
# this is slow - there are ~50k files
files2 = []
for file in files: 
    files2 += fs.glob(f'{file}/*.parquet')
files2[-5:]

In [None]:
files = files2

In [None]:
len(files) # number of parquet files - there are other random files in there 

In [None]:
RunDetails(run).show()

In [None]:
#pip install lz4==2.2.1

In [None]:
#pip install msgpack==0.6.2

In [None]:
#pip install numpy==1.18.1

In [None]:
# port to forward the dask dashboard to on the compute instance
# we do not use 8787 because it is already in use 
dashboard_port = 4242

print("waiting for scheduler node's ip")
while run.get_status() != 'Canceled' and 'scheduler' not in run.get_metrics():
    print('.', end ="")
    time.sleep(5)

if run.get_status() == 'Canceled':
    print('\nRun was canceled')
else:
    print(f'\nSetting up port forwarding...')
    os.system(f'killall socat') # kill all socat processes - cleans up previous port forward setups 
    os.system(f'setsid socat tcp-listen:{dashboard_port},reuseaddr,fork tcp:{run.get_metrics()["dashboard"]} &')
    print(f'Cluster is ready to use.')

c = Client(f'tcp://{run.get_metrics()["scheduler"]}')

print(f'\n\n{c}')

# build the dashboard link 
dashboard_url = f'https://{socket.gethostname()}-{dashboard_port}.{ws.get_details()["location"]}.instances.azureml.net/status'
HTML(f'<a href="{dashboard_url}">Dashboard link</a>')

In [None]:
#c.restart() # restart Client if needed

In [1]:
pip install --upgrade pyarrow

Requirement already up-to-date: pyarrow in /anaconda/envs/azureml_py36/lib/python3.6/site-packages (0.15.1)
Note: you may need to restart the kernel to use updated packages.


In [None]:
df = dask.delayed(dd.read_parquet)(files, engine='pyarrow', storage_options=STORAGE_OPTIONS).compute()
%time df.head()

In [None]:
df = df.set_index(dd.to_datetime(df.datetime).dt.floor('d'), sorted=False)
df = df.persist() 
%time len(df)

In [None]:
%time len(df)

In [None]:
%time df.describe().compute()

In [None]:
%time bites = df.memory_usage(index=True, deep=True).sum().compute()
print(f'Dataframe is: {round(bites/1e9, 2)}GB')

In [None]:
%time means = df.groupby(df.index).mean().compute()
means.head()

In [None]:
for col in list(means.columns):
    fig = plt.figure(figsize=(16, 8))
    #plt.style.use('dark_background')
    means[col].plot(color='b')
    plt.title('Average of {}'.format(col))
    plt.xlim([datetime(2008, 1, 1), datetime(2018, 12, 31)])
    plt.grid()
    
    # optionally, log the image to the run
    run.log_image(f'mean_{col}', plot=plt)

You can see the images logged to the run in the studio.

In [None]:
run

In [None]:
## insert any Pandas-like Dask data prep code 
df['temperature'] = df['temperature']*(9/5)+32 # 'Merica

In [None]:
df = df.repartition(npartitions=df.npartitions*10)
%time dask.delayed(df.to_parquet)(f'abfs://outputs/noaa/isd_out.parquet', compression='lz4', storage_options=STORAGE_OPTIONS).compute()

In [None]:
df = dask.delayed(dd.read_parquet)(files, engine='pyarrow', storage_options=STORAGE_OPTIONS).compute()

In [None]:
files = []
for file in fs.glob('noaa/isd/year=*/month=*'): # see https://github.com/dask/adlfs/issues/34
    files += fs.ls(f'{file}/')
files = [f'{protocol}://{container}/{file}' for file in files if '2019' in file] 

df2019 = dask.delayed(dd.read_parquet)(files, engine='pyarrow', storage_options=STORAGE_OPTIONS).compute() 

In [None]:
# begin data prep
df = df.fillna(0) 
df2019 = df2019.fillna(0) 

In [None]:
df['month'] = df['datetime'].dt.month
df2019['month'] = df2019['datetime'].dt.month

In [None]:
cols = list(df.columns)
cols = [col for col in cols if df.dtypes[col] != 'object' and col not in ['version', 'datetime']]
cols

In [None]:
X = df[[col for col in cols if col not in ['temperature']]].persist()
y = df.temperature.persist()
# end data prep - persist intelligently per https://docs.dask.org/en/latest/best-practices.html

In [None]:
xgb = XGBRegressor(n_estimators=16)
%time xgb.fit(X, y)

In [None]:
%time y_pred = xgb.predict(X).compute()

In [None]:
rmse = (((y.to_dask_array().compute()-y_pred)**2).mean())**.5 # runs locally, distribute (?)
print(f'Training RMSE: {round(rmse, 3)}')

In [None]:
X_test = df2019[[col for col in cols if col not in ['temperature']]].persist()
y_test = df2019.temperature.persist()

In [None]:
%time y_pred = xgb.predict(X_test).compute()

In [None]:
rmse = (((y_test.to_dask_array().compute()-y_pred)**2).mean())**.5 # runs locally, distribute (?)
print(f'Test RMSE: {round(rmse, 3)}')

In [None]:
model_path = 'xgboost_noaa_isd.joblib.dat'
joblib.dump(xgb, model_path)
xgb = joblib.load(model_path)

In [None]:
model = Model.register(ws, model_path, 'xgboost-noaa-isd', 
                       description='Dask XGBoost NOAA ISD temperature predictor',
                       model_framework='XGBoost')

In [None]:
c.close()
run.cancel()

In [None]:
t_end = time.time()
print(f'Total run time: {round((t_end-t_start)/60, 2)} minutes')