# Scaling Python with Azure ML and Dask

![Describe gif](media/describe3.gif)

## Environment setup

This notebook assumes you are using an Azure ML Compute Instance with the default kernel `azureml_py36`. This contains many unneccesary packages. If you want to avoid a long image build time, you may want to create a new conda environment with the minimal packages needed for your scenario. 

It is important that the local environment matches the remote environment to avoid mismatch issues when submitting commands to the remote cluster. To help with this, we will use Azure ML Environments. 

In [None]:
pip install --upgrade dask[complete] lz4 distributed fastparquet pyarrow azureml-sdk[notebooks] azureml-dataprep[fuse]

In [None]:
pip uninstall azureml-samples -y

In [None]:
# restart kernel
from IPython.core.display import HTML
HTML('<script>Jupyter.notebook.kernel.restart()</script>')

In [None]:
import os

os.system('sudo cp /etc/nginx/nginx.conf setup/temp.conf') # stupid

nginx = ''

with open('setup/temp.conf') as f:
    for line in f.readlines():
        if 'websocket/|/ws/' in line:
            nginx += line.replace('websocket/|/ws/', 'websocket/|/ws')
        else:
            nginx += line
       
with open('setup/temp2.conf', 'w') as f:
    f.write(nginx)
    
os.system('sudo mv setup/temp2.conf /etc/nginx/nginx.conf')
os.system('sudo service nginx restart')
os.system('rm setup/temp.conf');

## Imports

Import all packages used in this notebook.

In [1]:
import os
import sys
import dask
import glob
import socket
import matplotlib

import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt

from datetime import datetime
from dask.distributed import Client
from IPython.core.display import HTML

from azureml.widgets import RunDetails
from azureml.train.estimator import Estimator
from azureml.core.runconfig import MpiConfiguration
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.conda_dependencies import CondaDependencies 
from azureml.core import Workspace, Experiment, Dataset, Environment

%matplotlib inline 

Failure while loading azureml_run_type_providers. Failed to load entrypoint hyperdrive = azureml.train.hyperdrive:HyperDriveRun._from_run_dto with exception cannot import name '_DistributedTraining'.


## Azure ML Setup

Get the workspace.

In [2]:
ws = Workspace.from_config()
ws

Workspace.create(name='ncus-azuremlol', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='copetersrg')

### Create environment 

Create the environment to be used on the remote cluster. 

In [3]:
env_name = 'pandas'

if env_name not in ws.environments:
    env = Environment.from_existing_conda_environment(env_name, 'azureml_py36')
    env.python.conda_dependencies.add_pip_package('mpi4py') # needed for remote cluster
    env = env.register(ws)
else:
    env = ws.environments[env_name]
    
env.name, env.version

('pandas', '1')

### Create dataset

Create the dataset to be used. 

In [4]:
dataset_name = 'noaa-isd-files'
data_paths   = ['https://azureopendatastorage.blob.core.windows.net/isdweatherdatacontainer/ISDWeather/year=*/month=*/*.parquet']
local_path   = '/mnt/data/noaa/isd'
remote_path  = '/datasets/noaa/isd'

if dataset_name not in ws.datasets:
    ds = Dataset.File.from_files(data_paths, validate=False)
    # begin stupid 
    os.system('sudo chmod 777 /mnt') # stupid 
    ds.download(local_path)
    ws.get_default_datastore().upload(local_path, remote_path)
    ds = Dataset.File.from_files((ws.get_default_datastore(), remote_path))
    # end stupid
    ds = ds.register(ws, dataset_name)
else:
    ds = ws.datasets[dataset_name]
    
ds

{
  "source": [
    "('workspaceblobstore', 'datasets/noaa/isd')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ],
  "registration": {
    "id": "4fe0f07c-e3d8-4fab-90e4-fda48cfab8b9",
    "name": "noaa-isd-files",
    "version": 1,
    "workspace": "Workspace.create(name='ncus-azuremlol', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='copetersrg')"
  }
}

### Create VM pool

Create Azure ML VM pool for creating remote dask cluster(s).

In [5]:
pool_name = 'dask-pool'

if pool_name not in ws.compute_targets:
    # create config for Azure ML cluster
    # change properties as needed
    config = AmlCompute.provisioning_configuration(
             vm_size                 = 'STANDARD_DS13_V2',
             max_nodes               = 100,
             vnet_resourcegroup_name = ws.resource_group,
             vnet_name               = 'dask-vnet',
             subnet_name             = 'default'
    )
    
    ct = ComputeTarget.create(ws, pool_name, config)
    ct.wait_for_completion(show_output=True)    
else:
    ct = ws.compute_targets[pool_name]
    
ct

AmlCompute(workspace=Workspace.create(name='ncus-azuremlol', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='copetersrg'), name=dask-pool, id=/subscriptions/6560575d-fa06-4e7d-95fb-f962e74efd7a/resourceGroups/copetersrg/providers/Microsoft.MachineLearningServices/workspaces/ncus-azuremlol/computes/dask-pool, type=AmlCompute, provisioning_state=Succeeded, location=northcentralus, tags=None)

## Startup cluster

In [6]:
exp_name   = 'dask2020'

script_params = {
    '--datastore': ws.get_default_datastore(),
    #'--script'   : 'run.py' # run code in run.py on cluster and teardown (batch processing)
}

est = Estimator('setup', 
                compute_target          = ct, 
                entry_script            = 'start.py', 
                environment_definition  = env, 
                script_params           = script_params,
                node_count              = 50,
                distributed_training    = MpiConfiguration()
               )

#run = next(ws.experiments[exp_name].get_runs()) # use this to get existing run (if kernel restarted, etc)
run = Experiment(ws, exp_name).submit(est)
run



Experiment,Id,Type,Status,Details Page,Docs Page
dask2020,dask2020_1578139647_690d471d,azureml.scriptrun,Starting,Link to Azure Machine Learning studio,Link to Documentation


In [8]:
run.cancel()

## Dataset overview

This uses an [Azure Open Dataset](https://azure.microsoft.com/services/open-datasets/catalog/) of [NOAA Integrated Surface Data (ISD)](https://azure.microsoft.com/services/open-datasets/catalog/noaa-integrated-surface-data/) containing worldwide hourly weather data such as temperature, precipitation, and wind. 

Expanded in memory, the full dataset is ~660 GB. It is stored in compressed parquet files in a blob container partitioned by year and month. The dataset is updated daily. Compressed, the files for the dataset are ~8 GB. Uncompressed, the files for the dataset are ~150-200 GB.  

Specific years and months can be specified by `year=*/month=*/part-*.snappy.parquet`. 

The data begins in 2008 and contains 1 file per month. Each file can contain ~5 GB of data when in a dataframe in memory.

In [None]:
%time files = glob.glob(f'{local_path}/year=*/month=*/*.parquet', recursive=True)
files

## Pandas

In [None]:
files = glob.glob(f'{local_path}/year=2019/month=12/*.parquet', recursive=True) # 1 month of data
files

In [None]:
df = pd.read_parquet(files[0])
df.head()

In [None]:
%time len(df)

In [None]:
%time df.describe()

In [None]:
%time bites = df.memory_usage(index=True, deep=True).sum()
print(f'Dataframe is: {round(bites/1e9, 2)}GB')

In [None]:
%time means = df.groupby(df.datetime.dt.floor('d')).mean()
means.head()

In [None]:
for col in list(means.columns):
    fig = plt.figure(figsize=(16, 8))
    #plt.style.use('dark_background')
    means[col].plot(color='b')
    plt.title('Average of {}'.format(col))
    plt.xlim([datetime(2019, 12, 1), datetime(2019, 12, 31)])
    plt.grid()

## Scaling with Dask

The first step in scaling up with Dask is to simply get a bigger VM. For non-GPU Compute Instances, the max is the `STANDARD_DS15_V2` with 20 cores and 140 GB of RAM. This is suitable for interactive querying and data preparation on about 1 year of the weather data, but not on the full dataset.

In [None]:
dashboard_port = 9898

c = Client(dashboard_address=f':{dashboard_port}')
print(f'\n\n{c}')

# need to get the dashboard link 
dashboard_url = f'https://{socket.gethostname()}-{dashboard_port}.{ws.get_details()["location"]}.instances.azureml.net/status'
HTML(f'<a href="{dashboard_url}">Dashboard link</a>')

### Take some data

Take some data. The below cells may fail on smaller VMs. 1 year works semi-reliably on a `STANDARD_DS15_V2`, although computing the means fails sometimes.

You can use the dashboard to understand what is going on with this VM being used as a "local" cluster. 

In [None]:
start = 2019
end   = 2019

In [None]:
df = dd.read_parquet([f'{local_path}/year={year}/month=*/*.parquet' for year in range(start, end+1)], engine='pyarrow')
%time df.head()

In [None]:
%time len(df)

In [None]:
%time df.describe().compute()

In [None]:
%time bites = df.memory_usage(index=True, deep=True).sum().compute()
print(f'Dataframe is: {round(bites/1e9, 2)}GB')

In [None]:
%time means = df.groupby(df.datetime.dt.floor('d')).mean().compute() # slow, prone to error 
means.head()

In [None]:
for col in list(means.columns):
    fig = plt.figure(figsize=(16, 8))
    #plt.style.use('dark_background')
    means[col].plot(color='b')
    plt.title('Average of {}'.format(col))
    plt.xlim([datetime(start, 1, 1), datetime(end, 12, 31)])
    plt.grid()

In [None]:
c.close()

## Scale up with Dask and Azure ML

In [None]:
RunDetails(run).show()

### Connect to cluster

In [None]:
# port to forward the dask dashboard to on the compute instance
# we do not use 8787 because it is already in use 
dashboard_port = 4321

print("waiting for scheduler node's ip")
while run.get_status() != 'Canceled' and 'scheduler' not in run.get_metrics():
    print('.', end ="")
    time.sleep(5)

if run.get_status() == 'Canceled':
    print('Run was canceled')
else:
    print(f'Setting up port forwarding...')
    os.system(f'killall socat') # kill all socat processes - cleans up previous port forward setups 
    os.system(f'setsid socat tcp-listen:{dashboard_port},reuseaddr,fork tcp:{run.get_metrics()["dashboard"]} &')
    print(f'Cluster is ready to use.')

c = Client(f'tcp://{run.get_metrics()["scheduler"]}')

print(f'\n\n{c}')

c.restart()

# need to get the dashboard link 
dashboard_url = f'https://{socket.gethostname()}-{dashboard_port}.{ws.get_details()["location"]}.instances.azureml.net/status'
HTML(f'<a href="{dashboard_url}">Dashboard link</a>')

### Take all the data

In [None]:
rrpath = f'{run.get_metrics()["datastore"]}{remote_path}'
rrpath

In [None]:
dask.delayed(os.listdir)(rrpath).compute()

In [None]:
%time files = dask.delayed(glob.glob)(f'{rrpath}/year=*/month=*/*.parquet', recursive=True).compute()
%time files = dask.delayed(glob.glob)(f'{rrpath}*.parquet', recursive=True).compute()

files[-5:]

In [None]:
df = dd.from_delayed([dask.delayed(pd.read_parquet)(file) for file in files])
%time df.head()

In [None]:
df = df.set_index(dd.to_datetime(df.datetime).dt.floor('d'), sorted=False).persist() # persist and sort data by day 
#df = df.persist()
%time len(df)

In [None]:
%time len(df)

In [None]:
%time df.describe().compute()

In [None]:
%time bites = df.memory_usage(index=True, deep=True).sum().compute()
print(f'Dataframe is: {round(bites/1e9, 2)}GB')

In [None]:
%time means = df.groupby(df.datetime.dt.floor('d')).mean().compute()
means.head()

In [None]:
for col in list(means.columns):
    fig = plt.figure(figsize=(16, 8))
    #plt.style.use('dark_background')
    means[col].plot(color='b')
    plt.title('Average of {}'.format(col))
    plt.xlim([datetime(2008, 1, 1), datetime(2019, 12, 31)])
    plt.grid()
    
    # optionally, log the image to the run
    #run.log_image(f'mean_{col}', plot=plt)

In [None]:
run

## Prepare data

In [None]:
## insert any Pandas-like Dask data prep code 
df['temperature'] = df['temperature']*(9/5)+32       # 'Merica
means['temperature'] = means['temperature']*(9/5)+32 # skip recomputing this 

In [None]:
fig = plt.figure(figsize=(16, 8))
means.temperature.plot(color='b')
plt.title('Real average of temperature')
plt.xlim([datetime(2008, 1, 1), datetime(2019, 12, 31)])
plt.ylabel('Temperature in \u00B0F')
plt.grid()

## Write data

**Important:** this is slow and will put 200 GB of CSVs in your default storage.

In [None]:
%time dask.delayed(df.to_parquet)(f'/{run.get_metrics()["datastore"]}/datasets/isd2', compression='lz4').compute()

## Create dataset

In [None]:
from azureml.core import Dataset

dset = Dataset.File.from_files((ws.get_default_datastore(), '/dask/output/noaa/**/*.csv'))

In [None]:
dset = dset.register(ws, 'real-weather-files')
dset

## End the run

Cluster will return to 0 nodes

In [None]:
run.cancel()