## Interactively install packages as needed

The below cells show how to install/upgrade packages on the cluster using pip, interactively. No kernel restart needed.

In [None]:
import os
import dask
from dask.distributed import Client
from azureml.core import Run

In [None]:
packages = 'matplotlib'

def update_packages(packages):
    os.system(f'pip install --upgrade {packages}')

In [None]:
run   = Run.get_context()
nodes = 25

In [None]:
c = Client(f'{run.get_metrics()["scheduler"]}')
c

In [None]:
to_compute = dask.delayed([dask.delayed(update_packages)(packages) for i in range(nodes)])
to_compute.compute();

In [None]:
c.restart()

## Connect to cluster

In [1]:
import os
import glob
import dask
import time
import joblib
import fsspec
import socket
import matplotlib

import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt

from datetime import datetime
from dask.distributed import Client
from IPython.core.display import HTML
from dask_ml.lightgbm import LGBMRegressor

from azureml.core import Run, Model

%matplotlib inline

In [2]:
run = Run.get_context()
run

Experiment,Id,Type,Status,Details Page,Docs Page
dask-cloudprovider,dask-cloudprovider_1581570173_e99ec0fa,azureml.scriptrun,Running,Link to Azure Machine Learning studio,Link to Documentation


In [3]:
c = Client(f'{run.get_metrics()["scheduler"]}')
c

0,1
Client  Scheduler: tcp://10.12.0.18:8786  Dashboard: http://10.12.0.18:8787/status,Cluster  Workers: 25  Cores: 200  Memory: 1.48 TB


## Read data

Conveniently, the datastore is mounted so it can be viewed/operated on as if local. However, for reading into the distributed cluster, we will use ADLS's HDFS driver.

In [4]:
STORAGE_OPTIONS = {
    'account_name': run.experiment.workspace.datastores['gen2'].account_name,
    'account_key' : run.experiment.workspace.datastores['gen2'].account_key
}

protocol  = 'abfs'      # change to 'adl' for gen 1
container = 'datasets'

In [5]:
fs = fsspec.filesystem(protocol, **STORAGE_OPTIONS, container_name=container)

In [10]:
files = []
for year in range(2008, 2020+1):
    for month in range(1, 12+1):
        files += fs.glob(f'noaa-isd/year={year}/month={month}/*.parquet')
        
files = [f'{protocol}://{container}/{file}' for file in files]        
files[-5:]

['abfs://datasets/noaa-isd/year=2020/month=2/part-00003-tid-4372725586789113206-3e6d3a39-d431-4724-b506-4e73a1f2101e-1905-2.c000.snappy.parquet',
 'abfs://datasets/noaa-isd/year=2020/month=2/part-00004-tid-4372725586789113206-3e6d3a39-d431-4724-b506-4e73a1f2101e-1906-2.c000.snappy.parquet',
 'abfs://datasets/noaa-isd/year=2020/month=2/part-00005-tid-4372725586789113206-3e6d3a39-d431-4724-b506-4e73a1f2101e-1904-2.c000.snappy.parquet',
 'abfs://datasets/noaa-isd/year=2020/month=2/part-00006-tid-4372725586789113206-3e6d3a39-d431-4724-b506-4e73a1f2101e-1901-2.c000.snappy.parquet',
 'abfs://datasets/noaa-isd/year=2020/month=2/part-00007-tid-4372725586789113206-3e6d3a39-d431-4724-b506-4e73a1f2101e-1902-2.c000.snappy.parquet']

In [14]:
df = dd.read_parquet(files, storage_options=STORAGE_OPTIONS).repartition(npartitions=200).persist()
%time df.head()

ERROR - Client-Request-ID=bb42350c-4e22-11ea-8067-000d3aae19b0 Retry policy did not allow for a retry: Server-Timestamp=Thu, 13 Feb 2020 05:37:09 GMT, Server-Request-ID=2c4a76a4-b01e-003a-152f-e2bf57000000, HTTP status code=500, Exception=Server encountered an internal error. Please try again after some time. ErrorCode: InternalError<?xml version="1.0" encoding="utf-8"?><Error><Code>InternalError</Code><Message>Server encountered an internal error. Please try again after some time.RequestId:2c4a76a4-b01e-003a-152f-e2bf57000000Time:2020-02-13T05:37:10.1785922Z</Message></Error>.


AzureHttpError: Server encountered an internal error. Please try again after some time. ErrorCode: InternalError
<?xml version="1.0" encoding="utf-8"?><Error><Code>InternalError</Code><Message>Server encountered an internal error. Please try again after some time.
RequestId:2c4a76a4-b01e-003a-152f-e2bf57000000
Time:2020-02-13T05:37:10.1785922Z</Message></Error>

In [None]:
df = df.set_index(dd.to_datetime(df.datetime).dt.floor('d'), sorted=True).persist()
%time len(df)

In [None]:
%time len(df)

## Explore data

In [None]:
%time df.describe().compute()

In [None]:
%time places = df.groupby(df.index)[['longitude', 'latitude', 'year']].mean().compute()

In [None]:
plt.figure(figsize=(16, 16))
plt.scatter(places.longitude, places.latitude, c=places.year)
plt.title('Lat/long')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid()
plt.colorbar()

In [None]:
plt.figure(figsize=(16, 16))
plt.scatter(places.longitude, places.latitude, c=places.year)
plt.title('Lat/long')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.xlim([-50, -30]) # zoom in 
plt.ylim([35, 40])   # zoom in 
plt.grid()
plt.colorbar()

run.log_image(f'lat_long_zoomed', plot=plt) # log image to run 

In [None]:
%time means = df.groupby(df.index).mean().compute()
means.head()

In [None]:
for col in list(means.columns):
    fig = plt.figure(figsize=(16, 8))
    #plt.style.use('dark_background')
    means[col].plot(color='b')
    plt.title('Average of {}'.format(col))
    plt.xlim([datetime(2008, 1, 1), datetime(2020, 2, 1)])
    plt.grid()
    
    # optionally, log the image to the run
    run.log_image(f'mean_{col}', plot=plt)

You can see the images logged to the run in the studio.

In [None]:
run

## Transform data

In [None]:
## insert any Pandas-like Dask transformation code 
df['temperature'] = df['temperature']*(9/5)+32 # 'Merica

## Take subset of data to Pandas

In [None]:
df = df[df['stationName'].str.contains('FLORIDA|WASHINGTON|TEXAS', regex=True, na=False)].compute()
df

In [None]:
df.info()

## Write data

**Warning**: This will write ~150 GB of CSV files to your ADLSv2 storage account. This will be used to demonstrate Dask + Azure ML on an artificially larger set of files. This will take ~30 mins.

In [None]:
#%time df.to_csv(f'{protocol}://{container}/noaa-isd-csv/part-*-data.csv', storage_options=STORAGE_OPTIONS)

## Train LightGBM model

The below cells are for demo purposes only. The code is not good. The data science is not good. 

In [None]:
# begin data prep for ML 
df = df.fillna(0)

In [None]:
cols = list(df.columns)
cols = [col for col in cols if df.dtypes[col] != 'object' and col not in ['version', 'datetime']]
cols

In [None]:
X = df[[col for col in cols if col not in ['temperature']]].persist()
y = df.temperature.persist()
# end data prep for ML

In [None]:
xgb = LGBMRegressor(n_estimators=16)
%time xgb.fit(X, y)

In [None]:
%time y_pred = xgb.predict(X).compute()

In [None]:
rmse = (((y.to_dask_array().compute()-y_pred)**2).mean())**.5 # runs locally, distribute (?)
run.log('RMSE', rmse)
print(f'Training RMSE: {round(rmse, 3)}')

## Register model

In [None]:
model_path = 'xgboost_noaa_isd.joblib.dat'
joblib.dump(xgb, model_path)
xgb = joblib.load(model_path)

In [None]:
model = Model.register(run.experiment.workspace, 
                       model_name      = 'xgboost-noaa-isd', 
                       model_path      = model_path,
                       description     = 'Dask XGBoost NOAA ISD temperature predictor',
                       model_framework = 'DaskML.XGBoostRegressor')