## Connect to cluster

In [1]:
import os
import dask
import time
import joblib
import fsspec

import pandas as pd
import dask.dataframe as dd

from datetime import datetime
from dask.distributed import Client

from azureml.core import Run, Model

In [2]:
try:
    run = Run.get_context()
except:
    print('Not in a run -- follow link from the cluster widget')
    
run

Experiment,Id,Type,Status,Details Page,Docs Page
dask-cloudprovider,dask-cloudprovider_1584307902_4f5fe6b5,azureml.scriptrun,Running,Link to Azure Machine Learning studio,Link to Documentation


In [3]:
c = Client(f'{run.get_metrics()["scheduler"]}')
c

0,1
Client  Scheduler: tcp://10.0.0.11:8786  Dashboard: http://10.0.0.11:8787/status,Cluster  Workers: 39  Cores: 624  Memory: 4.61 TB


In [4]:
c.restart()

0,1
Client  Scheduler: tcp://10.0.0.11:8786  Dashboard: http://10.0.0.11:8787/status,Cluster  Workers: 39  Cores: 624  Memory: 4.61 TB


## Read data

Conveniently, the datastore is mounted so it can be viewed/operated on as if local. However, for reading into the distributed cluster, we will use ADLS's HDFS driver.

In [5]:
STORAGE_OPTIONS = {
    'account_name': run.experiment.workspace.get_default_datastore().account_name,
    'account_key' : run.experiment.workspace.get_default_datastore().account_key
}

protocol  = 'abfs' # change to 'adl' for gen 1
container = run.experiment.workspace.get_default_datastore().container_name

In [6]:
datapath  = f'{protocol}://{container}/noaa-isd-csv/*-data.csv'
blocksize = '5GB'
dtypes    = {'cloudCoverage': 'object', 'usaf': 'object'}

datapath

'abfs://default/noaa-isd-csv/*-data.csv'

In [7]:
df = dd.read_csv(datapath, blocksize=blocksize, dtype=dtypes, storage_options=STORAGE_OPTIONS).repartition(npartitions=1000).persist()
%time df.head()

CPU times: user 644 ms, sys: 40 ms, total: 684 ms
Wall time: 1min 1s


Unnamed: 0.1,Unnamed: 0,usaf,wban,datetime,latitude,longitude,elevation,windAngle,windSpeed,temperature,...,pastWeatherIndicator,precipTime,precipDepth,snowDepth,stationName,countryOrRegion,p_k,year,day,version
0,2008-01-10,999999,53878,2008-01-10 18:30:00,35.419,-82.557,641.0,,,45.32,...,,,,,ASHEVILLE 13 S,US,999999-53878,2008,10,1.0
1,2008-01-06,999999,53877,2008-01-06 13:45:00,35.495,-82.614,656.0,,,34.52,...,,,,,ASHEVILLE 8 SSW,US,999999-53877,2008,6,1.0
2,2008-01-29,999999,53878,2008-01-29 14:35:00,35.419,-82.557,641.0,,,45.14,...,,,,,ASHEVILLE 13 S,US,999999-53878,2008,29,1.0
3,2008-01-04,999999,53877,2008-01-04 05:25:00,35.495,-82.614,656.0,,,11.48,...,,,,,ASHEVILLE 8 SSW,US,999999-53877,2008,4,1.0
4,2008-01-12,999999,53878,2008-01-12 00:30:00,35.419,-82.557,641.0,,,42.26,...,,,,,ASHEVILLE 13 S,US,999999-53878,2008,12,1.0


In [17]:
datapath     = f'{protocol}://{container}/noaa-isd-by-station/'
partition_on = ['stationName']
compression  = 'lz4'

## Cache 

In [None]:
df.to_parquet(datapath, 
              compression     = compression, 
              partition_on    = partition_on, 
              write_index     = False,
              storage_options = STORAGE_OPTIONS)

In [None]:
df = df.set_index(df.stationName.fillna('None'), sorted=False)
%time len(df)

In [None]:
%time len(df)

In [None]:
df.npartitions

## Explore data

In [None]:
stations = list(df.stationName.unique().compute())
stations[-5:]

In [None]:
df_station = df[df.stationName == stations[0]].compute()
df_station.info()

In [None]:
df_station = df_station.fillna(0)

In [None]:
cols = list(df_station.columns)
cols = [col for col in cols if df_station.dtypes[col] != 'object' and col not in ['version', 'datetime']]
cols

In [None]:
X = df_station[[col for col in cols if col not in ['temperature']]]
y = df_station.temperature

In [None]:
from xgboost import XGBRegressor

In [None]:
xgb = XGBRegressor(n_estimators=16)
%time xgb.fit(X, y)

In [None]:
%time y_pred = xgb.predict(X)

In [None]:
rmse = (((y-y_pred)**2).mean())**.5 
#run.log('RMSE', rmse)
print(f'Training RMSE: {round(rmse, 3)}')

In [None]:
model_path = 'xgboost_noaa_isd.joblib.dat'
joblib.dump(xgb, model_path)
xgb = joblib.load(model_path)

In [None]:
model = Model.register(run.experiment.workspace, 
                       model_name      = 'xgboost-noaa-isd', 
                       model_path      = model_path,
                       description     = 'Dask XGBoost NOAA ISD temperature predictor',
                       model_framework = 'DaskML.XGBoostRegressor')

In [None]:
os.mkdir('models')

In [None]:
@dask.delayed
def train_on_station(station):
    df_station = df[df.stationName == station].compute()
    df_station = df_station.fillna(0)
    
    cols = list(df_station.columns)
    cols = [col for col in cols if df_station.dtypes[col] != 'object' and col not in ['version', 'datetime']]
    
    X = df_station[[col for col in cols if col not in ['temperature']]]
    y = df_station.temperature
    
    xgb = XGBRegressor(n_estimators=16)
    xgb.fit(X, y)
    
    y_pred = xgb.predict(X)
    
    rmse = (((y-y_pred)**2).mean())**.5 
    run.log(f'{station[:16]}_RMSE', rmse)
    
    model_path = f'models/{station[:16]}_xgboost_noaa_isd.joblib.dat'
    joblib.dump(xgb, model_path)

    model = Model.register(run.experiment.workspace, 
                       model_name      = f'{station[:16]}-xgboost-noaa-isd', 
                       model_path      = model_path,
                       description     = 'Dask XGBoost NOAA ISD temperature predictor',
                       model_framework = 'DaskML.XGBoostRegressor')

In [None]:
train_on_station(stations[0]).compute()

## Train XGBoost model

The below cells are for demo purposes only. The code is not good. The data science is not good. 

In [None]:
# begin data prep for ML 
df = df.fillna(0)

In [None]:
cols = list(df.columns)
cols = [col for col in cols if df.dtypes[col] != 'object' and col not in ['version', 'datetime']]
cols

In [None]:
X = df[[col for col in cols if col not in ['temperature']]].persist()
y = df.temperature.persist()
# end data prep for ML

In [None]:
xgb = XGBRegressor(n_estimators=16)
%time xgb.fit(X, y)

In [None]:
%time y_pred = xgb.predict(X).compute()

In [None]:
rmse = (((y.to_dask_array().compute()-y_pred)**2).mean())**.5 # runs locally, distribute (?)
run.log('RMSE', rmse)
print(f'Training RMSE: {round(rmse, 3)}')

## Register model

In [None]:
model_path = 'xgboost_noaa_isd.joblib.dat'
joblib.dump(xgb, model_path)
xgb = joblib.load(model_path)

In [None]:
model = Model.register(run.experiment.workspace, 
                       model_name      = 'xgboost-noaa-isd', 
                       model_path      = model_path,
                       description     = 'Dask XGBoost NOAA ISD temperature predictor',
                       model_framework = 'DaskML.XGBoostRegressor')