In [None]:
from azureml.core import Run

run = Run.get_context()
run

In [None]:
ws = run.experiment.workspace
ws

In [None]:
from dask.distributed import Client

c = Client(run.get_metrics()['scheduler'])
c

In [None]:
from azureml.core import Dataset 

data_url = 'https://azureopendatastorage.blob.core.windows.net/isdweatherdatacontainer/ISDWeather'

ds = Dataset.Tabular.from_parquet_files(f'{data_url}/year=*/month=*/*.parquet', validate=False)
ds

In [None]:
df

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df = df.repartition(npartitions=128).persist()

In [None]:
%time len(df)

In [None]:
%time len(df)

In [None]:
%time df = df.set_index(dd.to_datetime(df.datetime).dt.floor('d'), sorted=False).persist()

In [None]:
%time len(df)

In [None]:
%time len(df)

In [None]:
%time df.describe().compute()

In [None]:
%time places = df.groupby(df.index)[['longitude', 'latitude', 'year']].mean().compute()

In [None]:
plt.figure(figsize=(16, 16))
plt.scatter(places.longitude, places.latitude, c=places.year)
plt.title('Lat/long')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.grid()
plt.colorbar()

In [None]:
plt.figure(figsize=(16, 16))
plt.scatter(places.longitude, places.latitude, c=places.year)
plt.title('Lat/long')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.xlim([-50, -30]) # zoom in 
plt.ylim([35, 40])   # zoom in 
plt.grid()
plt.colorbar()

In [None]:
%time means = df.groupby(df.index).mean().compute()
means.head()

In [None]:
for col in list(means.columns):
    fig = plt.figure(figsize=(16, 8))
    #plt.style.use('dark_background')
    means[col].plot(color='b')
    plt.title('Average of {}'.format(col))
    plt.xlim([datetime(2008, 1, 1), datetime(2020, 7, 1)])
    plt.grid()

In [None]:
## insert any Pandas-like Dask transformation code 
df['temperature'] = df['temperature']*(9/5)+32 # 'Merica

In [None]:
# %time df.to_csv('/tmp/data/noaa-isd-csv')

In [None]:
# %time df = dd.read_csv('/tmp/data/noaa-isd-csv/*.part', dtype={'usaf': 'object'}, blocksize=None).persist()

In [None]:
%time len(df)

In [None]:
# begin data prep for ML 
df = df.fillna(0)

In [None]:
cols = list(df.columns)
cols = [col for col in cols if df.dtypes[col] != 'object' and col not in ['version', 'datetime']]
cols

In [None]:
X = df[[col for col in cols if col not in ['temperature']]].persist()
y = df.temperature.persist()
# end data prep for ML

In [None]:
xgb = XGBRegressor(n_estimators=16)
%time xgb.fit(X, y)

In [None]:
%time y_pred = xgb.predict(X).compute()

In [None]:
%time rmse = (((y.to_dask_array().compute()-y_pred)**2).mean())**.5
print(f'Training RMSE: {round(rmse, 3)}')

In [None]:
lgbm = LGBMRegressor(n_estimators=16)
%time lgbm.fit(X, y)

In [None]:
%time y_pred = lgbm.predict(X).compute()

In [None]:
%time rmse2 = (((y.to_dask_array().compute()-y_pred)**2).mean())**.5
print(f'Training RMSE: {round(rmse2, 3)}')

In [None]:
if rmse2 < rmse:
    best_model = 'lgbm'
else:
    best_model = 'xgb'

best_model

In [None]:
model_path = f'{best_model}_noaa_isd.joblib.dat'
joblib.dump(xgb, model_path) if best_model is 'xgb' else joblib.dump(lgbm, model_path)
model2 = joblib.load(model_path)

In [None]:
model = Model.register(ws, 
                       model_name      = f'{best_model}-noaa-isd', 
                       model_path      = model_path,
                       description     = f'NOAA ISD temperature predictor')