# AML - Training

In [None]:
## Check core SDK version number
import azureml.core
import mlflow
import os

from azureml.core import (Datastore, Dataset, Environment, Experiment, ScriptRunConfig,
                          Workspace)
from azureml.core.authentication import InteractiveLoginAuthentication
from IPython.display import display



print("[INFO] SDK version:", azureml.core.VERSION)

## due to diferent tenant -> typically customer tenant
# interactive_auth = InteractiveLoginAuthentication(tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47")

ws = Workspace.from_config()
print("[SUCCESS] LOGGED IN: ",ws.name, ws.resource_group, ws.location, ws.subscription_id, sep=' @ ')

## set mlflow backend to AML
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())

print("[INFO] MLFlow wired to AML:", "experiments.azureml.net" in mlflow.get_tracking_uri())

## Config

In [None]:
aml_compute = "aml-cluster"
aml_ds = "aml_data"# "mmaadlsgen2_test"
# aml_dset = 'noa_weather'
# aml_dset = "oj_sample_data"
aml_dset = "diabetes_multiple"
aml_experiment = "mlflow-azureml"
loc_data = "data/demo_data"

In [None]:
## set the datastore
ds = ws.datastores[aml_ds]
print(f"[INFO] Datastore: {ds.name}, type: {ds.datastore_type}")

##  Data Prep

In [None]:
wtds = Dataset.get_by_name(ws, name=aml_dset)
# wtds = Dataset.get_by_name(ws, name='noa_weather')
pdf = wtds.to_pandas_dataframe()
pdf.info()

In [None]:
# !pip install seaborn

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Comment this if the data visualisations doesn't work on your side
%matplotlib inline

plt.style.use('bmh')

In [None]:
pdf.columns

In [None]:
# target = "temperature" # diabetes
target = 'Y'

categorical_features_list = ['SEX', target]
quantitative_features_list = ['AGE', 'BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', target]


In [None]:
cols_to_drop = [] # or empty
cols_at_end = [target]

In [None]:
pdf.drop(cols_to_drop,axis=1, inplace=True)

Let's just remove [] and the features with 30% or less NaN values

In [None]:
# print(df[target].describe())
plt.figure(figsize=(12, 4))
sns.histplot(pdf[target], color='g', bins=100, kde=True);

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(pdf, test_size=0.2, random_state=223)

print(f'train:{train.shape[0]} \ntest: {test.shape[0]}')

# final_df.to_csv("./data/taxi_final_df.csv", index=False)
# train.to_csv("./data/taxi_final_df_train.csv", index=False)
# test.to_csv("./data/taxi_final_df_test.csv", index=False)

## Training locally

In [None]:
mlflow.set_experiment(aml_experiment)

In [None]:
pdf[quantitative_features_list]

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import joblib

import lightgbm as lgb

# df = pdf[quantitative_features_list]
# y_df = df.pop(target)
# x_df = df

df = pdf
x_df = df
y_df = x_df.pop(target)

x_train, x_test, y_train, y_test = train_test_split(x_df, y_df, test_size=0.2, random_state=223)

MLFlow Tracking

```python
mlflow.start_run()
# mlflow.log_param("x","abc")
mlflow.log_metric("x",123)
mlflow.end_run()
```

In [None]:
# Create a run object in the experiment
# run =  experiment.start_logging()
with mlflow.start_run():

    # Log the algorithm parameters to the run
    # run.log('num_leaves', 31)
    # run.log('learning_rate', 0.05)
    # run.log('n_estimators', 20)
    
    num_leaves=31
    learning_rate=0.01
    n_estimators=20
    
    mlflow.log_param("num_leaves",num_leaves)
    mlflow.log_param("learning_rate",learning_rate)
    mlflow.log_param("n_estimators",n_estimators)

    # setup model, train and test
    gbm = lgb.LGBMRegressor(num_leaves=num_leaves,
                            learning_rate=learning_rate,
                            n_estimators=n_estimators)
    model_gbm = gbm.fit(x_train, y_train,
            eval_set=[(x_test, y_test)],
            eval_metric='l1',
            early_stopping_rounds=5)

    preds = model_gbm.predict(x_test)

    # Output the Mean Squared Error to the notebook and to the run
    print('Mean Squared Error is', mean_squared_error(y_test, preds))
    # run.log('mse', mean_squared_error(y_test, preds))
    mlflow.log_metric('mse', mean_squared_error(y_test, preds))

#     # Save the model to the outputs directory for capture
#     model_file_name = './outputs/model.pkl'

#     joblib.dump(value = model_gbm, filename = model_file_name)

    mlflow.sklearn.log_model(model_gbm, "gbm_model")

    # upload the model file explicitly into artifacts 
    # run.upload_file(name = model_file_name, path_or_stream = model_file_name)


### Inspect Experiment

In [None]:
experiment = Experiment(ws, aml_experiment)

List through individual `Run` metrics and sort output 

In [None]:
import pandas as pd

# children = list(parent_run.get_children())
metricslist = {}
for run in experiment.get_runs():
    if (run.status == "Completed"):
    #     properties = run.get_properties()
        metrics = {k: v for k, v in run.get_metrics().items() if isinstance(v, float)}
    #     metricslist[int(properties['iteration'])] = metrics
    #     metricslist[run._run_number] = metrics
        metricslist[run.id] = metrics

rundata = pd.DataFrame(metricslist).T.sort_values(by=['mse'], ascending=False)
rundata

Get the **best** run by metric

In [None]:
minimum_rmse_runid = None
minimum_rmse = None
num_runs_compared = 0
best_run = None

for run in experiment.get_runs():
    run_metrics = run.get_metrics()
    run_details = run.get_details()
    
    if (run.status == "Completed"):
        if ("mse" in run.get_metrics()):
            num_runs_compared += 1
            # each logged metric becomes a key in this returned dict
            run_rmse = run_metrics["mse"]
            run_id = run_details["runId"]

            if minimum_rmse is None:
                minimum_rmse = run_rmse
                minimum_rmse_runid = run_id
                best_run = run
            else:
                if run_rmse < minimum_rmse:
                    minimum_rmse = run_rmse
                    minimum_rmse_runid = run_id
                    best_run = run

print("Best run_id: " + minimum_rmse_runid)
print("Best run_id rmse: " + str(minimum_rmse))
print("Runs compared: " + str(num_runs_compared))

In [None]:
best_run

## Training on AML Compute

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

compute_target = ComputeTarget(workspace=ws, name=aml_compute)
# Use the 'status' property to get a detailed status for the current cluster. 
cts = compute_target.status.serialize()
print(f'Found existing compute target: {aml_compute}\n({"cluster is running" if (int(cts["currentNodeCount"])>0) else "cluster is idle"}) currentNodeCount: {cts["currentNodeCount"]}, vmPriority: {cts["vmPriority"]}, vmSize: {cts["vmSize"]}')

In [None]:
import os
project_folder = "code/train/diabetes"

if not os.path.exists(project_folder):
    os.makedirs(project_folder)
else:
    print(f"folder '{project_folder}' aready there")

In [None]:
wtds = Dataset.get_by_name(ws, name=aml_dset)

In [None]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies

conda_env = Environment('conda-env')
conda_env.python.conda_dependencies = CondaDependencies.create(pip_packages=['azureml-sdk',
                                                                             'azureml-dataprep[pandas,fuse]',
                                                                             'scikit-learn==0.22.2.post1',
                                                                             'azureml-mlflow',
                                                                             'lightgbm',
                                                                            'joblib'])

In [None]:
from azureml.core import ScriptRunConfig

src = ScriptRunConfig(source_directory=project_folder, 
                      script='train.py', 
                      arguments =[wtds.as_named_input('data')])

src.run_config.framework = 'python'
src.run_config.environment = conda_env
src.run_config.target = compute_target.name
# src.run_config.data_references = {ds.name: dr}

In [None]:
run = experiment.submit(config=src)

In [None]:
from azureml.widgets import RunDetails
RunDetails(run).show()