# AML - Training

In [None]:
## Check core SDK version number
import azureml.core
import mlflow
import os

from azureml.core import (Datastore, Dataset, Environment, Experiment, ScriptRunConfig,
                          Workspace)
from azureml.core.authentication import InteractiveLoginAuthentication
from IPython.display import display



print("[INFO] SDK version:", azureml.core.VERSION)

## due to diferent tenant -> typically customer tenant
# interactive_auth = InteractiveLoginAuthentication(tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47")

ws = Workspace.from_config()
print("[SUCCESS] LOGGED IN: ",ws.name, ws.resource_group, ws.location, ws.subscription_id, sep=' @ ')

## set mlflow backend to AML
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())

print("[INFO] MLFlow wired to AML:", "experiments.azureml.net" in mlflow.get_tracking_uri())

## Config

In [None]:
aml_compute = "aml-cluster"
aml_ds = "aml_data"# "mmaadlsgen2_test"
# aml_dset = 'noa_weather'
# aml_dset = "oj_sample_data"
aml_dset = "diabetes_multiple"
aml_experiment = "mlflow-azureml"
loc_data = "data/demo_data"

In [None]:
## set the datastore
ds = ws.datastores[aml_ds]
print(f"[INFO] Datastore: {ds.name}, type: {ds.datastore_type}")

##  Data Prep

In [None]:
wtds = Dataset.get_by_name(ws, name=aml_dset)
# wtds = Dataset.get_by_name(ws, name='noa_weather')
pdf = wtds.to_pandas_dataframe()
pdf.info()

In [None]:
# !pip install seaborn

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Comment this if the data visualisations doesn't work on your side
%matplotlib inline

plt.style.use('bmh')

In [None]:
pdf.columns

In [None]:
# target = "temperature" # diabetes
target = 'Y'

categorical_features_list = ['SEX', target]
quantitative_features_list = ['AGE', 'BMI', 'BP', 'S1', 'S2', 'S3', 'S4', 'S5', 'S6', target]


In [None]:
cols_to_drop = [] # or empty
cols_at_end = [target]

In [None]:
pdf.drop(cols_to_drop,axis=1, inplace=True)

Let's just remove [] and the features with 30% or less NaN values

In [None]:
# print(df[target].describe())
plt.figure(figsize=(12, 4))
sns.histplot(pdf[target], color='g', bins=100, kde=True);

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(pdf, test_size=0.2, random_state=223)

print(f'train:{train.shape[0]} \ntest: {test.shape[0]}')

# final_df.to_csv("./data/taxi_final_df.csv", index=False)
# train.to_csv("./data/taxi_final_df_train.csv", index=False)
# test.to_csv("./data/taxi_final_df_test.csv", index=False)

## Training Pipeline

In [None]:
mlflow.set_experiment(aml_experiment)

In [None]:
experiment = Experiment(ws, aml_experiment)

#### Define RunConfig for the compute
We will also use `pandas`, `scikit-learn` and `automl`, `pyarrow` for the pipeline steps. Defining the `runconfig` for that.

In [None]:
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# Create a new runconfig object
aml_run_config = RunConfiguration()

# Use the aml_compute you created above. 
aml_run_config.target = aml_compute

# Enable Docker
aml_run_config.environment.docker.enabled = True

# Use conda_dependencies.yml to create a conda environment in the Docker image for execution
aml_run_config.environment.python.user_managed_dependencies = False

# Specify CondaDependencies obj, add necessary packages
aml_run_config.environment.python.conda_dependencies = CondaDependencies.create(
    conda_packages=['scikit-learn','packaging'], 
    pip_packages=['azureml-sdk', 
                    'pyarrow',
                    'pandas==1.1.0',
                    'azureml-dataprep[pandas,fuse]',
                    'scikit-learn==0.22.2.post1',
                    'azureml-mlflow',
                    'lightgbm',
                    'joblib'
                 ])

print ("Run configuration created.")


In [None]:
from azureml.pipeline.core import PipelineData
from azureml.pipeline.steps import PythonScriptStep

train_model_folder = "code/train/diabetes"

print('Train script is in {}.'.format(os.path.realpath(train_model_folder)))

# output_data = PipelineData("xxx", datastore=default_store).as_dataset()


# test train split step creation
# See the train_test_split.py for details about input and output
trainStep = PythonScriptStep(
    name="Train Model",
    script_name="train.py", 
    arguments=["--myarg", 111],
#     inputs=[output_split_train.parse_parquet_files(file_extension=None)],
    inputs=[wtds.as_named_input('data')],
#     outputs=[output_data],
    compute_target=aml_compute,
    runconfig = aml_run_config,
    source_directory=train_model_folder,
    allow_reuse=False
)

print("trainStep created.")

#### Build and run the pipeline

In [None]:
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails

pipeline_steps = [trainStep]

pipeline = Pipeline(workspace = ws, steps=pipeline_steps)
print("Pipeline is built.")

pipeline_run = experiment.submit(pipeline, regenerate_outputs=False)

# pipeline.validate()

print("Pipeline submitted for execution.")

In [None]:
RunDetails(pipeline_run).show()

In [None]:
# pipeline_run.get_all_logs()

## Publish  Pipeline

When AML Pipeline is published, you can schedule such pipeline to run based on schedule trigger. Moreover, you can work with such Pipeline outside AML, e.g. you can shcedule the Pipeline in Azure Data Factory.

In [None]:
published_pipeline = pipeline_run.publish_pipeline(
    name="Diabetes Training AML Pipeline", description="training pipeline", version="1.0")

published_pipeline

In [None]:
from azureml.pipeline.core import Pipeline, PublishedPipeline
published_pipelines = PublishedPipeline.list(ws)
for published_pipeline in  published_pipelines:
    print(f"{published_pipeline.name},'{published_pipeline.id}'")

### Work with the Pipeline in Azure Data Factory

Such published Pipeline can be accessed from Azure Data Factory.

1. Go to [https://ms-adf.azure.com/authoring](https://ms-adf.azure.com/authoring) and select your ADF
1. If you don't have create Linked Service to your AML Workspace
1. Create Pipeline with "Machine Learning Execute Pipeline" Activity:

<img src="./media/adf-aml-1.png" alt="Alt text that describes the graphic" title="Title text" />