# Training Pipeline

We do this using a 'pipeline first mentality' i.e. we want to have a production pipeline.


In [12]:
from azureml.core import Workspace, Dataset, Experiment
from azureml.core.runconfig import RunConfiguration, DEFAULT_CPU_IMAGE, CondaDependencies
from azureml.core.compute import AmlCompute
from azureml.pipeline.core import Pipeline, PipelineDataset, PipelineParameter, PipelineData, TrainingOutput, Schedule
from azureml.pipeline.steps import PythonScriptStep
from azureml.train.automl import AutoMLStep, AutoMLConfig

## Set up workspace, experiment and compute

In [2]:
ws = Workspace.from_config()
dstor = ws.datastores["datalake"]
ds_name = "AirliftDataset-Train"
experiment = Experiment(ws, "airlift")
aml_compute_target = AmlCompute(ws, "onenode-cpu")

## Set up Run Configuration

In [3]:
conda_run_config = RunConfiguration(framework="python")
conda_run_config.target = aml_compute_target
conda_run_config.environment.docker.enabled = True
conda_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
cd = CondaDependencies.create(pip_packages=["azureml-sdk[automl]", "scikit-learn", "azureml-explain-model"], conda_packages=["numpy"])
conda_run_config.environment.python.conda_dependencies = cd

## Create Training Data Step

We need to bring the data into a common X, y format

In [4]:
target_name = PipelineParameter("column_name", default_value="BOUGHT_CATEGORY_10")
output_split_train_x = PipelineData("output_split_train_x", datastore=dstor)
output_split_train_y = PipelineData("output_split_train_y", datastore=dstor)
output_split_test_x = PipelineData("output_split_test_x", datastore=dstor)
output_split_test_y = PipelineData("output_split_test_y", datastore=dstor)

In [5]:
test_train_split_step = PythonScriptStep(script_name="train_test_split.py", 
                                         allow_reuse=True,
                                         name="test_train_split_step",
                                         arguments=["--target_column", target_name,
                                                       "--input_dataset", ds_name,
                                                       "--output_split_train_x", output_split_train_x,
                                                       "--output_split_train_y", output_split_train_y,
                                                       "--output_split_test_x", output_split_test_x,
                                                       "--output_split_test_y", output_split_test_y],
                                         outputs=[output_split_train_x, output_split_train_y, output_split_test_x, output_split_test_y], 
                                         compute_target=aml_compute_target, 
                                         runconfig=conda_run_config)

In [6]:
print("Environment variable of X train split: " + output_split_train_x.__str__())
print("Environment variable of y train split: " + output_split_train_y.__str__())

Environment variable of X train split: $AZUREML_DATAREFERENCE_output_split_train_x
Environment variable of y train split: $AZUREML_DATAREFERENCE_output_split_train_y


## AutoML Step

In [7]:
metrics_data = PipelineData(name="automl_metrics",
                            datastore=dstor, 
                            pipeline_output_name="metrics_output",
                            training_output=TrainingOutput(type="Metrics"))

model_data = PipelineData(name="automl_model", 
                          datastore=dstor,
                          pipeline_output_name="best_model_output",
                          training_output=TrainingOutput(type="Model"))

In [8]:
automl_config = AutoMLConfig(task = "classification",
                             iterations = 1,
                             iteration_timeout_minutes = 5, 
                             max_cores_per_iteration = 4,
                             max_concurrent_iterations = 1,
                             primary_metric = "accuracy",
                             path=".",
                             data_script = "get_data.py",
                             run_configuration = conda_run_config,
                             model_explainability = True,
                             n_cross_validations = 2,
                             preprocess = True,
                             compute_target=aml_compute_target)

automl_step = AutoMLStep(name="build_model", 
                         allow_reuse=True,
                         automl_config=automl_config,
                         inputs=[output_split_train_x, output_split_train_y],
                         outputs=[metrics_data, model_data])

## Register Model Step

In [9]:
register_model_step = PythonScriptStep(script_name="register_model.py",
                                       name="register_model",
                                       allow_reuse=True,
                                       arguments=["--model_id", target_name, "--model_path", model_data, "--ds_name", ds_name],
                                       inputs=[model_data],
                                       compute_target=aml_compute_target,
                                       runconfig=conda_run_config)

## Create pipeline object

In [10]:
pipeline = Pipeline(ws, steps=[test_train_split_step, automl_step, register_model_step])



In get_data


## Run pipeline 

In [11]:
pipeline_run = experiment.submit(pipeline, pipeline_params={"column_name": "BOUGHT_CATEGORY_3"})

Created step test_train_split_step [b3df1883][eb7dcc08-deb1-4e8f-9607-984c436dba97], (This step will run and generate new outputs)
Created step build_model [45479c99][d407485f-493a-419a-a16f-f0a60f704b9e], (This step will run and generate new outputs)
Created step register_model [81524e61][2a92a65a-81bf-48c9-9e84-8686046d82ba], (This step will run and generate new outputs)
Submitted pipeline run: c1668258-e30e-4afd-85f7-511c36586ad4


## Schedule the pipeline on a blob change 

### Publish pipeline 

In [11]:
pipeline_name = "Prop-Training-Pipeline"
print(pipeline_name)

published_pipeline = pipeline.publish(
    name=pipeline_name, 
    description=pipeline_name)

print("Newly published pipeline id: {}".format(published_pipeline.id))

Prop-Training-Pipeline
Created step test_train_split_step [1edd7555][57403a31-d904-4ef0-8a98-483a4d6775d7], (This step will run and generate new outputs)
Created step build_model [5d6ffcd7][6fb8e31b-bb9f-4f7a-bb39-477dd086db52], (This step will run and generate new outputs)
Created step register_model [6d711caa][92ceb30d-c92c-4c1e-874b-18d855783b3b], (This step will run and generate new outputs)
Newly published pipeline id: 84458849-c3c2-4a85-94fc-2e4f0c1519ca


### Schedule all models

In [14]:
model_list = ["BOUGHT_CATEGORY_" + str(i) for i in range(1,11)]
model_list

['BOUGHT_CATEGORY_1',
 'BOUGHT_CATEGORY_2',
 'BOUGHT_CATEGORY_3',
 'BOUGHT_CATEGORY_4',
 'BOUGHT_CATEGORY_5',
 'BOUGHT_CATEGORY_6',
 'BOUGHT_CATEGORY_7',
 'BOUGHT_CATEGORY_8',
 'BOUGHT_CATEGORY_9',
 'BOUGHT_CATEGORY_10']

In [16]:
for m in model_list:
    schedule = Schedule.create(workspace=ws, name=m,
                               pipeline_parameters={"column_name": m},
                               pipeline_id=published_pipeline.id, 
                               experiment_name=m, # this creates parallelism on an AML cluster
                               datastore=dstor,
                               wait_for_provisioning=True,
                               description=m,
                               polling_interval=1440,
                               path_on_datastore="airlift/data-latest.csv")

    print("Created schedule with id: {}".format(schedule.id))

Provisioning status: Completed
Created schedule with id: 6c42b575-1fb0-4362-83bc-dbf0cf674f08
Provisioning status: Completed
Created schedule with id: 217f981c-c1cf-4cca-9d26-f17fc44394e4
Provisioning status: Completed
Created schedule with id: 49d98c48-3fee-4f7a-9076-b8c85cad72f6
Provisioning status: Completed
Created schedule with id: 7e8b9899-1c3a-42a4-b2e3-183ef81466ba
Provisioning status: Completed
Created schedule with id: f4692732-a492-42a5-8bc5-40f74d959d6b
Provisioning status: Completed
Created schedule with id: 807b3049-3f86-4a2d-908e-41784ac4ff87
Provisioning status: Completed
Created schedule with id: 08965120-43e1-4ed5-9232-e293d94c454c
Provisioning status: Completed
Created schedule with id: 4ac56f40-391c-4d76-948d-7fba02f791f5
Provisioning status: Completed
Created schedule with id: 0b26f2b7-59e1-47c1-835c-325df14341f3
Provisioning status: Completed
Created schedule with id: 9bb95f38-f603-4591-8342-44b7f589b45e


#### Disable schedules as this is a demo

In [21]:
for s in Schedule.list(ws):
    s.disable()