# Training Pipeline

We do this using a 'pipeline first mentality' i.e. we want to have a production pipeline.


In [None]:
from azureml.core import Workspace, Dataset, Experiment
from azureml.core.runconfig import RunConfiguration, DEFAULT_CPU_IMAGE, CondaDependencies
from azureml.core.compute import AmlCompute
from azureml.pipeline.core import Pipeline, PipelineDataset, PipelineParameter, PipelineData, TrainingOutput, Schedule
from azureml.pipeline.steps import PythonScriptStep
from azureml.train.automl import AutoMLStep, AutoMLConfig

## Set up workspace, experiment and compute

In [None]:
ws = Workspace.from_config()
dstor = ws.datastores["workspaceblobstore"]
ds_name = "AirliftDataset-Train"
experiment = Experiment(ws, "airlift")
aml_compute_target = AmlCompute(ws, "onenode-cpu")

## Set up Run Configuration

In [None]:
conda_run_config = RunConfiguration(framework="python")
conda_run_config.target = aml_compute_target
conda_run_config.environment.docker.enabled = True
conda_run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
cd = CondaDependencies.create(pip_packages=["azureml-sdk[automl]", "scikit-learn", "azureml-explain-model"], conda_packages=["numpy"])
conda_run_config.environment.python.conda_dependencies = cd

## Create Training Data Step

We need to bring the data into a common X, y format

In [None]:
target_name = PipelineParameter("column_name", default_value="BOUGHT_CATEGORY_10")
output_split_train_x = PipelineData("output_split_train_x", datastore=dstor)
output_split_train_y = PipelineData("output_split_train_y", datastore=dstor)
output_split_test_x = PipelineData("output_split_test_x", datastore=dstor)
output_split_test_y = PipelineData("output_split_test_y", datastore=dstor)

In [None]:
test_train_split_step = PythonScriptStep(script_name="train_test_split.py", 
                                         allow_reuse=True,
                                         name="test_train_split_step",
                                         arguments=["--target_column", target_name,
                                                       "--input_dataset", ds_name,
                                                       "--output_split_train_x", output_split_train_x,
                                                       "--output_split_train_y", output_split_train_y,
                                                       "--output_split_test_x", output_split_test_x,
                                                       "--output_split_test_y", output_split_test_y],
                                         outputs=[output_split_train_x, output_split_train_y, output_split_test_x, output_split_test_y], 
                                         compute_target=aml_compute_target, 
                                         runconfig=conda_run_config)

In [None]:
print("Environment variable of X train split: " + output_split_train_x.__str__())
print("Environment variable of y train split: " + output_split_train_y.__str__())

## AutoML Step

In [None]:
metrics_data = PipelineData(name="automl_metrics",
                            datastore=dstor, 
                            pipeline_output_name="metrics_output",
                            training_output=TrainingOutput(type="Metrics"))

model_data = PipelineData(name="automl_model", 
                          datastore=dstor,
                          pipeline_output_name="best_model_output",
                          training_output=TrainingOutput(type="Model"))

In [None]:
automl_config = AutoMLConfig(task = "classification",
                             iterations = 1,
                             iteration_timeout_minutes = 5, 
                             max_cores_per_iteration = 4,
                             max_concurrent_iterations = 1,
                             primary_metric = "accuracy",
                             path=".",
                             data_script = "get_data.py",
                             run_configuration = conda_run_config,
                             model_explainability = True,
                             n_cross_validations = 2,
                             preprocess = True,
                             compute_target=aml_compute_target)

automl_step = AutoMLStep(name="build_model", 
                         allow_reuse=True,
                         automl_config=automl_config,
                         inputs=[output_split_train_x, output_split_train_y],
                         outputs=[metrics_data, model_data])

## Register Model Step

In [None]:
register_model_step = PythonScriptStep(script_name="register_model.py",
                                       name="register_model",
                                       allow_reuse=True,
                                       arguments=["--model_id", target_name, "--model_path", model_data, "--ds_name", ds_name],
                                       inputs=[model_data],
                                       compute_target=aml_compute_target,
                                       runconfig=conda_run_config)

## Create pipeline object

In [None]:
pipeline = Pipeline(ws, steps=[test_train_split_step, automl_step, register_model_step])

## Run pipeline 

In [None]:
pipeline_run = experiment.submit(pipeline, pipeline_params={"column_name": "BOUGHT_CATEGORY_3"})

## Schedule the pipeline on a blob change 

### Publish pipeline 

In [None]:
pipeline_name = "Prop-Training-Pipeline"
print(pipeline_name)

published_pipeline = pipeline.publish(
    name=pipeline_name, 
    description=pipeline_name)

print("Newly published pipeline id: {}".format(published_pipeline.id))

### Schedule all models

In [None]:
model_list = ["BOUGHT_CATEGORY_" + str(i) for i in range(1,11)]
model_list

In [None]:
for m in model_list:
    schedule = Schedule.create(workspace=ws, name=m,
                               pipeline_parameters={"column_name": m},
                               pipeline_id=published_pipeline.id, 
                               experiment_name=m, # this creates parallelism on an AML cluster
                               datastore=dstor,
                               wait_for_provisioning=True,
                               description=m,
                               polling_interval=1440,
                               path_on_datastore="airlift/data-latest.csv")

    print("Created schedule with id: {}".format(schedule.id))

#### Disable schedules as this is a demo

In [None]:
for s in Schedule.list(ws):
    s.disable()