## Import Required Packages

In [None]:
from azureml.core import Workspace, Experiment, Datastore, Environment, Dataset
from azureml.core.compute import ComputeTarget
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import PipelineParameter, PipelineData
from azureml.data.output_dataset_config import OutputTabularDatasetConfig, OutputDatasetConfig, OutputFileDatasetConfig
from azureml.data.datapath import DataPath
import logging

## Connect to Azure ML Workspace using the AML SDK
The code snippet below retrieves a reference to your AML workspace - you can interact directly with resources in your workspace via the SDK, similar to how you can use the Studio UI.

In [None]:
from azureml.core import Workspace

ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

## Create an Experiment
Experiments are logical containers of script runs which can hold different metrics and experiments. 

<b>Hint:</b> if you get stuck on the components below, run a search online for azure ml sdk CLASS_NAME to find relevant docs.

In [None]:
from azureml.core import Experiment

# TO-DO:
# Update the experiment_name variable below to 'yourinitials_home_price_model_training_custom_script`
# Uncomment and create an Experiment object using the AML SDK

experiment_name = "NWK-PIPELINE"
experiment = Experiment(ws, experiment_name)

## Retrieve a Reference to Compute Cluster
Get a pointer to your created AML Compute Cluster (`cpucluster-yourinitials`). You will use this as the compute engine for executing your script run.

In [None]:
from azureml.core.compute import ComputeTarget

# TO-DO:
# Update the cpu_cluster_name variable below to the name of the cluster you previously created (cpucluster-yourinitials)
# Uncomment and retrieve a pointer to your ComputeTarget for cpu_cluster

cpu_cluster_name = "cpucluster-nwk"
cpu_cluster = ComputeTarget(ws, cpu_cluster_name)

## Create and Register an Environment
AML environments are reusable software environments that contain dependencies for model training/inferencing operations. These environments can be manually created, packaged into reusable docker containers, and then leveraged time and again for different MLOps activities.

Create and register a new environment from the exported conda yaml environment definition (`automl_env.yml`). 

<i>Hint:</i> The [`Environment` class definition reference](https://learn.microsoft.com/en-us/python/api/azureml-core/azureml.core.environment(class)?view=azure-ml-py) showcases multiple ways to construct a new environment, including from a conda specification. 

In [None]:
from azureml.core import Environment
from azureml.core.runconfig import DockerConfiguration
from azureml.core.conda_dependencies import CondaDependencies

# TO-DO:
# Retrieve AzureML-AutoML environment definition to be used in RunConfiguration
env_name = 'nwk-automl-env'
env = Environment.from_conda_specification(name=env_name, file_path="automl_env.yml")

run_config = RunConfiguration()
run_config.environment = env

## Define Output Datasets
Below we define the configuration for datasets that will be passed between steps in our pipeline. Note, in all cases we specify the datastore that should hold the datasets and whether they should be registered following step completion or not. This can optionally be disabled by removing the `register_on_complete()` call.

In [None]:
from azureml.data import DataType
column_dictionary = {
   'ZN':DataType.to_float(),
   'INDUS':DataType.to_float(),
   'CHAS':DataType.to_float(),
   'NOX':DataType.to_float(), 
   'RM':DataType.to_float(),
   'AGE':DataType.to_float(),
   'DIS':DataType.to_float(),
   'RAD':DataType.to_float(),
   'TAX':DataType.to_float(),
   'PTRATIO':DataType.to_float(),
   'LSTAT':DataType.to_float(),
   'CRIM':DataType.to_float(),
   'MEDV':DataType.to_float(),
}

#TO-DO:
# Update names of the OutputFileDatasetConfig and dataset objects below
default_ds = ws.get_default_datastore()
training_data = OutputFileDatasetConfig(name='NWK_Training_Data', destination=(default_ds, 'NWK_training_data/{run-id}')).read_delimited_files(set_column_types=column_dictionary).register_on_complete(name='NWK_Training_Data')
testing_data = OutputFileDatasetConfig(name='NWK_Testing_Data', destination=(default_ds, 'NWK_testing_data/{run-id}')).read_delimited_files(set_column_types=column_dictionary).register_on_complete(name='NWK_Testing_Data')

## Define Pipeline Parameters
`PipelineParameter` objects serve as variable inputs to an Azure ML pipeline and can be specified at runtime. Update the pipeline parameters below to include parameters for the following variables:

| Variable | Description |
|----------|-------------|
| `datastore_name` | Name of the datastore you created in Challenge 1 |
| `data_path` | Path on the datastore above which includes all of the home price CSV files you uploaded |
| `model_name` | Name of the model to be registered in your AML workspace upon completion of the run |
| `training_percentage` | Percent of data that should be used for training (0.0 - 1.0) |

In [None]:
datastore_name = PipelineParameter(name='datastore_name', default_value='nwk_datastore')
data_path = PipelineParameter(name='data_path', default_value='07-18-2022_014821_UTC/**')
model_name = PipelineParameter(name='model_name', default_value='NWK-ATUOML')
training_percentage = PipelineParameter(name='training_percentage', default_value=0.85)

## Define Pipeline Steps
The pipeline below consists of three distinct steps which execute an associated python scripts located in the `./pipeline_script_steps` dir in addition to submitting an AutoML job. First, we call get_data.py and retrieve data from your registered datastore and split into test and train subsets which are subsequently registered. Then, we pass the test and training datasets into an AutoML step that trains a custom model. Finally, the final step executes evaluate_and_register.py which loads both the new model (challenger) and current best model (champion) into code and evaluates the provided test dataset. Based on RMSE, if the challenger model performs better, or no model has been registered to-date, the model is registered in the workspace.

In [None]:
# Get raw data from AML-linked datastore
# Register tabular datasets (test & train) after retrieval
get_data_step = PythonScriptStep(
    name='Get Data from Blob Storage',
    script_name='get_data-ANSWERS.py',
    arguments =['--training_data', training_data,
                '--testing_data', testing_data,
                '--training_percentage', training_percentage,
                '--datastore_name', datastore_name,
                '--data_path', data_path
               ],
    outputs=[training_data, testing_data],
    compute_target=cpu_cluster,
    source_directory='./pipeline_step_scripts',
    allow_reuse=False,
    runconfig=run_config
)

from azureml.train.automl import AutoMLConfig
from azureml.pipeline.steps import AutoMLStep

# TO-DO: UPDATE AUTOML SETTINGS PER THE CONFIGURATION BELOW
# Allowed Models: XGBoostRegressor, LightGBM
# Experiment Timeout Hours: 0.5
# Cross Validation: k-folds with 3 folds
automl_settings = {
    "iteration_timeout_minutes" : 10,
    "primary_metric" : 'normalized_root_mean_squared_error',
    "allowed_models": ['XGBoostRegressor', 'LightGBM'],
    "experiment_timeout_hours": 0.5,
    "n_cross_validations": 3,
    "iterations": 1
}

automl_config = AutoMLConfig(task = 'regression',
                             path = '.',
                             compute_target = cpu_cluster,
                             run_configuration = run_config,
                             featurization = 'auto',
                             training_data = training_data,
                             label_column_name = 'MEDV',
                             **automl_settings)

train_model_step = AutoMLStep(name='Train Model (AutoML)',
    automl_config=automl_config,
    passthru_automl_config=False,
    enable_default_model_output=False,
    enable_default_metrics_output=False,
    allow_reuse=True)

evaluate_and_register_step = PythonScriptStep(
    name = 'Evaluate and Register Model',
    script_name='evaluate_and_register.py',
    inputs=[testing_data.as_input(name='testing_dataset')],
    arguments=['--model_name', model_name],
    compute_target=cpu_cluster,
    source_directory='./pipeline_step_scripts',
    allow_reuse=False,
    runconfig=run_config2
)

evaluate_and_register_step.run_after(train_model_step)

## Create Pipeline
Create an Azure ML Pipeline by specifying the steps to be executed. 

<b>Note:</b> based on the dataset dependencies between steps, exection occurs logically such that no step will execute unless all of the necessary input datasets have been generated.

In [None]:
#TO-DO: Update Pipeline syntax below
pipeline = Pipeline(workspace = ws, steps=[get_data_step, train_model_step, evaluate_and_register_step])

## Create a Published PipelineEndpoint
Once we have created our pipeline we will look to retrain our model periodically as new data becomes available. By publishing our pipeline to a `PipelineEndpoint` we can iterate on our pipeline definition but maintain a consistent REST API endpoint.

In [None]:
from azureml.pipeline.core import PipelineEndpoint

def published_pipeline_to_pipeline_endpoint(
    workspace,
    published_pipeline,
    pipeline_endpoint_name,
    pipeline_endpoint_description,
):
    try:
        pipeline_endpoint = PipelineEndpoint.get(
            workspace=workspace, name=pipeline_endpoint_name
        )
        print("using existing PipelineEndpoint...")
        pipeline_endpoint.add_default(published_pipeline)
    except Exception as ex:
        print(ex)
        # create PipelineEndpoint if it doesn't exist
        print("PipelineEndpoint does not exist, creating one for you...")
        pipeline_endpoint = PipelineEndpoint.publish(
            workspace=workspace,
            name=pipeline_endpoint_name,
            pipeline=published_pipeline,
            description=pipeline_endpoint_description
        )


pipeline_endpoint_name = 'NWK-PipelineEndpoint'
pipeline_endpoint_description = 'AutoML Training Pipeline for Home Prices Dataset'

published_pipeline = pipeline.publish(name=pipeline_endpoint_name,
                                     description=pipeline_endpoint_description,
                                     continue_on_step_failure=False)

published_pipeline_to_pipeline_endpoint(
    workspace=ws,
    published_pipeline=published_pipeline,
    pipeline_endpoint_name=pipeline_endpoint_name,
    pipeline_endpoint_description=pipeline_endpoint_description
)

## Trigger a Pipeline Execution from the Notebook
You can create an Experiment (logical collection for runs) and submit a pipeline run directly from this notebook by running the commands below

In [None]:
run = experiment.submit(pipeline)
run.wait_for_completion(show_output=True)