Copyright (c) Microsoft Corporation.
Licensed under the MIT license.

## Model training
Pre-process data and use the data to build an Azure AutoML model in this notebook using the following steps:

1. Define variables
2. Load data and setup Azure Machine learning (AML) connection
3. Train-test split
4. Train and Azure automl model
5. Register model to AML


In [None]:
import azureml.core

from pyspark.sql.functions import *
from azureml.core import Experiment, Workspace, Dataset, Datastore
from azureml.train.automl import AutoMLConfig
from azureml.data.dataset_factory import TabularDatasetFactory

### Define Variables


In [None]:
# Define Dataset name and AzureML experiment setup
subscription_id = ""
resource_group = ""
workspace_name = ""
experiment_name = "commodity-price-forecast"
data_lake_account_name = ""
file_system_name = ""
table_test_name = "default.test"

# Default train-test split on year
split_year = 2019

## Setup AML and Load Data


In [None]:
# Azure Storage path
adls_path = "abfss://%s@%s.dfs.core.windows.net/CommodityAggrData" % (file_system_name, data_lake_account_name)

# Connect to Workspace
ws = Workspace(subscription_id = subscription_id, resource_group = resource_group, workspace_name = workspace_name)

# Setup training experiment
experiment = Experiment(ws, experiment_name)

# Get datastore
datastore = Datastore.get_default(ws)

# Load Data
df = spark.read.parquet(adls_path)

## Train Test split


In [None]:
# Data Train-test split
df_train = df.filter(f"Year(date) < {split_year}")
df_test = df.filter(f"Year(date) >= {split_year}")

# Register dataset to AML datastore
dataset = TabularDatasetFactory.register_spark_dataframe(df_train, datastore, name = experiment_name + "-dataset")

# Store test dataset for prediction
df_test.write.mode("overwrite").saveAsTable(table_test_name) 

## Define and train Automl model


In [None]:
from azureml.automl.core.forecasting_parameters import ForecastingParameters

# Set automl forecasting parameters
forecasting_parameters = ForecastingParameters(
    time_column_name = "Date",
    forecast_horizon = 12,
)

# Automl config
automl_config = AutoMLConfig(spark_context = sc,
                             task = "forecasting",
                             training_data = dataset,
                             label_column_name = "average_value",
                             primary_metric = "normalized_mean_absolute_error",
                             experiment_timeout_hours = 0.25,
                             max_concurrent_iterations = 2,
                             n_cross_validations = 5,
                             forecasting_parameters = forecasting_parameters)

In [None]:
# Run experiment
run = experiment.submit(automl_config)

Now that the experiment is running, we can view the run in the AML workspace. 

In [None]:
# Show experiment URL
displayHTML("<a href={} target='_blank'>Your experiment in Azure Machine Learning portal: {}</a>".format(run.get_portal_url(), run.id))

## Register Model


In [None]:
run.wait_for_completion()

# Install required dependency
import pip
pip.main(["install", "azure-storage-blob==12.5.0"])

import mlflow

# Get best model from automl run
best_run, non_onnx_model = run.get_output()

artifact_path = experiment_name + "_artifact"

# Config mflow to monitor results to AML
mlflow.set_tracking_uri(ws.get_mlflow_tracking_uri())
mlflow.set_experiment(experiment_name)

with mlflow.start_run() as run:
    # Save the model to the outputs directory for capture
    mlflow.sklearn.log_model(non_onnx_model, artifact_path)

    # Register the model to AML model registry
    mlflow.register_model("runs:/" + run.info.run_id + "/" + artifact_path, "commodity-price-forecast-Best")