Copyright (c) Microsoft Corporation. All rights reserved.

Licensed under the MIT License.

# Forecasting Pipeline
---

In this notebook we create a pipeline to forecast sales with the models we trained in the last step. The forecasting pipeline we'll set up is similar to the training pipeline in the last step. For more details on the steps and functions refer to that notebook.

We will set up the Pipeline for forecasting given the desired forecasting horizon. We utitlize the ParallelRunStep to parallelize the process. For more information about the Data and Models refer to the Data Preparation and Training Notebooks.


### Prerequisites
At this point, you should have already:

1. Created your AML Workspace
2. Run 00_Environment_Setup.ipynb to configure the enviroment
3. Run 01_Training_Pipeline.ipynb to train the models

## 1.0 Connect to workspace and datastore

In [None]:
from azureml.core import Workspace
from azureml.core import Datastore

ws = Workspace.from_config()

# set up datastores
dstore = ws.get_default_datastore()

print('Workspace Name: ' + ws.name, 
      'Azure Region: ' + ws.location, 
      'Subscription Id: ' + ws.subscription_id, 
      'Resource Group: ' + ws.resource_group, sep='\n')

## 2.0 Create an experiment

In [None]:
from azureml.core import Experiment

experiment = Experiment(ws, 'forecasting_pipeline')

## 3.0 Get the dataset

In [None]:
from azureml.core.dataset import Dataset

small_dataset = Dataset.get_by_name(ws, name='oj_data_small')
small_dataset_input = small_dataset.as_named_input('forecast_10_models')

## 4.0 Create ParallelRunStep for the forecasting pipeline

### 4.1 Configure environment for ParallelRunStep

In [None]:
from azureml.core import Environment
from azureml.core.conda_dependencies import CondaDependencies

forecast_env = Environment(name="many_models_environment")
forecast_conda_deps = CondaDependencies.create(pip_packages=['sklearn'])
forecast_env.python.conda_dependencies = forecast_conda_deps

### 4.2 Choose a compute target

In [None]:
from azureml.core.compute import AmlCompute
compute = AmlCompute(ws, "cpucluster")

### 4.3 Set up ParallelRunConfig

In [None]:
from azureml.contrib.pipeline.steps import ParallelRunConfig 

process_count_per_node = 8
node_count = 5
timeout = 500

tags = {}
tags['node_count'] = node_count
tags['process_count_per_node'] = process_count_per_node
tags['timeout'] = timeout

parallel_run_config = ParallelRunConfig(
    source_directory='./scripts',
    entry_script='forecast.py',
    mini_batch_size='1',
    run_invocation_timeout=timeout, 
    error_threshold=10,
    output_action='append_row', 
    environment=forecast_env, 
    process_count_per_node=process_count_per_node, 
    compute_target=compute, 
    node_count=node_count
)

### 4.4 Set up ParallelRunStep

In [None]:
from azureml.pipeline.core import PipelineData
from azureml.contrib.pipeline.steps import ParallelRunStep 

output_dir = PipelineData(name='output_dir', datastore=dstore)

parallel_run_step = ParallelRunStep(
    name="many-models-forecasting",
    parallel_run_config=parallel_run_config,
    inputs=[small_dataset_input],
    output=output_dir,
    allow_reuse=False,
    arguments=['--forecast_horizon', 8,
              '--starting_date', '1992-10-01',
              '--target_column', 'Quantity',
              '--timestamp_column', 'WeekStarting',
              '--model_type', 'lr',
              '--date_freq', 'W-THU']
)

## 5.0 Create PythonScriptStep to copy predictions at the end of the pipeline

### 5.1 Create a data reference

In [None]:
from azureml.data.data_reference import DataReference

output_dstore = Datastore.register_azure_blob_container(
    workspace=ws, 
    datastore_name="predictions",
    container_name="predictions",
    account_name=dstore.account_name,
    account_key=dstore.account_key,
    create_if_not_exists=True
)

output_dref = DataReference(output_dstore)

### 5.2 Create PythonScriptStep to copy predictions

In [None]:
from azureml.pipeline.steps import PythonScriptStep

upload_predictions_step = PythonScriptStep(
    name="copy_predictions",
    script_name="copy_predictions.py",
    compute_target=compute,
    source_directory='./scripts',
    inputs=[output_dref, output_dir],
    allow_reuse=False,
    arguments=['--parallel_run_step_output', output_dir,
              '--output_dir', output_dref]
)

## 6.0 Run the pipeline

In [None]:
from azureml.pipeline.core import Pipeline

pipeline = Pipeline(workspace=ws, steps=[parallel_run_step, upload_predictions_step])
run = experiment.submit(pipeline, tags=tags)

In [None]:
run.wait_for_completion(show_output=True)

## 7.0 Visualize the results

In [None]:
import os

def download_predictions(run, target_dir=None):
    stitch_run = run.find_step_run("many-models-forecasting")[0]
    
    port_data = stitch_run.get_output_data("output_dir")
    print(port_data)
    port_data.download(target_dir, show_progress=True)
    step_hash = os.listdir(os.path.join(target_dir, 'azureml'))[0]
    return  os.path.join(target_dir, 'azureml', step_hash, 'output_dir')

In [None]:
files = os.listdir(output_path)
fileNames = []

col = []
for f in files[1:]: 
    fileNames.append(pd.read_csv(output_path + '/' + f))
#concat df and set index to week starting 
df = pd.concat(fileNames, ignore_index=True)
df.WeekStarting = pd.to_datetime(df.WeekStarting)
df.WeekStarting = [d.date() for d in df.WeekStarting]
df.head()

In [None]:
fig = sns.violinplot(x=df['Brand'], y=df['Predictions'], data=df)
fig.set_title('Predictions by Brand')