# Forecasting Pipeline

In [1]:
## add in overview

# Prerequisites 

This example runs on an Azure Machine Learning Notebook VM. We are calling models that have already been trained and registered to the Workspace. If you have already run the Environment Setup and Training Pipeline notebooks or you have an AML Notebook set up with Models registered to the Workspace you are all set. 

In [None]:
from azureml.core import Workspace 

ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

In [None]:
dstore = ws.get_default_datastore()

In [None]:
from azureml.core.compute import AmlCompute, ComputeTarget

compute = AmlCompute(ws, 'many-models')

In [None]:
from azureml.core.dataset import Dataset

ds_name = 'oj_data_100100100' # this FDS has 2,000 files in it

stores_FDS = Dataset.get_by_name(ws, name=ds_name)

# subset the data with .take_sample()

stores_FDS_subset = stores_FDS.take_sample(0.01) # set the proportion of data you want to use 

stores_input = stores_FDS_subset.as_named_input('subset_stores') 

In [None]:
from azureml.core import Environment
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.core.conda_dependencies import CondaDependencies

# set up the batch environment settings
batch_conda_deps = CondaDependencies.create(pip_packages=['sklearn','pmdarima'])

batch_env = Environment(name="manymodels_environment")
batch_env.python.conda_dependencies = batch_conda_deps
batch_env.docker.enabled = True
batch_env.docker.base_image = DEFAULT_CPU_IMAGE

In [None]:
from azureml.contrib.pipeline.steps import ParallelRunStep, ParallelRunConfig 

workercount = 5
nodecount = 8
timeout = 3000

compute = AmlCompute(ws, "train-max")

tags1 = {}
tags1['nodes'] = nodecount
tags1['workers-per-node'] = workercount
tags1['timeout'] = timeout 

parallel_run_config = ParallelRunConfig(
    source_directory = './scripts',
    entry_script = 'score.py',
    mini_batch_size = '1',
    run_invocation_timeout = timeout, 
    error_threshold = 10,
    output_action = 'summary_only', 
    environment = batch_env, 
    process_count_per_node = workercount, 
    compute_target = compute, 
    node_count = nodecount
)

In [None]:
from azureml.core import Environment
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.core.conda_dependencies import CondaDependencies

# set up the batch environment settings
batch_conda_deps = CondaDependencies.create(pip_packages=['sklearn','pmdarima'])

batch_env = Environment(name="manymodels_environment")
batch_env.python.conda_dependencies = batch_conda_deps
batch_env.docker.enabled = True
batch_env.docker.base_image = DEFAULT_CPU_IMAGE

In [None]:
from azureml.contrib.pipeline.steps import ParallelRunStep, ParallelRunConfig 

workercount = 5
nodecount = 8
timeout = 3000

compute = AmlCompute(ws, "train-max")

tags1 = {}
tags1['nodes'] = nodecount
tags1['workers-per-node'] = workercount
tags1['timeout'] = timeout 

parallel_run_config = ParallelRunConfig(
    source_directory = './scripts',
    entry_script = 'forecasting.py',
    mini_batch_size = '1',
    run_invocation_timeout = timeout, 
    error_threshold = 10,
    output_action = 'summary_only', 
    environment = batch_env, 
    process_count_per_node = workercount, 
    compute_target = compute, 
    node_count = nodecount
)

In [None]:
# Note the inputs are set up for running 3 models currently. 
datasetname = 'store'
output_dir = PipelineData(name = 'scoringOutput', 
                         datastore = dstore, 
                         output_path_on_compute = 'scoringOutput/')

parallelrun_step = ParallelRunStep(
    name="many-models-scoring",
    parallel_run_config=parallel_run_config,
    inputs=[stores_input],  
    output=output_dir,
    models= [], # this is just for logging
    arguments=['--forecast_horizon', 8,
              '--starting_date', '1992-10-01'],
    allow_reuse = False
)

## Forecasting Script

In [None]:
%%writefile ./scripts/forecasting.py

import pandas as pd
import os
import uuid
import argparse
import datetime
import numpy as np
from sklearn.externals import joblib
from joblib import dump, load
import pmdarima as pm
import time
from datetime import timedelta
from sklearn.metrics import mean_squared_error, mean_absolute_error 
import pickle
import logging 

# Import the AzureML packages 
from azureml.core.model import Model
from azureml.core import Experiment, Workspace, Run
from azureml.core import ScriptRunConfig
from azureml.core.run import Run

# Import the helper script 
from entry_script_helper import EntryScriptHelper


# Get the information for the current Run
thisrun = Run.get_context()

# Set the log file name
LOG_NAME = "user_log"

# Parse the arguments passed in the PipelineStep through the arguments option 
parser = argparse.ArgumentParser("split")
parser.add_argument("--forecast_horizon", type=int, help="input number of predictions")
parser.add_argument("--starting_date", type=str, help="date to begin forcasting")

args, unknown = parser.parse_known_args()

print("Argument 1(forecast_horizon): %s" % args.forecast_horizon)
print("Argument 2(starting_date): %s" % args.starting_date)


def init():
    EntryScriptHelper().config(LOG_NAME)
    logger = logging.getLogger(LOG_NAME)
    output_folder = os.path.join(os.environ.get("AZ_BATCHAI_INPUT_AZUREML", ""), "temp/output")
    logger.info(f"{__file__}.output_folder:{output_folder}")
    logger.info("init()")    
    return

def run(input_data):
    print("begin run ")
    
    # 0. Set up Logging
    logger = logging.getLogger(LOG_NAME)
    os.makedirs('./outputs', exist_ok=True)
    resultsList = []
    predictions = pd.DataFrame()
    logger.info('making predictions...')
    
    print('looping through data')
    # 1. Loop through the input data 
    for idx, file in enumerate(input_data): # add the enumerate for the 12,000 files 
        u1 = uuid.uuid4()
        mname='arima'+str(u1)[0:16]        
        logs = []

        date1=datetime.datetime.now()
        logger.info('starting ('+file+') ' + str(date1))
        thisrun.log(mname,'starttime-'+str(date1))

        # 2. Set up data to predict on 
        store = [str(file).split('/')[-1][:-4].split('_')[0]] * args.forecast_horizon
        brand = [split('/')[-1][:-4].split('_')[-1]] * args.forecast_horizon
        date_list = pd.date_range(args.starting_date, periods = args.forecast_horizon, freq ='W-THU')
        
        prediciton_df = pd.DataFrame(list(zip(date_list, store, brand)), 
                                    columns = ['WeekStarting', 'Store', 'Brand'])
        
        # 3. Unpickle Model and Make Predictions             
        model_name = 'arima_'+str(file).split('/')[-1][:-4]  
        model_path = Model.get_model_path(model_name)         
        model = joblib.load(model_path)        
        
        prediction_list, conf_int = model.predict(args.forecast_horizon, return_conf_int = True)

        prediction_df['Predictions'] = prediction_list

        # 4. Save the output back to blob storage 
        run_date = datetime.datetime.now().date()
        ws1 = thisrun.experiment.workspace
        output_path = os.path.join('./outputs/', model_name + str(run_date))
        test.to_csv(path_or_buf=output_path + '.csv', index = False)
        dstore = ws1.get_default_datastore()
        dstore.upload_files([output_path + '.csv'], target_path='oj_forecasts' + str(run_date), overwrite=False, show_progress=True)

        # 5. Append the predictions to return a dataframe if desired 

        # 6. Log Metrics
        date2=datetime.datetime.now()
        logger.info('ending ('+str(file)+') ' + str(date2))

        logs.append(str(file).split('/')[-1][:-4])
        logs.append(model_name)
        logs.append(str(date1))
        logs.append(str(date2))
        logs.append(date2-date1)
        logs.append(idx)
        logs.append(len(input_data))
        logs.append(thisrun.get_status())        

        thisrun.log(mname,'endtime-'+str(date2))
        thisrun.log(mname,'auc-1')

    resultsList.append(logs)
    return resultsList