# Scoring Pipeline

This script creates a pipeline with a ParallelRunStep to score all the models and output the predictions to blob storage. 

In [1]:
import os
import azureml.core
from azureml.core import Workspace, Experiment, Run, Datastore, Dataset
from azureml.core.compute import AmlCompute
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.core import Environment
from azureml.core.runconfig import CondaDependencies, DEFAULT_CPU_IMAGE
from azureml.contrib.pipeline.steps import ParallelRunStep, ParallelRunConfig
from azureml.core.model import Model
import joblib

## Set up the Workspace, Datastore, Experiment and Compute

As we did in the Training Pipeline notebook, we need to call the Workspace and set up an Experiment. We also want to create variables for the datastore and compute cluster. 

In [2]:
ws = Workspace(subscription_id="bbd86e7d-3602-4e6d-baa4-40ae2ad9303c", resource_group="ManyModelsSA", workspace_name="ManyModelsSAv1")
ws.get_details()

# define the compute cluster and the data store
compute = AmlCompute(ws, 'cpu-cluster')
dstore = ws.get_default_datastore()

# set up the experiment
experiment = Experiment(ws, 'scoring-pipeline-AP')

## Set up the Environment

In [3]:
# set up the batch environment settings
batch_conda_deps = CondaDependencies.create(pip_packages=['sklearn','pmdarima'])

batch_env = Environment(name="manymodels_environment")
batch_env.python.conda_dependencies = batch_conda_deps
batch_env.docker.enabled = True
batch_env.docker.base_image = DEFAULT_CPU_IMAGE

In [4]:
# call the models
from azureml.core.model import Model 

model1 = Model(ws, 'arima_Store5_tropicana')
model2 = Model(ws, 'arima_Store2_dominicks')
model3 = Model(ws, 'arima_Store8_minute.maid')

model_list = [model1, model2, model3]

In [5]:
type(model1)

azureml.core.model.Model

In [None]:
from azureml.pipeline.core import Pipeline, PipelineData

# dataset = Dataset.get_by_name(ws, name='Store2_dominicks')

dataset1 = Dataset.File.from_files(path = (dstore, '3modelsdata/Store2_dominicks.csv'))
dataset2 = Dataset.File.from_files(path = (dstore, '3modelsdata/Store5_tropicana.csv'))
dataset3 = Dataset.File.from_files(path = (dstore, '3modelsdata/Store8_minute.maid.csv'))

output_dir = PipelineData(name="3_models", 
                          datastore=dstore, 
                          output_path_on_compute="3models/")


## Define the ParallelRunConfig

In [12]:
# Create the parallel run config
workercount = 3
nodecount = 1
timeout = 3000

tags1 = {}
tags1['nodes'] = nodecount
tags1['workers-per-node'] = workercount
tags1['timeout'] = timeout 

parallel_run_config = ParallelRunConfig(
    source_directory = './scripts',
    entry_script = 'score.py',
    mini_batch_size = '1',
    run_invocation_timeout = timeout, 
    error_threshold = 10,
    output_action = 'summary_only', 
    environment = batch_env, 
    process_count_per_node = workercount, 
    compute_target = compute, 
    node_count = nodecount
)

## Set up the ParallelRunStep

In [13]:
datasetname = 'store'
output_dir = PipelineData(name = 'scoringOutput', 
                         datastore = dstore, 
                         output_path_on_compute = 'scoringOutput/')

parallelrun_step = ParallelRunStep(
    name="many-models-scoring",
    parallel_run_config=parallel_run_config,
    inputs=[dataset1.as_named_input(datasetname), dataset2.as_named_input(datasetname), dataset3.as_named_input(datasetname)], # must have at least one element.... 
    output=output_dir,
    models= model_list, # this is just for logging
    arguments=['--n_predictions', 6],
    allow_reuse = False
)

## Submit and Run the Pipeline

In [14]:
pipeline = Pipeline(ws, steps=[parallelrun_step])

run = experiment.submit(pipeline, tags=tags1)

Created step many-models-scoring [4e56035e][3ccd7e72-45fd-42dd-a81f-a099067e9b41], (This step will run and generate new outputs)
Using data reference store5_0 for StepId [512dd7d7][e79bdf2c-906d-4a00-a053-8d1dc436f342], (Consumers of this data are eligible to reuse prior runs.)
Submitted PipelineRun 18c39a34-306b-4ab2-b756-e215c6657122




Link to Azure Machine Learning studio: https://ml.azure.com/experiments/scoring-pipeline-AP/runs/18c39a34-306b-4ab2-b756-e215c6657122?wsid=/subscriptions/bbd86e7d-3602-4e6d-baa4-40ae2ad9303c/resourcegroups/ManyModelsSA/workspaces/ManyModelsSAv1


## Review the Output from the Pipeline
Put the predicitons back into blob storage

In [None]:
prediction_run = next(run.get_children())
prediction_output = prediction_run.get_output_data("3models")
prediction_output

prediction_output.download(local_path="training_results")


for root, dirs, files in os.walk("training_results"):
    for file in files:
        if file.endswith('parallel_run_step.txt'):
            result_file = os.path.join(root,file)
            
df = pd.read_csv(result_file, delimiter=" ", header=None) 
df.head()

## Scoring Script 

In [11]:
%%writefile ./scripts/score.py
from azureml.core.run import Run
import pandas as pd
import os
import uuid
import argparse
import datetime
import numpy as np

from azureml.core.model import Model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import pickle
from azureml.core import Experiment, Workspace, Run
from azureml.core import ScriptRunConfig
# import datetime
from entry_script_helper import EntryScriptHelper
import logging

from sklearn.externals import joblib
from joblib import dump, load
import pmdarima as pm
import time
from datetime import timedelta
from sklearn.metrics import mean_squared_error, mean_absolute_error 


thisrun = Run.get_context()
#childrun=thisrun

LOG_NAME = "user_log"

parser = argparse.ArgumentParser("split")
parser.add_argument("--n_test_set", type=int, help="input number of predictions")
parser.add_argument("--timestamp_column", type=str, help="model name")
#parser.add_argument("--start_date", type=str, help="date to start predictions")

args, unknown = parser.parse_known_args()
# args = parser.parse_args()

print("Argument 1(n_test_set): %s" % args.n_test_set)
print("Argument 2(timestamp_column): %s" % args.timestamp_column)


def init():
    EntryScriptHelper().config(LOG_NAME)
    logger = logging.getLogger(LOG_NAME)
    output_folder = os.path.join(os.environ.get("AZ_BATCHAI_INPUT_AZUREML", ""), "temp/output")
    logger.info(f"{__file__}.output_folder:{output_folder}")
    logger.info("init()")    
    return

def run(data):
    print("begin run ")
    logger = logging.getLogger(LOG_NAME)
    os.makedirs('./outputs', exist_ok=True)
    
    predictions = pd.DataFrame()
    
    logger.info('making predictions...')
    
    for file in data:
        u1 = uuid.uuid4()
        mname='arima'+str(u1)[0:16]

        #for w in range(0,1):
        with thisrun.child_run(name=mname) as childrun:
            for w in range(0,5):
                thisrun.log(mname,str(w))
            
            date1=datetime.datetime.now()
            logger.info('starting ('+file+') ' + str(date1))
            childrun.log(mname,'starttime-'+str(date1))
            
            # 0. unpickle model 
            model_name = 'arima_'+str(data).split('/')[-1][:-6]
            model_path = Model.get_model_path(model_name) 
           
            print(model_name)
            model = joblib.load(model_path)
            print("UNPICKELED THE MODEL")
            
            # 1. make preidtions 
            prediction_list, conf_int = model.predict(args.n_test_set, return_conf_int = True)
            print("MADE PREDICTIONS")
            print(predictions)
            
            # 2. Score predictions with test set 
            data = pd.read_csv(file,header=0, )
            logger.info(data.head())
             
            # splitting the data for test set     
            data = data.set_index(args.timestamp_column)             
            max_date = datetime.datetime.strptime(data.index.max(),'%Y-%m-%d')
            split_date = max_date - timedelta(days=7*args.n_test_set)
            data.index = pd.to_datetime(data.index)
            #train = data[data.index <= split_date]
            test = data[data.index > split_date]
                
            test['Predictions'] = prediction_list
            print(test.head())
            
            # accuracy metrics 
            #accuracy_metrics = get_accuracy_metrics(test['Quantity'], test['Predictions'])
            #print(accuracy_metrics)
            #logger.info(accuracy_metrics)
                        
            metrics = []
            mse = mean_squared_error(test['Quantity'], test['Predictions'])
            print(mse)
            rmse = np.sqrt(mse)
            print(rmse)
            mae = mean_absolute_error(test['Quantity'], test['Predictions'])
            print(mae)
            act, pred = np.array(test['Quantity']), np.array(test['Predictions'])
            mape = np.mean(np.abs((act - pred)/act)*100)
            print(mape)
            metrics.append(mse)
            metrics.append(rmse)
            metrics.append(mae)
            metrics.append(mape)

            print(metrics)
            
            # 3. Save the output back to blob storage 
            #predictions_path = 'predictions'
            #filename = '/arima_'+str(input_data).split('/')[-1][:-6]+'.csv'
            
            #test[['Quantity', 'Predictions']].to_csv(path_or_buf = predictions_path + filename, index = False)
            
            ws1 = childrun.experiment.workspace
            output_path = os.path.join('./outputs/', model_name)
            
            test.to_csv(path_or_buf=output_path+'.csv', index = False)
            
            #try:
            #    childrun.upload_file(test, output_path+'.csv')
            #except:
            #    logger.info('dont need to upload')
            #logger.info('output test set, skip the outputs prefix')
            
            #Model.register(workspace=ws1, model_path=os.path.join('./outputs/', mname), model_name='arima_'+str(input_data).split('/')[-1][:-6], model_framework='pmdarima')
            
            dstore = ws1.get_default_datastore()
            print(dstore)
        
            dstore.upload_files([output_path+'.csv'], target_path='oj_predictions', overwrite=False, show_progress=True)
            
            #you can return anything you want
            date2=datetime.datetime.now()
            logger.info('ending ('+str(file)+') ' + str(date2))

            #log some metrics
            childrun.log(mname,'endtime-'+str(date2))
            childrun.log(mname,'auc-1')
        
            predictions = predictions.append(test)
            print(len(predictions))
        
    return predictions

Overwriting ./scripts/score.py
