# Scoring Pipeline

This script creates a pipeline with a ParallelRunStep to score all the models and output the predictions to blob storage. 

In [1]:
import os
import azureml.core
from azureml.core import Workspace, Experiment, Run, Datastore, Dataset
from azureml.core.compute import AmlCompute
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.core import Environment
from azureml.core.runconfig import CondaDependencies, DEFAULT_CPU_IMAGE
from azureml.contrib.pipeline.steps import ParallelRunStep, ParallelRunConfig
from azureml.core.model import Model
import joblib

## Set up the Workspace, Datastore, Experiment and Compute

As we did in the Training Pipeline notebook, we need to call the Workspace and set up an Experiment. We also want to create variables for the datastore and compute cluster. 

In [None]:
ws = Workspace(subscription_id="bbd86e7d-3602-4e6d-baa4-40ae2ad9303c", resource_group="ManyModelsSA", workspace_name="ManyModelsSAv1")
ws.get_details()

# define the compute cluster and the data store
compute = AmlCompute(ws, 'cpu-cluster')
dstore = ws.get_default_datastore()

# set up the experiment
experiment = Experiment(ws, 'scoring-pipeline-AP')

## Set up the Environment

In [None]:
# set up the batch environment settings
batch_conda_deps = CondaDependencies.create(pip_packages=['sklearn','pmdarima'])

batch_env = Environment(name="manymodels_environment")
batch_env.python.conda_dependencies = batch_conda_deps
batch_env.docker.enabled = True
batch_env.docker.base_image = DEFAULT_CPU_IMAGE

In [None]:
# call the models
from azureml.core.model import Model 

model1 = Model(ws, 'arima_Store5_tropicana')
model2 = Model(ws, 'arima_Store2_dominicks')
model3 = Model(ws, 'arima_Store8_minute.maid')

model_list = [model1, model2, model3]

In [None]:
type(model1)

In [None]:
# input data 
dataset1 = Dataset.File.from_files(path = (dstore, '3modelsdata/Store2_dominicks.csv'))

## Define the ParallelRunConfig

In [None]:
# Create the parallel run config
workercount = 3
nodecount = 1
timeout = 3000

tags1 = {}
tags1['nodes'] = nodecount
tags1['workers-per-node'] = workercount
tags1['timeout'] = timeout 

parallel_run_config = ParallelRunConfig(
    source_directory = './scripts',
    entry_script = 'score.py',
    mini_batch_size = '1',
    run_invocation_timeout = timeout, 
    error_threshold = 10,
    output_action = 'summary_only', 
    environment = batch_env, 
    process_count_per_node = workercount, 
    compute_target = compute, 
    node_count = nodecount
)

## Set up the ParallelRunStep

In [None]:
output_dir = PipelineData(name = 'scoringOutput', 
                         datastore = dstore, 
                         output_path_on_compute = 'scoringOutput/')

parallelrun_step = ParallelRunStep(
    name="many-models-scoring",
    parallel_run_config=parallel_run_config,
    inputs=[dataset1.as_named_input('store5')], # must have at least one element.... 
    output=output_dir,
    models= model_list,
    arguments=['--n_predictions', 8],
    allow_reuse = False
)

## Submit and Run the Pipeline

In [None]:
pipeline = Pipeline(ws, steps=[parallelrun_step])

run = experiment.submit(pipeline, tags=tags1)

## Scoring Script 

In [3]:
%%writefile ./scripts/score.py
from azureml.core.run import Run
import pandas as pd
import os
import uuid
import argparse
import datetime

from azureml.core.model import Model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import pickle
from azureml.core import Experiment, Workspace, Run
from azureml.core import ScriptRunConfig
# import datetime
from entry_script_helper import EntryScriptHelper
import logging

from sklearn.externals import joblib
from joblib import dump, load
import pmdarima as pm
import time
from datetime import timedelta
from sklearn.metrics import mean_squared_error, mean_absolute_error 

thisrun = Run.get_context()
#childrun=thisrun

LOG_NAME = "user_log"

print("Make predictions")

parser = argparse.ArgumentParser("split")
parser.add_argument("--n_predictions", type=int, help="input number of predictions")
parser.add_argument("--model", type=str, help="model name")
#parser.add_argument("--start_date", type=str, help="date to start predictions")

args, unknown = parser.parse_known_args()
# args = parser.parse_args()

print("Argument 1(n_predictions): %s" % args.n_predictions)
print("Argument 2(model): %s" % args.model)

def mape_calc(actual, predicted):
    act, pred = np.array(actual), np.array(predicted)
    mape = np.mean(np.abs((act - pred)/act)*100)
    return mape

def get_accuracy_metrics(actual, predicted, print_values = True):

    metrics = []
    mse = mean_squared_error(actual, predicted)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(actual, predicted)
    mape = mape_calc(actual, predicted)
    
    metrics.append(mse)
    metrics.append(rmse)
    metrics.append(mae)
    metrics.append(mape)
    
    if print_values == True: 
        print('Accuracy Metrics')
        print('MSE: {}'.format(mse))
        print('RMSE: {}'.format(rmse))
        print('MAE: {}'.format(mae))
        print('MAPE: {}'.format(mape))
    
    return metrics


def init():
    EntryScriptHelper().config(LOG_NAME)
    logger = logging.getLogger(LOG_NAME)
    output_folder = os.path.join(os.environ.get("AZ_BATCHAI_INPUT_AZUREML", ""), "temp/output")
    logger.info(f"{__file__}.output_folder:{output_folder}")
    logger.info("init()")    
    return

def run(data):
    logger = logging.getLogger(LOG_NAME)
    os.makedirs('./outputs', exist_ok=True)
    resultList = []
    logger.info('making predictions...')
    print("ITERATING THROUGH MODELS")
    
    for file in data:
        u1 = uuid.uuid4()
        mname='arima'+str(u1)[0:16]

        #for w in range(0,1):
        with thisrun.child_run(name=mname) as childrun:
            for w in range(0,5):
                thisrun.log(mname,str(w))
            date1=datetime.datetime.now()
            logger.info('starting ('+file+') ' + str(date1))
            childrun.log(mname,'starttime-'+str(date1))
            
            # 0. unpickle model 
            model_path = Model.get_model_path(args.model)
            print(model_path)
            model = joblib.load(model_path)
            print("UNPICKELED THE MODEL")
            # 1. make preidtions 
            predictions, conf_int = model.predict(args.n_predictions, return_conf_int = True)
            print("MADE PREDICTIONS")
            print(predictions)
            
            # 2. Score predictions with test set 
            test = pd.read_csv(file,header=0, )
            logger.info(data.head())
             
            test['Predicitons'] = predictions
            
            # accuracy metrics 
            accuracy_metrics = get_accuracy_metrics(test['Quantity'], test['Predictions'])
            print(accuracy_metrics)
            logger.info(accuracy_metrics)
            
            # 3. Save the output back to blob storage 
                        
          
                
           

            #you can return anything you want
            date2=datetime.datetime.now()
            logger.info('ending ('+str(file)+') ' + str(date2))

            #log some metrics
            childrun.log(mname,'endtime-'+str(date2))
            childrun.log(mname,'auc-1')
        resultList.append(True)
    return resultList


Writing ./scripts/score.py
