# Scoring Pipeline

We do this using a 'pipeline first mentality' i.e. we want to have a production pipeline.

In [1]:
import os
import azureml.core
from azureml.core import Workspace, Experiment, Run
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import PythonScriptStep
from azureml.widgets import RunDetails
from azureml.core import Workspace, Experiment, Datastore
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.widgets import RunDetails
from azureml.data.data_reference import DataReference
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule
from azureml.core.dataset import Dataset
from azureml.core.authentication import InteractiveLoginAuthentication
from azureml.core import Experiment
from azureml.core import Environment
from azureml.core.runconfig import CondaDependencies, DEFAULT_CPU_IMAGE
from azureml.contrib.pipeline.steps import ParallelRunStep, ParallelRunConfig

print("SDK version:", azureml.core.VERSION)

SDK version: 1.0.74


## Set up workspace, datastore, experiment and compute

In [73]:
ws = Workspace(subscription_id="bbd86e7d-3602-4e6d-baa4-40ae2ad9303c", resource_group="ManyModelsSA", workspace_name="ManyModelsSAv1")
# auth = InteractiveLoginAuthentication(force=True, tenant_id="72f988bf-86f1-41af-91ab-2d7cd011db47")

# set up workspace
# ws = Workspace.from_config()
ws.get_details()

# choose a compute target
compute = AmlCompute(ws, "train-many-model")

# choose a datastore
dstore = ws.get_default_datastore()

# choose a experiment
experiment = Experiment(ws, 'automl-ojforecasting')
print(dstore.name, dstore.datastore_type, dstore.account_name, dstore.container_name)

workspaceblobstore AzureBlob manymodelssav16457539585 azureml-blobstore-77752be6-01b4-4a3e-9d42-03c9c0d6248f


## Set up run configuration

Set up the run config for experiment to run targeting different compute targets in Azure Machine Learning.

In [3]:
# create a new runconfig object
run_config = RunConfiguration()
run_config.environment.docker.enabled = True
run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE
run_config.environment.python.conda_dependencies = CondaDependencies.create(conda_packages=['sklearn','pmdarima'])

## Read the registered dataset from Workspace

We used 12,222 datasets and ParallelRunStep to build 12,222 time-series ARIMA models to predict the quantity of each store brand.

In [5]:
allfiledst = Dataset.get_by_name(ws, name='Allfiledatasets') 
allfiledstinput = allfiledst.as_named_input('trainallmodels')

read 3 datasets first

In [5]:
filedst3models = Dataset.get_by_name(ws, name='3modelsfiledataset')

In [7]:
%time files_list=filedst3models.to_path()

print("Files count: {}".format(len(files_list)))

# showing a few example paths
for i in range(min(5, len(files_list))):
    print("{}".format(files_list[i]))

CPU times: user 106 ms, sys: 10.9 ms, total: 117 ms
Wall time: 1.05 s
Files count: 3
/Store2_dominicks.csv
/Store5_tropicana.csv
/Store8_minute.maid.csv


In [19]:
file3dstinput = filedst3models.as_named_input('score3models')

## Set up environment 

Environment defines a collection of resources that we will need to run our Azure pipelines.

In [9]:
batch_conda_deps = CondaDependencies.create(pip_packages=['sklearn','pmdarima'])

batch_env = Environment(name="manymodels_environment")
batch_env.python.conda_dependencies = batch_conda_deps
batch_env.docker.enabled = True
batch_env.docker.base_image = DEFAULT_CPU_IMAGE

## Define ParallelRunConfig

In [147]:
workercount=3
nodecount=5
timeout=3000

output_dir = PipelineData(name="3ARIMAmodelsscore", 
                          datastore=dstore,
                          output_name='3modelsscore',
#                           output_mode='upload',
                          pipeline_output_name='3arimascore')
#                           output_path_on_compute="3ARIMAmodelsscore/")


In [148]:
datasetname='store'

tags1={}
tags1['dataset']=datasetname
tags1['nodes']=nodecount
tags1['workers-per-node']=workercount
tags1['timeout']=timeout

parallel_run_config = ParallelRunConfig(
    source_directory='./scripts',
    entry_script='score.py',
    mini_batch_size="1",
    run_invocation_timeout=timeout,
    error_threshold=10,
    output_action="append_row",
    environment=batch_env,
    process_count_per_node=workercount,
    compute_target=compute,
    node_count=nodecount)

## Set up ParallelRunStep

We added 3 arguments that users can customize based on the prediction goal.

In [150]:
parallelrun_step = ParallelRunStep(
    name="many-models-scoring",
    parallel_run_config=parallel_run_config,
    inputs=[file3dstinput],
    output=output_dir,
    models=[],
    arguments=['--target_column','Quantity', '--n_test_periods',6, '--timestamp_column','WeekStarting'],
    allow_reuse=False
)

## Submit the pipeline to run

In [224]:
pipeline = Pipeline(workspace=ws, steps=[parallelrun_step])

run = experiment.submit(pipeline,tags=tags1)
#RunDetails(run).show()

Created step many-models-scoring [2750c315][bde62caf-4e16-4418-ae76-ae12c6717259], (This step will run and generate new outputs)
Using data reference score3models_0 for StepId [adc4779d][a713f31d-49cc-4fb4-b0b2-8561d3bc61ce], (Consumers of this data are eligible to reuse prior runs.)
Submitted PipelineRun e22d0cc4-3538-4ad6-b926-20147cbdb6d2




Link to Azure Machine Learning studio: https://ml.azure.com/experiments/automl-ojforecasting/runs/e22d0cc4-3538-4ad6-b926-20147cbdb6d2?wsid=/subscriptions/bbd86e7d-3602-4e6d-baa4-40ae2ad9303c/resourcegroups/ManyModelsSA/workspaces/ManyModelsSAv1


In [225]:
run.wait_for_completion(show_output=True)

PipelineRunId: e22d0cc4-3538-4ad6-b926-20147cbdb6d2
Link to Portal: https://ml.azure.com/experiments/automl-ojforecasting/runs/e22d0cc4-3538-4ad6-b926-20147cbdb6d2?wsid=/subscriptions/bbd86e7d-3602-4e6d-baa4-40ae2ad9303c/resourcegroups/ManyModelsSA/workspaces/ManyModelsSAv1
PipelineRun Status: NotStarted
PipelineRun Status: Running


StepRunId: 3374d17f-4a19-4e75-a751-579edf0519aa
Link to Portal: https://ml.azure.com/experiments/automl-ojforecasting/runs/3374d17f-4a19-4e75-a751-579edf0519aa?wsid=/subscriptions/bbd86e7d-3602-4e6d-baa4-40ae2ad9303c/resourcegroups/ManyModelsSA/workspaces/ManyModelsSAv1
StepRun( many-models-scoring ) Status: NotStarted
StepRun( many-models-scoring ) Status: Running

Streaming azureml-logs/55_azureml-execution-tvmps_8871c3cb012c9a026fee021d64b3ddf54c32b01cf412e073370019e2b556b4f3_d.txt
2019-12-07T03:29:58Z Starting output-watcher...
Login Succeeded
Using default tag: latest
latest: Pulling from azureml/azureml_b55ccba3e015ef6ca93b6a296bf3a2b2
a1298f4ce990: 


Streaming azureml-logs/75_job_post-tvmps_8871c3cb012c9a026fee021d64b3ddf54c32b01cf412e073370019e2b556b4f3_d.txt
bash: /azureml-envs/azureml_c46313de5fc6278ee028076ebb69e934/lib/libtinfo.so.5: no version information available (required by bash)

StepRun(many-models-scoring) Execution Summary
StepRun( many-models-scoring ) Status: Finished
{'runId': '3374d17f-4a19-4e75-a751-579edf0519aa', 'target': 'train-many-model', 'status': 'Completed', 'startTimeUtc': '2019-12-07T03:29:53.927499Z', 'endTimeUtc': '2019-12-07T03:31:51.02773Z', 'properties': {'azureml.runsource': 'azureml.StepRun', 'ContentSnapshotId': '3c1f626f-50b7-481f-ba5c-bfb340570ae9', 'StepType': 'PythonScriptStep', 'ComputeTargetType': 'AmlCompute', 'azureml.pipelinerunid': 'e22d0cc4-3538-4ad6-b926-20147cbdb6d2', '_azureml.ComputeTargetType': 'amlcompute', 'AzureML.DerivedImageName': 'azureml/azureml_b55ccba3e015ef6ca93b6a296bf3a2b2', 'ProcessInfoFile': 'azureml-logs/process_info.json', 'ProcessStatusFile': 'azureml-logs/proce



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': 'e22d0cc4-3538-4ad6-b926-20147cbdb6d2', 'status': 'Completed', 'startTimeUtc': '2019-12-07T03:25:11.433926Z', 'endTimeUtc': '2019-12-07T03:32:13.507678Z', 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': None, 'runType': 'HTTP', 'azureml.parameters': '{"aml_process_count_per_node":"3","aml_node_count":"5"}'}, 'inputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://manymodelssav16457539585.blob.core.windows.net/azureml/ExperimentRun/dcid.e22d0cc4-3538-4ad6-b926-20147cbdb6d2/logs/azureml/executionlogs.txt?sv=2019-02-02&sr=b&sig=F7zF%2B0%2FzXk%2BOSzv1rp9soy0cTOylGHK7ZQhKvUSmLqk%3D&st=2019-12-07T03%3A22%3A16Z&se=2019-12-07T11%3A32%3A16Z&sp=r', 'logs/azureml/stderrlogs.txt': 'https://manymodelssav16457539585.blob.core.windows.net/azureml/ExperimentRun/dcid.e22d0cc4-3538-4ad6-b926-20147cbdb6d2/logs/azureml/stderrlogs.txt?sv=2019-02-02&sr=b&sig=gLWKE2C%2BPabFSVboUrgM0vr5wzyDXKR8H

'Finished'

In [226]:
run

Experiment,Id,Type,Status,Details Page,Docs Page
automl-ojforecasting,e22d0cc4-3538-4ad6-b926-20147cbdb6d2,azureml.PipelineRun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [227]:
prediction_run = next(run.get_children())
prediction_output = prediction_run.get_output_data("3modelsscore")

In [228]:
prediction_output

Name,Datastore,Path on Datastore,Produced By PipelineRun,Produced By StepRun
3modelsscore,workspaceblobstore,azureml/3374d17f-4a19-4e75-a751-579edf0519aa/3modelsscore,e22d0cc4-3538-4ad6-b926-20147cbdb6d2,3374d17f-4a19-4e75-a751-579edf0519aa


In [229]:
prediction_output.download(local_path="scoring_results")

1

In [230]:
for root, dirs, files in os.walk("scoring_results"):
    for file in files:
        if file.endswith('parallel_run_step.txt'):
            result_file = os.path.join(root,file)
            print (result_file)

scoring_results/azureml/3374d17f-4a19-4e75-a751-579edf0519aa/3modelsscore/parallel_run_step.txt


In [232]:
df = pd.read_csv(result_file, delimiter=" ", header=None) 
df.columns=['Store','Brand','Quantity','Advert','Price','Revenue','Predictions','WeekStarting']
df

Unnamed: 0,Store,Brand,Quantity,Advert,Price,Revenue,Predictions,WeekStarting
0,2,dominicks,9024,0,1.19,10738.56,17544.331791,1992-08-27
1,2,dominicks,2048,0,2.09,4280.32,14856.337652,1992-09-03
2,2,dominicks,1984,0,2.09,4146.56,19966.683155,1992-09-10
3,2,dominicks,4160,0,1.77,7363.2,12821.308733,1992-09-17
4,2,dominicks,35264,0,1.49,52543.36,12909.109209,1992-09-24
5,2,dominicks,8640,0,1.82,15724.8,14056.553538,1992-10-01
6,8,minute.maid,18688,0,1.69,31582.72,26480.188275,1992-08-27
7,8,minute.maid,14656,0,1.69,24768.64,27021.250533,1992-09-03
8,8,minute.maid,30144,1,1.99,59986.56,26787.754776,1992-09-10
9,8,minute.maid,6208,0,2.49,15457.92,26606.842329,1992-09-17


## Score script

In [223]:
%%writefile ./scripts/score.py

from azureml.core.run import Run
import pandas as pd
import os
import uuid
import argparse
import datetime

from azureml.core.model import Model
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
import pickle
from azureml.core import Experiment, Workspace, Run
from azureml.core import ScriptRunConfig
from entry_script_helper import EntryScriptHelper
import logging

from sklearn.externals import joblib
from joblib import dump, load
import pmdarima as pm
import time
from datetime import timedelta

thisrun = Run.get_context()

LOG_NAME = "user_log"

print("Split the data into train and test")

parser = argparse.ArgumentParser("split")
parser.add_argument("--target_column", type=str, help="input target column")
parser.add_argument("--n_test_periods", type=int, help="input number of test periods")
parser.add_argument("--timestamp_column", type=str, help="input timestamp column")

args, unknown = parser.parse_known_args()

print("Argument 1(n_test_periods): %s" % args.n_test_periods)
print("Argument 2(target_column): %s" % args.target_column)
print("Argument 3(timestamp_column): %s" % args.timestamp_column)

def init():
    EntryScriptHelper().config(LOG_NAME)
    logger = logging.getLogger(LOG_NAME)
    output_folder = os.path.join(os.environ.get("AZ_BATCHAI_INPUT_AZUREML", ""), "temp/output")
    logger.info(f"{__file__}.output_folder:{output_folder}")
    logger.info("init()")
    return

def run(input_data):
    # 0. Set up logging
    logger = logging.getLogger(LOG_NAME)
    os.makedirs('./outputs', exist_ok=True)
    resultList = []
#     allpredictions = []
    allpredictions = pd.DataFrame()
    logger.info('processing all files')

    # 1. Read in the data file
    for idx, csv_file_path in enumerate(input_data):
        u1 = uuid.uuid4()
        mname='arima'+str(u1)[0:16]
        with thisrun.child_run(name=mname) as childrun:
            for w in range(0,5):
                thisrun.log(mname,str(w))
            date1=datetime.datetime.now()
            logger.info('starting ('+csv_file_path+') ' + str(date1))
            childrun.log(mname,'starttime-'+str(date1))
            
            data = pd.read_csv(csv_file_path,header=0)
            logger.info(data.head())

            # 2. unpickle model 
            model_name = 'arima_'+str(input_data).split('/')[-1][:-6]
            model_path = Model.get_model_path(model_name) 
            print(model_path)
            model = joblib.load(model_path)
            print("UNPICKELED THE MODEL")
            
            # 3. make preidtions 
            predictions, conf_int = model.predict(args.n_test_periods, return_conf_int = True)
            print("MADE PREDICTIONS")
            print(predictions)
                        
            # 4. splitting the data for test set     
            data = data.set_index(args.timestamp_column)             
            max_date = datetime.datetime.strptime(data.index.max(),'%Y-%m-%d')
            split_date = max_date - timedelta(days=7*args.n_test_periods)
            data.index = pd.to_datetime(data.index)
            train = data[data.index <= split_date]
            test = data[data.index > split_date]
            
            testcp = test.copy()           
            testcp['Predicitons'] = predictions
            testcp['WeekStarting'] = testcp.index
            print(testcp)
            
#             testcp_list = [testcp.columns.values.tolist()] + testcp.values.tolist()
#             print (testcp_list)
            allpredictions = allpredictions.append(testcp)
#             print (allpredictions)
#             allpredictions = pd.concat([testcp, testcp], axis=0)
            print(allpredictions)
            
#             testcp.to_csv('arima_'+str(input_data).split('/')[-1][:-6]+'_Prediction.csv')
            
#             # 5. accuracy metrics 
#             accuracy_metrics = get_accuracy_metrics(test['Quantity'], test['Predictions'])
#             print(accuracy_metrics)
#             logger.info(accuracy_metrics)
            
            # 3. Save the output back to blob storage 
            #predictions_path = 'predictions'
            #filename = '/arima_'+str(input_data).split('/')[-1][:-6]+'.csv'
            
            #test[['Quantity', 'Predictions']].to_csv(path_or_buf = predictions_path + filename, index = False)
                       
            date2=datetime.datetime.now()
            logger.info('ending ('+csv_file_path+') ' + str(date2))

            #6. Log some metrics
            childrun.log(mname,'endtime-'+str(date2))
            childrun.log(mname,'auc-1')
        resultList.append(True)
#         allpredictionslist = allpredictions.values.tolist()
    return allpredictions

Overwriting ./scripts/score.py


## Next step