In [1]:
# Pipeline for Batch Inferencing

In [2]:
from azureml.core import Workspace, Experiment, Datastore, Environment, Dataset
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_CPU_IMAGE
from azureml.pipeline.core import Pipeline, PipelineParameter, PipelineData
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import PipelineParameter, PipelineData, PipelineEndpoint
from azureml.data.output_dataset_config import OutputTabularDatasetConfig, OutputDatasetConfig, OutputFileDatasetConfig

# Load the workspace from the saved config file
ws = Workspace.from_config()
print('Ready to use Azure ML {} to work with {}'.format(azureml.core.VERSION, ws.name))

Ready to use Azure ML 1.38.0 to work with mm-hackathon-prep


In [3]:
import os
directory_path = os.getcwd()
print("My current directory is : " + directory_path)
folder_name = os.path.basename(directory_path)
print("My directory name is : " + folder_name)

parent = os.path.dirname(directory_path)
parent_folder_name = os.path.basename(parent)
print("My user directory name is: " + parent_folder_name)

user = parent_folder_name
user = parent_folder_name
user = user.replace('_', '')
user = user.replace('-', '')
user = user[:10]
print('user=' + user)
experiment_name = parent_folder_name + '-inferecing Pipeline'

My current directory is : /mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz4/code/Users/memasanz/Internal-hack-prep
My directory name is : Internal-hack-prep
My user directory name is: memasanz
user=memasanz


In [4]:
from azureml.core import Dataset

#use default datastore
default_ds = ws.get_default_datastore()

if user + '-inference-dataset1' not in ws.datasets:
    default_ds.upload_files(files=['./data/Day2_Inference1.csv'], # Upload the diabetes csv files in /data
                        target_path= user + '-inference1/', # Put it in a folder path in the datastore
                        overwrite=True, # Replace existing files of the same name
                        show_progress=True)

    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, user + '-inference1/*.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name= user + '-inference-dataset1',
                                description='data 1',
                                tags = {'format':'csv'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset1 already registered.')

#########################
if user + '-inference-dataset2' not in ws.datasets:
    default_ds.upload_files(files=['./data/Day2_Inference2.csv'], # Upload the diabetes csv files in /data
                        target_path= user + '-inference2/', # Put it in a folder path in the datastore
                        overwrite=True, # Replace existing files of the same name
                        show_progress=True)

    tab_data_set = Dataset.Tabular.from_delimited_files(path=(default_ds, user + '-inference2/*.csv'))

    # Register the tabular dataset
    try:
        tab_data_set = tab_data_set.register(workspace=ws, 
                                name= user + '-inference-dataset2',
                                description='data 2',
                                tags = {'format':'csv'},
                                create_new_version=True)
        print('Dataset registered.')
    except Exception as ex:
        print(ex)
else:
    print('Dataset2 already registered.')

Dataset1 already registered.
Dataset2 already registered.


In [5]:
import os, shutil
folder_name = 'batch-inferencing'
script_folder = os.path.join(os.getcwd(), folder_name)
print(script_folder)
os.makedirs(script_folder, exist_ok=True)

/mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz4/code/Users/memasanz/Internal-hack-prep/batch-inferencing


In [6]:
model_name_parm  = PipelineParameter(name='model_name', default_value= 'titanic-model')
user_param       = PipelineParameter(name='user_param', default_value= user)

In [7]:
%%writefile $script_folder/score_inferencing_data.py

from azureml.core import Run, Workspace, Datastore, Dataset
from azureml.core.model import Model
from azureml.data.datapath import DataPath
import pandas as pd
import os
import argparse
import joblib
import json
import joblib
import numpy as np
from azureml.core.model import Model
import time
import pandas as pd
import azureml.core
from azureml.core import Workspace, Dataset
import os
import math


# Parse input arguments
parser = argparse.ArgumentParser("Score Inferencing Data")
parser.add_argument('--model_name_parm', type=str, required=True)
parser.add_argument('--scored_dataset', dest='scored_dataset', required=True)

args, _ = parser.parse_known_args()
model_name = args.model_name_parm
scored_dataset = args.scored_dataset

# Get current run
current_run = Run.get_context()

# Get associated AML workspace
ws = current_run.experiment.workspace

# Get default datastore
ds = ws.get_default_datastore()


inferencing_dataset1 = current_run.input_datasets['raw_data1']
df1 = inferencing_dataset1.to_pandas_dataframe()

inferencing_dataset2 = current_run.input_datasets['raw_data2']
df2 = inferencing_dataset2.to_pandas_dataframe()


print('model_name' + model_name)

# Get model from workspace - the code below will always retrieve the latest version of the model; specific versions can be targeted.
model_list = Model.list(ws, name=model_name, latest=True)
model_path = model_list[0].download(exist_ok=True)
model = joblib.load(model_path)

print(df1.columns)
print(df2.columns)

df = df1.merge(df2, on = 'passenger_id', how = 'inner')

print('***************')
print(df.columns)
print('***************')
df['loc']= df['cabin'].apply(lambda x: x[0] if pd.notnull(x) else 'X')
df['hasFamily'] = (df['sibsp'] > 0) | (df['parch'] > 0)
    
cols_to_keep = ['pclass','sex','age','embarked','loc','hasFamily']
df_for_pred = df[cols_to_keep]
    
print(df_for_pred.isnull().sum())
y_pred = model.predict(df_for_pred)


print('made predictions')

print(y_pred)


df['Predictions']=y_pred

print(df.head(5))


# Save scored dataset
os.makedirs(scored_dataset, exist_ok=True)
print(scored_dataset)


df.to_csv(os.path.join(scored_dataset, 'scored_data.csv'), index=False)

Overwriting /mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz4/code/Users/memasanz/Internal-hack-prep/batch-inferencing/score_inferencing_data.py


In [8]:
dataset1 = Dataset.get_by_name(ws, name='memasanz-inference-dataset1')
dataset2 = Dataset.get_by_name(ws, name='memasanz-inference-dataset2')

In [9]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = user + "-cluster"
cluster_name = cluster_name[-16:]
print('trying to create: ' + cluster_name)

try:
    # Check for existing compute target
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    # If it doesn't already exist, create it
    try:
        compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS11_V2', max_nodes=2, idle_seconds_before_scaledown=1800)
        compute_target = ComputeTarget.create(ws, cluster_name, compute_config)
        compute_target.wait_for_completion(show_output=True)
    except Exception as ex:
        print(ex)

trying to create: memasanz-cluster
Found existing cluster, use it.


In [10]:
%%writefile $script_folder/experiment_env.yml
name: experiment_env
dependencies:
  # The python interpreter version.
  # Currently Azure ML only supports 3.5.2 and later.
- python=3.6.2
- scikit-learn
- ipykernel
- matplotlib
- pandas
- pip
- pip:
  - azureml-defaults
  - pyarrow

Overwriting /mnt/batch/tasks/shared/LS_root/mounts/clusters/memasanz4/code/Users/memasanz/Internal-hack-prep/batch-inferencing/experiment_env.yml


In [11]:
run_config = RunConfiguration()
run_config.environment.docker.base_image = DEFAULT_CPU_IMAGE

conda_dependencies_file_path = os.path.join(script_folder, 'experiment_env.yml')
conda_dependencies = CondaDependencies(conda_dependencies_file_path= conda_dependencies_file_path )
run_config.environment.python.conda_dependencies = conda_dependencies

In [12]:
inference_outputfile_name = 'titanic_batch_inference_dataset'
inference_dataset_name = 'titanic_batch_inference_dataset'
scored_dataset = OutputFileDatasetConfig(name=inference_outputfile_name, destination=(default_ds, user + 'inferencing_dataset/{run-id}')).read_delimited_files().register_on_complete(name= inference_dataset_name)

score_data_step = PythonScriptStep(
    name='Get Inferencing Data',
    script_name='score_inferencing_data.py',
    arguments=[
        '--input-data1', dataset1.as_named_input('raw_data1'),
        '--input-data2', dataset2.as_named_input('raw_data2'),
        '--model_name_parm', model_name_parm,
        '--scored_dataset', scored_dataset
    ],
    outputs=[scored_dataset],
    compute_target=compute_target,
    source_directory=folder_name,
    allow_reuse=False,
    runconfig=run_config
)

In [13]:
pipeline = Pipeline(workspace=ws, steps=[score_data_step])

In [14]:
experiment = Experiment(ws, user + '_batch_predictions')
run = experiment.submit(pipeline)
run.wait_for_completion(show_output=True)

Created step Get Inferencing Data [f22093ff][7c007db2-f127-4456-b4fc-b35ba1c7b26e], (This step will run and generate new outputs)
Submitted PipelineRun 3279b51e-e8a1-416e-84fd-a2fb48dc4ab8
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/3279b51e-e8a1-416e-84fd-a2fb48dc4ab8?wsid=/subscriptions/5da07161-3770-4a4b-aa43-418cbbb627cf/resourcegroups/mm-hackathon-prep-rg/workspaces/mm-hackathon-prep&tid=72f988bf-86f1-41af-91ab-2d7cd011db47
PipelineRunId: 3279b51e-e8a1-416e-84fd-a2fb48dc4ab8
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/3279b51e-e8a1-416e-84fd-a2fb48dc4ab8?wsid=/subscriptions/5da07161-3770-4a4b-aa43-418cbbb627cf/resourcegroups/mm-hackathon-prep-rg/workspaces/mm-hackathon-prep&tid=72f988bf-86f1-41af-91ab-2d7cd011db47
PipelineRun Status: Running


StepRunId: 3caecda9-2cb9-44dc-a8f4-5442f5752779
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/3caecda9-2cb9-44dc-a8f4-5442f5752779?wsid=/subscriptions/5da07161-3770-4a4b-aa43-418cb



PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '3279b51e-e8a1-416e-84fd-a2fb48dc4ab8', 'status': 'Completed', 'startTimeUtc': '2022-03-08T01:41:41.242658Z', 'endTimeUtc': '2022-03-08T01:42:15.308009Z', 'services': {}, 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': 'SDK', 'runType': 'SDK', 'azureml.parameters': '{"model_name":"titanic-model"}', 'azureml.continue_on_step_failure': 'False', 'azureml.pipelineComponent': 'pipelinerun'}, 'inputDatasets': [], 'outputDatasets': [], 'logFiles': {'logs/azureml/executionlogs.txt': 'https://mmhackathonpre4120422243.blob.core.windows.net/azureml/ExperimentRun/dcid.3279b51e-e8a1-416e-84fd-a2fb48dc4ab8/logs/azureml/executionlogs.txt?sv=2019-07-07&sr=b&sig=K4Eyr8qh7FW5PHXuGDWjQJmBqnRDymuXakBmTkXOmFY%3D&skoid=f676a930-738c-4b9f-82a1-19a06ad4180e&sktid=72f988bf-86f1-41af-91ab-2d7cd011db47&skt=2022-03-07T23%3A49%3A42Z&ske=2022-03-09T07%3A59%3A42Z&sks=b&skv=2019-07-07&st=2022-03-08T01%3A32%3A06Z&se=2022-03-

'Finished'

In [16]:
dataset = Dataset.get_by_name(ws, name='titanic_batch_inference_dataset')
df = dataset.to_pandas_dataframe()

In [17]:
df

Unnamed: 0,passenger_id,fare,cabin,embarked,home.dest,survived,pclass,name,sex,age,sibsp,parch,ticket,loc,hasFamily,Predictions
0,283,153.4625,C91,S,"Winnipeg, MB",0.0,1.0,"Graham, Mr. George Edward",male,38.0,0.0,1.0,PC 17582,C,True,0
1,82,31.3875,,S,"Sweden Worcester, MA",0.0,3.0,"Asplund, Master. Carl Edgar",male,5.0,4.0,2.0,347077,X,True,0
2,753,15.0,,S,"North Evington, England",0.0,2.0,"Jarvis, Mr. John Denzil",male,47.0,0.0,0.0,237565,X,False,0
3,703,7.75,,Q,,0.0,3.0,"Mangan, Miss. Mary",female,30.5,0.0,0.0,364850,X,False,1
4,533,9.4833,,S,,0.0,3.0,"Larsson, Mr. August Viktor",male,29.0,0.0,0.0,7545,X,False,0
5,548,15.55,,S,,0.0,3.0,"Lindell, Mrs. Edvard Bengtsson (Elin Gerda Per...",female,30.0,1.0,0.0,349910,X,True,1
6,267,91.0792,B49,C,"Dowagiac, MI",1.0,1.0,"Bishop, Mrs. Dickinson H (Helen Walton)",female,19.0,1.0,0.0,11967,B,True,1
7,1149,17.4,,S,"Tampico, MT",1.0,3.0,"de Messemaeker, Mrs. Guillaume Joseph (Emma)",female,36.0,1.0,0.0,345572,X,True,0
8,442,8.05,,S,,0.0,3.0,"Nancarrow, Mr. William Henry",male,33.0,0.0,0.0,A./5. 3338,X,False,0
9,672,15.5792,,C,"New York, NY",0.0,2.0,"Mangiavacchi, Mr. Serafino Emilio",male,,0.0,0.0,SC/A.3 2861,X,False,0
