# DO NOT START THIS NOTEBOOK UNTIL THE PIPELINE CREATED IN THE PREVIOUS STEP, THE IJUNGLE TRAINING PIPELINE, IS IN "COMPLETE" STATUS.

# iJungle Inference pipeline

In [1]:
from azureml.core import Workspace, Environment, Experiment, ScriptRunConfig
from azureml.core.compute import ComputeTarget
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import RunConfiguration
from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import PythonScriptStep, ParallelRunConfig, ParallelRunStep
from azureml.data import OutputFileDatasetConfig

In [2]:
cluster_name = "cluster4"
environment_name = "ijungle-inference-env"
input_dataset_name="ijungle-test-dataset"
working_datastore_name="workspaceblobstore"
output_datastore_name="workspaceblobstore"
output_path="iJungle/results/"
pipeline_name="ijungle-inference-pipeline"

index_feature = 'index'
anomaly_score = -.8

In [3]:
ws = Workspace.from_config()
pipeline_cluster = ComputeTarget(workspace=ws, name=cluster_name)
print('Cluster configured to execute the pipeline:',cluster_name)

Cluster configured to execute the pipeline: cluster4


In [4]:
new_env = Environment(environment_name)
packages = CondaDependencies.create(
    conda_packages=['pip'],
    pip_packages=['azureml-defaults','azureml-interpret','scikit-learn','pandas','pyarrow'])

# Add the dependencies to the environment
new_env.python.conda_dependencies = packages

# Register the environment 
new_env.register(workspace=ws)
registered_env = Environment.get(ws, environment_name)

# Create a new runconfig object for the pipeline
pipeline_run_config = RunConfiguration()

# Use the compute you created above. 
pipeline_run_config.target = pipeline_cluster

# Assign the environment to the run configuration
pipeline_run_config.environment = registered_env

print ("Run configuration created.")


Run configuration created.


In [5]:
# Get the inference dataset
inference_ds = ws.datasets.get(input_dataset_name)

# Intermadiate data
dataprep_output = OutputFileDatasetConfig(
    name="processed_data", 
    destination=(
        ws.datastores.get(working_datastore_name), 
        "invoices/{run-id}/{output-name}")
).as_upload()

# Step 1, Run the data prep script
prep_step = PythonScriptStep(
    name = "Inference data preparation Step",
    source_directory = "../scripts",
    script_name = "feat_eng.py",
    arguments = [
        '--input-data', inference_ds.as_named_input('input'),
        '--prepped-data', dataprep_output,
        '--index-feature', index_feature,
        '--training', 'False',        
    ],
    outputs=[dataprep_output],
    compute_target = pipeline_cluster,
    runconfig = pipeline_run_config,
    allow_reuse = False
)

# Initial definition of the pipeline steps
pipeline_steps = [prep_step]


In [6]:
# Next Step, run the inferencing script

node_count = int(pipeline_cluster.serialize()['properties']['properties']['scaleSettings']['maxNodeCount'])

dataprep_input = dataprep_output.read_parquet_files().as_input("inference_data")

inference_output_dir = OutputFileDatasetConfig(
    name="inference_output", 
    destination=(
        ws.datastores.get(working_datastore_name), 
        "invoices/{run-id}/{output-name}")
).as_upload()

inference_step = PythonScriptStep(
    name = "Inference Step",
    source_directory = "../scripts",
    script_name = "inference.py",
    arguments = [
        '--input', dataprep_input,
        '--output', inference_output_dir,
        '--feat-id', index_feature
    ],
    inputs=[dataprep_input],
    outputs=[inference_output_dir],
    compute_target = pipeline_cluster,
    runconfig = pipeline_run_config,
    allow_reuse = False
)

pipeline_steps.append(inference_step)


In [7]:
# Next step, explainability

interpret_input = inference_output_dir.read_parquet_files().as_input("interpret_input")

interpret_output_dir = OutputFileDatasetConfig(
    name="interpret_output", 
    destination=(
        ws.datastores.get(output_datastore_name), 
        output_path)
).as_upload()


interpret_step = PythonScriptStep(
    name = "Explainability Step",
    source_directory = "../scripts",
    script_name = "interpret.py",
    arguments = [
        '--input', interpret_input,
        '--dataprep', dataprep_input,
        '--output', interpret_output_dir,
        '--index-id', index_feature,
        '--anomaly-score', anomaly_score
    ],
    inputs=[ interpret_input, dataprep_input],
    outputs=[interpret_output_dir],
    compute_target = pipeline_cluster,
    runconfig = pipeline_run_config,
    allow_reuse = False
)
pipeline_steps.append(interpret_step)

NameError: name 'dataprep_output_outliers' is not defined

In [None]:
# Construct the pipeline
pipeline = Pipeline(workspace=ws, steps=pipeline_steps)
print("Pipeline is built.")

# Create an experiment and run the pipeline
experiment = Experiment(workspace=ws, name = pipeline_name)
pipeline_run = experiment.submit(pipeline, regenerate_outputs=True)
print("Pipeline submitted for execution.")
