Initialization Steps

In [None]:
import azureml.core
from azureml.core import Workspace, Datastore, Experiment, Dataset
from azureml.data import OutputFileDatasetConfig
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

from azureml.pipeline.core import Pipeline
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core.graph import PipelineParameter

print("Pipeline SDK-specific imports completed")

ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')


Define the datastore

In [None]:
def_blob_store = Datastore(ws,"workspaceblobstore")
print("Blobstore's name is : {}".format(def_blob_store.name))


Compute Targets

In [None]:
from azureml.core.compute_target import ComputeTargetException

aml_compute_target = "cpu-cluster"
try:
    aml_compute = AmlCompute(ws, aml_compute_target)
    print("found existing compute target.")
except ComputeTargetException:
    print("creating new compute target")
    
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = "STANDARD_D2_V2",
                                                                min_nodes = 1, 
                                                                max_nodes = 4)    
    aml_compute = ComputeTarget.create(ws, aml_compute_target, provisioning_config)
    aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

### Building Pipeline step 

Step 1 : Upload data to datastore

In [None]:
data_path = def_blob_store.upload_files(["./20news.pkl"], target_path="20newsgroups" , overwrite=True)


Step2 : Reference the data uploaded using from_files method

In [None]:
# Assign the datasource to blob_input_data variable

blob_input_data = Dataset.File.from_files(data_path).as_named_input("Test_data")
print("Test dataset created.")

In [None]:
# # Define intermediate data using OutputFileDatasetConfig

processed_data1 = OutputFileDatasetConfig(name="processed_data1")
print("Output dataset object created.")

Define a Step that consumes a dataset and produces intermediate data.

In [None]:
# # trainStep consumes the datasource (Datareference) in the previous step
# and produces processed_data

source_directory = "publish_run_train"

trainStep = PythonScriptStep(script_name="train.py",
                            arguments=["--input_data" , blob_input_data.as_mount(),
                                      "--output_train" , processed_data1],
                            compute_target=aml_compute,
                            source_directory=source_directory)

print("Train step is created.")

Define a Step that consumes intermediate data and produces intermediate data

In [None]:
# extractStep to use the intermediate data produced by trainStep
# This step also produces an output processed_data2
processed_data2 = OutputFileDatasetConfig(name="processed_data2")
source_directory = "publish_run_extract"

extractStep = PythonScriptStep(
    script_name="extract.py",
    arguments=["--input_extract", processed_data1.as_input(), "--output_extract", processed_data2],
    compute_target=aml_compute, 
    source_directory=source_directory)
print("extractStep created")

#### PipelineParameter
This step also has a PipelineParameter argument that help with calling the REST endpoint of the published pipeline.


In [None]:
pipeline_param = PipelineParameter(name="pipeline_arg" , default_value=10)
print("Pipeline parameter created.")

In [None]:
# Now define compareStep that takes two inputs (both intermediate data), and produce an output
processed_data3 = OutputFileDatasetConfig(name="processed_data3")

# You can register the output as dataset after job completion
processed_data3 = processed_data3.register_on_complete("compare_result")

source_directory = "publish_run_compare"

compareStep = PythonScriptStep(
    script_name="compare.py",
    arguments=["--compare_data1", processed_data1.as_input(), "--compare_data2", processed_data2.as_input(), "--output_compare", processed_data3, "--pipeline_param", pipeline_param],  
    compute_target=aml_compute, 
    source_directory=source_directory)
print("compareStep created")

### Build the pipeline

In [None]:
pipeline1 = Pipeline(workspace=ws, steps=[compareStep])
print ("Pipeline is built")

### Run published pipeline

Publish the pipeline


In [None]:
published_pipeline1 = pipeline1.publish(name="My_New_Pipeline", 
                                        description="My Published Pipeline Description", 
                                        continue_on_step_failure=True)
published_pipeline1

Publish the pipeline from a submitted PipelineRun.

It is also possible to publish a pipeline from a submitted PipelineRun

1. submit
2. publish



In [None]:
# submit a pipeline run
pipeline_run1 = Experiment(ws, 'Pipeline_experiment_sample').submit(pipeline1)
# publish a pipeline from the submitted pipeline run
published_pipeline2 = pipeline_run1.publish_pipeline(name="My_New_Pipeline2", description="My Published Pipeline Description", version="0.1", continue_on_step_failure=True)
published_pipeline2

### Get published pipeline


use pipeline id.
So to get all published pipeline id use:
all_pub_pipelines = PublishedPipeline.get_all(ws)

In [None]:
from azureml.pipeline.core import PublishedPipeline

pipeline_id = published_pipeline1.id # use your published pipeline id
published_pipeline = PublishedPipeline.get(ws, pipeline_id)
published_pipeline

#### Run published pipeline using its REST endpoint


In [None]:
from azureml.core.authentication import InteractiveLoginAuthentication
import requests

auth = InteractiveLoginAuthentication()
aad_token = auth.get_authentication_header()

rest_endpoint1 = published_pipeline.endpoint

print("You can perform HTTP POST on URL {} to trigger this pipeline".format(rest_endpoint1))

# specify the param when running the pipeline
response = requests.post(rest_endpoint1, 
                         headers=aad_token, 
                         json={"ExperimentName": "My_Pipeline1",
                               "RunSource": "SDK",
                               "ParameterAssignments": {"pipeline_arg": 45}})

In [None]:
try:
    response.raise_for_status()
except Exception:    
    raise Exception('Received bad response from the endpoint: {}\n'
                    'Response Code: {}\n'
                    'Headers: {}\n'
                    'Content: {}'.format(rest_endpoint, response.status_code, response.headers, response.content))

run_id = response.json().get('Id')
print('Submitted pipeline run: ', run_id)