run an already published pipeline or a pipeline endpoint on a schedule.

Initialization Steps

In [None]:
import azureml.core
from azureml.core import Workspace

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

Compute Targets

In [None]:
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException
aml_compute_target = "cpu-cluster"
try:
    aml_compute = AmlCompute(ws, aml_compute_target)
    print("Found existing compute target: {}".format(aml_compute_target))
except ComputeTargetException:
    print("Creating new compute target: {}".format(aml_compute_target))
    
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = "STANDARD_D2_V2",
                                                                min_nodes = 1, 
                                                                max_nodes = 4)    
    aml_compute = ComputeTarget.create(ws, aml_compute_target, provisioning_config)
    aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

### Build and Publish Pipeline

Step 1 : Define a pipeline step

In [None]:
from azureml.pipeline.steps import PythonScriptStep

source_directory = "publish_run_train"

trainStep = PythonScriptStep(
    name="Training_Step",
    script_name="train.py", 
    compute_target=aml_compute_target, 
    source_directory=source_directory
)
print("TrainStep created")

Step 2 : Build the Pipeline

In [None]:
from azureml.pipeline.core import Pipeline

pipeline1 = Pipeline(workspace=ws, steps=[trainStep])
print ("Pipeline is built")

Step 3 : Publish the pipeline

In [None]:
from datetime import datetime

timenow = datetime.now().strftime('%m-%d-%Y-%H-%M')

pipeline_name = timenow + "-Pipeline"
print(pipeline_name)

published_pipeline1 = pipeline1.publish(
    name=pipeline_name, 
    description=pipeline_name)
print("Newly published pipeline id: {}".format(published_pipeline1.id))

Step 4: Create a Pipeline Endpoint

In [None]:
from azureml.pipeline.core import PipelineEndpoint

pipeline_endpoint = PipelineEndpoint.publish(workspace=ws, name="ScheduledPipelineEndpoint",
                                            pipeline=pipeline1, description="Publish pipeline endpoint for schedule test")
pipeline_endpoint

### Schedule Operations

Schedule operations require id of a published pipeline. You can get all published pipelines and do Schedule operations on them, or if you already know the id of the published pipeline, you can use it directly.

Step 1 : Get published pipeline ID

In [None]:
from azureml.pipeline.core import PublishedPipeline

# You could retrieve all pipelines that are published, or 
# just get the published pipeline object that you have the ID for.

# Get all published pipeline objects in the workspace
all_pub_pipelines = PublishedPipeline.list(ws)

# We will iterate through the list of published pipelines and 
# use the last ID in the list for Schelue operations: 
print("Published pipelines found in the workspace:")
for pub_pipeline in all_pub_pipelines:
    print(pub_pipeline.id)
    pub_pipeline_id = pub_pipeline.id

print("Published pipeline id to be used for Schedule operations: {}".format(pub_pipeline_id))

Step 2 : Create a schedule for the published pipeline using a recurrence.

This schedule will run on a specified recurrence interval.

In [None]:
from azureml.pipeline.core.schedule import ScheduleRecurrence, Schedule

recurrence = ScheduleRecurrence(frequency="Day", interval=2, hours=[22], minutes=[30]) # Runs every other day at 10:30pm

schedule = Schedule.create(workspace=ws, name="My_Schedule",
                           pipeline_id=pub_pipeline_id, 
                           experiment_name='Schedule-run-sample',
                           recurrence=recurrence,
                           wait_for_provisioning=True,
                           description="Schedule Run")

print("Created schedule with id: {}".format(schedule.id))

Step 3 : Get all schedules for a given pipeline


In [None]:
schedules = Schedule.list(ws, pipeline_id=pub_pipeline_id)

print("Found these schedules for the pipeline id {}:".format(pub_pipeline_id))
for schedule in schedules: 
    print(schedule.id)
    if schedule.recurrence is not None:
        schedule_id = schedule.id

print("Schedule id to be used for schedule operations: {}".format(schedule_id))


Step 4 : Get all schedules in your workspace

In [None]:
schedules = Schedule.list(ws, active_only=True) 
print("Your workspace has the following schedules set up:")
for schedule in schedules:
    print("{} (Published pipeline: {}".format(schedule.id, schedule.pipeline_id))

Step 5 : Get the schedule

In [None]:
fetched_schedule = Schedule.get(ws, schedule_id)
print("Using schedule with id: {}".format(fetched_schedule.id))

In [None]:
# In case you want to disable the schedule
# Set the wait_for_provisioning flag to False if you do not want to wait  
# for the call to provision the schedule in the backend.
fetched_schedule.disable(wait_for_provisioning=True)
fetched_schedule = Schedule.get(ws, schedule_id)
print("Disabled schedule {}. New status is: {}".format(fetched_schedule.id, fetched_schedule.status))


### Create a schedule for the pipeline using a Datastore

This schedule will run when additions or modifications are made to Blobs in the Datastore. By default, the Datastore container is monitored for changes. Use the path_on_datastore parameter to instead specify a path on the Datastore to monitor for changes. 

In [None]:
from azureml.core.datastore import Datastore

datastore = Datastore(workspace=ws, name="workspaceblobstore")

schedule = Schedule.create(workspace=ws, name="My_Schedule",
                           pipeline_id=pub_pipeline_id, 
                           experiment_name='Schedule-run-sample',
                           datastore=datastore,
                           wait_for_provisioning=True,
                           description="Schedule Run")
                          #polling_interval=5, use polling_interval to specify how often to poll for blob additions/modifications. Default value is 5 minutes.
                          #path_on_datastore="file/path") use path_on_datastore to specify a specific folder to monitor for changes.
        
        
print("Created schedule with id: {}".format(schedule.id))