In [1]:
import json
import azureml.core
from azureml.core import Workspace, Datastore, Dataset, Environment, Experiment
from azureml.data import FileDataset
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
from azureml.pipeline.core import Pipeline, PipelineData, PipelineParameter
from azureml.core.compute import ComputeTarget, AmlCompute


print(azureml.core.VERSION)
version = dict(zip(['major','minor','patch'], azureml.core.VERSION.split('.')))
ws = Workspace.from_config()

1.10.0


In [2]:
if int(version['major']) >= 1: 
    if int(version['minor']) == 10:
        from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep        
    else:
        from azureml.contrib.pipeline.steps import ParallelRunConfig, ParallelRunStep 

In [3]:
compute_name = "aml-compute1"
vm_size = "STANDARD_DS1_v2"
if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('Found compute target: ' + compute_name)
else:
    print('compute target not found, refer to 02_create_compute_cluster to create compute target...')

Found compute target: aml-compute1


In [4]:
datastore_name = 'godzilla'
if datastore_name in ws.datastores:
    datastore = ws.datastores[datastore_name]
    if datastore and type(datastore) is Datastore: 
        print('Found datastore: ' + datastore_name)
else: 
    print('datastore not found...')

images_dataset_name = 'images_partition'
path_on_datastore = datastore.path('images')
input_images_dataset = Dataset.File.from_files(path=path_on_datastore, validate=False)

In [5]:
output_dir = PipelineData(name="scores", 
                          datastore=ws.get_default_datastore(), 
                          output_path_on_compute="batchscoring/results")

In [6]:
output_dir

$AZUREML_DATAREFERENCE_scores

# pipeline parameters

In [7]:
# kv = ws.get_default_keyvault()
# print(len(kv.get_secret(pipeline_kv_readapi)))
pipeline_inpart = PipelineParameter(name="pipeline_inpart", default_value='2020/07/28')
pipeline_kv_customimg = PipelineParameter(name="pipeline_kv_customimg", default_value='api-custom-vision')
pipeline_kv_readapi = PipelineParameter(name="pipeline_kv_readapi", default_value='api-readapi')
pipeline_dataset_param = PipelineParameter(name='pipeline_dataset_id', default_value=input_images_dataset)


In [8]:
print(pipeline_inpart)
print(pipeline_dataset_param)

PipelineParameter_Name:pipeline_inpart_Default:2020/07/28
PipelineParameter_Name:pipeline_dataset_id_Default:FileDataset
{
  "source": [
    "('godzilla', 'images')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ]
}


# python envinronment configuration 

In [9]:
# conda dependencies 
env_name = 'MAG-ParallelRunEnv'
print(env_name in ws.environments)
if env_name in ws.environments: 
    env = ws.environments.get(env_name)
    if env and type(env) is Environment: 
        print('Found environment: ' + env_name)
else: 
    print('environment not found, refer to 01_config_notebook to register envinroment...')


True
Found environment: MAG-ParallelRunEnv


# parallel run config and step

In [12]:
parallelrunconfig = ParallelRunConfig(
    environment=env, 
    entry_script='minibatch_process.py', 
    error_threshold=1,
    output_action='append_row', 
    compute_target=compute_target, 
    node_count=1, 
    process_count_per_node=2,
    mini_batch_size='2',
    source_directory='scripts', 
    description='description of batch step config',
    logging_level='INFO'
)


#parallelrunconfig('script', 'other stuff')
parallelrunstep = ParallelRunStep(
    name='cv-detection-batch-dataset-step', 
    parallel_run_config=parallelrunconfig, 
    inputs=[DatasetConsumptionConfig('dataset_param_config', pipeline_dataset_param).as_mount()], 
    # inputs=[ Dataset.File.from_files((godzilla_datastore, 'images')).as_named_input('anpr_images').as_mount()],
    # inputs=[]
    side_inputs=[], 
    output=output_dir,
    arguments=['--input_partition', pipeline_inpart, '--kv_customimage', pipeline_kv_customimg, '--kv_readapi', pipeline_kv_readapi], 
    allow_reuse=False
)


# ('config', 'inputs as mount', 'arguments passing in pipeline args')

# prepare pipeline 

In [14]:
pipeline = Pipeline(workspace=ws, steps=[parallelrunstep])
pipeline_run = Experiment(ws, 'MAG-batch-paramdataset').submit(pipeline)
pipeline_run.wait_for_completion(show_output=True)

tHandler __enter__
[2020-07-30T12:36:22.755037] TimeoutHandler __exit__
[2020-07-30T12:36:22.755092] _download_tree finished writing file
[2020-07-30T12:36:22.755138] Starting _download_tree for file.
[2020-07-30T12:36:22.755248] _download_tree start request for file
[2020-07-30T12:36:22.780278] _download_tree finished request for file
[2020-07-30T12:36:22.780762] _download_tree start writing file
[2020-07-30T12:36:22.781080] TimeoutHandler __init__
[2020-07-30T12:36:22.781387] TimeoutHandler __enter__
[2020-07-30T12:36:22.803485] TimeoutHandler __exit__
[2020-07-30T12:36:22.804032] _download_tree finished writing file
[2020-07-30T12:36:22.804396] Starting _download_tree for file.
[2020-07-30T12:36:22.804807] _download_tree start request for file
[2020-07-30T12:36:22.832956] _download_tree finished request for file
[2020-07-30T12:36:22.833442] _download_tree start writing file
[2020-07-30T12:36:22.833754] TimeoutHandler __init__
[2020-07-30T12:36:22.834069] TimeoutHandler __enter__
[20

'Finished'

In [15]:
published_pipeline = pipeline_run.publish_pipeline(
    name='MAG-batchscore-dataset', 
    description='published pipeline with dataset param 10.30pm', 
    version='2.0', 
    continue_on_step_failure=True)

In [16]:
published_pipeline.endpoint

'https://australiaeast.api.azureml.ms/pipelines/v1.0/subscriptions/907c8efc-c2c8-4c49-a4e1-aeb880e10c88/resourceGroups/aml/providers/Microsoft.MachineLearningServices/workspaces/magaml/PipelineRuns/PipelineSubmit/0cae7283-36d6-4b77-84d3-7a38f9c2a0a5'