In [1]:
import json
import azureml.core
from azureml.core import Workspace, Datastore, Dataset, Environment, Experiment
from azureml.data import FileDataset
from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
from azureml.pipeline.core import Pipeline, PipelineData, PipelineParameter
from azureml.core.compute import ComputeTarget, AmlCompute


print(azureml.core.VERSION)
version = dict(zip(['major','minor','patch'], azureml.core.VERSION.split('.')))
ws = Workspace.from_config()

1.10.0


In [2]:
if int(version['major']) >= 1: 
    if int(version['minor']) == 10:
        from azureml.pipeline.steps import ParallelRunConfig, ParallelRunStep        
    else:
        from azureml.contrib.pipeline.steps import ParallelRunConfig, ParallelRunStep 

In [3]:
compute_name = "aml-compute1"
vm_size = "STANDARD_DS1_v2"
if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('Found compute target: ' + compute_name)
else:
    print('compute target not found, refer to 02_create_compute_cluster to create compute target...')

Found compute target: aml-compute1


In [4]:
datastore_name = 'godzilla'
if datastore_name in ws.datastores:
    datastore = ws.datastores[datastore_name]
    if datastore and type(datastore) is Datastore: 
        print('Found datastore: ' + datastore_name)
else: 
    print('datastore not found...')

images_dataset_name = 'images_partition'
path_on_datastore = datastore.path('images')
input_images_dataset = Dataset.File.from_files(path=path_on_datastore, validate=False)

In [15]:
# get side input dataset

metadata_dataset_name =  'metadata_ds'

metadata_ds = Dataset.get_by_name(workspace=ws, name=metadata_dataset_name)
if metadata_ds and type(metadata_ds) is FileDataset: 
    print('Found metadata: ' + ', '.join(metadata_ds.to_path()) )
else: 
    print('dataset not found, refer to 01a_register_metadata_dataset.ipynb to create and register metadata dataset')
  
metadata_config = metadata_ds.as_named_input('metadata_input')

Found metadata: /20200810_images.csv, /images_20200810.csv


In [16]:
print(metadata_config)

<azureml.data.dataset_consumption_config.DatasetConsumptionConfig object at 0x00000245ED2901C0>


In [9]:
output_dir = PipelineData(name="scores", 
                          datastore=ws.get_default_datastore(), 
                          output_path_on_compute="batchscoring/results")

In [10]:
output_dir

$AZUREML_DATAREFERENCE_scores

# pipeline parameters

In [11]:
# kv = ws.get_default_keyvault()
# print(len(kv.get_secret(pipeline_kv_readapi)))
pipeline_inpart = PipelineParameter(name="pipeline_inpart", default_value='2020/08/10')
pipeline_metadata = PipelineParameter(name="pipeline_metadata", default_value='20200810_images.csv')
pipeline_kv_customimg = PipelineParameter(name="pipeline_kv_customimg", default_value='api-custom-vision')
pipeline_kv_readapi = PipelineParameter(name="pipeline_kv_readapi", default_value='api-readapi')
pipeline_dataset_param = PipelineParameter(name='pipeline_dataset_id', default_value=input_images_dataset)


In [12]:
print(pipeline_inpart)
print(pipeline_dataset_param)

PipelineParameter_Name:pipeline_inpart_Default:2020/08/10
PipelineParameter_Name:pipeline_dataset_id_Default:FileDataset
{
  "source": [
    "('godzilla', 'images')"
  ],
  "definition": [
    "GetDatastoreFiles"
  ]
}


# python envinronment configuration 

In [14]:
# conda dependencies 
env_name = 'MAG-ParallelRunEnv'
print(env_name in ws.environments)
if env_name in ws.environments: 
    env = ws.environments.get(env_name)
    if env and type(env) is Environment: 
        print('Found environment: ' + env_name)
else: 
    print('environment not found, refer to 01_config_notebook to register envinroment...')


True
Found environment: MAG-ParallelRunEnv


# parallel run config and step

In [17]:
parallelrunconfig = ParallelRunConfig(
    environment=env, 
    entry_script='minibatch_process.py', 
    error_threshold=1,
    output_action='append_row', 
    compute_target=compute_target, 
    node_count=1, 
    process_count_per_node=2,
    mini_batch_size='2',
    source_directory='scripts', 
    description='description of batch step config',
    logging_level='INFO'
)


#parallelrunconfig('script', 'other stuff')
parallelrunstep = ParallelRunStep(
    name='cv-detection-batch-dataset-step', 
    parallel_run_config=parallelrunconfig, 
    inputs=[DatasetConsumptionConfig('dataset_param_config', pipeline_dataset_param).as_mount()], 
    # inputs=[ Dataset.File.from_files((godzilla_datastore, 'images')).as_named_input('anpr_images').as_mount()],
    # inputs=[]
    side_inputs=[metadata_config], 
    output=output_dir,
    arguments=[
        '--input_partition', pipeline_inpart, 
        '--metadata_config', metadata_config,
        '--metadata', pipeline_metadata,
        '--kv_customimage', pipeline_kv_customimg, 
        '--kv_readapi', pipeline_kv_readapi], 
    allow_reuse=False
)


# ('config', 'inputs as mount', 'arguments passing in pipeline args')

# data transfer step 

to move score data from out temp storage to blog storage

# prepare pipeline 

In [18]:
pipeline = Pipeline(workspace=ws, steps=[parallelrunstep])
pipeline_run = Experiment(ws, 'MAG-batch-paramdataset').submit(pipeline)
pipeline_run.wait_for_completion(show_output=True)

2', '--input_partition', '2020/08/10', '--metadata_config', 'DatasetConsumptionConfig:metadata_input', '--metadata', '20200810_images.csv', '--kv_customimage', 'api-custom-vision', '--kv_readapi', 'api-readapi', '--input_fds_0', 'dataset_param_config', '--input_pipeline_param_0', 'DatasetConsumptionConfig:dataset_param_config'])
Initialize DatasetContextManager.
Starting the daemon thread to refresh tokens in background for process with pid = 101
Set Dataset dataset_param_config's target path to /mnt/batch/tasks/shared/LS_root/jobs/magaml/azureml/192305d5-0f3b-4004-bcce-9f0e4f32a9b7/mounts/workspaceblobstore/azureml/192305d5-0f3b-4004-bcce-9f0e4f32a9b7/b8e8a8ce-d5b8-47bc-a9b4-d412f942b9dd
Set Dataset metadata_input's target path to /mnt/batch/tasks/shared/LS_root/jobs/magaml/azureml/192305d5-0f3b-4004-bcce-9f0e4f32a9b7/mounts/workspaceblobstore/azureml/192305d5-0f3b-4004-bcce-9f0e4f32a9b7/745e9021-8eb9-4ef4-acd5-be122e346086
Enter __enter__ of DatasetContextManager
SDK version: azureml

'Finished'

In [19]:
published_pipeline = pipeline_run.publish_pipeline(
    name='MAG-batchscore-dataset', 
    description='published pipeline with dataset param and side input 10/08/2020 14.17pm', 
    version='2.0', 
    continue_on_step_failure=True)

In [20]:
published_pipeline.endpoint

'https://australiaeast.api.azureml.ms/pipelines/v1.0/subscriptions/907c8efc-c2c8-4c49-a4e1-aeb880e10c88/resourceGroups/aml/providers/Microsoft.MachineLearningServices/workspaces/magaml/PipelineRuns/PipelineSubmit/59faefa7-d271-4fb3-8c50-10bfbb9dfe90'