# 1.0 Load libraries

In [48]:
import os
from dotenv import load_dotenv, find_dotenv

import azureml.core
from azureml.core.authentication import AzureCliAuthentication
from azureml.core import Workspace, Experiment, Datastore, Environment, Dataset
from azureml.core.compute import AmlCompute, ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails

from azureml.core.runconfig import RunConfiguration
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import DEFAULT_CPU_IMAGE

from azureml.pipeline.core import Pipeline, PipelineParameter, PipelineData
from azureml.pipeline.steps import PythonScriptStep

from azureml.data import OutputFileDatasetConfig

from azureml.pipeline.steps import ParallelRunStep, ParallelRunConfig
from azureml.data.datapath import DataPath, DataPathComputeBinding

# 1.1 Setup some environment
## 1.1.1 Load variables

In [13]:
load_dotenv()

ws_name = os.environ['AML_WORKSPACE_NAME']
subscription_id = os.environ['AML_SUBSCRIPTION_ID']
resource_group = os.environ['AML_RESOURCE_GROUP']
tenant_id = os.environ['AML_TENANT_ID']
min_nodes = int(os.environ['AML_MIN_NODES'])
max_nodes = int(os.environ['AML_MAX_NODES'])
aml_compute_target = os.environ['AML_COMPUTE_NAME']

order_file = os.environ['MODEL_INPUT_ORDER_FILE']
distance_file = os.environ['MODEL_INPUT_DISTANCE_FILE']
model_output_path = os.environ['MODEL_OUTPUT_PATH']

print('---- Check Azure setting ----')
print(f'AML Workspace name       : {ws_name}')
print(f'Subscription ID          : {subscription_id}')
print(f'Resource group           : {resource_group}')
print(f'tenant id                : {tenant_id}')
print(f'min nodes of AML compute : {min_nodes}')
print(f'max nodes of AML compute : {max_nodes}')
print(f'AML compute target       : {aml_compute_target}')
print(f'Model input order file   : {order_file}')
print(f'Model input distance file: {distance_file}')
print(f'Input output             : {model_output_path}')

---- Check Azure setting ----
AML Workspace name       : amlrouteoptimization
Subscription ID          : 0b3f04a9-6375-4341-a513-dd53731a99a4
Resource group           : dstoolkit-route-optimization
tenant id                : 72f988bf-86f1-41af-91ab-2d7cd011db47
min nodes of AML compute : 0
max nodes of AML compute : 10
AML compute target       : opcluster
Model input order file   : model_input/order_small.csv
Model input distance file: model_input/distance.csv
Input output             : model_output


# 1.1.2 Azure authentication and Load Azure ML Workspace

In [14]:
!az login
# !az login --use-device-code

[
  {
    "cloudName": "AzureCloud",
    "homeTenantId": "72f988bf-86f1-41af-91ab-2d7cd011db47",
    "id": "f501f470-c695-4681-856a-988d86851132",
    "isDefault": false,
    "managedByTenants": [
      {
        "tenantId": "2f4a9838-26b7-47ee-be60-ccc1fdec5953"
      }
    ],
    "name": "Microsoft Azure Internal Consumption",
    "state": "Enabled",
    "tenantId": "72f988bf-86f1-41af-91ab-2d7cd011db47",
    "user": {
      "name": "zhih@microsoft.com",
      "type": "user"
    }
  },
  {
    "cloudName": "AzureCloud",
    "homeTenantId": "72f988bf-86f1-41af-91ab-2d7cd011db47",
    "id": "7fd08dcc-a653-4b0f-8f8c-4dac889fdda4",
    "isDefault": false,
    "managedByTenants": [
      {
        "tenantId": "2f4a9838-26b7-47ee-be60-ccc1fdec5953"
      }
    ],
    "name": "Code generate Test and Infra",
    "state": "Enabled",
    "tenantId": "72f988bf-86f1-41af-91ab-2d7cd011db47",
    "user": {
      "name": "zhih@microsoft.com",
      "type": "user"
    }
  },
  {
    "cloudName": "Az


Trace ID: 61e0c154-e5d9-4038-8d63-70f7b8391b00

Correlation ID: 2462870a-a678-43b2-a90f-5ba8d13e6735

Timestamp: 2022-09-15 05:33:17Z'

Trace ID: 85da8157-2877-474e-af91-a2cb7a2a2700

Correlation ID: df448b62-7b1e-423b-87e3-ad11c84abb23

Timestamp: 2022-09-15 05:33:20Z'


In [15]:
cli_auth = AzureCliAuthentication()
ws =  Workspace.get(name=ws_name
                    ,subscription_id=subscription_id
                    ,resource_group=resource_group
                    ,auth=cli_auth)

print(ws.get_details())

{'id': '/subscriptions/0b3f04a9-6375-4341-a513-dd53731a99a4/resourceGroups/dstoolkit-route-optimization/providers/Microsoft.MachineLearningServices/workspaces/amlrouteoptimization', 'name': 'amlrouteoptimization', 'identity': {'principal_id': '6a398b87-cd4d-4ff3-b66f-344edbdd9bd5', 'tenant_id': '72f988bf-86f1-41af-91ab-2d7cd011db47', 'type': 'SystemAssigned'}, 'location': 'australiaeast', 'type': 'Microsoft.MachineLearningServices/workspaces', 'tags': {}, 'sku': 'Basic', 'workspaceid': 'bb87308f-2811-4bdc-8c73-339822d22a39', 'sdkTelemetryAppInsightsKey': 'bd22fe07-ca7e-4337-bd98-3d5116261c1f', 'description': '', 'friendlyName': 'amlrouteoptimization', 'containerRegistry': '/subscriptions/0b3f04a9-6375-4341-a513-dd53731a99a4/resourceGroups/dstoolkit-route-optimization/providers/Microsoft.ContainerRegistry/registries/bb87308f28114bdc8c73339822d22a39', 'keyVault': '/subscriptions/0b3f04a9-6375-4341-a513-dd53731a99a4/resourceGroups/dstoolkit-route-optimization/providers/Microsoft.Keyvault/

# 1.1.3 Get Compute Cluster

In [16]:
# Retrieve or create an Aml compute
aml_compute_target = os.environ['AML_COMPUTE_NAME']
try:
    aml_compute = AmlCompute(ws, aml_compute_target)
    print("Found existing compute target.")
except ComputeTargetException:
    print("Creating new compute target")
    
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = "STANDARD_D2_V2",
                                                                min_nodes = min_nodes, 
                                                                max_nodes = max_nodes)    
    aml_compute = ComputeTarget.create(ws, aml_compute_target, provisioning_config)
    aml_compute.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

Found existing compute target.


# 1.1.4 Create Run Configuration

In [17]:
# Default datastore (Azure blob storage)
def_blob_store = ws.get_default_datastore()

# source directory
source_directory = '../src/'
print(f'Source code is in {source_directory} directory.')

Source code is in ../src/ directory.


In [18]:
# environment
env_name = 'op-env'

try:
    env = Environment.get(ws, env_name, 2)
    print("Found existing environment.")

except Exception:
    print("Creating new enviroment")
    env = Environment(env_name)

    # enable Docker 
    env.docker.enabled = True
    # set Docker base image to the default CPU-based image
    env.docker.base_image = DEFAULT_CPU_IMAGE
    # use conda_dependencies.yml to create a conda environment in the Docker image for execution
    env.python.user_managed_dependencies = False
    # specify CondaDependencies obj
    env.python.conda_dependencies = CondaDependencies.create(
            conda_packages=['pandas']
            ,pip_packages=['ortools'
                            ,'azureml-defaults']
        )

    env.register(workspace=ws)

Found existing environment.


In [19]:
# create a new runconfig object
run_config = RunConfiguration()

# set environment
run_config.environment = env

# 1.2 Set up Azure ML Pipeline
## 1.2.1 Reduce the search space of the problem

In [49]:
# model_input = DataReference(
#     datastore=def_blob_store,
#     data_reference_name="model_input",
#     path_on_datastore=order_file)

# # The pair-wise distance between cities
# distance = DataReference(
#     datastore=def_blob_store,
#     data_reference_name="distance",
#     path_on_datastore=distance_file)

# ## Order file
# orderpath = DataPath(datastore=def_blob_store, 
#                 path_on_datastore='model_input/order_small.csv')
# order_input = (PipelineParameter(name="order_data", default_value=orderpath),
#                            DataPathComputeBinding(mode='mount'))

# ## Distance file
# datapath = DataPath(datastore=def_blob_store, 
#                 path_on_datastore='model_input/distance.csv')
# distance_input = (PipelineParameter(name="distance_data", default_value=datapath),
#                            DataPathComputeBinding(mode='mount'))

model_input = Dataset.File.from_files((def_blob_store, order_file))
distance = Dataset.File.from_files((def_blob_store, distance_file))

# Naming the intermediate data 
model_result_partial = PipelineData("model_result_partial",datastore=def_blob_store)
model_input_reduced = PipelineData("model_input_reduced",datastore=def_blob_store)

reduce_step = PythonScriptStep(
    script_name="reduce.py", 
    arguments=["--model_input", model_input.as_named_input('model_input').as_download(path_on_compute='order_file'),
                "--distance", distance.as_named_input('distance').as_download(path_on_compute='distance_file'),
                "--model_result_partial", model_result_partial,
                "--model_input_reduced", model_input_reduced],
    # inputs=[model_input, distance],
    outputs=[model_result_partial, model_input_reduced],
    compute_target=aml_compute, 
    source_directory=source_directory,
    runconfig=run_config
)

# 1.2.2 Partition the problem

In [50]:
# Naming the intermediate data 
model_input_list = PipelineData("model_input_list",datastore=def_blob_store).as_dataset()

parition_step = PythonScriptStep(
    script_name="partition.py", 
    arguments=["--model_input_reduced", model_input_reduced,
                "--distance", distance.as_named_input('distance').as_download(path_on_compute='distance_file'),
                "--model_input_list", model_input_list],
    inputs=[model_input_reduced],
    outputs=[model_input_list],
    compute_target=aml_compute, 
    source_directory=source_directory,
    runconfig=run_config
)

## 1.2.3 Solve individual problem

In [54]:
import uuid

# Naming the intermediate data 
model_result_list = PipelineData("model_result_list", datastore=def_blob_store)

# pass distance file as side input
local_path = "/tmp/{}".format(str(uuid.uuid4()))
distance_config = distance.as_named_input("distance").as_mount(local_path)

# Try new model input list 
# model_input_list_new = Dataset.File.from_files((def_blob_store, "azureml/7a497558-971f-4511-ad93-21af71a252a8/model_input_list/"))
# model_input_list_new = Dataset.Tabular.from_delimited_files((def_blob_store, "model_input_list/*.csv"))

# for file_path in model_input_list.to_path():
#     print(file_path)

# print(model_input_list_new.to_pandas_dataframe())

parallel_run_config = ParallelRunConfig(
    source_directory=source_directory,
    entry_script='solve.py',
    mini_batch_size="1",
    error_threshold=-1,
    output_action="append_row",
    append_row_file_name="model_result_list.txt",
    environment=env,
    compute_target=aml_compute,
    process_count_per_node=1,
    node_count=1)

solve_step = ParallelRunStep(
    name="solve",
    inputs=[model_input_list.as_named_input('model_input_list')],
    output=model_result_list,
    arguments=["--distance", distance_config],
    side_inputs=[distance_config],
    parallel_run_config=parallel_run_config,
    allow_reuse=False
)

SyntaxError: invalid syntax (2593426835.py, line 26)

## 1.2.4 Merge the results

In [None]:
# Naming the intermediate data 
model_result_final = OutputFileDatasetConfig(destination=(def_blob_store, 'model_result_final'))

merge_step = PythonScriptStep(
    script_name="merge.py", 
    arguments=["--model_input", model_input.as_named_input('model_input').as_download(path_on_compute='order_file'), 
    "--distance", distance.as_named_input('distance').as_download(path_on_compute='distance_file'),
    "--model_result_partial", model_result_partial, 
    "--model_result_list", model_result_list, 
    "--model_result_final", model_result_final],
    inputs=[model_result_partial, model_result_list],
    outputs=[model_result_final],
    compute_target=aml_compute, 
    source_directory=source_directory,
    runconfig=run_config
)

# 1.3 Create the Pipeline

In [None]:
# parition_step.run_after(reduce_step)
# solve_step.run_after(parition_step)
# merge_step.run_after(solve_step)

pipeline = Pipeline(workspace=ws, steps=[reduce_step, parition_step, solve_step, merge_step])
# pipeline = Pipeline(workspace=ws, steps=[solve_step])
print("Pipeline is built")

pipeline_run = Experiment(ws, 'optimization_example').submit(pipeline)
print("Pipeline is submitted for execution")

RunDetails(pipeline_run).show()

pipeline_run.wait_for_completion(show_output=True)

Pipeline is built
Created step reduce.py [91bfd70f][923bcd7f-93ad-47a6-8e08-3d976c7f3953], (This step will run and generate new outputs)
Created step partition.py [e4ebdaaf][5a8c6c9d-69e1-4ab8-be70-95c4bf869dec], (This step will run and generate new outputs)
Created step solve [392c5cea][cac3e134-9931-4ae1-94c9-0cdf7bf325a7], (This step will run and generate new outputs)
Created step merge.py [476831dc][5721a20a-0fbb-4785-9126-8b8188f0d46d], (This step will run and generate new outputs)
Submitted PipelineRun a1a5df00-0e9c-45ec-9db3-e029af405df1
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/a1a5df00-0e9c-45ec-9db3-e029af405df1?wsid=/subscriptions/0b3f04a9-6375-4341-a513-dd53731a99a4/resourcegroups/dstoolkit-route-optimization/workspaces/amlrouteoptimization&tid=72f988bf-86f1-41af-91ab-2d7cd011db47
Pipeline is submitted for execution


_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', …

PipelineRunId: a1a5df00-0e9c-45ec-9db3-e029af405df1
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/a1a5df00-0e9c-45ec-9db3-e029af405df1?wsid=/subscriptions/0b3f04a9-6375-4341-a513-dd53731a99a4/resourcegroups/dstoolkit-route-optimization/workspaces/amlrouteoptimization&tid=72f988bf-86f1-41af-91ab-2d7cd011db47
PipelineRun Status: Running


StepRunId: 1dc561bf-d987-4674-9f6b-bcc5d5f1b9d7
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/1dc561bf-d987-4674-9f6b-bcc5d5f1b9d7?wsid=/subscriptions/0b3f04a9-6375-4341-a513-dd53731a99a4/resourcegroups/dstoolkit-route-optimization/workspaces/amlrouteoptimization&tid=72f988bf-86f1-41af-91ab-2d7cd011db47
StepRun( reduce.py ) Status: Running

StepRun(reduce.py) Execution Summary
StepRun( reduce.py ) Status: Finished
{'runId': '1dc561bf-d987-4674-9f6b-bcc5d5f1b9d7', 'target': 'opcluster', 'status': 'Completed', 'startTimeUtc': '2022-09-15T07:06:39.690728Z', 'endTimeUtc': '2022-09-15T07:07:41.506505Z', 'services': {}, '

'Finished'