In [11]:
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient

ml_client = MLClient(
    credential=DefaultAzureCredential(),
    subscription_id="",
    resource_group_name="databricks-sandbox-rg",
    workspace_name="amlsandbox-eco3",
    )

print(ml_client)

MLClient(credential=<azure.identity._credentials.default.DefaultAzureCredential object at 0x7f395be3bf40>,
         subscription_id=2a834239-8f89-42e1-8cf1-c3c10090f51c,
         resource_group_name=databricks-sandbox-rg,
         workspace_name=amlsandbox-eco3)


In [12]:
from azure.ai.ml.entities import AmlCompute

cpu_compute_target = "ciaranh1"

try:
    # let's see if the compute target already exists
    cpu_cluster = ml_client.compute.get(cpu_compute_target)
    print(
        f"You already have a cluster named {cpu_compute_target}, we'll reuse it as is."
    )

except Exception:
    print("Creating a new cpu compute target...")

    # Let's create the Azure ML compute object with the intended parameters
    cpu_cluster = AmlCompute(
        # Name assigned to the compute cluster
        name="cpu-cluster",
        # Azure ML Compute is the on-demand VM service
        type="amlcompute",
        # VM Family
        size="STANDARD_DS3_V2",
        # Minimum running nodes when there is no job running
        min_instances=0,
        # Nodes in cluster
        max_instances=4,
        # How many seconds will the node running after the job termination
        idle_time_before_scale_down=180,
        # Dedicated or LowPriority. The latter is cheaper but there is a chance of job termination
        tier="Dedicated",
    )

    # Now, we pass the object to MLClient's create_or_update method
    cpu_cluster = ml_client.begin_create_or_update(cpu_cluster)

print(
    f"AMLCompute with name {cpu_cluster.name} is created, the compute size is {cpu_cluster.size}"
)

You already have a cluster named ciaranh1, we'll reuse it as is.
AMLCompute with name ciaranh1 is created, the compute size is STANDARD_DS12_V2


In [13]:
from azure.ai.ml.entities import Environment

custom_env_name = "aml-scikit-learn"

pipeline_job_env = Environment(
    name=custom_env_name,
    description="Custom environment for Credit Card Defaults pipeline",
    tags={"scikit-learn": "0.24.2"},
    conda_file=os.path.join("../dependencies/", "conda.yaml"),
    image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
    version="0.2.0",

)
pipeline_job_env = ml_client.environments.create_or_update(pipeline_job_env)

print(
    f"Environment with name {pipeline_job_env.name} is registered to workspace, the environment version is {pipeline_job_env.version}"
)

Environment with name aml-scikit-learn is registered to workspace, the environment version is 0.2.0


In [14]:
data_prep_src_dir = "../components/databricks"
print(data_prep_src_dir)

../components/databricks


In [15]:
from azure.ai.ml import command
from azure.ai.ml import Input, Output

list_dbx_clusters = command(
    name="Databricks E2E ML Workflow",
    display_name="Databricks E2E ML Workflow",
    description="Invoke a Databricks Pipeline",
    inputs={
        "test_train_ratio": Input(type="number")
    },
    outputs=dict(
        train_data=Output(type="uri_folder", mode="rw_mount"),
        test_data=Output(type="uri_folder", mode="rw_mount"),
    ),
    # The source folder of the component
    code=data_prep_src_dir,
    command="""python listclusters.py --test_train_ratio ${{inputs.test_train_ratio}} \
            --train_data ${{outputs.train_data}} --test_data ${{outputs.test_data}} \
            """,
    environment=f"{pipeline_job_env.name}:{pipeline_job_env.version}",
)

In [16]:
print(list_dbx_clusters)

type: command
outputs:
  train_data:
    mode: rw_mount
    type: uri_folder
  test_data:
    mode: rw_mount
    type: uri_folder
environment: azureml:aml-scikit-learn:0.2.0
code: /mnt/batch/tasks/shared/LS_root/mounts/clusters/ciaranh1/code/Users/ciaranh/components/databricks
component:
  name: Databricks E2E ML Workflow
  display_name: Databricks E2E ML Workflow
  description: Invoke a Databricks Pipeline
  type: command
  inputs:
    test_train_ratio:
      type: number
  outputs:
    train_data:
      type: uri_folder
      mode: rw_mount
    test_data:
      type: uri_folder
      mode: rw_mount
  command: 'python listclusters.py --test_train_ratio ${{inputs.test_train_ratio}}             --train_data
    ${{outputs.train_data}} --test_data ${{outputs.test_data}}             '
  environment: azureml:aml-scikit-learn:0.2.0
  code: /mnt/batch/tasks/shared/LS_root/mounts/clusters/ciaranh1/code/Users/ciaranh/components/databricks
  is_deterministic: true



In [17]:
# the dsl decorator tells the sdk that we are defining an Azure ML pipeline
from azure.ai.ml import dsl, Input, Output

pipeline_job_test_train_ratio = 0.2
@dsl.pipeline(
    compute=cpu_compute_target,
    description="E2E data_perp-train pipeline",
)
def dbx_pipeline(
    pipeline_job_test_train_ratio,
):
    # using data_prep_function like a python call with its own inputs
    data_prep_job = list_dbx_clusters(
        test_train_ratio=pipeline_job_test_train_ratio,
    )



In [20]:
pipeline = dbx_pipeline(
    pipeline_job_test_train_ratio=0.3,
)

In [21]:
# submit the pipeline job
pipeline_job = ml_client.jobs.create_or_update(
    pipeline,
    # Project's name
    experiment_name="dbx_list_clusters",
)

In [22]:
print(pipeline_job)

name: witty_grass_hhzx8v120l
display_name: dbx_pipeline
description: E2E data_perp-train pipeline
type: pipeline
inputs:
  pipeline_job_test_train_ratio: '0.3'
jobs:
  data_prep_job:
    type: command
    inputs:
      test_train_ratio:
        path: ${{parent.inputs.pipeline_job_test_train_ratio}}
    outputs:
      train_data:
        mode: rw_mount
        type: uri_folder
      test_data:
        mode: rw_mount
        type: uri_folder
    component: azureml:azureml_anonymous:32bdee22-c855-46f7-9f0f-38725a6253c2
services:
  Tracking:
    job_service_type: Tracking
    endpoint: azureml://eastus.api.azureml.ms/mlflow/v1.0/subscriptions/2a834239-8f89-42e1-8cf1-c3c10090f51c/resourceGroups/databricks-sandbox-rg/providers/Microsoft.MachineLearningServices/workspaces/amlsandbox-eco3?
  Studio:
    job_service_type: Studio
    endpoint: https://ml.azure.com/runs/witty_grass_hhzx8v120l?wsid=/subscriptions/2a834239-8f89-42e1-8cf1-c3c10090f51c/resourcegroups/databricks-sandbox-rg/workspaces/