# Azure ML v2 — End-to-end pipeline submission (uses existing compute)

This notebook submits an AML pipeline with 3 steps (00/01/02).

In [None]:
# =========================
# CONFIG — EDIT THESE
# =========================
SUBSCRIPTION_ID = "00000000-0000-0000-0000-000000000000"
RESOURCE_GROUP  = "rg-name"
WORKSPACE_NAME  = "aml-workspace-name"
COMPUTE_NAME    = "existing-compute"

TRAIN_INPUT  = "azureml://datastores/workspaceblobstore/paths/data/train.csv"
TEST_INPUT   = "azureml://datastores/workspaceblobstore/paths/data/test.csv"
LABELS_INPUT = "azureml://datastores/workspaceblobstore/paths/data/labels.csv"

TEXT_COL   = "text"
DEMAND_COL = "demand_id"
GROUP_COL  = "group_id"
REL_COL    = "relevant"   # or "relevance"
EXCLUDE_COL= "exclude"

TOP_K = 5
TOP_N_GROUPS = 3


In [None]:
from azure.identity import DefaultAzureCredential
from azure.ai.ml import MLClient, Input, Output, command
from azure.ai.ml.dsl import pipeline
from azure.ai.ml.entities import Environment
from pathlib import Path

ml_client = MLClient(DefaultAzureCredential(), SUBSCRIPTION_ID, RESOURCE_GROUP, WORKSPACE_NAME)
_ = ml_client.compute.get(COMPUTE_NAME)
print("Connected. Using compute:", COMPUTE_NAME)


In [None]:
# Create/reuse environment from conda.yaml
env = Environment(
    name="nlp-pipeline-py311",
    version="1",
    conda_file="conda.yaml",
    image="mcr.microsoft.com/azureml/minimal-ubuntu22.04-py311-cuda11.8-gpu-inference:latest",
)
env = ml_client.environments.create_or_update(env)
print("Environment:", env.name, env.version)


In [None]:
# Define steps (command components)
step00 = command(
    name="step00_filter_resplit",
    display_name="00 Filter excluded labels and resplit 70/30",
    code="./src",
    command=(
        "python step00_filter_resplit.py "
        "--train_csv ${{inputs.train_csv}} --test_csv ${{inputs.test_csv}} --labels_csv ${{inputs.labels_csv}} "
        f"--text_col {TEXT_COL} --demand_col {DEMAND_COL} --exclude_col {EXCLUDE_COL} "
        "--test_size 0.30 --seed 42 "
        "--out_train ${{outputs.out_train}} --out_test ${{outputs.out_test}}"
    ),
    inputs={
        "train_csv": Input(type="uri_file"),
        "test_csv": Input(type="uri_file"),
        "labels_csv": Input(type="uri_file"),
    },
    outputs={
        "out_train": Output(type="uri_file"),
        "out_test": Output(type="uri_file"),
    },
    environment=env,
    compute=COMPUTE_NAME,
)

step01 = command(
    name="step01_relevance_train",
    display_name="01 Train relevance model and select best",
    code="./src",
    command=(
        "python step01_relevance_train.py "
        "--train_csv ${{inputs.train_csv}} "
        f"--text_col {TEXT_COL} --rel_col {REL_COL} "
        "--seed 42 --test_size 0.30 --precision_target 0.90 "
        "--out_model ${{outputs.out_model}} --out_metrics ${{outputs.out_metrics}}"
    ),
    inputs={"train_csv": Input(type="uri_file")},
    outputs={
        "out_model": Output(type="uri_file"),
        "out_metrics": Output(type="uri_file"),
    },
    environment=env,
    compute=COMPUTE_NAME,
)

step02 = command(
    name="step02_label_train_compare",
    display_name="02 Train label models and compare",
    code="./src",
    command=(
        "python step02_label_train_compare.py "
        "--train_csv ${{inputs.train_csv}} --labels_csv ${{inputs.labels_csv}} "
        f"--text_col {TEXT_COL} --demand_col {DEMAND_COL} --group_col {GROUP_COL} --rel_col {REL_COL} "
        "--seed 42 --val_size 0.30 "
        f"--top_k {TOP_K} --top_n_groups {TOP_N_GROUPS} "
        "--out_group_model ${{outputs.out_group_model}} "
        "--out_ce_dir ${{outputs.out_ce_dir}} "
        "--out_ht_path ${{outputs.out_ht_path}} "
        "--out_meta ${{outputs.out_meta}} "
        "--out_metrics ${{outputs.out_metrics}}"
    ),
    inputs={
        "train_csv": Input(type="uri_file"),
        "labels_csv": Input(type="uri_file"),
    },
    outputs={
        "out_group_model": Output(type="uri_file"),
        "out_ce_dir": Output(type="uri_folder"),
        "out_ht_path": Output(type="uri_file"),
        "out_meta": Output(type="uri_file"),
        "out_metrics": Output(type="uri_file"),
    },
    environment=env,
    compute=COMPUTE_NAME,
)


In [None]:
@pipeline(name="nlp_end_to_end_pipeline")
def nlp_pipeline(train_csv, test_csv, labels_csv):
    s00 = step00(train_csv=train_csv, test_csv=test_csv, labels_csv=labels_csv)
    s01 = step01(train_csv=s00.outputs.out_train)
    s02 = step02(train_csv=s00.outputs.out_train, labels_csv=labels_csv)
    return {
        "train_clean": s00.outputs.out_train,
        "test_clean": s00.outputs.out_test,
        "relevance_model": s01.outputs.out_model,
        "relevance_metrics": s01.outputs.out_metrics,
        "group_model": s02.outputs.out_group_model,
        "cross_encoder_dir": s02.outputs.out_ce_dir,
        "hier_transformer": s02.outputs.out_ht_path,
        "hier_meta": s02.outputs.out_meta,
        "label_metrics": s02.outputs.out_metrics,
    }

job = nlp_pipeline(
    train_csv=Input(type="uri_file", path=TRAIN_INPUT),
    test_csv=Input(type="uri_file", path=TEST_INPUT),
    labels_csv=Input(type="uri_file", path=LABELS_INPUT),
)
job.settings.default_compute = COMPUTE_NAME
job.settings.force_rerun = True

returned = ml_client.jobs.create_or_update(job)
print("Submitted job:", returned.name)
print("Studio URL:", returned.studio_url)
