# Manual HPO Job Runner

This notebook submits individual Azure ML command jobs for each hyperparameter trial so you can inspect every run separately (as opposed to a single SweepJob). The structure follows the [`SweepJob` guidance](https://learn.microsoft.com/en-us/azure/machine-learning/how-to-tune-hyperparameters?view=azureml-api-2) but executes the trials one-by-one for maximum transparency.



In [None]:
# Ensure we run from the project root so component paths resolve correctly
import os
from pathlib import Path

PROJECT_ROOT = Path("..").resolve()
os.chdir(PROJECT_ROOT)
print(f"Changed working directory to: {os.getcwd()}")


In [None]:
from __future__ import annotations

import itertools
import json
import os
import time
from pathlib import Path
from typing import Dict, Iterable, List, Optional

import yaml
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.ai.ml import Input, MLClient, load_component

# Allow importing project utilities
import sys
PROJECT_ROOT = Path("..")
if str(PROJECT_ROOT.resolve()) not in sys.path:
    sys.path.append(str(PROJECT_ROOT.resolve()))

from src.utils import get_data_asset_config, load_azure_config  # noqa: E402
from hpo_utils import build_parameter_space, load_hpo_config  # noqa: E402

In [None]:
load_dotenv("../config.env")

azure_cfg = load_azure_config()
data_asset_cfg = get_data_asset_config()

credential = DefaultAzureCredential()
ml_client = MLClient(
    credential,
    subscription_id=azure_cfg["subscription_id"],
    resource_group_name=azure_cfg["resource_group"],
    workspace_name=azure_cfg["workspace_name"],
)

workspace_url = f"https://ml.azure.com/?wsid=/subscriptions/{azure_cfg['subscription_id']}/resourcegroups/{azure_cfg['resource_group']}/workspaces/{azure_cfg['workspace_name']}"
print(f"Connected to workspace '{azure_cfg['workspace_name']}'.")



In [None]:
SRC_DIR = str((Path.cwd() / "src").resolve())
DEFAULT_ENV = os.getenv("AML_DEFAULT_ENV", "azureml:bank-churn-env:1")
DEFAULT_COMPUTE = os.getenv("AML_COMPUTE_CLUSTER", "cpu-cluster")

hpo_cfg = load_hpo_config()
parameter_space = build_parameter_space(hpo_cfg.get("search_space", {}))

train_config_path = Path("configs/train.yaml")
with train_config_path.open() as f:
    train_cfg = yaml.safe_load(f) or {}

raw_input = Input(
    type="uri_folder",
    path=f"azureml:{data_asset_cfg['data_asset_name']}:{data_asset_cfg['data_asset_version']}",
    mode="mount",
)



In [None]:
from azure.ai.ml import command, Output


def _extract_uri_from_mapping(data: Optional[Dict]) -> Optional[str]:
    if not data:
        return None
    for key in ("uri", "path", "value"):
        value = data.get(key)
        if value:
            return value
    metadata = data.get("metadata") or {}
    for key in ("uri", "path", "value"):
        value = metadata.get(key)
        if value:
            return value
    return None


def stream_job(job_name: str) -> None:
    """Stream logs for an Azure ML job."""
    print(f"Streaming logs for job: {job_name}")
    ml_client.jobs.stream(job_name)


def _resolve_output_uri(job_output) -> str:
    """Best-effort extraction of the backing URI from a job output."""
    for attr in ("uri", "path", "value"):
        value = getattr(job_output, attr, None)
        if value:
            return value
    if hasattr(job_output, "_to_dict"):
        data = job_output._to_dict() or {}
        extracted = _extract_uri_from_mapping(data)
        if extracted:
            return extracted
    if isinstance(job_output, dict):
        extracted = _extract_uri_from_mapping(job_output)
        if extracted:
            return extracted
    raise AttributeError("Unable to resolve output URI from job output metadata.")


def _get_output_uri(job, output_name: str) -> str:
    output = job.outputs[output_name]
    try:
        return _resolve_output_uri(output)
    except AttributeError:
        job_dict = job._to_dict() if hasattr(job, "_to_dict") else {}
        fallback = _extract_uri_from_mapping((job_dict.get("outputs") or {}).get(output_name))
        if fallback:
            return fallback
        raise


def _wait_for_job_completion(job_name: str, poll_interval: int = 15):
    """Poll a job until it finishes and return the refreshed job."""
    while True:
        fresh_job = ml_client.jobs.get(job_name)
        status = getattr(fresh_job, "status", None)
        if status in {"Completed", "Finished"}:
            return fresh_job
        if status in {"Failed", "Canceled"}:
            raise RuntimeError(f"Job {job_name} finished with status {status}")
        time.sleep(poll_interval)


def run_data_prep_job(
    *, wait_for_completion: bool = True, stream_logs: bool = True, poll_interval: int = 15
) -> Dict[str, str]:
    """Submit data prep job and return metadata (always job name, plus URI when ready)."""
    prep_command = command(
        code=SRC_DIR,
        command="python data_prep.py --input ${{inputs.raw_data}} --output ${{outputs.processed_data}}",
        inputs={"raw_data": raw_input},
        outputs={"processed_data": Output(type="uri_folder")},
        environment=DEFAULT_ENV,
        compute=DEFAULT_COMPUTE,
        experiment_name="manual-hpo-data-prep",
        display_name="manual-hpo-data-prep",
    )
    returned_job = ml_client.jobs.create_or_update(prep_command)
    result = {"job_name": returned_job.name, "studio_url": returned_job.studio_url}
    print(f"Data prep job submitted: {returned_job.name} | Studio: {returned_job.studio_url}")
    if not wait_for_completion:
        print(
            "Data prep job is running asynchronously; record the job name above and call "
            "`fetch_processed_data_uri(job_name)` once it finishes to get the output URI."
        )
        return result
    if stream_logs:
        stream_job(returned_job.name)
        completed_job = ml_client.jobs.get(returned_job.name)
    else:
        completed_job = _wait_for_job_completion(returned_job.name, poll_interval=poll_interval)
    processed_uri = _get_output_uri(completed_job, "processed_data")
    print(f"Processed data available at: {processed_uri}")
    result["processed_data_uri"] = processed_uri
    return result


def fetch_processed_data_uri(
    job_name: str, output_name: str = "processed_data", poll_interval: int = 15
) -> str:
    """Wait for an existing job to finish and return the processed data URI."""
    completed_job = _wait_for_job_completion(job_name, poll_interval=poll_interval)
    processed_uri = _get_output_uri(completed_job, output_name)
    print(f"Processed data for {job_name} available at: {processed_uri}")
    return processed_uri


def build_trial_grid(max_trials: Optional[int] = None) -> List[Dict[str, float]]:
    """Expand the discrete search space into explicit trial configs."""
    search_space = hpo_cfg.get("search_space", {})
    trials: List[Dict[str, float]] = []
    for model_name, model_space in search_space.items():
        if not model_space:
            trials.append({"model_type": model_name})
            continue
        keys = list(model_space.keys())
        values = [model_space[key] for key in keys]
        for combo in itertools.product(*values):
            trial_params = {f"{model_name}_{key}": value for key, value in zip(keys, combo)}
            trial_params["model_type"] = model_name
            trials.append(trial_params)
    if max_trials:
        trials = trials[:max_trials]
    print(f"Prepared {len(trials)} trial definitions")
    return trials


def submit_training_trial(trial_idx: int, processed_uri: str, trial_params: Dict[str, float]):
    """Submit a single training job with explicit hyperparameters."""
    cli_overrides = []
    for key, value in trial_params.items():
        if key == "model_type":
            continue
        if value is not None:
            cli_overrides.append(f"--set {key}={value}")
    override_str = " ".join(cli_overrides)
    override_segment = f" {override_str}" if override_str else ""
    train_command = command(
        code=SRC_DIR,
        command=(
            "python train.py "
            "--data ${{inputs.processed_data}} "
            "--model-artifact-dir ${{outputs.model_output}} "
            f"--model-type {trial_params['model_type']}" + override_segment
        ),
        inputs={"processed_data": Input(type="uri_folder", path=processed_uri)},
        outputs={"model_output": Output(type="uri_folder")},
        environment=DEFAULT_ENV,
        compute=DEFAULT_COMPUTE,
        experiment_name=hpo_cfg.get("experiment_name", "manual-hpo-trials"),
        display_name=f"manual-hpo-trial-{trial_idx:03d}-{trial_params['model_type']}",
    )
    returned_job = ml_client.jobs.create_or_update(train_command)
    print(f"Trial {trial_idx} submitted: {returned_job.name} | Studio: {returned_job.studio_url}")
    return returned_job.name



In [None]:
DATA_PREP_ASYNC = os.getenv("AML_DATA_PREP_ASYNC", "false").lower() in {"true", "1", "yes"}

data_prep_result = run_data_prep_job(
    wait_for_completion=not DATA_PREP_ASYNC,
    stream_logs=not DATA_PREP_ASYNC,
)

data_prep_job_name = data_prep_result["job_name"]
processed_data_uri = data_prep_result.get("processed_data_uri")
if not processed_data_uri:
    print(f"Waiting for processed data from job {data_prep_job_name}...")
    processed_data_uri = fetch_processed_data_uri(data_prep_job_name)

print(f"Processed data URI: {processed_data_uri}")



In [None]:
# processed_data_uri is guaranteed by the previous cell.



In [None]:
trial_definitions = build_trial_grid(max_trials=hpo_cfg.get("budget", {}).get("max_trials"))

In [None]:
trial_jobs: List[str] = []
for idx, trial_params in enumerate(trial_definitions, start=1):
    job_name = submit_training_trial(idx, processed_data_uri, trial_params)
    trial_jobs.append(job_name)
    # Optionally stream logs synchronously per job
    # stream_job(job_name)

print("Submitted trial jobs:")
for job_name in trial_jobs:
    print(f"  - {job_name}")



## Next steps

- Use `stream_job(job_name)` on any of the trials to follow logs live.
- After all jobs finish, open each run in Azure ML Studio to compare metrics and hyperparameters.
- Optionally rerun `extract_best_params.py` with the best trial's `parent_run_id` (each trial writes its own MLflow run).

