# Hyperparameter Optimization (HPO) - Manual Sweep Trials

This notebook orchestrates hyperparameter optimization sweeps for customer churn prediction models.

## Workflow

1. **Setup**: Configure Azure ML client and load HPO configuration
2. **Data**: Set training data URI (from previous data prep job or environment variable)
3. **Configure Sweeps**: Build sweep jobs per model type from `configs/hpo.yaml`
4. **Submit Sweeps**: Submit sweep jobs to Azure ML (or load previous submissions)
5. **Analyze Results**: Find best model and export configuration to `configs/train.yaml`



In [122]:
# Ensure we run from the project root so component paths resolve correctly
import os
from pathlib import Path


def _find_project_root(start_dir: Path) -> Path:
    any_markers = {".git", "pyproject.toml", "setup.cfg"}
    required_entries = {"configs", "src", "notebooks"}

    def _looks_like_root(path: Path) -> bool:
        return any((path / marker).exists() for marker in any_markers) and all(
            (path / entry).exists() for entry in required_entries
        )

    for candidate in [start_dir, *start_dir.parents]:
        if _looks_like_root(candidate):
            return candidate

    try:
        for child in start_dir.iterdir():
            if child.is_dir() and _looks_like_root(child):
                return child
    except PermissionError:
        pass

    env_override = os.getenv("AML_PROJECT_ROOT")
    if env_override:
        return Path(env_override).resolve()

    raise RuntimeError(
        "Unable to determine project root. Set AML_PROJECT_ROOT or start inside the repo."
    )


NOTEBOOK_DIR = Path.cwd().resolve()
PROJECT_ROOT = _find_project_root(NOTEBOOK_DIR)
if Path.cwd() != PROJECT_ROOT:
    os.chdir(PROJECT_ROOT)


In [123]:
from __future__ import annotations

import os
import time
from pathlib import Path
from typing import Dict, List, Optional

import yaml
from dotenv import load_dotenv
from azure.identity import DefaultAzureCredential
from azure.ai.ml import Input, MLClient

# Allow importing project utilities
import sys
if str(PROJECT_ROOT.resolve()) not in sys.path:
    sys.path.append(str(PROJECT_ROOT.resolve()))

import importlib
import hpo_utils  # noqa: E402
hpo_utils = importlib.reload(hpo_utils)
from azure.ai.ml.sweep import Choice

from src.utils import get_data_asset_config, load_azure_config  # noqa: E402

In [124]:
load_dotenv(PROJECT_ROOT / "config.env")

azure_cfg = load_azure_config()
data_asset_cfg = get_data_asset_config()

credential = DefaultAzureCredential()
ml_client = MLClient(
    credential,
    subscription_id=azure_cfg["subscription_id"],
    resource_group_name=azure_cfg["resource_group"],
    workspace_name=azure_cfg["workspace_name"],
)

workspace_url = f"https://ml.azure.com/?wsid=/subscriptions/{azure_cfg['subscription_id']}/resourcegroups/{azure_cfg['resource_group']}/workspaces/{azure_cfg['workspace_name']}"



Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented


## 1. Setup and Configuration

Load Azure ML configuration and initialize the ML client.


In [125]:
SRC_DIR = str((PROJECT_ROOT / "src").resolve())
DEFAULT_ENV = os.getenv("AML_DEFAULT_ENV", "azureml:bank-churn-env:1")
DEFAULT_COMPUTE = os.getenv("AML_COMPUTE_CLUSTER", "cpu-cluster")
PROCESSED_DATA_DATASTORE = os.getenv("AML_PROCESSED_DATA_DATASTORE", "workspaceblobstore")
PROCESSED_DATA_PREFIX = os.getenv("AML_PROCESSED_DATA_PREFIX", "manual-hpo-data")

hpo_cfg = hpo_utils.load_hpo_config()
search_space_cfg = hpo_utils.build_parameter_space(hpo_cfg.get("search_space", {}))

train_config_path = PROJECT_ROOT / "configs" / "train.yaml"
if not train_config_path.exists():
    raise FileNotFoundError(
        f"Expected training config at {train_config_path}. Ensure you're running inside the repo."
    )
with train_config_path.open() as f:
    train_cfg = yaml.safe_load(f) or {}

raw_input = Input(
    type="uri_folder",
    path=f"azureml:{data_asset_cfg['data_asset_name']}:{data_asset_cfg['data_asset_version']}",
    mode="mount",
)

In [126]:
from azure.ai.ml import command, Output
from azure.ai.ml.sweep import BanditPolicy
from typing import Any


_URI_KEYS = (
    "uri",
    "path",
    "value",
    "uri_folder",
    "uri_file",
    "asset_uri",
    "assetUri",
    "location",
)


def _extract_asset_reference(container: Dict[str, Any]) -> Optional[str]:
    asset_name = container.get("asset_name") or container.get("assetName")
    asset_version = container.get("asset_version") or container.get("assetVersion")
    if asset_name and asset_version:
        return f"azureml:{asset_name}:{asset_version}"
    asset_id = container.get("asset_id") or container.get("assetId")
    if asset_id:
        return asset_id
    return None


def _extract_uri_from_mapping(data: Optional[Dict[str, Any]]) -> Optional[str]:
    if not data:
        return None
    if isinstance(data, str):
        return data
    containers = [data, data.get("metadata") or {}]
    for container in containers:
        for key in _URI_KEYS:
            value = container.get(key)
            if value:
                return value
        asset_ref = _extract_asset_reference(container)
        if asset_ref:
            return asset_ref
    return None


def stream_job(job_name: str) -> None:
    """Stream logs for an Azure ML job."""
    ml_client.jobs.stream(job_name)


def _resolve_output_uri(job_output) -> str:
    """Best-effort extraction of the backing URI from a job output."""
    if isinstance(job_output, str):
        return job_output
    for attr in _URI_KEYS:
        value = getattr(job_output, attr, None)
        if value:
            return value
    attr_dict = getattr(job_output, "__dict__", {}) or {}
    asset_ref = _extract_asset_reference(attr_dict)
    if asset_ref:
        return asset_ref
    if hasattr(job_output, "as_dict"):
        data = job_output.as_dict() or {}
        extracted = _extract_uri_from_mapping(data)
        if extracted:
            return extracted
    if hasattr(job_output, "_to_dict"):
        data = job_output._to_dict() or {}
        extracted = _extract_uri_from_mapping(data)
        if extracted:
            return extracted
    if isinstance(job_output, dict):
        extracted = _extract_uri_from_mapping(job_output)
        if extracted:
            return extracted
    raise AttributeError("Unable to resolve output URI from job output metadata.")


def _get_output_uri(job, output_name: str) -> str:
    outputs = getattr(job, "outputs", None) or {}
    if output_name not in outputs:
        raise KeyError(f"Job {job.name} has no output named '{output_name}'.")
    output = outputs[output_name]
    job_dict = job._to_dict() if hasattr(job, "_to_dict") else {}
    try:
        return _resolve_output_uri(output)
    except AttributeError:
        fallback = _extract_uri_from_mapping((job_dict.get("outputs") or {}).get(output_name) or {})
        if fallback:
            return fallback
        fallback = _extract_uri_from_mapping(
            (job_dict.get("job_outputs") or {}).get(output_name) or {}
        )
        if fallback:
            return fallback
        artifact_store = os.getenv("AML_ARTIFACT_DATASTORE", "workspaceartifactstore")
        run_output_path = (
            f"azureml://datastores/{artifact_store}/paths/ExperimentRun/dcid.{job.name}/"
            f"outputs/{output_name}/"
        )
        print(
            f"Unable to locate explicit URI for {job.name}:{output_name}; "
            f"falling back to {run_output_path}. Consider registering the output as a data asset."
        )
        return run_output_path


def _wait_for_job_completion(job_name: str, poll_interval: int = 15):
    """Poll a job until it finishes and return the refreshed job."""
    while True:
        fresh_job = ml_client.jobs.get(job_name)
        status = getattr(fresh_job, "status", None)
        if status in {"Completed", "Finished"}:
            return fresh_job
        if status in {"Failed", "Canceled"}:
            raise RuntimeError(f"Job {job_name} finished with status {status}")
        time.sleep(poll_interval)


def run_data_prep_job(
    *, wait_for_completion: bool = True, stream_logs: bool = True, poll_interval: int = 15
) -> Dict[str, str]:
    """Submit data prep job and return metadata (always job name, plus URI when ready)."""
    output_subdir = f"{PROCESSED_DATA_PREFIX}/{int(time.time())}"
    output_uri = f"azureml://datastores/{PROCESSED_DATA_DATASTORE}/paths/{output_subdir}"
    prep_command = command(
        code=SRC_DIR,
        command="python data_prep.py --input ${{inputs.raw_data}} --output ${{outputs.processed_data}}",
        inputs={"raw_data": raw_input},
        outputs={"processed_data": Output(type="uri_folder", path=output_uri)},
        environment=DEFAULT_ENV,
        compute=DEFAULT_COMPUTE,
        experiment_name="manual-hpo-data-prep",
        display_name="manual-hpo-data-prep",
    )
    returned_job = ml_client.jobs.create_or_update(prep_command)
    result = {"job_name": returned_job.name, "studio_url": returned_job.studio_url}
    print(f"Data prep job submitted: {returned_job.name} | Studio: {returned_job.studio_url}")
    if not wait_for_completion:
        print(
            "Data prep job is running asynchronously; record the job name above and call "
            "`fetch_processed_data_uri(job_name)` once it finishes to get the output URI."
        )
        return result
    if stream_logs:
        stream_job(returned_job.name)
        completed_job = ml_client.jobs.get(returned_job.name)
    else:
        completed_job = _wait_for_job_completion(returned_job.name, poll_interval=poll_interval)
    processed_uri = _get_output_uri(completed_job, "processed_data")
    result["processed_data_uri"] = processed_uri
    return result


def fetch_processed_data_uri(
    job_name: str, output_name: str = "processed_data", poll_interval: int = 15
) -> str:
    """Wait for an existing job to finish and return the processed data URI."""
    completed_job = _wait_for_job_completion(job_name, poll_interval=poll_interval)
    processed_uri = _get_output_uri(completed_job, output_name)
    return processed_uri


## 2. Training Data

**Option A**: Run a new data prep job (cell below) to create processed data  
**Option B**: Use existing processed data (skip to the training data URI cell)


In [127]:
# OPTIONAL: Run a new data preparation job
# Uncomment and run this cell to create new processed data for training

# DATA_PREP_ASYNC = os.getenv("AML_DATA_PREP_ASYNC", "false").lower() in {"true", "1", "yes"}

# data_prep_result = run_data_prep_job(
#     wait_for_completion=not DATA_PREP_ASYNC,
#     stream_logs=not DATA_PREP_ASYNC,
# )

# data_prep_job_name = data_prep_result["job_name"]
# processed_data_uri = data_prep_result.get("processed_data_uri")
# if not processed_data_uri:
#     print(f"Waiting for processed data from job {data_prep_job_name}...")
#     processed_data_uri = fetch_processed_data_uri(data_prep_job_name)

# print(f"✓ Processed data URI: {processed_data_uri}")
# print(f"✓ Set this as your training_data_uri in the cell below, or set:")
# print(f"  os.environ['AML_PROCESSED_DATA_URI'] = '{processed_data_uri}'")


### Set Training Data URI

Set the training data URI for sweep jobs. This should point to processed data from a data prep job.


In [128]:
# Set training data URI for sweep jobs.
# 
# OPTION 1: Set manually (uncomment and edit the line below):
training_data_uri = "azureml:azureml_sleepy_lime_0cgj10sz9w_output_data_processed_data:1"

# OPTION 2: Get from data prep job cell above (if you ran it)

# Get from data prep job if available, otherwise use manually set value
try:
    training_data_uri = processed_data_uri
except NameError:
    # Not set from data prep job, must be manually set above
    try:
        training_data_uri
    except NameError:
        raise RuntimeError(
            "No training data URI available. Either:\n"
            "  1. Uncomment and set training_data_uri manually above, or\n"
            "  2. Run the data prep job cell above to create processed data"
        )

## 3. Configure Sweep Jobs

Build sweep job configurations from `configs/hpo.yaml`. Each model type gets its own sweep.


## 4. Submit or Load Sweep Jobs

**Option A**: Submit new sweep jobs (run the cell below)  
**Option B**: Load previous sweep submissions (skip to the next section)


In [None]:
# Configure sweep jobs per model (no cross-model mixing).
budget_cfg = hpo_cfg.get("budget", {})
timeouts_cfg = hpo_cfg.get("timeouts", {})
early_cfg = hpo_cfg.get("early_stopping", {})

sweep_jobs = {}
for model_name in search_space_cfg.get("model_types", []):
    model_space = search_space_cfg.get(model_name)
    if not model_space:
        print(f"Skipping sweep for {model_name}: no hyperparameters defined in configs/hpo.yaml.")
        continue

    command_segments = [
        "python run_sweep_trial.py",
        "--data ${{inputs.processed_data}}",
        f"--model-type {model_name}",
        "--model-artifact-dir ${{outputs.model_output}}",
    ]

    base_command_inputs = {
        "processed_data": Input(type="uri_folder", path=training_data_uri),
    }

    sweep_search_space = {}
    hyperparam_names = []
    
    # Add training-level parameters (use_smote, class_weight, random_state) if they're in search space
    # These are swept across all models, not model-specific
    training_level_params = ["use_smote", "class_weight", "random_state"]
    for param_name in training_level_params:
        if param_name in search_space_cfg:
            param_values = search_space_cfg[param_name]
            # Only add to sweep if it's a list (for sweeping), not a single value
            if isinstance(param_values, list):
                hyperparam_names.append(param_name)
                command_segments.append(f"--{param_name} ${{{{search_space.{param_name}}}}}")
                sweep_search_space[param_name] = Choice(values=param_values)
    
    # Add model-specific hyperparameters
    for hp_name, hp_values in model_space.items():
        prefixed_name = f"{model_name}_{hp_name}"
        hyperparam_names.append(prefixed_name)
        command_segments.append(f"--{prefixed_name} ${{{{search_space.{prefixed_name}}}}}")
        sweep_search_space[prefixed_name] = Choice(values=hp_values)

    sweep_command = " ".join(command_segments)

    base_training_command = command(
        code=SRC_DIR,
        command=sweep_command,
        inputs=base_command_inputs,
        outputs={"model_output": Output(type="uri_folder")},
        environment=DEFAULT_ENV,
        compute=DEFAULT_COMPUTE,
        display_name=f"manual-hpo-sweep-trial-{model_name}",
        experiment_name=hpo_cfg.get("experiment_name", "manual-hpo-sweep"),
    )

    early_policy = None
    if early_cfg.get("enabled"):
        policy_name = (early_cfg.get("policy", "bandit") or "bandit").lower()
        if policy_name != "bandit":
            raise ValueError(f"Unsupported early stopping policy: {policy_name}")
        eval_interval = max(1, int(early_cfg.get("evaluation_interval", 2)))
        delay_eval = max(1, int(early_cfg.get("delay_evaluation", eval_interval)))
        slack_factor = early_cfg.get("slack_factor")
        slack_amount = early_cfg.get("slack_amount")
        early_policy = BanditPolicy(
            evaluation_interval=eval_interval,
            delay_evaluation=delay_eval,
            slack_factor=slack_factor,
            slack_amount=slack_amount,
        )

    sweep_kwargs = {
        "primary_metric": hpo_cfg.get("metric", "f1"),
        "goal": "Maximize" if hpo_cfg.get("mode", "max").lower() == "max" else "Minimize",
        "sampling_algorithm": hpo_cfg.get("sampling_algorithm", "random"),
        "search_space": sweep_search_space,
        "early_termination_policy": early_policy,
    }
    if budget_cfg.get("max_trials"):
        sweep_kwargs["max_total_trials"] = budget_cfg["max_trials"]
    if budget_cfg.get("max_concurrent"):
        sweep_kwargs["max_concurrent_trials"] = min(budget_cfg["max_concurrent"], sweep_kwargs.get("max_total_trials", budget_cfg.get("max_concurrent")))
    if timeouts_cfg.get("total_minutes"):
        sweep_kwargs["timeout"] = int(timeouts_cfg["total_minutes"]) * 60
    if timeouts_cfg.get("trial_minutes"):
        sweep_kwargs["trial_timeout"] = int(timeouts_cfg["trial_minutes"]) * 60

    sweep_job = base_training_command.sweep(**sweep_kwargs)
    sweep_job.display_name = f"{hpo_cfg.get('sweep_display_name', 'manual-hpo-sweep')}-{model_name}"
    sweep_job.experiment_name = hpo_cfg.get("experiment_name", "manual-hpo-sweep")
    sweep_jobs[model_name] = sweep_job

if not sweep_jobs:
    raise RuntimeError("No sweep jobs were created. Check configs/hpo.yaml::search_space.")

Configured sweep for logreg:
  metric: f1 (Maximize)
  sampling: random
  limits: max_total_trials=2 | max_concurrent=2
  timeouts: total=60 min | trial=20 min
  hyperparameters: use_smote, logreg_C, logreg_solver
-
Configured sweep for rf:
  metric: f1 (Maximize)
  sampling: random
  limits: max_total_trials=2 | max_concurrent=2
  timeouts: total=60 min | trial=20 min
  hyperparameters: use_smote, rf_n_estimators, rf_max_depth, rf_min_samples_split, rf_min_samples_leaf, rf_max_features
-
Configured sweep for xgboost:
  metric: f1 (Maximize)
  sampling: random
  limits: max_total_trials=2 | max_concurrent=2
  timeouts: total=60 min | trial=20 min
  hyperparameters: use_smote, xgboost_n_estimators, xgboost_max_depth, xgboost_learning_rate, xgboost_subsample, xgboost_colsample_bytree
-


## 5. Load Previous Sweep Submissions

Load existing sweep jobs by name or auto-discover from the experiment.


In [None]:
# Submit new sweep jobs and populate sweep_submissions.
# Run this cell to create new sweeps, or skip to load previous submissions below.
sweep_submissions = {}
for model_name, sweep_job in sweep_jobs.items():
    submission = ml_client.jobs.create_or_update(sweep_job)
    sweep_submissions[model_name] = submission
    print(f"✓ {model_name}: {submission.name} | {submission.studio_url}")


pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored


Sweep job submitted for logreg!
  Name      : frank_tent_7zshzmfxnt
  Status    : Running
  Studio URL: https://ml.azure.com/runs/frank_tent_7zshzmfxnt?wsid=/subscriptions/a23fa87c-802c-4fdf-9e59-e3d7969bcf31/resourcegroups/rg-churn-ml-project-2025-11-15/workspaces/churn-ml-workspace&tid=e7572e92-7aee-4713-a3c4-ba64888ad45f
-


pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored


Sweep job submitted for rf!
  Name      : great_kitchen_qxthwhlfy3
  Status    : Running
  Studio URL: https://ml.azure.com/runs/great_kitchen_qxthwhlfy3?wsid=/subscriptions/a23fa87c-802c-4fdf-9e59-e3d7969bcf31/resourcegroups/rg-churn-ml-project-2025-11-15/workspaces/churn-ml-workspace&tid=e7572e92-7aee-4713-a3c4-ba64888ad45f
-


pathOnCompute is not a known attribute of class <class 'azure.ai.ml._restclient.v2023_04_01_preview.models._models_py3.UriFolderJobOutput'> and will be ignored


Sweep job submitted for xgboost!
  Name      : musing_salt_8zbzdr8wtr
  Status    : Running
  Studio URL: https://ml.azure.com/runs/musing_salt_8zbzdr8wtr?wsid=/subscriptions/a23fa87c-802c-4fdf-9e59-e3d7969bcf31/resourcegroups/rg-churn-ml-project-2025-11-15/workspaces/churn-ml-workspace&tid=e7572e92-7aee-4713-a3c4-ba64888ad45f
-


## 6. Analyze Results

Find the best model across all sweeps and display its parameters.


In [131]:
def load_previous_sweeps(ml_client, search_space_cfg: dict, hpo_cfg: dict, specific_sweep_jobs: dict | None = None, auto_discovery: bool | None = None):
    """Load previous sweep jobs by model name using:
       1) Explicit job names (if provided)
       2) Auto-discovery (if enabled via parameter or env var)
    """
    specific_sweep_jobs = specific_sweep_jobs or {}
    if auto_discovery is None:
        use_auto_discovery = os.getenv("AML_USE_PREVIOUS_SWEEPS", "false").lower() in {"true", "1", "yes"}
    else:
        use_auto_discovery = auto_discovery

    sweep_submissions = {}
    model_types = search_space_cfg.get("model_types", [])

    # 1) Explicit job names
    for model_name, job_name in specific_sweep_jobs.items():
        try:
            job = ml_client.jobs.get(job_name)
            if job.type == "sweep":
                sweep_submissions[model_name] = job
        except Exception:
            pass

    # 2) Auto-discovery for missing models
    if use_auto_discovery and len(sweep_submissions) < len(model_types):
        experiment_name = hpo_cfg.get("experiment_name", "manual-hpo-sweep")
        prefix = hpo_cfg.get("sweep_display_name", "manual-hpo-sweep")

        for job in ml_client.jobs.list():
            if (
                job.type == "sweep"
                and getattr(job, "experiment_name", None) == experiment_name
                and getattr(job, "display_name", "").startswith(prefix)
            ):
                for model_name in model_types:
                    if model_name not in sweep_submissions:
                        if f"-{model_name}" in job.display_name or job.display_name.endswith(model_name):
                            sweep_submissions[model_name] = job

    if not sweep_submissions:
        print("No sweeps found.")

    return sweep_submissions


# Example usage
SPECIFIC_SWEEP_JOBS = {
    # "rf": "gentle_ear_wx2w5x8k5t",
}

sweep_submissions = load_previous_sweeps(
    ml_client=ml_client,
    search_space_cfg=search_space_cfg,
    hpo_cfg=hpo_cfg,
    specific_sweep_jobs=SPECIFIC_SWEEP_JOBS,
    auto_discovery=True
)



## 7. Export Best Model Configuration

Export the best model's hyperparameters to `configs/train.yaml` for production training.


In [None]:
# Find and display the best model with its parameters.
import ast

primary_metric_name = hpo_cfg.get("metric", "f1")
parameter_coercions = {"true": True, "false": False, "none": None}


def _coerce(value):
    if isinstance(value, str):
        lowered = value.strip().lower()
        if lowered in parameter_coercions:
            return parameter_coercions[lowered]
        try:
            return ast.literal_eval(value)
        except (ValueError, SyntaxError):
            return value
    return value


best_overall = None

for model_name, submission in sweep_submissions.items():
    sweep_name = submission.name
    sweep_job = ml_client.jobs.get(sweep_name)
    best_child_run_id = sweep_job.properties.get("best_child_run_id")
    raw_score = sweep_job.properties.get("score")

    if not best_child_run_id:
        continue

    metric_value = None
    if raw_score is not None:
        try:
            metric_value = float(raw_score)
        except (TypeError, ValueError):
            metric_value = raw_score

    if metric_value is None:
        continue

    child_job = ml_client.jobs.get(best_child_run_id)
    params = {k: _coerce(v) for k, v in (getattr(child_job, "parameters", {}) or {}).items()}

    if best_overall is None or metric_value > best_overall["metric"]:
        best_overall = {
            "model_name": model_name,
            "metric": metric_value,
            "params": params,
        }

if best_overall:
    print(f"Best Model: {best_overall['model_name']} ({primary_metric_name}={best_overall['metric']:.4f})")
    print(f"Parameters: {best_overall['params']}")
else:
    print("No completed trials found. Re-run this cell after sweeps finish.")



Best Model:
  Model: xgboost
  Metric (f1): 0.6362573099415205
  Parameters: {'use_smote': True, 'class_weight': 'balanced', 'random_state': 42, 'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.1, 'subsample': 1, 'colsample_bytree': 0.8}


In [None]:
# Export best model configuration to train.yaml
if best_overall:
    model_name = best_overall["model_name"]
    params = best_overall["params"]
    
    # Load existing config to preserve structure
    config_path = Path("configs/train.yaml")
    config_path.parent.mkdir(parents=True, exist_ok=True)
    existing_config = {}
    if config_path.exists():
        with open(config_path, "r") as f:
            existing_config = yaml.safe_load(f) or {}
    
    training_config = existing_config.get("training", {})
    
    # Extract and normalize parameters
    training_level_keys = ["use_smote", "class_weight", "random_state"]
    training_configs = {}
    model_hyperparams = {}
    prefix = f"{model_name}_"
    
    for key, value in params.items():
        # Convert booleans to quoted strings
        if isinstance(value, bool):
            value = "true" if value else "false"
        
        if key in training_level_keys:
            training_configs[key] = value
        elif key.startswith(prefix):
            model_hyperparams[key[len(prefix):]] = value
        elif not any(key.startswith(f"{p}_") for p in ["rf", "logreg", "xgboost"]):
            model_hyperparams[key] = value
    
    # Update config (preserve existing structure)
    training_config["models"] = [model_name]
    training_config.update(training_configs)
    training_config.setdefault("hyperparameters", {})[model_name] = model_hyperparams
    
    # Force quote all strings in YAML
    class QuotedString(str):
        pass
    
    yaml.add_representer(QuotedString, lambda d, v: d.represent_scalar('tag:yaml.org,2002:str', v, style='"'))
    
    def quote_strings(obj):
        if isinstance(obj, dict):
            return {k: quote_strings(v) for k, v in obj.items()}
        elif isinstance(obj, list):
            return [quote_strings(item) for item in obj]
        elif isinstance(obj, str):
            return QuotedString(obj)
        return obj
    
    # Write to file
    final_config = {**existing_config, "training": training_config}
    with open(config_path, "w") as f:
        yaml.dump(quote_strings(final_config), f, default_flow_style=False, sort_keys=False, allow_unicode=True)
    
    print(f"✓ Exported to {config_path}")
else:
    print("No best model found. Run the best model analysis cell first.")

✓ Exported best model configuration to configs/train.yaml
  Model: xgboost
  Hyperparameters: {'n_estimators': 100, 'max_depth': 4, 'learning_rate': 0.1, 'subsample': 1, 'colsample_bytree': 0.8}


## 8. Run Training Pipeline

Run the training pipeline with the exported best model configuration.


In [139]:
# Run the training pipeline with the exported best model configuration
import subprocess
import sys

if best_overall:
    try:
        subprocess.run(
            [sys.executable, "run_pipeline.py"],
            cwd=PROJECT_ROOT,
            check=True,
            capture_output=False,
        )
    except subprocess.CalledProcessError as e:
        print(f"✗ Pipeline submission failed with exit code {e.returncode}")
        raise
    except FileNotFoundError:
        print("✗ run_pipeline.py not found. Ensure you're in the project root.")
        raise
else:
    print("No best model found. Run the export cell first.")


Class DeploymentTemplateOperations: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class AutoDeleteConditionSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseAutoDeleteSettingSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class IntellectualPropertySchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class ProtectionLevelSchema: This is an experimental class, and may change at any time. Please see https://aka.ms/azuremlexperimental for more information.
Class BaseIntellectualPropertySchema: Th

✓ Job submitted: witty_lizard_0f8czt39nb
  View in Azure ML Studio: https://ml.azure.com/runs/witty_lizard_0f8czt39nb?wsid=/subscriptions/a23fa87c-802c-4fdf-9e59-e3d7969bcf31/resourcegroups/rg-churn-ml-project-2025-11-15/workspaces/churn-ml-workspace&tid=e7572e92-7aee-4713-a3c4-ba64888ad45f


## Next Steps

- **Monitor Progress**: Use `ml_client.jobs.stream(<job.name>)` or the Studio URL to monitor sweep progress
- **Analyze Results**: After sweeps complete, run the analysis cells above to find the best model
- **Export Config**: The best model configuration will be exported to `configs/train.yaml` for production training
- **Run Pipeline**: Use the pipeline cell above to train the best model, or run `run_pipeline.py` manually

