# Phase 1: Local Training Orchestration (Google Colab Compatible)

This notebook orchestrates all training activities for **local execution** with Google Colab GPU compute support.

## Overview

- **Step 1**: Load Centralized Configs
- **Step 2**: Download Dataset from Kaggle & Create dataset-tiny
- **Step 3**: Setup Local Environment
- **Step 4**: The Dry Run
- **Step 5**: The Sweep (HPO) - Local with Optuna
- **Step 6**: Best Configuration Selection (Automated)
- **Step 7**: Final Training (Post-HPO, Single Run)
- **Step 8**: Model Conversion & Optimization

## Important

- This notebook **executes training locally** (not on Azure ML)
- All computation happens on the local machine or Google Colab GPU
- The notebook must be **re-runnable end-to-end**
- Uses Kaggle dataset download instead of Azure ML data assets


## Step P1-3.1: Load Centralized Configs

Load and validate all configuration files. Configs are immutable and will be logged with each job for reproducibility.


In [1]:
# Install required packages for local execution
%pip install kagglehub optuna mlflow --quiet


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m130.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m89.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m51.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.9/114.9 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.0/85.0 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━

In [3]:
import os
import subprocess
from pathlib import Path

IN_COLAB = 'COLAB_GPU' in os.environ or 'COLAB_TPU' in os.environ
REPO_URL = "https://github.com/longdang193/resume-ner-azureml.git"
COLAB_REPO_DIR = Path("/content/resume-ner-azureml")

if IN_COLAB and not COLAB_REPO_DIR.exists():
    print("Cloning repository in Colab...")
    subprocess.run(
        ["git", "clone", REPO_URL, str(COLAB_REPO_DIR)],
        check=True
    )
    print(f"✓ Repository cloned to {COLAB_REPO_DIR}")
elif IN_COLAB and COLAB_REPO_DIR.exists():
    print(f"✓ Repository already exists at {COLAB_REPO_DIR}")
    os.chdir(COLAB_REPO_DIR)
    subprocess.run(["git", "pull"], check=False, capture_output=True)
    print("✓ Repository updated")


Cloning repository in Colab...
✓ Repository cloned to /content/resume-ner-azureml


In [None]:
import os
import sys
from pathlib import Path
from typing import Dict, Any

COLAB_ROOT_DIR = Path("/content/resume-ner-azureml")
LOCAL_ROOT_DIR = Path("..").resolve()

ROOT_DIR = LOCAL_ROOT_DIR if not IN_COLAB else COLAB_ROOT_DIR.resolve()

if not ROOT_DIR.exists():
    raise FileNotFoundError(
        f"Repository directory not found: {ROOT_DIR}\n"
        f"In Colab, make sure the repository was cloned in the previous cell."
    )

SRC_DIR = ROOT_DIR / "src"
sys.path.append(str(ROOT_DIR))
sys.path.append(str(SRC_DIR))

if IN_COLAB:
    os.chdir(ROOT_DIR)
    print(f"Running in Google Colab. Changed to directory: {ROOT_DIR}")

from shared.yaml_utils import load_yaml
from shared.json_cache import save_json, load_json
from orchestration import (
    STAGE_SMOKE,
    STAGE_HPO,
    STAGE_TRAINING,
    EXPERIMENT_NAME,
    MODEL_NAME,
    PROD_STAGE,
)
from orchestration.config_loader import (
    ExperimentConfig,
    create_config_metadata,
    load_all_configs,
    load_experiment_config,
    compute_config_hashes,
    snapshot_configs,
    validate_config_immutability,
)

print(f"Root directory: {ROOT_DIR}")
print(f"Source directory: {SRC_DIR}")
print(f"In Colab: {IN_COLAB}")


FileNotFoundError: [Errno 2] No such file or directory: '/content/resume-ner-azureml'

In [None]:
CONFIG_DIR = ROOT_DIR / "config" if IN_COLAB else Path("../config")

experiment_config: ExperimentConfig = load_experiment_config(CONFIG_DIR, EXPERIMENT_NAME)
configs = load_all_configs(experiment_config)
config_hashes = compute_config_hashes(configs)
original_configs = snapshot_configs(configs)

print(f"Loaded experiment: {experiment_config.name}")
print(f"Config hashes: {config_hashes}")


In [None]:
validate_config_immutability(configs, original_configs)


In [None]:
config_metadata = create_config_metadata(configs, config_hashes)
print("Config metadata:", config_metadata)


## Step P1-3.2: Download Dataset from Kaggle & Create dataset-tiny

Download the dataset from Kaggle and create a tiny subset for smoke testing.


In [None]:
import kagglehub
import shutil

KAGGLE_DATASET_NAME = "yashpwrr/resume-ner-training-dataset"
LOCAL_DATASET_DIR = ROOT_DIR / "dataset"
LOCAL_DATASET_DIR.mkdir(parents=True, exist_ok=True)

print("Downloading dataset from Kaggle...")
kaggle_path = kagglehub.dataset_download(KAGGLE_DATASET_NAME)
print(f"Dataset downloaded to: {kaggle_path}")

dataset_path = Path(kaggle_path)
train_json_path = next(dataset_path.rglob("train.json"), None)

if train_json_path is None:
    raise FileNotFoundError(f"train.json not found in downloaded dataset at {kaggle_path}")

print(f"Found train.json at: {train_json_path}")

shutil.copy(train_json_path, LOCAL_DATASET_DIR / "train.json")
print(f"Copied train.json to {LOCAL_DATASET_DIR / 'train.json'}")

configs["data"]["local_path"] = str(LOCAL_DATASET_DIR.relative_to(ROOT_DIR))


In [None]:
import json
import yaml

TINY_TRAIN_SAMPLES = 8
TINY_VAL_SAMPLES = 2
MAX_TEXT_LENGTH_CHARS = 1500

RAW_TRAIN_PATH = LOCAL_DATASET_DIR / "train.json"
TINY_DATA_DIR = ROOT_DIR / "dataset_tiny"
TINY_TRAIN_PATH = TINY_DATA_DIR / "train.json"
TINY_VAL_PATH = TINY_DATA_DIR / "validation.json"

if not RAW_TRAIN_PATH.exists():
    raise FileNotFoundError(f"Raw train.json not found at {RAW_TRAIN_PATH}")

with RAW_TRAIN_PATH.open("r", encoding="utf-8") as f:
    full_train = json.load(f)

if not isinstance(full_train, list) or not full_train:
    raise ValueError("Expected train.json to be a non-empty list of samples")

TINY_DATA_DIR.mkdir(parents=True, exist_ok=True)

short_samples = [
    sample for sample in full_train
    if isinstance(sample.get("text", ""), str) and len(sample.get("text", "")) <= MAX_TEXT_LENGTH_CHARS
]

required_samples = TINY_TRAIN_SAMPLES + TINY_VAL_SAMPLES
if len(short_samples) < required_samples:
    raise ValueError(
        f"Not enough short samples (<= {MAX_TEXT_LENGTH_CHARS} chars). "
        f"Found {len(short_samples)}, need at least {required_samples}."
    )

train_slice = short_samples[:TINY_TRAIN_SAMPLES]
val_slice = short_samples[TINY_TRAIN_SAMPLES:required_samples]

with TINY_TRAIN_PATH.open("w", encoding="utf-8") as f:
    json.dump(train_slice, f, ensure_ascii=False, indent=2)
with TINY_VAL_PATH.open("w", encoding="utf-8") as f:
    json.dump(val_slice, f, ensure_ascii=False, indent=2)

print(f"✓ Created tiny train ({len(train_slice)} samples) at {TINY_TRAIN_PATH}")
print(f"✓ Created tiny validation ({len(val_slice)} samples) at {TINY_VAL_PATH}")


In [None]:
TINY_CONFIG_PATH = CONFIG_DIR / "data" / "resume_tiny.yaml"
BASE_CONFIG_PATH = CONFIG_DIR / "data" / "resume_v1.yaml"

if BASE_CONFIG_PATH.exists():
    with BASE_CONFIG_PATH.open("r", encoding="utf-8") as f:
        base_cfg = yaml.safe_load(f)
    
    base_cfg["name"] = "resume-ner-data-tiny-short"
    base_cfg["version"] = "v2"
    base_cfg["local_path"] = str(TINY_DATA_DIR.relative_to(ROOT_DIR))
    base_cfg["description"] = "Tiny smoke-test subset of Resume NER dataset (short-text version for fast orchestration tests)"
    
    with TINY_CONFIG_PATH.open("w", encoding="utf-8") as f:
        yaml.safe_dump(base_cfg, f, sort_keys=False)
    
    print(f"✓ Updated tiny data config at {TINY_CONFIG_PATH}")

DATASET_PATH = str(TINY_DATA_DIR)
print(f"Using dataset path: {DATASET_PATH}")


In [None]:
import subprocess

LOCAL_REQUIRED_PACKAGES = ["kagglehub", "optuna", "mlflow"]
CONDA_YAML_PATH = CONFIG_DIR / "environment" / "conda.yaml"

if not CONDA_YAML_PATH.exists():
    print("Warning: conda.yaml not found, skipping dependency installation")
else:
    with open(CONDA_YAML_PATH, "r") as f:
        conda_config = yaml.safe_load(f)
    
    pip_deps = []
    for dep in conda_config.get("dependencies", []):
        if isinstance(dep, dict) and "pip" in dep:
            pip_deps = dep["pip"]
            break
    
    pip_deps.extend(LOCAL_REQUIRED_PACKAGES)
    
    print("Installing dependencies...")
    for dep in pip_deps:
        if dep and not dep.startswith("#"):
            try:
                subprocess.run([sys.executable, "-m", "pip", "install", dep], 
                             check=False, capture_output=True)
            except Exception as e:
                print(f"Warning: Could not install {dep}: {e}")
    
    print("✓ Dependencies installed")


In [None]:
import torch

if torch.cuda.is_available():
    print(f"✓ GPU available: {torch.cuda.get_device_name(0)}")
    print(f"  CUDA version: {torch.version.cuda}")
    print(f"  GPU count: {torch.cuda.device_count()}")
else:
    print("⚠️  No GPU available. Training will use CPU (will be slow).")
    if IN_COLAB:
        print("  In Colab, make sure Runtime > Change runtime type > Hardware accelerator = GPU")


In [None]:
import mlflow

MLFLOW_TRACKING_DIR = ROOT_DIR / "mlruns"
MLFLOW_TRACKING_DIR.mkdir(parents=True, exist_ok=True)

mlflow.set_tracking_uri(f"file://{MLFLOW_TRACKING_DIR}")
mlflow.set_experiment(experiment_config.name)

print(f"✓ MLflow tracking URI: {mlflow.get_tracking_uri()}")
print(f"✓ MLflow experiment: {experiment_config.name}")


## Step P1-3.4: The Dry Run

Run a minimal training job to validate the pipeline before launching HPO.


In [None]:
import subprocess
import sys
from pathlib import Path

# Dry run configuration
DRY_RUN_OUTPUT_DIR = ROOT_DIR / "outputs" / "dry_run"
DRY_RUN_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Get backbone from HPO config
backbone = configs["hpo"]["search_space"]["backbone"]["values"][0]
print(f"Running dry run with backbone: {backbone}")

# Build training command
train_script = ROOT_DIR / "src" / "train.py"
args = [
    sys.executable,
    str(train_script),
    "--data-asset", DATASET_PATH,
    "--config-dir", str(CONFIG_DIR),
    "--backbone", backbone,
    "--epochs", "1",  # Single epoch for dry run
    "--batch-size", "4",  # Small batch for dry run
]

# Set output directory via environment variable
env = os.environ.copy()
env["AZURE_ML_OUTPUT_checkpoint"] = str(DRY_RUN_OUTPUT_DIR)

print("Starting dry run training...")
print(f"Command: {' '.join(args)}")

result = subprocess.run(
    args,
    cwd=ROOT_DIR,
    env=env,
    capture_output=True,
    text=True,
)

if result.returncode != 0:
    print(f"❌ Dry run failed with return code {result.returncode}")
    print(f"STDOUT:\n{result.stdout}")
    print(f"STDERR:\n{result.stderr}")
    raise RuntimeError("Dry run training failed")
else:
    print("✓ Dry run completed successfully")
    print(f"Checkpoint saved to: {DRY_RUN_OUTPUT_DIR}")
    
    # Verify checkpoint exists
    checkpoint_dir = DRY_RUN_OUTPUT_DIR / "checkpoint"
    if checkpoint_dir.exists():
        print(f"✓ Checkpoint directory found: {checkpoint_dir}")
    else:
        print(f"⚠️  Warning: Checkpoint directory not found at {checkpoint_dir}")


## Step P1-3.5: The Sweep (HPO) - Local with Optuna

Run hyperparameter optimization using Optuna for local execution.


In [None]:
from orchestration.jobs.local_sweeps import run_local_hpo_sweep
from orchestration.jobs.local_selection import select_best_configuration_across_studies
import optuna

# HPO configuration
hpo_config = configs["hpo"]
backbone_values = hpo_config["search_space"]["backbone"]["values"]

# Output directory for HPO trials
HPO_OUTPUT_DIR = ROOT_DIR / "outputs" / "hpo"
HPO_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# MLflow experiment name for HPO
mlflow_experiment_name = f"{experiment_config.name}_hpo"

print(f"Starting HPO sweep for backbones: {backbone_values}")
print(f"Max trials: {hpo_config['sampling']['max_trials']}")
print(f"Timeout: {hpo_config['sampling']['timeout_minutes']} minutes")

# Run HPO for each backbone
hpo_studies = {}

for backbone in backbone_values:
    print(f"\n{'='*60}")
    print(f"Running HPO for backbone: {backbone}")
    print(f"{'='*60}")
    
    study = run_local_hpo_sweep(
        dataset_path=DATASET_PATH,
        config_dir=CONFIG_DIR,
        backbone=backbone,
        hpo_config=hpo_config,
        train_config=configs["train"],
        output_dir=HPO_OUTPUT_DIR / backbone,
        mlflow_experiment_name=mlflow_experiment_name,
    )
    
    hpo_studies[backbone] = study
    
    print(f"\n✓ Completed HPO for {backbone}")
    print(f"  Best trial: {study.best_trial.number}")
    print(f"  Best value: {study.best_value:.4f}")
    print(f"  Best params: {study.best_params}")

print(f"\n{'='*60}")
print("HPO completed for all backbones")
print(f"{'='*60}")


## Step P1-3.6: Best Configuration Selection (Automated)

Select the best configuration across all HPO studies.


In [None]:
# Select best configuration across all studies
best_configuration = select_best_configuration_across_studies(
    studies=hpo_studies,
    hpo_config=hpo_config,
    dataset_version=configs["data"]["version"],
)

print("✓ Best configuration selected:")
print(f"  Backbone: {best_configuration['backbone']}")
print(f"  Best metric value: {best_configuration['selection_criteria']['best_value']:.4f}")
print(f"  Hyperparameters: {best_configuration['hyperparameters']}")

# Save best configuration
best_config_cache_file = ROOT_DIR / "notebooks" / "best_configuration_cache.json"
save_json(best_config_cache_file, best_configuration)
print(f"✓ Saved best configuration to {best_config_cache_file}")


In [None]:
train_script = ROOT_DIR / "src" / "train.py"
args = [
    sys.executable,
    str(train_script),
    "--data-asset", DATASET_PATH,
    "--config-dir", str(CONFIG_DIR),
    "--backbone", final_training_config["backbone"],
    "--learning-rate", str(final_training_config["learning_rate"]),
    "--batch-size", str(final_training_config["batch_size"]),
    "--dropout", str(final_training_config["dropout"]),
    "--weight-decay", str(final_training_config["weight_decay"]),
    "--epochs", str(final_training_config["epochs"]),
    "--random-seed", str(final_training_config["random_seed"]),
    "--early-stopping-enabled", str(final_training_config["early_stopping_enabled"]).lower(),
    "--use-combined-data", str(final_training_config["use_combined_data"]).lower(),
]

env = os.environ.copy()
env["AZURE_ML_OUTPUT_checkpoint"] = str(FINAL_TRAINING_OUTPUT_DIR)

print("Starting final training...")
print(f"Command: {' '.join(args)}")

result = subprocess.run(
    args,
    cwd=ROOT_DIR,
    env=env,
    capture_output=True,
    text=True,
)

if result.returncode != 0:
    print(f"❌ Final training failed with return code {result.returncode}")
    print(f"STDOUT:\n{result.stdout}")
    print(f"STDERR:\n{result.stderr}")
    raise RuntimeError("Final training failed")

print("✓ Final training completed successfully")
print(f"Checkpoint saved to: {FINAL_TRAINING_OUTPUT_DIR}")

checkpoint_dir = FINAL_TRAINING_OUTPUT_DIR / "checkpoint"
if checkpoint_dir.exists():
    FINAL_CHECKPOINT_PATH = checkpoint_dir
elif (FINAL_TRAINING_OUTPUT_DIR / "pytorch_model.bin").exists():
    FINAL_CHECKPOINT_PATH = FINAL_TRAINING_OUTPUT_DIR
else:
    raise FileNotFoundError(f"Checkpoint not found in {FINAL_TRAINING_OUTPUT_DIR}")

print(f"✓ Checkpoint directory found: {FINAL_CHECKPOINT_PATH}")


## Step P1-3.7: Final Training (Post-HPO, Single Run)

Train the final production model using the best configuration from HPO.


In [None]:
from orchestration.jobs.training import build_final_training_config

# Build final training config from best HPO result
final_training_config = build_final_training_config(
    best_configuration,
    configs["train"],
    random_seed=42,
)

print("Final training configuration:")
for key, value in final_training_config.items():
    print(f"  {key}: {value}")

# Output directory for final training
FINAL_TRAINING_OUTPUT_DIR = ROOT_DIR / "outputs" / "final_training"
FINAL_TRAINING_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Build training command
args = [
    sys.executable,
    str(ROOT_DIR / "src" / "train.py"),
    "--data-asset", DATASET_PATH,
    "--config-dir", str(CONFIG_DIR),
    "--backbone", final_training_config["backbone"],
    "--learning-rate", str(final_training_config["learning_rate"]),
    "--batch-size", str(final_training_config["batch_size"]),
    "--dropout", str(final_training_config["dropout"]),
    "--weight-decay", str(final_training_config["weight_decay"]),
    "--epochs", str(final_training_config["epochs"]),
    "--random-seed", str(final_training_config["random_seed"]),
    "--early-stopping-enabled", str(final_training_config["early_stopping_enabled"]).lower(),
    "--use-combined-data", str(final_training_config["use_combined_data"]).lower(),
]

# Set output directory
env = os.environ.copy()
env["AZURE_ML_OUTPUT_checkpoint"] = str(FINAL_TRAINING_OUTPUT_DIR)

print("\nStarting final training...")
print(f"Command: {' '.join(args)}")

result = subprocess.run(
    args,
    cwd=ROOT_DIR,
    env=env,
    capture_output=True,
    text=True,
)

if result.returncode != 0:
    print(f"❌ Final training failed with return code {result.returncode}")
    print(f"STDOUT:\n{result.stdout}")
    print(f"STDERR:\n{result.stderr}")
    raise RuntimeError("Final training failed")
else:
    print("✓ Final training completed successfully")
    print(f"Checkpoint saved to: {FINAL_TRAINING_OUTPUT_DIR}")
    
    # Verify checkpoint exists
    checkpoint_dir = FINAL_TRAINING_OUTPUT_DIR / "checkpoint"
    if checkpoint_dir.exists():
        print(f"✓ Checkpoint directory found: {checkpoint_dir}")
        FINAL_CHECKPOINT_PATH = checkpoint_dir
    else:
        # Try alternative location
        if (FINAL_TRAINING_OUTPUT_DIR / "pytorch_model.bin").exists():
            FINAL_CHECKPOINT_PATH = FINAL_TRAINING_OUTPUT_DIR
        else:
            raise FileNotFoundError(f"Checkpoint not found in {FINAL_TRAINING_OUTPUT_DIR}")


## Step P1-4: Model Conversion & Optimization

Convert the final training checkpoint to an optimized ONNX model.


In [None]:
# Conversion output directory
CONVERSION_OUTPUT_DIR = ROOT_DIR / "outputs" / "onnx_model"
CONVERSION_OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Build conversion command
conversion_script = ROOT_DIR / "src" / "convert_to_onnx.py"
args = [
    sys.executable,
    str(conversion_script),
    "--checkpoint-path", str(FINAL_CHECKPOINT_PATH),
    "--config-dir", str(CONFIG_DIR),
    "--backbone", best_configuration["backbone"],
    "--output-dir", str(CONVERSION_OUTPUT_DIR),
    "--quantize-int8",
    "--run-smoke-test",
]

print("Starting model conversion...")
print(f"Checkpoint: {FINAL_CHECKPOINT_PATH}")
print(f"Output: {CONVERSION_OUTPUT_DIR}")
print(f"Command: {' '.join(args)}")

result = subprocess.run(
    args,
    cwd=ROOT_DIR,
    capture_output=True,
    text=True,
)

if result.returncode != 0:
    print(f"❌ Conversion failed with return code {result.returncode}")
    print(f"STDOUT:\n{result.stdout}")
    print(f"STDERR:\n{result.stderr}")
    raise RuntimeError("Model conversion failed")
else:
    print("✓ Model conversion completed successfully")
    
    # Check for ONNX model
    onnx_model_path = CONVERSION_OUTPUT_DIR / "model_int8.onnx"
    if not onnx_model_path.exists():
        onnx_model_path = CONVERSION_OUTPUT_DIR / "model.onnx"
    
    if onnx_model_path.exists():
        print(f"✓ ONNX model saved to: {onnx_model_path}")
        print(f"  Model size: {onnx_model_path.stat().st_size / (1024*1024):.2f} MB")
    else:
        print(f"⚠️  Warning: ONNX model not found in {CONVERSION_OUTPUT_DIR}")


## Summary

All steps completed successfully! The trained model has been converted to ONNX format and is ready for deployment.

### Outputs:
- **Final checkpoint**: `outputs/final_training/checkpoint/`
- **ONNX model**: `outputs/onnx_model/model_int8.onnx` (or `model.onnx`)
- **MLflow tracking**: `mlruns/` directory

### Next Steps:
- Review MLflow metrics and training logs
- Test the ONNX model with sample inputs
- Deploy the model for inference (Phase 2)
