In [3]:
#!/usr/bin/env python3
"""
analyze_results.py

Fetch all runs for the 'CitiBike_Forecasting' experiment,
extract MAE for each, compute improvement over the baseline,
and register the best model in MLflow Model Registry.
"""

import mlflow
from mlflow.exceptions import MlflowException
import pandas as pd

EXPERIMENT_NAME = "CitiBike_Forecasting"
BASELINE_RUN    = "baseline_mean"      # runName for your baseline script
MODEL_REG_NAME  = "CitiBike_BestModel" # desired name in the Model Registry

def fetch_run_metrics(experiment_name: str) -> pd.DataFrame:
    client = mlflow.tracking.MlflowClient()
    exp = client.get_experiment_by_name(experiment_name)
    if exp is None:
        raise ValueError(f"Experiment '{experiment_name}' not found")

    runs = client.search_runs(
        exp.experiment_id,
        order_by=["start_time ASC"]
    )

    records = []
    for run in runs:
        run_name  = run.data.tags.get("mlflow.runName", run.info.run_id)
        mae       = run.data.metrics.get("mae")
        model_uri = f"runs:/{run.info.run_id}/model"
        if mae is not None:
            records.append({
                "run_id":    run.info.run_id,
                "model":     run_name,
                "mae":       mae,
                "model_uri": model_uri
            })
    return pd.DataFrame(records)

def main():
    df = fetch_run_metrics(EXPERIMENT_NAME)

    if BASELINE_RUN not in df["model"].values:
        raise ValueError(f"Baseline run '{BASELINE_RUN}' not found among models: {df['model'].tolist()}")

    # Compute improvement %
    baseline_mae = df.loc[df["model"] == BASELINE_RUN, "mae"].iloc[0]
    df["improvement_pct"] = (baseline_mae - df["mae"]) / baseline_mae * 100

    # Reorder so baseline appears first
    df = df.set_index("model").loc[
        [BASELINE_RUN] + [m for m in df["model"] if m != BASELINE_RUN]
    ].reset_index()

    # Print summary table
    print("\nModel MAE and Improvement Over Baseline\n" + "-"*50)
    print(df[["model", "mae", "improvement_pct"]].to_string(index=False, float_format="%.2f"))

    # Identify best model
    best = df.loc[df["mae"].idxmin()]
    print(f"\nBest model: {best['model']} (run {best['run_id']})")
    print(f"  MAE = {best['mae']:.2f}")
    print(f"  Improvement = {best['improvement_pct']:.2f}% over baseline")

    # Register in Model Registry if missing, then add a new version
    client = mlflow.tracking.MlflowClient()
    try:
        client.get_registered_model(MODEL_REG_NAME)
    except MlflowException:
        # any exception here means "not found" or similar—create it
        client.create_registered_model(MODEL_REG_NAME)

    model_version = client.create_model_version(
        name=MODEL_REG_NAME,
        source=best["model_uri"],
        run_id=best["run_id"]
    )
    print(f"\nRegistered '{MODEL_REG_NAME}' as version {model_version.version}")

if __name__ == "__main__":
    main()


Model MAE and Improvement Over Baseline
--------------------------------------------------
         model   mae  improvement_pct
 baseline_mean 31.20             0.00
    lgbm_28lag  8.22            73.66
lgbm_top10_imp  8.33            73.28

Best model: lgbm_28lag (run 46db17e7e9ba4389b5a20772abd2d767)
  MAE = 8.22
  Improvement = 73.66% over baseline

Registered 'CitiBike_BestModel' as version 1
