# MLflow Structure

In [1]:
import os, json, math, tempfile
from dataclasses import dataclass
from typing import Dict, List, Iterator, Tuple, Any

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Ridge, ElasticNet

import mlflow

import statsmodels.api as sm



In [2]:
import os
# import mlflow

TRACK_DIR = os.path.abspath("./mlruns_rga_test")
mlflow.set_tracking_uri(f"file:///{TRACK_DIR.replace(os.sep, '/')}") # for local testing. not needed for Databricks


In [3]:
import pandas as pd
import numpy as np

def make_tiny_df():
    weeks = pd.date_range("2023-01-01", "2025-06-30", freq="W-SUN")
    n = len(weeks)

    rng = np.random.default_rng(42)

    df = pd.DataFrame({
        "week_start": weeks,
        # target
        "log_GC": rng.normal(0, 1, n),
    })

    # promo + media features
    for c in ['digital_promo_1','digital_promo_2','digital_promo_3','digital_promo_4','digital_promo_5',
              'media_1','media_2','media_3','media_4','media_5']:
        df[c] = rng.normal(0, 1, n)

    return df

df = make_tiny_df()


1) Configuration objects (easy to extend)

In [5]:
@dataclass(frozen=True)
class TimeSplit:
    time_split_id: str
    train_start: str
    train_end: str
    test_start: str
    test_end: str

@dataclass(frozen=True)
class RunConfig:
    # "fixed" knobs you said exist
    target_node: str              # "GC" or "AC"
    panel_control: str            # "FE" / "Mundlak" / "Bayesian" (placeholder for now)
    algorithm: str                # "OLS" / "Ridge" / "ElasticNet"
    alpha: float = 0.0
    l1_ratio: float = 0.0

    # experiment dimensions
    feature_block_set_id: str = ""
    features: Tuple[str, ...] = tuple()
    time_split: TimeSplit = None
    seed: int = 42

2) MLflow setup (experiment + common tags)

Databricks tip: use a workspace path like "/Users/<you>/RGA/Regressions".

In [6]:
# def setup_mlflow(experiment_name: str, common_tags: Dict[str, str]) -> None:
#     mlflow.set_experiment(experiment_name)
#     mlflow.set_tags(common_tags)

def setup_mlflow(experiment_name: str) -> None:
    mlflow.set_experiment(experiment_name)


3) Search strategy component (swap later)

This yields a stream of RunConfig objects. Keep it dumb/simple now.

In [7]:
def generate_feature_sets(feature_blocks: Dict[str, List[str]]) -> Dict[str, Tuple[str, ...]]:
    """
    Example: feature_blocks = {"promo":[...], "media":[...], "ops":[...]}
    Return a dict of feature_block_set_id -> tuple(features)
    """
    # Minimal example: each single block + all blocks
    out = {}
    for block, feats in feature_blocks.items():
        out[f"block__{block}"] = tuple(feats)
    all_feats = tuple(sorted({f for feats in feature_blocks.values() for f in feats}))
    out["block__ALL"] = all_feats
    return out

def search_space(
    target_node: str,
    panel_controls: List[str],
    algorithms: List[Dict[str, Any]],
    time_splits: List[TimeSplit],
    feature_sets: Dict[str, Tuple[str, ...]],
    seed: int = 42,
) -> Iterator[RunConfig]:
    for pc in panel_controls:
        for algo in algorithms:
            for ts in time_splits:
                for fs_id, feats in feature_sets.items():
                    yield RunConfig(
                        target_node=target_node,
                        panel_control=pc,
                        algorithm=algo["name"],
                        alpha=float(algo.get("alpha", 0.0)),
                        l1_ratio=float(algo.get("l1_ratio", 0.0)),
                        feature_block_set_id=fs_id,
                        features=feats,
                        time_split=ts,
                        seed=seed,
                    )


4) Model factory component (swap later)

This is where FE/Mundlak/Bayesian wrappers will go. For now it’s vanilla sklearn.

In [8]:
def make_model(cfg: RunConfig):
    if cfg.algorithm == "OLS":
        return LinearRegression()
    if cfg.algorithm == "Ridge":
        return Ridge(alpha=cfg.alpha, random_state=cfg.seed)
    if cfg.algorithm == "ElasticNet":
        return ElasticNet(alpha=cfg.alpha, l1_ratio=cfg.l1_ratio, random_state=cfg.seed, max_iter=10000)
    raise ValueError(f"Unknown algorithm: {cfg.algorithm}")


In [9]:
def fit_and_get_coef_table(cfg: RunConfig, X_train: pd.DataFrame, y_train: pd.Series):
    """
    Returns: model_object, coef_df
    coef_df columns: feature, coef, sign, abs_coef, rank_abscoef, t_stat, p_value
    """
    feats = list(X_train.columns)

    if cfg.algorithm == "OLS":
        # statsmodels gives p-values
        X_sm = sm.add_constant(X_train, has_constant="add")
        model = sm.OLS(y_train, X_sm).fit()

        coef = model.params.drop("const", errors="ignore")
        tstat = model.tvalues.drop("const", errors="ignore")
        pval = model.pvalues.drop("const", errors="ignore")

        df = pd.DataFrame({
            "feature": coef.index,
            "coef": coef.values,
            "t_stat": tstat.reindex(coef.index).values,
            "p_value": pval.reindex(coef.index).values,
        })

    else:
        # sklearn (no p-values)
        model = make_model(cfg)
        model.fit(X_train, y_train)

        coefs = np.asarray(model.coef_).ravel()
        df = pd.DataFrame({
            "feature": feats,
            "coef": coefs,
            "t_stat": np.nan,
            "p_value": np.nan,
        })

    df["abs_coef"] = df["coef"].abs()
    df["sign"] = np.sign(df["coef"]).astype(int)
    df = df.sort_values("abs_coef", ascending=False).reset_index(drop=True)
    df["rank_abscoef"] = np.arange(1, len(df) + 1)

    # simple significance flag (customize threshold later)
    df["is_significant_05"] = (df["p_value"] < 0.05)

    return model, df


5) Data prep component (you will replace pieces later)

Assumes:

df_pd includes store_id, week_start, plus feature columns

target columns exist, e.g. log_GC, log_AC (or you can compute)

In [10]:
def slice_by_time(df: pd.DataFrame, ts: TimeSplit) -> Tuple[pd.DataFrame, pd.DataFrame]:
    df = df.copy()
    df["week_start"] = pd.to_datetime(df["week_start"])
    train = df[(df["week_start"] >= ts.train_start) & (df["week_start"] <= ts.train_end)]
    test  = df[(df["week_start"] >= ts.test_start)  & (df["week_start"] <= ts.test_end)]
    return train, test

def get_target_col(target_node: str) -> str:
    # you can change this mapping anytime
    if target_node == "GC":
        return "log_GC"
    if target_node == "AC":
        return "log_AC"
    raise ValueError("target_node must be 'GC' or 'AC'")


6) Logging policy component (what to log)

Keep parameters as params; use tags for “indexing / grouping” fields you’ll filter on later.

In [12]:
def log_run_inputs(cfg: RunConfig) -> None:
    # Params (queryable, shown in UI)
    mlflow.log_params({
        "target_node": cfg.target_node,
        "panel_control": cfg.panel_control,
        "algorithm": cfg.algorithm,
        "alpha": cfg.alpha,
        "l1_ratio": cfg.l1_ratio,
        "feature_block_set_id": cfg.feature_block_set_id,
        "n_features": len(cfg.features),
        "time_split_id": cfg.time_split.time_split_id,
        "train_start": cfg.time_split.train_start,
        "train_end": cfg.time_split.train_end,
        "test_start": cfg.time_split.test_start,
        "test_end": cfg.time_split.test_end,
        "seed": cfg.seed,
    })

def log_metrics(prefix: str, y_true, y_pred) -> None:
    rmse = math.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    mlflow.log_metrics({
        f"{prefix}_rmse": rmse,
        f"{prefix}_mae": mae,
        f"{prefix}_r2": r2,
    })


7) Fit + log artifacts (coeff table, rank table, config snapshot)

In [13]:
def coef_table(model, feature_names: List[str]) -> pd.DataFrame:
    # Works for linear models that have coef_
    coefs = np.asarray(model.coef_).ravel()
    df = pd.DataFrame({"feature": feature_names, "coef": coefs})
    df["abs_coef"] = df["coef"].abs()
    df["sign"] = np.sign(df["coef"]).astype(int)
    df = df.sort_values("abs_coef", ascending=False).reset_index(drop=True)
    df["rank_abscoef"] = np.arange(1, len(df) + 1)
    return df

def log_dataframe_as_csv(df: pd.DataFrame, artifact_path: str, filename: str) -> None:
    with tempfile.TemporaryDirectory() as tmpdir:
        fpath = os.path.join(tmpdir, filename)
        df.to_csv(fpath, index=False)
        mlflow.log_artifact(fpath, artifact_path=artifact_path)

def log_config_snapshot(cfg: RunConfig) -> None:
    payload = {
        "target_node": cfg.target_node,
        "panel_control": cfg.panel_control,
        "algorithm": cfg.algorithm,
        "alpha": cfg.alpha,
        "l1_ratio": cfg.l1_ratio,
        "feature_block_set_id": cfg.feature_block_set_id,
        "features": list(cfg.features),
        "time_split": cfg.time_split.__dict__,
        "seed": cfg.seed,
    }
    mlflow.log_dict(payload, artifact_file="run_config.json")


8) Single run execution (with failure handling)

In [None]:
# def run_one_experiment(df_pd: pd.DataFrame, cfg: RunConfig) -> None:
#     run_name = f"{cfg.target_node}__{cfg.panel_control}__{cfg.algorithm}__{cfg.feature_block_set_id}__{cfg.time_split.time_split_id}"

#     with mlflow.start_run(run_name=run_name, nested=True):
#         try:
#             log_run_inputs(cfg)
#             log_config_snapshot(cfg)

#             # Data slicing
#             train_df, test_df = slice_by_time(df_pd, cfg.time_split)
#             ycol = get_target_col(cfg.target_node)

#             X_train = train_df.loc[:, list(cfg.features)]
#             y_train = train_df[ycol]
#             X_test  = test_df.loc[:, list(cfg.features)]
#             y_test  = test_df[ycol]

#             # Fit
#             model = make_model(cfg)
#             model.fit(X_train, y_train)

#             # Predict + metrics
#             pred_train = model.predict(X_train)
#             pred_test  = model.predict(X_test)
#             log_metrics("train", y_train, pred_train)
#             log_metrics("test", y_test, pred_test)

#             # Artifacts: coefficients + ranks
#             cdf = coef_table(model, list(cfg.features))
#             log_dataframe_as_csv(cdf, artifact_path="artifacts", filename="coefficients.csv")

#             # Optional: log model (safe to keep optional if volume is huge)
#             # mlflow.sklearn.log_model(model, artifact_path="model")

#         except Exception as e:
#             mlflow.set_tag("run_status", "failed")
#             mlflow.log_text(str(e), "error.txt")
#             raise
#         else:
#             mlflow.set_tag("run_status", "ok")


In [None]:
# print("Active run before:", mlflow.active_run())
# print("Active run entering child:", mlflow.active_run())

Active run before: <ActiveRun: >
Active run entering child: <ActiveRun: >


In [38]:
def run_one_experiment(df_pd: pd.DataFrame, cfg: RunConfig) -> None:
    """
    Execute ONE fully-specified trial and log everything needed for later stability aggregation.

    Logs (per run):
      - params: target_node, panel_control, algorithm, hyperparams, feature_set_id, n_features, time split, etc.
      - metrics: train/test r2/rmse/mae
      - artifacts: coefficients.csv (coef/sign/rank + p-values for OLS), run_config.json
      - tags: run_status
    """
    run_name = (
        f"{cfg.target_node}__{cfg.panel_control}__{cfg.algorithm}"
        f"__{cfg.feature_block_set_id}__{cfg.time_split.time_split_id}"
    )

    with mlflow.start_run(run_name=run_name, nested=True):
        mlflow.set_tag("run_type", "trial")
        try:
            # --- 1) Log inputs (params + config snapshot) ---
            log_run_inputs(cfg)
            log_config_snapshot(cfg)

            # --- 2) Slice data ---
            train_df, test_df = slice_by_time(df_pd, cfg.time_split)
            ycol = get_target_col(cfg.target_node)

            # Basic guards (kept simple)
            if len(cfg.features) == 0:
                raise ValueError("cfg.features is empty.")
            missing_cols = [c for c in cfg.features + (ycol,) if c not in train_df.columns]
            if missing_cols:
                raise ValueError(f"Missing columns in df_pd: {missing_cols}")

            X_train = train_df.loc[:, list(cfg.features)]
            y_train = train_df[ycol]
            X_test = test_df.loc[:, list(cfg.features)]
            y_test = test_df[ycol]

            # --- 3) Fit + build coefficients table (with p-values only for OLS) ---
            model, cdf = fit_and_get_coef_table(cfg, X_train, y_train)

            # Add run context directly into the coefficient table (helps aggregation later)
            cdf["target_node"] = cfg.target_node
            cdf["panel_control"] = cfg.panel_control
            cdf["algorithm"] = cfg.algorithm
            cdf["feature_block_set_id"] = cfg.feature_block_set_id
            cdf["time_split_id"] = cfg.time_split.time_split_id

            # --- 4) Predict (handle statsmodels vs sklearn) ---
            if cfg.algorithm == "OLS":
                pred_train = model.predict(sm.add_constant(X_train, has_constant="add"))
                pred_test = model.predict(sm.add_constant(X_test, has_constant="add"))
            else:
                pred_train = model.predict(X_train)
                pred_test = model.predict(X_test)

            # --- 5) Log metrics ---
            log_metrics("train", y_train, pred_train)
            log_metrics("test", y_test, pred_test)

            # Optional: log a couple simple summary metrics from coefficient table
            # (e.g. number significant; works for OLS only)
            if cdf["p_value"].notna().any():
                mlflow.log_metric("n_significant_05", float((cdf["p_value"] < 0.05).sum()))
                mlflow.log_metric("pct_significant_05", float((cdf["p_value"] < 0.05).mean()))
            else:
                mlflow.log_metric("n_significant_05", np.nan)
                mlflow.log_metric("pct_significant_05", np.nan)

            # For ElasticNet, you may care about selection rate (non-zero)
            if cfg.algorithm == "ElasticNet":
                mlflow.log_metric("n_nonzero_coef", float((cdf["coef"].abs() > 1e-12).sum()))
                mlflow.log_metric("pct_nonzero_coef", float((cdf["coef"].abs() > 1e-12).mean()))

            # --- 6) Log artifacts (system of record for per-feature consolidation) ---
            log_dataframe_as_csv(cdf, artifact_path="artifacts", filename="coefficients.csv")

        except Exception as e:
            mlflow.set_tag("run_status", "failed")
            mlflow.log_text(str(e), "error.txt")
            raise
        else:
            mlflow.set_tag("run_status", "ok")

Orchestrator (parent run + many child runs)

This matches the MLflow tutorial pattern (parent run contains the “study”, child runs contain each trial).

In [None]:
# def run_study(
#     df_pd: pd.DataFrame,
#     experiment_name: str,
#     study_name: str,
#     common_tags: Dict[str, str],
#     configs: Iterator[RunConfig],
#     max_runs: int = None,
# ) -> str:
#     setup_mlflow(experiment_name, common_tags)

#     with mlflow.start_run(run_name=study_name) as parent:
#         mlflow.set_tag("run_type", "study")
#         mlflow.log_param("study_name", study_name)

#         n = 0
#         for cfg in configs:
#             if max_runs is not None and n >= max_runs:
#                 break
#             run_one_experiment(df_pd, cfg)
#             n += 1

#         mlflow.log_param("n_child_runs", n)
#         return parent.info.run_id


In [None]:
def run_study(
    df_pd: pd.DataFrame,
    experiment_name: str,
    study_name: str,
    common_tags: Dict[str, str],
    configs: Iterator[RunConfig],
    max_runs: int = None,
) -> str:
    # extra safety for notebooks: close anything dangling
    while mlflow.active_run() is not None:
        mlflow.end_run()

    setup_mlflow(experiment_name)

    # start parent run FIRST, then set tags
    with mlflow.start_run(run_name=study_name) as parent:
        mlflow.set_tags(common_tags)          # ✅ now safe
        mlflow.set_tag("run_type", "study")
        mlflow.log_param("study_name", study_name)

        n = 0
        for cfg in configs:
            if max_runs is not None and n >= max_runs:
                break
            run_one_experiment(df_pd, cfg)     # child run MUST be nested=True
            n += 1

        mlflow.log_param("n_child_runs", n)
        return parent.info.run_id


10) Retrieve results + feature stability summary (simple version)

This is the “later I’ll decide which features to keep” part — implemented minimally using MLflow search + the logged coefficient artifacts.

In [16]:
def load_coefficients_for_run(run_id: str) -> pd.DataFrame:
    # Download the artifact and read it
    local_dir = mlflow.artifacts.download_artifacts(run_id=run_id, artifact_path="artifacts/coefficients.csv")
    return pd.read_csv(local_dir)

def aggregate_feature_stability(
    experiment_name: str,
    filter_query: str = "tags.run_status = 'ok'",
) -> pd.DataFrame:
    exp = mlflow.get_experiment_by_name(experiment_name)
    runs = mlflow.search_runs(experiment_ids=[exp.experiment_id], filter_string=filter_query)

    rows = []
    for _, r in runs.iterrows():
        run_id = r["run_id"]
        try:
            cdf = load_coefficients_for_run(run_id)
            cdf["run_id"] = run_id
            rows.append(cdf[["run_id", "feature", "coef", "sign", "abs_coef", "rank_abscoef"]])
        except Exception:
            # If some run didn't log artifacts, skip (or handle stricter)
            continue

    if not rows:
        return pd.DataFrame()

    allc = pd.concat(rows, ignore_index=True)

    # Stability stats you described (simple baseline)
    g = allc.groupby("feature")
    out = pd.DataFrame({
        "n_runs_appeared": g["run_id"].nunique(),
        "mean_coef": g["coef"].mean(),
        "median_coef": g["coef"].median(),
        "std_coef": g["coef"].std(ddof=1),
        "mean_abscoef": g["abs_coef"].mean(),
        "mean_rank": g["rank_abscoef"].mean(),
        "pct_positive": g["sign"].apply(lambda s: (s > 0).mean()),
        "pct_negative": g["sign"].apply(lambda s: (s < 0).mean()),
    }).reset_index()

    out["coef_cv"] = out["std_coef"] / out["mean_coef"].replace(0, np.nan)
    out = out.sort_values(["n_runs_appeared", "mean_abscoef"], ascending=[False, False]).reset_index(drop=True)
    return out


11) Example usage (plug in your df_pd)

In [16]:
df

Unnamed: 0,week_start,log_GC,digital_promo_1,digital_promo_2,digital_promo_3,digital_promo_4,digital_promo_5,media_1,media_2,media_3,media_4,media_5
0,2023-01-01,0.304717,-1.376686,0.232170,0.459386,1.403821,0.319400,0.044212,0.529413,-0.955625,0.417472,-1.180001
1,2023-01-08,-1.039984,0.635151,-0.555327,0.701954,-0.442536,-0.869047,-0.202914,1.363429,0.437512,-1.320489,0.804570
2,2023-01-15,0.750451,-0.222223,0.471539,0.138241,1.455046,0.177396,-1.082427,-1.880798,-1.241756,0.854686,-0.675114
3,2023-01-22,0.940565,-1.470806,1.012716,0.760133,0.131486,1.212519,-0.151052,-0.317907,-0.204069,-0.800212,0.403954
4,2023-01-29,-1.951035,-1.015579,0.155429,0.229211,0.258229,-0.323792,-0.746098,-0.867005,0.109648,0.632858,0.565460
...,...,...,...,...,...,...,...,...,...,...,...,...
126,2025-06-01,1.463303,-0.376156,0.276274,1.628937,1.847825,-0.079730,0.555582,0.101926,-0.084851,-2.480709,-0.439988
127,2025-06-08,-1.188763,-0.133823,-1.412766,-0.970150,-0.174173,1.797561,-0.622168,-0.762323,-1.600206,-0.996419,-2.955619
128,2025-06-15,-0.639752,-1.374896,-2.310103,-0.887696,1.667888,0.894213,0.987405,-0.859206,-0.761974,1.232902,-1.247317
129,2025-06-22,-0.926576,-0.238174,0.054354,1.335784,-1.103741,0.011445,1.157508,-0.537663,0.148627,-2.777994,1.120841


In [40]:
# import mlflow
# while mlflow.active_run() is not None:
#     mlflow.end_run()
# print("Active run now:", mlflow.active_run())


local testing

In [41]:
feature_blocks = {
    "promo": ['digital_promo_1','digital_promo_2','digital_promo_3','digital_promo_4','digital_promo_5'],
    "media": ['media_1','media_2','media_3','media_4','media_5'],
}

time_splits = [
    TimeSplit("ts1", "2023-01-01", "2024-06-30", "2024-07-01", "2024-12-31"),
]

algorithms = [
    {"name": "OLS"},   # simplest to start (and gives p-values if you used statsmodels in OLS)
]

feature_sets = generate_feature_sets(feature_blocks)

configs_gc = search_space(
    target_node="GC",
    panel_controls=["FE"],     # keep one for smoke test
    algorithms=algorithms,
    time_splits=time_splits,
    feature_sets={"block__promo": feature_sets["block__promo"]},  # one feature set only
    seed=42
)


In [42]:
print("Active run before:", mlflow.active_run())

parent_run_id = run_study(
    df_pd=df,
    experiment_name="RGA_Regression_Local",
    study_name="GC_smoketest_v001",
    common_tags={
        "project": "RevenueGrowthAnalytics",
        "layer": "2",
        "framework": "regression_shell",
        "env": "local",
    },
    configs=configs_gc,
    max_runs=2,   # <-- smallest smoke test
)

stability = aggregate_feature_stability(
    experiment_name="RGA_Regression_Local",
    filter_query="tags.run_status = 'ok' and params.target_node = 'GC'"
)

print(stability.head(30))


Active run before: None
           feature  n_runs_appeared  mean_coef  median_coef  std_coef  \
0  digital_promo_1                2  -0.184221    -0.184221       0.0   
1  digital_promo_5                2  -0.107128    -0.107128       0.0   
2  digital_promo_4                2  -0.052290    -0.052290       0.0   
3  digital_promo_3                2  -0.023930    -0.023930       0.0   
4  digital_promo_2                2   0.011568     0.011568       0.0   

   mean_abscoef  mean_rank  pct_positive  pct_negative  coef_cv  
0      0.184221        1.0           0.0           1.0     -0.0  
1      0.107128        2.0           0.0           1.0     -0.0  
2      0.052290        3.0           0.0           1.0     -0.0  
3      0.023930        4.0           0.0           1.0     -0.0  
4      0.011568        5.0           1.0           0.0      0.0  


In [None]:
# aggregate_feature_stability() slice_by_time(), fit_and_get_coef_table(), and generate_feature_sets()

complex version

In [None]:
# Example component inputs (you will replace these)
# feature_blocks = {
#     "promo": ["promo_depth", "lto_flag", "discount_idx", "bundle_idx", "coupon_rate"],
#     "media": ["tv_grps", "digital_imps", "search_spend", "social_spend", "ooh_spend"],
# }

feature_blocks = {
    "promo": ['digital_promo_1', 'digital_promo_2', 'digital_promo_3',
       'digital_promo_4', 'digital_promo_5'],
    "media": ['media_1', 'media_2', 'media_3', 'media_4',
       'media_5'],
}

time_splits = [
    TimeSplit("ts1", "2023-01-01", "2024-06-30", "2024-07-01", "2024-12-31"),
    TimeSplit("ts2", "2023-07-01", "2024-12-31", "2025-01-01", "2025-06-30"),
]

algorithms = [
    {"name": "OLS"},
    {"name": "Ridge", "alpha": 1.0},
    {"name": "ElasticNet", "alpha": 0.1, "l1_ratio": 0.5},
]

feature_sets = generate_feature_sets(feature_blocks)

configs_gc = search_space(
    target_node="GC",
    panel_controls=["FE", "Mundlak"],      # Bayesian later
    algorithms=algorithms,
    time_splits=time_splits,
    feature_sets=feature_sets,
    seed=42
)

parent_run_id = run_study(
    df_pd=df,
    # experiment_name="/Users/your.name@company.com/RGA_Regression",
    experiment_name="RGA_Regression_Local",
    study_name="GC_study_v001",
    common_tags={
        "project": "RevenueGrowthAnalytics",
        "layer": "2",
        "framework": "regression_shell",
    },
    configs=configs_gc,
    max_runs=50,   # remove later
)

# Aggregate stability
stability = aggregate_feature_stability(
    experiment_name="/Users/your.name@company.com/RGA_Regression",
    filter_query="tags.run_status = 'ok' and params.target_node = 'GC'"
)

display(stability.head(30))


In [26]:
# import mlflow
# from mlflow.tracking import MlflowClient

# print("Tracking URI:", mlflow.get_tracking_uri())
# print("Active run:", mlflow.active_run())

# client = MlflowClient()
# exps = client.search_experiments()
# print("Experiments found:", [e.name for e in exps][:10])

# print(TRACK_DIR)

# import os, mlflow
# TRACK_DIR = os.path.abspath("./mlruns_rga_test")
# mlflow.set_tracking_uri("file:///" + TRACK_DIR.replace("\\", "/"))
# print("Tracking URI set to:", mlflow.get_tracking_uri())

In [27]:
# from mlflow.tracking import MlflowClient
# client = MlflowClient()

# exp = client.get_experiment_by_name("RGA_Regression_Local")
# print("Experiment:", exp)

# runs = client.search_runs([exp.experiment_id], max_results=5)
# print("Found runs:", len(runs))
# print([r.data.tags.get("mlflow.runName") for r in runs])


In [28]:
# print(mlflow.get_tracking_uri())

In [29]:
from mlflow.tracking import MlflowClient
client = MlflowClient()

run = client.get_run("15a6649b2b244ec4b624b0d7bfc0afc2")
print(run.data.metrics)


{}


In [30]:
X_train.shape[0] == 0
X_test.shape[0] == 0


NameError: name 'X_train' is not defined

Final model

In [None]:
from dataclasses import dataclass
from typing import List, Dict, Any
import mlflow
import statsmodels.api as sm

@dataclass(frozen=True)
class FinalSpec:
    name: str                  # e.g., "GC_final_v001"
    target_node: str           # "GC" / "AC"
    panel_control: str         # "FE" / "Mundlak" / "Bayesian"
    algorithm: str             # "OLS" / "Ridge" / "ElasticNet"
    alpha: float = 0.0
    l1_ratio: float = 0.0
    features: List[str] = None
    train_start: str = None
    train_end: str = None

def train_final_and_log_model(df_pd, spec: FinalSpec, experiment_name: str, tags: Dict[str,str]):
    mlflow.set_experiment(experiment_name)

    run_name = spec.name
    with mlflow.start_run(run_name=run_name):
        # Tag this as a promoted/final run
        mlflow.set_tags({**tags, "run_stage": "final", "is_champion": "true"})

        # Params (full snapshot)
        mlflow.log_params({
            "target_node": spec.target_node,
            "panel_control": spec.panel_control,
            "algorithm": spec.algorithm,
            "alpha": spec.alpha,
            "l1_ratio": spec.l1_ratio,
            "n_features": len(spec.features),
            "train_start": spec.train_start,
            "train_end": spec.train_end,
        })

        # Train slice
        df = df_pd.copy()
        df["week_start"] = pd.to_datetime(df["week_start"])
        train_df = df[(df["week_start"] >= spec.train_start) & (df["week_start"] <= spec.train_end)]

        ycol = "log_GC" if spec.target_node == "GC" else "log_AC"
        X_train = train_df[spec.features]
        y_train = train_df[ycol]

        # Fit + coeff artifact (reuse your function)
        cfg = RunConfig(
            target_node=spec.target_node,
            panel_control=spec.panel_control,
            algorithm=spec.algorithm,
            alpha=spec.alpha,
            l1_ratio=spec.l1_ratio,
            feature_block_set_id="FINAL",
            features=tuple(spec.features),
            time_split=TimeSplit("FINAL", spec.train_start, spec.train_end, spec.train_start, spec.train_end),
            seed=42,
        )

        model, cdf = fit_and_get_coef_table(cfg, X_train, y_train)
        log_dataframe_as_csv(cdf, artifact_path="artifacts", filename="coefficients.csv")

        # Log model artifact (ONLY for final)
        if spec.algorithm == "OLS":
            # statsmodels model
            mlflow.statsmodels.log_model(model, artifact_path="model")
        else:
            # sklearn model
            import mlflow.sklearn
            mlflow.sklearn.log_model(model, artifact_path="model")

        # Optional: store the final feature list
        mlflow.log_text("\n".join(spec.features), "final_features.txt")

        return mlflow.active_run().info.run_id


In [None]:
final_gc = FinalSpec(
    name="GC_final_v001",
    target_node="GC",
    panel_control="Mundlak",
    algorithm="Ridge",
    alpha=1.0,
    features=[...],                 # your chosen stable features
    train_start="2023-01-01",
    train_end="2025-06-30",
)

run_id = train_final_and_log_model(
    df_pd=df,
    spec=final_gc,
    experiment_name="RGA_Regression_Local",
    tags={"project":"RevenueGrowthAnalytics", "layer":"2"}
)
print("Final model run_id:", run_id)


In [32]:
runs = client.search_runs(
    experiment_ids=[exp.experiment_id],
    filter_string="tags.run_status = 'ok'",
    order_by=["attributes.start_time DESC"],
    max_results=5
)

for r in runs:
    print(r.info.run_id, r.data.tags.get("mlflow.runName"))


c1195de73a404f6a9e5befacc0862eb4 GC__FE__OLS__block__promo__ts1


In [48]:
run = client.get_run("41894288e1974559b68c3337fb586335")
print(run.data.metrics)


{}


In [46]:
from mlflow.tracking import MlflowClient
client = MlflowClient()

exp = client.get_experiment_by_name("RGA_Regression_Local")

runs = client.search_runs(
    experiment_ids=[exp.experiment_id],
    filter_string="tags.run_status = 'ok' and tags.run_type != 'study'",
    order_by=["attributes.start_time DESC"],
    max_results=20,
)

for r in runs:
    print(r.info.run_id, r.data.tags.get("mlflow.runName"), r.data.metrics.get("test_rmse"))


8510b0a2bd9b431291963245548f5017 GC__FE__OLS__block__promo__ts1 0.7691974572947565
