# Pre-Selection Analysis

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import sys
from pprint import pprint
import json

# Set pandas display options to show full content without truncation
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_colwidth', None)  # Show full content in cells (no truncation)
pd.set_option('display.width', None)  # Use full width
pd.set_option('display.max_rows', 100)  # Show up to 100 rows (adjust as needed)

# Set numpy print options to show full content
np.set_printoptions(threshold=sys.maxsize, linewidth=sys.maxsize)

# Set IPython display options (for Jupyter notebooks)
try:
    from IPython.core.interactiveshell import InteractiveShell
    InteractiveShell.ast_node_interactivity = "all"
    # Set IPython display options
    from IPython.display import display, HTML
    import IPython.core.display as ipd
    # Increase the maximum length of string representations
    sys.displayhook = lambda x: None if x is None else print(repr(x) if len(repr(x)) < 1000 else x)
except ImportError:
    pass

# Custom print function that shows full content. print_full() or pprint() to print full results
def print_full(obj, indent=2):
    """Print object with full content, no truncation"""
    if isinstance(obj, dict):
        print(json.dumps(obj, indent=indent, ensure_ascii=False))
    elif isinstance(obj, (list, tuple)):
        print(json.dumps(obj, indent=indent, ensure_ascii=False))
    else:
        print(obj)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def plot_corr_heatmap(corr_mat, labels, title="", figsize=(16, 12)):
    """
    Plot an UPPER-triangle bivariate correlation heatmap with 2-decimal annotations.

    Parameters
    ----------
    corr_mat : np.ndarray (n x n)
        Correlation matrix.
    labels : list[str]
        Feature names.
    title : str
        Chart title.
    figsize : tuple
        Figure size.
    """

    corr_mat = np.asarray(corr_mat)
    n = corr_mat.shape[0]

    # mask lower triangle + diagonal
    mask = np.tril(np.ones_like(corr_mat, dtype=bool))
    corr_plot = np.where(mask, np.nan, corr_mat)

    fig, ax = plt.subplots(figsize=figsize)

    im = ax.imshow(
        corr_plot,
        interpolation="nearest",
        vmin=-1,
        vmax=1
    )

    # axis ticks + labels
    ax.set_xticks(np.arange(n))
    ax.set_yticks(np.arange(n))
    ax.set_xticklabels(labels, rotation=90, fontsize=7)
    ax.set_yticklabels(labels, fontsize=7)

    ax.set_title(title, fontsize=12)
    plt.colorbar(im, ax=ax, fraction=0.046, pad=0.04)

    # annotate only visible (upper-triangle) cells
    if n <= 35:  # avoid clutter for large matrices
        for i in range(n):
            for j in range(n):
                if j > i and np.isfinite(corr_mat[i, j]):
                    ax.text(
                        j, i,
                        f"{corr_mat[i, j]:.2f}",
                        ha="center",
                        va="center",
                        fontsize=6
                    )

    plt.tight_layout()
    plt.show()
    plt.close(fig)


In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# =========================================================
# Correlation heatmaps by DOMAIN (simple + easy to run)
# =========================================================
# Assumes you already have:
#   - data: your weekly store-level dataframe
#   - feature_catalog: dataframe with at least columns ["feature","domain"]
#
# Output:
#   - Saves one PNG heatmap per domain (and optionally shows it)
#
# Notes:
#   - Uses Pearson correlation (scale-invariant)
#   - Uses matplotlib only (no seaborn)
#   - For large domains, consider setting MAX_FEATURES_PER_DOMAIN
# =========================================================

OUTPUT_DIR = "corr_heatmaps_by_domain"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Optional caps to keep plots readable
MAX_FEATURES_PER_DOMAIN = 80     # set None to plot all (may be huge/unreadable)
MIN_FEATURES_TO_PLOT = 3

def get_domain_features(domain: str, feature_catalog: pd.DataFrame, data: pd.DataFrame) -> list[str]:
    """Return numeric features in this domain that exist in data."""
    feats = feature_catalog.loc[feature_catalog["domain"] == domain, "feature"].tolist()
    feats = [f for f in feats if f in data.columns]
    feats = [f for f in feats if pd.api.types.is_numeric_dtype(data[f])]
    return feats

def plot_corr_heatmap(corr: pd.DataFrame, title: str, outpath: str, show: bool = False):
    """Simple matplotlib heatmap with colorbar."""
    fig, ax = plt.subplots(figsize=(max(8, corr.shape[1] * 0.18), max(6, corr.shape[0] * 0.18)))

    # Heatmap
    im = ax.imshow(corr.values, aspect="auto", interpolation="nearest")

    # Ticks + labels
    ax.set_xticks(np.arange(corr.shape[1]))
    ax.set_yticks(np.arange(corr.shape[0]))
    ax.set_xticklabels(corr.columns, rotation=90, fontsize=7)
    ax.set_yticklabels(corr.index, fontsize=7)

    ax.set_title(title, fontsize=12)
    fig.colorbar(im, ax=ax, fraction=0.046, pad=0.04)

    # Tight layout + save
    plt.tight_layout()
    plt.savefig(outpath, dpi=200, bbox_inches="tight")
    if show:
        plt.show()
    plt.close(fig)

def simple_reorder_by_corr(corr: pd.DataFrame) -> pd.DataFrame:
    """
    Optional: reorder features so similar ones appear close.
    Very simple heuristic: sort by correlation with the first feature.
    (Keeps code lightweight; no clustering dependencies.)
    """
    cols = corr.columns.tolist()
    if len(cols) <= 2:
        return corr
    anchor = cols[0]
    order = corr[anchor].abs().sort_values(ascending=False).index.tolist()
    return corr.loc[order, order]

def generate_domain_corr_heatmaps(data: pd.DataFrame,
                                  feature_catalog: pd.DataFrame,
                                  output_dir: str = OUTPUT_DIR,
                                  max_features: int | None = MAX_FEATURES_PER_DOMAIN,
                                  show: bool = False,
                                  reorder: bool = True):
    domains = sorted(feature_catalog["domain"].dropna().unique().tolist())

    report_rows = []

    for d in domains:
        feats = get_domain_features(d, feature_catalog, data)

        if len(feats) < MIN_FEATURES_TO_PLOT:
            report_rows.append({"domain": d, "n_features": len(feats), "status": "SKIP(<3 features)"})
            continue

        # Cap if needed
        feats_used = feats[:max_features] if (max_features is not None and len(feats) > max_features) else feats

        X = data[feats_used]
        corr = X.corr(method="pearson")

        if reorder:
            corr = simple_reorder_by_corr(corr)

        outpath = os.path.join(output_dir, f"corr_heatmap__{d}__n{len(feats_used)}.png")
        title = f"Correlation Heatmap — Domain: {d} (n={len(feats_used)})"

        plot_corr_heatmap(corr, title=title, outpath=outpath, show=show)

        report_rows.append({
            "domain": d,
            "n_features_total_in_domain": len(feats),
            "n_features_plotted": len(feats_used),
            "status": "OK",
            "file": outpath
        })

    return pd.DataFrame(report_rows).sort_values("n_features_plotted", ascending=False).reset_index(drop=True)

# =========================================================
# RUN
# =========================================================
heatmap_report = generate_domain_corr_heatmaps(
    data=data,
    feature_catalog=feature_catalog,
    output_dir=OUTPUT_DIR,
    max_features=MAX_FEATURES_PER_DOMAIN,
    show=False,       # set True if you want to pop charts in notebook
    reorder=True
)

print(heatmap_report)


In [3]:
# Load the pickle file
import pickle

with open('synth_store_week_2023_2025_200stores_402cols.pkl', 'rb') as f:
    data = pickle.load(f)

# Display basic information about the loaded data
print(f"Type: {type(data)}")
if isinstance(data, pd.DataFrame):
    print(f"Shape: {data.shape}")
    print(f"\nColumns: {data.columns.tolist()}")
    print(f"\nFirst few rows:")
    print(data.head())
elif isinstance(data, dict):
    print(f"Keys: {list(data.keys())}")
    for key, value in data.items():
        print(f"\n{key}: {type(value)}")
        if isinstance(value, pd.DataFrame):
            print(f"  Shape: {value.shape}")
else:
    print(f"Data preview: {data}")


Type: <class 'pandas.core.frame.DataFrame'>
Shape: (31400, 402)

Columns: ['restaurant_id', 'week_start', 'store_open_days', 'holidays_cnt', 'breakfast_gc_share_total', 'breakfast_sales_share_total', 'lunch_gc_share_total', 'lunch_sales_share_total', 'dinner_gc_share_total', 'dinner_sales_share_total', 'snack_gc_share_total', 'snack_sales_share_total', 'late_night_gc_share_total', 'late_night_sales_share_total', 'mobile_gc_share_total', 'mobile_sales_share_total', 'drive_thru_gc_share_total', 'drive_thru_sales_share_total', 'delivery_gc_share_total', 'delivery_sales_share_total', 'front_counter_gc_share_total', 'front_counter_sales_share_total', 'kiosk_gc_share_total', 'kiosk_sales_share_total', 'mccafe_gc_share_total', 'mccafe_sales_share_total', 'other_gc_share_total', 'other_sales_share_total', 'beef_sales_share_total', 'chicken_sales_share_total', 'dessert_sales_share_total', 'drink_sales_share_total', 'ttl_promo_depth', 'mobile_promo_depth', 'drive_thru_promo_depth', 'delivery_pro

  data = pickle.load(f)


### Organize features as catalog

In [None]:
# MANUAL BUILDING OF FEATURE CATALOG

# DOMAIN_MAP = {
#     "promo": [
#         "ttl_promo_depth",
#         "ttl_promo_width",
#         "digital_ttl_promo_depth",
#         "nondigital_ttl_promo_depth",
#         "avg_promo_applied",
#         "ttl_gc_promo_pct",
#         "single_item_discount_gc_promo_pct",
#     ],
#     "media": [
#         "media_spend_tv",
#         "media_spend_digital",
#         "media_spend_search",
#         "media_spend_social",
#         "total_daily_spend",
#         "unique_campaigns",
#     ],
#     "csat": [
#         "avg_overall_rating",
#         "avg_fast_rating",
#         "avg_accuracy_rating",
#         "avg_friendly_rating",
#         "avg_quality_rating",
#     ],
#     "pricing": [
#         "pricing_index_method1",
#         "pricing_index_method2",
#         "pricing_index_method3_competitor_adj",
#     ],
# }

# STRUCTURAL_TOKENS = {
#     "daypart": {"breakfast", "lunch", "dinner", "snack", "late_night"},
#     "channel": {"mobile", "drive_thru", "delivery", "front_counter", "kiosk", "mccafe", "other"},
#     "category": {"beef", "chicken", "dessert", "drink"},
#     "product_level": {"premium", "core"},
# }

# def extract_structure(feature):
#     tokens = set(feature.split("_"))
#     return {
#         dim: sorted(tokens & vocab)
#         for dim, vocab in STRUCTURAL_TOKENS.items()
#     }

# rows = []
# for domain, features in DOMAIN_MAP.items():
#     for f in features:
#         structure = extract_structure(f)
#         rows.append({
#             "feature": f,
#             "domain": domain,
#             "mechanism_group": f.split("_")[0],  # or manual map
#             "dayparts": structure["daypart"],
#             "channels": structure["channel"],
#             "categories": structure["category"],
#             "product_levels": structure["product_level"],
#             "has_lag": "wk_lag" in f,
#         })

# feature_catalog = pd.DataFrame(rows)

In [None]:
# IF EXCEL

# # make 'mobile' input as a list
# def parse_list_cell(x):
#     if pd.isna(x) or str(x).strip() == "":
#         return []
#     return [v.strip() for v in str(x).split(",")]

# catalog["channels"] = catalog["channels"].apply(parse_list_cell)
# catalog["dayparts"] = catalog["dayparts"].apply(parse_list_cell)
# catalog["categories"] = catalog["categories"].apply(parse_list_cell)

# VALID_CHANNELS = {
#     "mobile", "drive_thru", "delivery",
#     "front_counter", "kiosk", "mccafe", "other"
# }

# bad = catalog.explode("channels").query(
#     "channels.notnull() and channels not in @VALID_CHANNELS"
# )

# assert bad.empty, f"Invalid channel values:\n{bad}"


In [13]:
import re
import pandas as pd
import numpy as np

# =========================
# Step A: Build Feature Catalog
# =========================

# Expect you already have `data` loaded
# e.g., data = pd.read_pickle("...pkl")

# ---------- Structural vocab ----------
STRUCTURAL_TOKENS = {
    "daypart": {"breakfast", "lunch", "dinner", "snack", "late_night"},
    "channel": {"mobile", "drive_thru", "delivery", "front_counter", "kiosk", "mccafe", "other"},
    "category": {"beef", "chicken", "dessert", "drink"},
    "product_level": {"premium", "core"},
    "media_channel": {"tv", "digital", "search", "social", "ooh", "radio", "in_app"},
}

LAG_PATTERN = re.compile(r"_(\d+)wk_lag$")


def extract_structure(feature: str) -> dict:
    """
    Parse structural dimension tokens from a feature name.
    Returns lists per dimension (possibly empty).
    """
    tokens = set(feature.split("_"))
    return {dim: sorted(tokens & vocab) for dim, vocab in STRUCTURAL_TOKENS.items()}


# ---------- Domain rules (editable) ----------
DOMAIN_RULES = [
    ("id", re.compile(r"^(restaurant_id|week_start)$")),
    ("ops", re.compile(r"(store_open_days|holidays_cnt|service_time_min|error_rate)")),
    ("media", re.compile(r"^(media_spend_.*|total_daily_spend|unique_campaigns|campaigns_.*|spend_primary_objective_.*)$")),
    ("promo", re.compile(r"(promo_depth|promo_width|gc_promo_pct|promo_applied)")),
    ("pricing", re.compile(r"(pricing_index)")),
    ("csat", re.compile(r"(rating)")),
    ("menu", re.compile(r"(lto|new_products|core_products_cnt|promo_.*_cnt|^lto_.*_cnt$|_cnt$)")),
    ("mix_share", re.compile(r"(gc_share|sales_share)")),
]


def assign_domain(feature: str) -> str:
    for domain, pat in DOMAIN_RULES:
        if pat.search(feature):
            return domain
    return "other"


# ---------- Mechanism rules (editable) ----------
def assign_mechanism(feature: str, domain: str) -> str:
    base = LAG_PATTERN.sub("", feature)  # strip lag suffix for mechanism naming

    if domain == "promo":
        if "promo_depth" in base: return "promo_depth"
        if "promo_width" in base: return "promo_width"
        if "gc_promo_pct" in base: return "gc_promo_pct"
        if "promo_applied" in base: return "promo_applied"
        return "promo_other"

    if domain == "media":
        if base.startswith("media_spend_"): return "media_spend"
        if base == "total_daily_spend": return "media_spend_total"
        if base == "unique_campaigns": return "campaign_volume"
        if base.startswith("campaigns_partner_"): return "campaign_partner_volume"
        if base.startswith("campaigns_subcategory_"): return "campaign_subcategory_volume"
        if base.startswith("spend_primary_objective_"): return "spend_objective_share"
        return "media_other"

    if domain == "mix_share":
        if "gc_share" in base: return "gc_share"
        if "sales_share" in base: return "sales_share"
        return "mix_other"

    if domain == "pricing":
        if "method1" in base: return "pricing_index_m1"
        if "method2" in base: return "pricing_index_m2"
        if "method3" in base or "competitor_adj" in base: return "pricing_index_m3_comp_adj"
        return "pricing_index"

    if domain == "csat":
        if "overall" in base: return "rating_overall"
        if "fast" in base: return "rating_fast"
        if "accuracy" in base: return "rating_accuracy"
        if "friendly" in base: return "rating_friendly"
        if "quality" in base: return "rating_quality"
        return "rating_other"

    if domain == "menu":
        if base.endswith("_cnt") or base == "core_products_cnt":
            if base.startswith("promo_") and base.endswith("_cnt"): return "promo_item_count"
            if base.startswith("lto_") and base.endswith("_cnt"): return "lto_count_by_slice"
            if base == "lto_cnt": return "lto_count_total"
            if base == "new_products_cnt": return "new_product_count"
            if base == "core_products_cnt": return "core_menu_count"
            return "menu_count_other"
        return "menu_other"

    if domain == "ops":
        if "service_time_min" in base: return "service_time"
        if "error_rate" in base: return "error_rate"
        if base in {"store_open_days", "holidays_cnt"}: return "calendar"
        return "ops_other"

    if domain == "id":
        return "id"

    return "other"


def infer_unit(feature: str, domain: str, mechanism: str):
    if domain in {"mix_share", "promo"}: return "share"
    if domain == "pricing": return "index"
    if domain == "csat": return "rating_1_5"
    if domain == "media":
        if mechanism.startswith("media_spend"): return "currency"
        if "share" in mechanism: return "share"
        return "count"
    if domain == "menu": return "count"
    if domain == "ops":
        if mechanism == "service_time": return "minutes"
        if mechanism == "error_rate": return "share"
        return "count"
    return None


# ---------- Build feature catalog ----------
def build_feature_catalog(data: pd.DataFrame) -> tuple[pd.DataFrame, dict, list]:
    dtypes_map = data.dtypes.astype(str).to_dict()
    missing_map = data.isna().mean().to_dict()

    num = data.select_dtypes(include=[np.number])
    var_map = num.var(numeric_only=True).to_dict()
    nunique_map = num.nunique(dropna=True).to_dict()

    rows = []
    for f in data.columns:
        st = extract_structure(f)
        m = LAG_PATTERN.search(f)
        lag_weeks = int(m.group(1)) if m else None

        domain = assign_domain(f)
        mechanism = assign_mechanism(f, domain)
        unit = infer_unit(f, domain, mechanism)

        dims = []
        if st["daypart"]: dims.append("daypart")
        if st["channel"]: dims.append("channel")
        if st["category"]: dims.append("category")
        if st["product_level"]: dims.append("product_level")
        if st["media_channel"]: dims.append("media_channel")
        structure_signature = "+".join(dims) if dims else "none"

        rows.append({
            "feature": f,
            "domain": domain,
            "mechanism_group": mechanism,
            "unit": unit,
            "has_lag": lag_weeks is not None,
            "lag_weeks": lag_weeks,
            "dayparts": st["daypart"],
            "channels": st["channel"],
            "categories": st["category"],
            "product_levels": st["product_level"],
            "media_channels": st["media_channel"],
            "structure_signature": structure_signature,
            "dtype": dtypes_map.get(f),
            "missing_rate": float(missing_map.get(f, 0.0)),
            "variance": var_map.get(f, None),
            "nunique": nunique_map.get(f, None),
        })

    feature_catalog = pd.DataFrame(rows)

    feature_catalog["is_id_like"] = feature_catalog["domain"].eq("id")
    feature_catalog["near_constant_flag"] = (
        feature_catalog["nunique"].fillna(999999) <= 2
    ) | (
        feature_catalog["variance"].fillna(0.0) < 1e-8
    )

    feature_catalog = feature_catalog.sort_values(
        ["domain", "mechanism_group", "structure_signature", "has_lag", "feature"]
    ).reset_index(drop=True)

    DOMAIN_MAP = feature_catalog.groupby("domain")["feature"].apply(list).to_dict()
    GROUP_KEYS = ["domain", "mechanism_group", "structure_signature", "has_lag", "lag_weeks"]

    return feature_catalog, DOMAIN_MAP, GROUP_KEYS


feature_catalog, DOMAIN_MAP, GROUP_KEYS = build_feature_catalog(data)

# quick sanity checks
print(feature_catalog["domain"].value_counts())
print(feature_catalog["structure_signature"].value_counts().head(10))


domain
mix_share    171
promo        134
csat          32
media         29
pricing       17
menu          13
ops            4
id             2
Name: count, dtype: int64
structure_signature
none                     87
channel                  52
category                 52
daypart                  52
daypart+category         48
channel+category         40
daypart+channel          36
media_channel            28
product_level             5
channel+media_channel     2
Name: count, dtype: int64


In [30]:

feature_catalog

Unnamed: 0,feature,domain,mechanism_group,unit,has_lag,lag_weeks,dayparts,channels,categories,product_levels,media_channels,structure_signature,dtype,missing_rate,variance,nunique,is_id_like,near_constant_flag
0,avg_accuracy_rating_delivery,csat,rating_accuracy,rating_1_5,False,,[],[delivery],[],[],[],channel,float32,0.000000,0.063373,31156.0,False,False
1,avg_accuracy_rating_kiosk,csat,rating_accuracy,rating_1_5,False,,[],[kiosk],[],[],[],channel,float32,0.000000,0.063324,31131.0,False,False
2,avg_accuracy_rating_mccafe,csat,rating_accuracy,rating_1_5,False,,[],[mccafe],[],[],[],channel,float32,0.000000,0.063107,31154.0,False,False
3,avg_accuracy_rating_mobile,csat,rating_accuracy,rating_1_5,False,,[],[mobile],[],[],[],channel,float32,0.000000,0.063280,31140.0,False,False
4,avg_accuracy_rating_other,csat,rating_accuracy,rating_1_5,False,,[],[other],[],[],[],channel,float32,0.000000,0.063362,31155.0,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397,nondigital_ttl_promo_width,promo,promo_width,share,False,,[],[],[],[],[],none,float32,0.000000,0.012989,30988.0,False,False
398,ttl_promo_width,promo,promo_width,share,False,,[],[],[],[],[],none,float32,0.000000,0.012609,30882.0,False,False
399,ttl_promo_width_12wk_lag,promo,promo_width,share,True,12.0,[],[],[],[],[],none,float32,0.076433,0.012651,28532.0,False,False
400,ttl_promo_width_4wk_lag,promo,promo_width,share,True,4.0,[],[],[],[],[],none,float32,0.025478,0.012603,30086.0,False,False


### Covariate matrices

In [34]:
import numpy as np
import pandas as pd
import re

# -------------------------
# Parameters (tune later)
# -------------------------
GROUP_KEYS = ["domain", "mechanism_group", "structure_signature", "has_lag", "lag_weeks"]
MIN_FEATURES_PER_GROUP = 3
MAX_FEATURES_PER_GROUP = 80          # safety cap; for real use, split large groups instead of truncating
CORR_METHODS = ["pearson"]           # add "spearman" if you want
HIGH_CORR_FLAG = 0.90                # flag threshold (not a drop rule)
TOP_K_HIGH_CORR_PAIRS = 50           # diagnostics only

# -------------------------
# Helper: missingness co-pattern
# -------------------------
def missingness_correlation(X: pd.DataFrame) -> pd.DataFrame:
    """
    Correlation of missingness indicators.
    Great for detecting duplicated pipelines / shared lag generation.
    """
    M = X.isna().astype(np.int8)
    corr = M.corr(method="pearson")
    return corr.fillna(0.0)

# -------------------------
# Helper: matrices + diagnostics for one group
# -------------------------
def build_group_matrices(X: pd.DataFrame,
                         corr_methods=("pearson",),
                         high_corr_flag=0.9,
                         top_k_pairs=50) -> dict:
    out = {}

    # covariate matrices
    for method in corr_methods:
        out[f"corr_{method}"] = X.corr(method=method)

    # missingness matrix
    out["missing_corr"] = missingness_correlation(X)

    # diagnostics: top abs correlations (off-diagonal)
    corr = out["corr_pearson"].copy()
    np.fill_diagonal(corr.values, 0.0)
    abs_corr = corr.abs()

    iu = np.triu_indices_from(abs_corr.values, k=1)
    vals = abs_corr.values[iu]

    if len(vals) == 0:
        out["top_abs_corr_pairs"] = pd.DataFrame(columns=["feature_a", "feature_b", "abs_corr"])
        out["max_abs_corr_offdiag"] = np.nan
        out["n_pairs_flagged_ge_threshold"] = 0
        return out

    top_idx = np.argsort(vals)[::-1][:min(top_k_pairs, len(vals))]
    cols = abs_corr.columns.to_list()

    pairs = []
    for idx in top_idx:
        i, j = iu[0][idx], iu[1][idx]
        pairs.append((cols[i], cols[j], float(vals[idx])))

    out["top_abs_corr_pairs"] = pd.DataFrame(pairs, columns=["feature_a", "feature_b", "abs_corr"])
    out["max_abs_corr_offdiag"] = float(vals.max())
    out["n_pairs_flagged_ge_threshold"] = int((vals >= high_corr_flag).sum())

    return out

# -------------------------
# Build covariate matrices within groups
# -------------------------
numeric_features = set(data.select_dtypes(include=[np.number]).columns)

eligible_catalog = (
    feature_catalog
    .loc[feature_catalog["feature"].isin(numeric_features)]
    .copy()
)

grouped = eligible_catalog.groupby(GROUP_KEYS, dropna=False)

covariate_matrices = {}
inventory_rows = []

for gkey, gdf in grouped:
    feats = gdf["feature"].tolist()
    if len(feats) < MIN_FEATURES_PER_GROUP:
        continue

    # safety cap for very large groups
    feats_for_matrix = feats if len(feats) <= MAX_FEATURES_PER_GROUP else feats[:MAX_FEATURES_PER_GROUP]
    X = data[feats_for_matrix]

    mats = build_group_matrices(
        X=X,
        corr_methods=CORR_METHODS,
        high_corr_flag=HIGH_CORR_FLAG,
        top_k_pairs=TOP_K_HIGH_CORR_PAIRS,
    )

    covariate_matrices[gkey] = mats

    inventory_rows.append({
        "group_key": gkey,
        "n_features": len(feats),
        "n_features_used": len(feats_for_matrix),
        "missing_rate_mean": float(X.isna().mean().mean()),
        "missing_rate_max": float(X.isna().mean().max()),
        "max_abs_corr_offdiag": mats["max_abs_corr_offdiag"],
        "n_pairs_flagged_ge_0p90": mats["n_pairs_flagged_ge_threshold"],
    })

group_inventory = (
    pd.DataFrame(inventory_rows)
    .sort_values(["n_features", "n_pairs_flagged_ge_0p90", "max_abs_corr_offdiag"],
                 ascending=[False, False, False])
    .reset_index(drop=True)
)

# -------------------------
# Outputs
# -------------------------
group_inventory.head(20)


Unnamed: 0,group_key,n_features,n_features_used,missing_rate_mean,missing_rate_max,max_abs_corr_offdiag,n_pairs_flagged_ge_0p90
0,"(mix_share, gc_share, daypart+channel, False, nan)",20,20,0.0,0.0,0.692244,0
1,"(mix_share, sales_share, channel+category, False, nan)",20,20,0.0,0.0,0.6356,0
2,"(mix_share, gc_share, channel+category, False, nan)",20,20,0.0,0.0,0.629526,0
3,"(promo, promo_width, daypart+category, False, nan)",16,16,0.0,0.0,0.935748,120
4,"(promo, promo_depth, daypart+category, False, nan)",16,16,0.0,0.0,0.886086,0
5,"(mix_share, sales_share, category, False, nan)",16,16,0.0,0.0,0.730638,0
6,"(mix_share, sales_share, daypart+category, False, nan)",16,16,0.0,0.0,0.531328,0
7,"(promo, promo_width, daypart, False, nan)",12,12,0.0,0.0,0.957651,66
8,"(promo, promo_depth, daypart, False, nan)",12,12,0.0,0.0,0.923604,8
9,"(mix_share, gc_share, daypart, False, nan)",12,12,0.0,0.0,0.638598,0


In [26]:
def find_group_key(domain, mechanism, structure, has_lag, lag_weeks=None):
    for k in covariate_matrices.keys():
        d, m, s, hl, lw = k
        if (
            d == domain and
            m == mechanism and
            s == structure and
            hl == has_lag and
            (
                (pd.isna(lw) and lag_weeks is None) or
                (lw == lag_weeks)
            )
        ):
            return k
    raise KeyError("Group key not found")


In [None]:
key = find_group_key(
    domain="mix_share",
    mechanism="gc_share",
    structure="channel",
    has_lag=False,
    lag_weeks=None
)

covariate_matrices[key]["corr_pearson"]
covariate_matrices[key]["top_abs_corr_pairs"]

Unnamed: 0,delivery_gc_share_total,kiosk_gc_share_total,late_night_delivery_gc_share,late_night_kiosk_gc_share,late_night_mccafe_gc_share,late_night_mobile_gc_share,late_night_other_gc_share,mccafe_gc_share_total,mobile_gc_share_total,other_gc_share_total
delivery_gc_share_total,1.0,0.235246,0.824576,0.180672,0.098,-0.230424,-0.103553,0.134465,-0.273751,-0.123117
kiosk_gc_share_total,0.235246,1.0,0.195425,0.816425,0.079769,-0.180092,-0.050068,0.106182,-0.217764,-0.061707
late_night_delivery_gc_share,0.824576,0.195425,1.0,0.33603,0.262278,-0.038251,0.046746,0.114331,-0.235755,-0.098924
late_night_kiosk_gc_share,0.180672,0.816425,0.33603,1.0,0.233907,0.005113,0.096689,0.084474,-0.183399,-0.04561
late_night_mccafe_gc_share,0.098,0.079769,0.262278,0.233907,1.0,0.056531,0.131859,0.796739,-0.145252,-0.015532
late_night_mobile_gc_share,-0.230424,-0.180092,-0.038251,0.005113,0.056531,1.0,0.120541,-0.139855,0.802342,-0.048775
late_night_other_gc_share,-0.103553,-0.050068,0.046746,0.096689,0.131859,0.120541,1.0,-0.017888,-0.058559,0.806117
mccafe_gc_share_total,0.134465,0.106182,0.114331,0.084474,0.796739,-0.139855,-0.017888,1.0,-0.177366,-0.020483
mobile_gc_share_total,-0.273751,-0.217764,-0.235755,-0.183399,-0.145252,0.802342,-0.058559,-0.177366,1.0,-0.067351
other_gc_share_total,-0.123117,-0.061707,-0.098924,-0.04561,-0.015532,-0.048775,0.806117,-0.020483,-0.067351,1.0


Unnamed: 0,feature_a,feature_b,abs_corr
0,delivery_gc_share_total,late_night_delivery_gc_share,0.824576
1,kiosk_gc_share_total,late_night_kiosk_gc_share,0.816425
2,late_night_other_gc_share,other_gc_share_total,0.806117
3,late_night_mobile_gc_share,mobile_gc_share_total,0.802342
4,late_night_mccafe_gc_share,mccafe_gc_share_total,0.796739
5,late_night_delivery_gc_share,late_night_kiosk_gc_share,0.33603
6,delivery_gc_share_total,mobile_gc_share_total,0.273751
7,late_night_delivery_gc_share,late_night_mccafe_gc_share,0.262278
8,late_night_delivery_gc_share,mobile_gc_share_total,0.235755
9,delivery_gc_share_total,kiosk_gc_share_total,0.235246


### VIF:

Correlation: “Are any two features very similar?”
VIF: “Is one feature almost a linear combination of several others?”
VIF catches:
promo_depth_total ≈ weighted_sum(daypart promo depths)
category shares ≈ function of other category shares
lagged features that are jointly redundant even if no single pair is >0.9

VIF within groups

Finds multi-way redundancy

Confirms whether “total + components” can coexist

Helps distinguish:

“keep 2–3 reps” vs

“collapse everything into an index”

Correlation asks
“Are these features similar?”

VIF asks
“Is this feature necessary given the others?”

In [35]:
# =========================
# Step D: VIF screening (within groups)
# =========================

# ---- Parameters ----
MAX_FEATURES_PER_GROUP_FOR_VIF = 50   # VIF can be heavy; if groups > 50, consider sub-grouping instead of truncation
MISSINGNESS_ROW_DROP_THRESHOLD = 0.50 # drop rows with >50% missing within group (for stability)
RIDGE_EPS = 1e-8                      # numerical stabilizer for near-singular corr matrices

VIF_FLAG_MODERATE = 5.0
VIF_FLAG_HIGH = 10.0

def zscore_df(X: pd.DataFrame) -> pd.DataFrame:
    """Column-wise z-score; ignores NaNs. Constant cols become NaN and are dropped later."""
    mu = X.mean(axis=0, skipna=True)
    sd = X.std(axis=0, ddof=0, skipna=True).replace(0, np.nan)
    return (X - mu) / sd

def compute_vif_from_corr(C: np.ndarray, feature_names: list[str]) -> pd.Series:
    """
    VIF via inverse correlation matrix:
        VIF_j = diag(inv(Corr(Z)))_j
    where Z are standardized features.
    """
    C = np.nan_to_num(C, nan=0.0, posinf=0.0, neginf=0.0)
    C = (C + C.T) / 2.0
    np.fill_diagonal(C, 1.0)

    # ridge stabilization
    C = C + np.eye(C.shape[0]) * RIDGE_EPS

    try:
        invC = np.linalg.inv(C)
    except np.linalg.LinAlgError:
        invC = np.linalg.pinv(C)

    return pd.Series(np.diag(invC), index=feature_names)

def compute_group_vif(X_raw: pd.DataFrame) -> pd.DataFrame:
    """
    Compute VIF for one group:
    - remove near-constant columns
    - drop rows with too much missingness within group
    - z-score
    - invert correlation matrix
    """
    # drop near-constant cols
    nunique = X_raw.nunique(dropna=True)
    var = X_raw.var(numeric_only=True)
    keep_cols = X_raw.columns[(nunique > 2) & (var > 1e-8)]
    X = X_raw[keep_cols].copy()

    if X.shape[1] < 2:
        return pd.DataFrame()

    # drop heavily-missing rows
    row_missing = X.isna().mean(axis=1)
    X = X.loc[row_missing <= MISSINGNESS_ROW_DROP_THRESHOLD]
    if len(X) < 100:  # minimum obs for stable corr
        return pd.DataFrame()

    # z-score
    Z = zscore_df(X)
    Z = Z.loc[:, Z.notna().any(axis=0)]  # drop all-NaN cols after standardization
    if Z.shape[1] < 2:
        return pd.DataFrame()

    C = Z.corr(method="pearson").to_numpy()
    vif = compute_vif_from_corr(C, Z.columns.to_list())

    out = pd.DataFrame({
        "feature": vif.index,
        "vif": vif.values,
        "n_obs_used": len(Z),
        "n_features_used": Z.shape[1],
    }).sort_values("vif", ascending=False).reset_index(drop=True)

    return out

# -------------------------
# Run VIF by iterating the same groups you already used in Step B
# (Use group_inventory order so you prioritize big / redundant groups first)
# -------------------------
vif_rows = []
vif_group_rows = []

for _, row in group_inventory.iterrows():
    gkey = row["group_key"]

    # gkey structure: (domain, mechanism_group, structure_signature, has_lag, lag_weeks)
    # gdf is the subset of eligible_catalog for that group
    try:
        gdf = grouped.get_group(gkey)
    except KeyError:
        # If nan in key causes mismatch, fall back to mask-based lookup
        domain, mech, sig, has_lag, lag_weeks = gkey
        gdf = eligible_catalog[
            (eligible_catalog["domain"] == domain) &
            (eligible_catalog["mechanism_group"] == mech) &
            (eligible_catalog["structure_signature"] == sig) &
            (eligible_catalog["has_lag"] == has_lag) &
            (
                (eligible_catalog["lag_weeks"].isna() & pd.isna(lag_weeks)) |
                (eligible_catalog["lag_weeks"] == lag_weeks)
            )
        ]

    feats = gdf["feature"].tolist()
    if len(feats) < MIN_FEATURES_PER_GROUP:
        continue

    feats_for_vif = feats if len(feats) <= MAX_FEATURES_PER_GROUP_FOR_VIF else feats[:MAX_FEATURES_PER_GROUP_FOR_VIF]
    X_raw = data[feats_for_vif]

    res = compute_group_vif(X_raw)
    if res.empty:
        continue

    # Attach group metadata
    domain, mech, sig, has_lag, lag_weeks = gkey
    res["group_key"] = [gkey] * len(res)
    res["domain"] = domain
    res["mechanism_group"] = mech
    res["structure_signature"] = sig
    res["has_lag"] = has_lag
    res["lag_weeks"] = lag_weeks

    # VIF flags (still not drop rules)
    res["vif_flag"] = np.select(
        [res["vif"] >= VIF_FLAG_HIGH, res["vif"] >= VIF_FLAG_MODERATE],
        ["HIGH(>=10)", "MOD(>=5)"],
        default="OK(<5)"
    )

    vif_rows.append(res)

    vif_group_rows.append({
        "group_key": gkey,
        "n_features_in_group": len(feats),
        "n_features_used_for_vif": int(res["n_features_used"].iloc[0]),
        "n_obs_used": int(res["n_obs_used"].iloc[0]),
        "vif_max": float(res["vif"].max()),
        "vif_p95": float(res["vif"].quantile(0.95)),
        "vif_median": float(res["vif"].median()),
        "n_vif_ge_10": int((res["vif"] >= VIF_FLAG_HIGH).sum()),
        "n_vif_ge_5": int((res["vif"] >= VIF_FLAG_MODERATE).sum()),
        "top_vif_feature": str(res.iloc[0]["feature"]),
        "top_vif_value": float(res.iloc[0]["vif"]),
    })

vif_results = pd.concat(vif_rows, ignore_index=True) if vif_rows else pd.DataFrame()
vif_group_summary = (
    pd.DataFrame(vif_group_rows)
      .sort_values(["vif_max", "n_vif_ge_10", "n_features_in_group"], ascending=[False, False, False])
      .reset_index(drop=True)
)

# Optional: merge VIF summaries back into group_inventory for one unified "group dashboard"
group_inventory = group_inventory.merge(
    vif_group_summary[["group_key", "vif_max", "vif_p95", "vif_median", "n_vif_ge_10", "n_vif_ge_5", "top_vif_feature", "top_vif_value"]],
    on="group_key",
    how="left"
)

# Quick view
group_inventory.head(20)


Unnamed: 0,group_key,n_features,n_features_used,missing_rate_mean,missing_rate_max,max_abs_corr_offdiag,n_pairs_flagged_ge_0p90,vif_max,vif_p95,vif_median,n_vif_ge_10,n_vif_ge_5,top_vif_feature,top_vif_value
0,"(mix_share, gc_share, daypart+channel, False, nan)",20,20,0.0,0.0,0.692244,0,4.937222,4.797272,3.871291,0,0,lunch_delivery_gc_share,4.937222
1,"(mix_share, sales_share, channel+category, False, nan)",20,20,0.0,0.0,0.6356,0,3.787306,3.755306,3.255677,0,0,beef_delivery_sales_share,3.787306
2,"(mix_share, gc_share, channel+category, False, nan)",20,20,0.0,0.0,0.629526,0,3.928634,3.924958,3.383712,0,0,delivery_beef_gc_share_proxy,3.928634
3,"(promo, promo_width, daypart+category, False, nan)",16,16,0.0,0.0,0.935748,120,12.99563,12.97429,12.883199,16,16,dinner_chicken_promo_width,12.99563
4,"(promo, promo_depth, daypart+category, False, nan)",16,16,0.0,0.0,0.886086,0,7.315831,7.313122,7.26519,0,16,breakfast_dessert_promo_depth,7.315831
5,"(mix_share, sales_share, category, False, nan)",16,16,0.0,0.0,0.730638,0,31893480.0,30029610.0,5.555015,4,12,beef_sales_share_total,31893480.0
6,"(mix_share, sales_share, daypart+category, False, nan)",16,16,0.0,0.0,0.531328,0,4.645561,4.622084,4.501066,0,0,lunch_chicken_sales_share,4.645561
7,"(promo, promo_width, daypart, False, nan)",12,12,0.0,0.0,0.957651,66,23.422,23.40414,13.866882,12,12,snack_promo_width,23.422
8,"(promo, promo_depth, daypart, False, nan)",12,12,0.0,0.0,0.923604,8,12.94001,12.88602,7.826147,4,12,breakfast_promo_depth,12.94001
9,"(mix_share, gc_share, daypart, False, nan)",12,12,0.0,0.0,0.638598,0,7.415176,7.358845,6.66667,0,11,snack_drive_thru_gc_share,7.415176


### Feature Redundancy and Collapse

In [36]:
import numpy as np
import pandas as pd
from dataclasses import dataclass
from typing import Dict, List, Tuple, Any, Optional

# =========================================================
# Step E: Redundancy clustering + collapse decisions
# =========================================================
# Expected inputs already exist from your flow:
#   - data
#   - feature_catalog
#   - group_inventory
#   - covariate_matrices
#   - vif_results
#
# Outputs:
#   - selected_features: list[str]
#   - collapse_definitions: dict[str, dict]   # index_name -> recipe
#   - mapping_table: pd.DataFrame             # original -> action -> representative/index + cluster_id
#   - group_decisions: pd.DataFrame           # group -> decision + rationale
# =========================================================

# -------------------------
# Tunable policies
# -------------------------
CORR_CLUSTER_THRESHOLD = 0.90  # connect features into redundancy cluster if abs(corr) >= this
AUTO_COLLAPSE_IF_ANY = {
    # If (domain, mechanism_group) in this set and redundancy is high, default collapse to an index
    ("promo", "promo_width"),
    ("promo", "promo_depth"),
}
VIF_HIGH = 10.0
VIF_MOD = 5.0

# Per-group "decision triggers"
PAIR_REDUNDANCY_TRIGGER = 10        # if >= this many pairs abs(corr)>=0.90, treat as "explosion"
VIF_REDUNDANCY_TRIGGER = 0.50       # if >= 50% of features have VIF>=10, treat as "explosion"

# For share-simplex groups (category/daypart/channel shares), recommended keep strategy
# (exec-friendly, avoids feeding full simplex)
STRATEGIC_CHANNELS = ["delivery", "mobile", "drive_thru"]
STRATEGIC_DAYPARTS = ["breakfast", "lunch", "dinner"]
STRATEGIC_CATEGORIES = ["beef", "chicken"]

# -------------------------
# Helpers: cluster by correlation (connected components)
# -------------------------
def corr_connected_components(corr_abs: pd.DataFrame, threshold: float) -> List[List[str]]:
    """
    Build clusters using abs(corr) adjacency >= threshold.
    Returns list of clusters (list of feature names).
    """
    cols = corr_abs.columns.tolist()
    n = len(cols)
    if n == 0:
        return []

    # adjacency list
    adj = {c: set() for c in cols}
    M = corr_abs.values
    for i in range(n):
        for j in range(i + 1, n):
            if M[i, j] >= threshold:
                a, b = cols[i], cols[j]
                adj[a].add(b)
                adj[b].add(a)

    # connected components
    seen = set()
    comps = []
    for c in cols:
        if c in seen:
            continue
        stack = [c]
        comp = []
        seen.add(c)
        while stack:
            u = stack.pop()
            comp.append(u)
            for v in adj[u]:
                if v not in seen:
                    seen.add(v)
                    stack.append(v)
        comps.append(sorted(comp))
    return comps

# -------------------------
# Helpers: representative selection (target-free)
# -------------------------
def choose_representative(features: List[str], feature_catalog: pd.DataFrame) -> str:
    """
    Pick one representative feature from a cluster, target-free.
    Heuristics:
      1) lowest missing_rate
      2) higher variance
      3) prefer canonical names: ttl_/avg_/overall (if present)
    """
    fc = feature_catalog.set_index("feature").loc[features].copy()

    # canonical name bonus
    name = fc.index.to_series()
    bonus = (
        name.str.startswith("ttl_").astype(int) * 3
        + name.str.contains("avg_").astype(int) * 2
        + name.str.contains("overall").astype(int) * 2
        + name.str.contains("_total").astype(int) * 1
    )
    fc["name_bonus"] = bonus.values

    # fill missing stats
    fc["missing_rate"] = fc["missing_rate"].fillna(1.0)
    fc["variance"] = fc["variance"].fillna(0.0)

    # sort: prefer canonical, then low missing, then high variance
    fc = fc.sort_values(["name_bonus", "missing_rate", "variance"], ascending=[False, True, False])
    return fc.index[0]

# -------------------------
# Helpers: build weights for index (exec-meaningful)
# -------------------------
def find_weight_series_for_feature(feature: str, data: pd.DataFrame) -> Optional[pd.Series]:
    """
    For a feature like 'breakfast_delivery_promo_width', we try to find a weight:
      - daypart weight: breakfast_gc_share_total
      - channel weight: delivery_gc_share_total
      - category weight: chicken_sales_share_total (if category present)
    If multiple dims present, use product of weights (still interpretable as exposure proxy).
    If no applicable weights exist, return None.
    """
    tokens = feature.split("_")

    dayparts = {"breakfast", "lunch", "dinner", "snack", "late_night"}
    channels = {"mobile", "drive_thru", "delivery", "front_counter", "kiosk", "mccafe", "other"}
    categories = {"beef", "chicken", "dessert", "drink"}

    dp = [t for t in tokens if t in dayparts]
    ch = [t for t in tokens if t in channels]
    cat = [t for t in tokens if t in categories]

    weights = []
    # daypart exposure proxy: gc share
    if dp:
        col = f"{dp[0]}_gc_share_total"
        if col in data.columns:
            weights.append(data[col])
    # channel exposure proxy: gc share
    if ch:
        col = f"{ch[0]}_gc_share_total"
        if col in data.columns:
            weights.append(data[col])
    # category exposure proxy: sales share
    if cat:
        col = f"{cat[0]}_sales_share_total"
        if col in data.columns:
            weights.append(data[col])

    if not weights:
        return None

    w = weights[0].astype(float)
    for ww in weights[1:]:
        w = w * ww.astype(float)

    # guardrail: if all zeros, treat as no weights
    if np.nanmax(w.values) <= 0:
        return None
    return w

def weighted_index(df: pd.DataFrame, feature_list: List[str], data: pd.DataFrame, index_name: str) -> pd.Series:
    """
    Compute an index as weighted mean across feature_list using exposure weights inferred per feature.
    If weights unavailable, fallback to simple mean.
    """
    X = df[feature_list].astype(float)

    # build per-feature weights (vector per row)
    W_list = []
    for f in feature_list:
        w = find_weight_series_for_feature(f, data)
        if w is None:
            W_list.append(None)
        else:
            W_list.append(w.astype(float))

    if all(w is None for w in W_list):
        return X.mean(axis=1, skipna=True)

    # Replace missing weights with 1.0 (so those features still contribute)
    W = np.column_stack([
        (np.ones(len(df)) if w is None else w.values) for w in W_list
    ])
    W = np.nan_to_num(W, nan=0.0, posinf=0.0, neginf=0.0)

    # Normalize weights per row (avoid divide by 0)
    denom = W.sum(axis=1)
    denom_safe = np.where(denom == 0, 1.0, denom)

    # weighted mean
    num = np.nansum(X.values * W, axis=1)
    return pd.Series(num / denom_safe, index=df.index, name=index_name)

# -------------------------
# Helpers: handle share-simplex groups (exec-friendly)
# -------------------------
def simplex_keep_policy(features: List[str], structure_signature: str) -> List[str]:
    """
    For shares that are constrained (sum ~1), keep a subset:
    - Keep strategic slices if present (delivery/mobile/drive_thru, breakfast/lunch/dinner, beef/chicken)
    - Otherwise keep K-1 by dropping a "residual" slice (last alphabetical)
    """
    feats = sorted(features)

    def pick_by_token(token_list):
        keep = []
        for t in token_list:
            for f in feats:
                if f.startswith(t + "_") or f.endswith("_" + t) or f"_{t}_" in f:
                    keep.append(f)
        return list(dict.fromkeys(keep))  # preserve order, unique

    keep = []
    if structure_signature == "channel":
        keep = pick_by_token(STRATEGIC_CHANNELS)
    elif structure_signature == "daypart":
        keep = pick_by_token(STRATEGIC_DAYPARTS)
    elif structure_signature == "category":
        keep = pick_by_token(STRATEGIC_CATEGORIES)

    # If we didn't match anything, do drop-one
    if not keep:
        # drop last alphabetical as residual
        return feats[:-1]

    # Ensure we don't keep everything (avoid full simplex)
    if len(keep) >= len(feats):
        return feats[:-1]

    return keep

# -------------------------
# Main Step E runner
# -------------------------
def step_e_collapse(
    data: pd.DataFrame,
    feature_catalog: pd.DataFrame,
    group_inventory: pd.DataFrame,
    covariate_matrices: Dict[Tuple, Dict[str, Any]],
    vif_results: pd.DataFrame,
    corr_cluster_threshold: float = CORR_CLUSTER_THRESHOLD,
) -> Tuple[List[str], Dict[str, Dict[str, Any]], pd.DataFrame, pd.DataFrame]:
    """
    Returns:
      selected_features, collapse_definitions, mapping_table, group_decisions
    """
    fc = feature_catalog.copy()
    fc_idx = fc.set_index("feature")

    # Build quick VIF lookup: (group_key, feature) -> vif
    vif_lookup = {}
    if vif_results is not None and len(vif_results) > 0:
        for _, r in vif_results.iterrows():
            vif_lookup[(r["group_key"], r["feature"])] = float(r["vif"])

    selected_features = set()
    collapse_definitions: Dict[str, Dict[str, Any]] = {}
    mapping_rows = []
    decision_rows = []

    # Iterate groups in the same order you’ve been using (biggest / most redundant first)
    for _, gr in group_inventory.iterrows():
        gkey = gr["group_key"]
        n_features = int(gr["n_features"])
        if n_features < 1:
            continue

        domain, mechanism, sig, has_lag, lag_weeks = gkey

        # Get all features in this group from feature_catalog (NaN-safe for lag)
        mask = (
            (fc["domain"] == domain) &
            (fc["mechanism_group"] == mechanism) &
            (fc["structure_signature"] == sig) &
            (fc["has_lag"] == has_lag) &
            (
                (fc["lag_weeks"].isna() & pd.isna(lag_weeks)) |
                (fc["lag_weeks"] == lag_weeks)
            )
        )
        feats = fc.loc[mask, "feature"].tolist()

        # If group missing (e.g. non-numeric), skip
        feats = [f for f in feats if f in data.columns and np.issubdtype(data[f].dtype, np.number)]
        if len(feats) < 2:
            # keep singletons
            for f in feats:
                selected_features.add(f)
                mapping_rows.append({"group_key": gkey, "feature": f, "action": "KEEP_SINGLE", "representative": f, "index_name": None, "cluster_id": None})
            continue

        # Get group corr matrix if available
        mats = covariate_matrices.get(gkey, None)
        corr = None
        clusters = [sorted(feats)]
        if mats is not None and "corr_pearson" in mats:
            corr = mats["corr_pearson"].loc[feats, feats]
            corr_abs = corr.abs()
            clusters = corr_connected_components(corr_abs, threshold=corr_cluster_threshold)

        # Redundancy signals from inventory
        pairs90 = int(gr.get("n_pairs_flagged_ge_0p90", 0))
        vif_max = gr.get("vif_max", np.nan)
        n_vif_ge_10 = int(gr.get("n_vif_ge_10", 0))
        frac_vif_ge_10 = (n_vif_ge_10 / len(feats)) if len(feats) else 0.0

        # Decide group-level strategy
        is_explosion = (pairs90 >= PAIR_REDUNDANCY_TRIGGER) or (frac_vif_ge_10 >= VIF_REDUNDANCY_TRIGGER)
        is_simplex_share = (domain == "mix_share" and mechanism in {"gc_share", "sales_share"} and sig in {"category", "daypart", "channel"})

        decision = "KEEP"
        rationale = []

        if is_simplex_share:
            decision = "SIMPLEX_KEEP_SUBSET"
            rationale.append("Share simplex detected (likely sum-to-1); avoid full set to prevent multi-way collinearity.")
        elif (domain, mechanism) in AUTO_COLLAPSE_IF_ANY and is_explosion:
            decision = "COLLAPSE_TO_INDEX"
            rationale.append("High redundancy (pairs>=0.90 and/or many VIF>=10) in promo mechanism; collapse to interpretable index.")
        elif is_explosion:
            decision = "COLLAPSE_OR_REP"
            rationale.append("High redundancy detected; either collapse to index or keep 1-2 reps per cluster.")
        else:
            decision = "KEEP_WITHIN_CLUSTERS"
            rationale.append("No strong redundancy; keep features (or cluster reps if needed).")

        # Apply decision
        if decision == "SIMPLEX_KEEP_SUBSET":
            keep = simplex_keep_policy(feats, sig)
            keep_set = set(keep)
            for f in feats:
                if f in keep_set:
                    selected_features.add(f)
                    mapping_rows.append({"group_key": gkey, "feature": f, "action": "KEEP_SIMPLEX", "representative": f, "index_name": None, "cluster_id": None})
                else:
                    mapping_rows.append({"group_key": gkey, "feature": f, "action": "DROP_SIMPLEX_RESIDUAL", "representative": None, "index_name": None, "cluster_id": None})

        elif decision in {"COLLAPSE_TO_INDEX"}:
            # One index per group (exec-friendly), built from all features in group
            index_name = f"{domain}__{mechanism}__{sig}__{'lag'+str(int(lag_weeks)) if has_lag else 'nolag'}__idx"
            idx_series = weighted_index(data, feats, data, index_name=index_name)
            data[index_name] = idx_series.astype("float32")

            selected_features.add(index_name)
            collapse_definitions[index_name] = {
                "type": "weighted_mean_index",
                "domain": domain,
                "mechanism_group": mechanism,
                "structure_signature": sig,
                "has_lag": bool(has_lag),
                "lag_weeks": None if pd.isna(lag_weeks) else float(lag_weeks),
                "inputs": feats,
                "weighting": "auto (gc_share_total for channel/daypart; sales_share_total for category; product for multi-dim; fallback mean)",
            }
            for f in feats:
                mapping_rows.append({"group_key": gkey, "feature": f, "action": "COLLAPSE_TO_INDEX", "representative": None, "index_name": index_name, "cluster_id": "ALL"})

        else:
            # KEEP_WITHIN_CLUSTERS or COLLAPSE_OR_REP:
            # Use corr clusters; for each cluster, either keep rep or (if cluster large) collapse that cluster to an index.
            for ci, cl in enumerate(clusters):
                # Determine cluster redundancy via VIF/corr
                cl_size = len(cl)
                # If cluster is big OR contains high VIF features, collapse that cluster; else keep representative
                high_vif_in_cluster = any(vif_lookup.get((gkey, f), 0.0) >= VIF_HIGH for f in cl)
                should_collapse_cluster = (decision == "COLLAPSE_OR_REP" and (cl_size >= 6 or high_vif_in_cluster))

                if should_collapse_cluster:
                    index_name = f"{domain}__{mechanism}__{sig}__c{ci}__{'lag'+str(int(lag_weeks)) if has_lag else 'nolag'}__idx"
                    idx_series = weighted_index(data, cl, data, index_name=index_name)
                    data[index_name] = idx_series.astype("float32")
                    selected_features.add(index_name)
                    collapse_definitions[index_name] = {
                        "type": "weighted_mean_index",
                        "domain": domain,
                        "mechanism_group": mechanism,
                        "structure_signature": sig,
                        "cluster_id": ci,
                        "has_lag": bool(has_lag),
                        "lag_weeks": None if pd.isna(lag_weeks) else float(lag_weeks),
                        "inputs": cl,
                        "weighting": "auto (see above)",
                    }
                    for f in cl:
                        mapping_rows.append({"group_key": gkey, "feature": f, "action": "COLLAPSE_CLUSTER_TO_INDEX", "representative": None, "index_name": index_name, "cluster_id": ci})
                else:
                    rep = choose_representative(cl, fc)
                    selected_features.add(rep)
                    for f in cl:
                        action = "KEEP_REP" if f == rep else "MAP_TO_REP"
                        mapping_rows.append({"group_key": gkey, "feature": f, "action": action, "representative": rep, "index_name": None, "cluster_id": ci})

        decision_rows.append({
            "group_key": gkey,
            "domain": domain,
            "mechanism_group": mechanism,
            "structure_signature": sig,
            "has_lag": bool(has_lag),
            "lag_weeks": None if pd.isna(lag_weeks) else float(lag_weeks),
            "n_features": len(feats),
            "pairs_ge_0p90": pairs90,
            "vif_max": float(vif_max) if pd.notna(vif_max) else np.nan,
            "n_vif_ge_10": n_vif_ge_10,
            "decision": decision,
            "rationale": " | ".join(rationale),
            "n_clusters": len(clusters),
        })

    mapping_table = pd.DataFrame(mapping_rows)
    group_decisions = pd.DataFrame(decision_rows)

    # Also always keep IDs
    for id_col in ["restaurant_id", "week_start"]:
        if id_col in data.columns:
            selected_features.add(id_col)

    return sorted(selected_features), collapse_definitions, mapping_table, group_decisions


# -------------------------
# Run Step E
# -------------------------
selected_features, collapse_definitions, mapping_table, group_decisions = step_e_collapse(
    data=data,
    feature_catalog=feature_catalog,
    group_inventory=group_inventory,
    covariate_matrices=covariate_matrices,
    vif_results=vif_results,
    corr_cluster_threshold=CORR_CLUSTER_THRESHOLD,
)

# Quick outputs
print("Selected feature count:", len(selected_features))
print("Number of indices created:", len(collapse_definitions))
group_decisions.head(20), mapping_table.head(20)


Selected feature count: 236
Number of indices created: 7


(                                                 group_key     domain  \
 0       (mix_share, gc_share, daypart+channel, False, nan)  mix_share   
 1   (mix_share, sales_share, channel+category, False, nan)  mix_share   
 2      (mix_share, gc_share, channel+category, False, nan)  mix_share   
 3       (promo, promo_width, daypart+category, False, nan)      promo   
 4       (promo, promo_depth, daypart+category, False, nan)      promo   
 5           (mix_share, sales_share, category, False, nan)  mix_share   
 6   (mix_share, sales_share, daypart+category, False, nan)  mix_share   
 7                (promo, promo_width, daypart, False, nan)      promo   
 8                (promo, promo_depth, daypart, False, nan)      promo   
 9               (mix_share, gc_share, daypart, False, nan)  mix_share   
 10              (mix_share, gc_share, channel, False, nan)  mix_share   
 11                  (promo, promo_depth, none, False, nan)      promo   
 12               (promo, promo_depth,

In [37]:
selected_features

['avg_accuracy_rating_delivery',
 'avg_accuracy_rating_front_counter',
 'avg_accuracy_rating_kiosk',
 'avg_accuracy_rating_mccafe',
 'avg_accuracy_rating_mobile',
 'avg_accuracy_rating_other',
 'avg_fast_rating_delivery',
 'avg_fast_rating_front_counter',
 'avg_fast_rating_mccafe',
 'avg_fast_rating_mobile',
 'avg_overall_rating_delivery',
 'avg_overall_rating_front_counter',
 'avg_overall_rating_kiosk',
 'avg_overall_rating_mccafe',
 'avg_overall_rating_mobile',
 'avg_overall_rating_other',
 'beef_delivery_sales_share',
 'beef_drive_thru_sales_share',
 'beef_front_counter_sales_share',
 'beef_kiosk_sales_share',
 'beef_mccafe_sales_share',
 'beef_mobile_sales_share',
 'beef_other_sales_share',
 'beef_sales_share_total',
 'breakfast_beef_promo_depth',
 'breakfast_beef_sales_share',
 'breakfast_chicken_promo_depth',
 'breakfast_chicken_sales_share',
 'breakfast_delivery_gc_share',
 'breakfast_delivery_promo_depth',
 'breakfast_dessert_promo_depth',
 'breakfast_dessert_sales_share',
 'br

In [38]:
collapse_definitions

{'promo__promo_width__daypart+category__nolag__idx': {'type': 'weighted_mean_index',
  'domain': 'promo',
  'mechanism_group': 'promo_width',
  'structure_signature': 'daypart+category',
  'has_lag': False,
  'lag_weeks': None,
  'inputs': ['breakfast_beef_promo_width',
   'breakfast_chicken_promo_width',
   'breakfast_dessert_promo_width',
   'breakfast_drink_promo_width',
   'dinner_beef_promo_width',
   'dinner_chicken_promo_width',
   'dinner_dessert_promo_width',
   'dinner_drink_promo_width',
   'lunch_beef_promo_width',
   'lunch_chicken_promo_width',
   'lunch_dessert_promo_width',
   'lunch_drink_promo_width',
   'snack_beef_promo_width',
   'snack_chicken_promo_width',
   'snack_dessert_promo_width',
   'snack_drink_promo_width'],
  'weighting': 'auto (gc_share_total for channel/daypart; sales_share_total for category; product for multi-dim; fallback mean)'},
 'promo__promo_width__daypart__nolag__idx': {'type': 'weighted_mean_index',
  'domain': 'promo',
  'mechanism_group': 

In [39]:
group_decisions

Unnamed: 0,group_key,domain,mechanism_group,structure_signature,has_lag,lag_weeks,n_features,pairs_ge_0p90,vif_max,n_vif_ge_10,decision,rationale,n_clusters
0,"(mix_share, gc_share, daypart+channel, False, nan)",mix_share,gc_share,daypart+channel,False,,20,0,4.937222,0,KEEP_WITHIN_CLUSTERS,No strong redundancy; keep features (or cluster reps if needed).,20
1,"(mix_share, sales_share, channel+category, False, nan)",mix_share,sales_share,channel+category,False,,20,0,3.787306,0,KEEP_WITHIN_CLUSTERS,No strong redundancy; keep features (or cluster reps if needed).,20
2,"(mix_share, gc_share, channel+category, False, nan)",mix_share,gc_share,channel+category,False,,20,0,3.928634,0,KEEP_WITHIN_CLUSTERS,No strong redundancy; keep features (or cluster reps if needed).,20
3,"(promo, promo_width, daypart+category, False, nan)",promo,promo_width,daypart+category,False,,16,120,12.99563,16,COLLAPSE_TO_INDEX,High redundancy (pairs>=0.90 and/or many VIF>=10) in promo mechanism; collapse to interpretable index.,1
4,"(promo, promo_depth, daypart+category, False, nan)",promo,promo_depth,daypart+category,False,,16,0,7.315831,0,KEEP_WITHIN_CLUSTERS,No strong redundancy; keep features (or cluster reps if needed).,16
5,"(mix_share, sales_share, category, False, nan)",mix_share,sales_share,category,False,,16,0,31893480.0,4,SIMPLEX_KEEP_SUBSET,Share simplex detected (likely sum-to-1); avoid full set to prevent multi-way collinearity.,16
6,"(mix_share, sales_share, daypart+category, False, nan)",mix_share,sales_share,daypart+category,False,,16,0,4.645561,0,KEEP_WITHIN_CLUSTERS,No strong redundancy; keep features (or cluster reps if needed).,16
7,"(promo, promo_width, daypart, False, nan)",promo,promo_width,daypart,False,,12,66,23.422,12,COLLAPSE_TO_INDEX,High redundancy (pairs>=0.90 and/or many VIF>=10) in promo mechanism; collapse to interpretable index.,1
8,"(promo, promo_depth, daypart, False, nan)",promo,promo_depth,daypart,False,,12,8,12.94001,4,KEEP_WITHIN_CLUSTERS,No strong redundancy; keep features (or cluster reps if needed).,4
9,"(mix_share, gc_share, daypart, False, nan)",mix_share,gc_share,daypart,False,,12,0,7.415176,0,SIMPLEX_KEEP_SUBSET,Share simplex detected (likely sum-to-1); avoid full set to prevent multi-way collinearity.,12


### Domain & mechanism feature budgeting

In [42]:
import numpy as np
import pandas as pd

# =========================================================
# Step F: Domain & mechanism feature budgeting (target-free)
# =========================================================
# Assumes you already have from previous steps:
#   - data (DataFrame)
#   - feature_catalog (DataFrame)
#   - selected_features (list[str]) from Step E
#   - collapse_definitions (dict) from Step E (index_name -> recipe)
#   - mapping_table (DataFrame) from Step E
#   - group_decisions (DataFrame) from Step E  (corr+vif summary per group)
#
# Outputs:
#   - final_features (list[str]) capped to your budgets
#   - budget_report (DataFrame) summary counts by domain/mechanism/type
#   - feature_budget_table (DataFrame) transparent scoring + reasons
# =========================================================


# -------------------------
# 1) Set your budgets here
# -------------------------
DOMAIN_BUDGETS = {
    "promo": 12,
    "media": 10,
    "mix_share": 35,   # usually biggest; tune down for ~80 total
    "csat": 8,
    "pricing": 6,
    "menu": 8,
    "ops": 6,
    "other": 5,
    "id": 2,           # restaurant_id, week_start
}

# Optional: enforce caps inside domain
MECHANISM_BUDGETS = {
    # Examples (uncomment / adjust):
    # ("promo", "promo_width"): 4,
    # ("promo", "promo_depth"): 4,
    # ("mix_share", "gc_share"): 18,
    # ("mix_share", "sales_share"): 18,
}

ALWAYS_KEEP = {"restaurant_id", "week_start"}  # IDs always kept


# -------------------------
# 2) Utilities
# -------------------------
def safe_domain(x: str) -> str:
    return x if x in DOMAIN_BUDGETS else "other"

def get_domain_mech(feature: str) -> tuple[str, str]:
    """Prefer collapse_definitions for indices; else use feature_catalog."""
    if feature in collapse_definitions:
        d = collapse_definitions[feature].get("domain", "other")
        m = collapse_definitions[feature].get("mechanism_group", "other")
        return safe_domain(d), m
    if feature in set(feature_catalog["feature"]):
        row = feature_catalog.loc[feature_catalog["feature"] == feature].iloc[0]
        d = row.get("domain", "other")
        m = row.get("mechanism_group", "other")
        return safe_domain(d), m
    return "other", "other"

def get_group_key_for_feature(feature: str):
    """Get a group_key for scoring diagnostics. If multiple, take first."""
    if feature in collapse_definitions:
        # Synthetic key for indices (good enough for scoring)
        r = collapse_definitions[feature]
        return (
            r.get("domain"),
            r.get("mechanism_group"),
            r.get("structure_signature"),
            r.get("has_lag"),
            r.get("lag_weeks"),
        )
    m = mapping_table[mapping_table["feature"] == feature]
    if not m.empty:
        return m.iloc[0]["group_key"]
    return None

def build_group_diag_lookup(group_decisions: pd.DataFrame) -> dict:
    """group_key -> diagnostics used for scoring."""
    diag = {}
    if group_decisions is None or len(group_decisions) == 0:
        return diag
    for _, r in group_decisions.iterrows():
        diag[r["group_key"]] = {
            "pairs_ge_0p90": float(r.get("pairs_ge_0p90", r.get("n_pairs_flagged_ge_0p90", 0)) or 0),
            "vif_max": float(r.get("vif_max", np.nan)),
            "n_features": float(r.get("n_features", np.nan)),
        }
    return diag

GROUP_DIAG = build_group_diag_lookup(group_decisions)

def score_feature(feature: str, ftype: str, group_key):
    """
    Target-free scoring:
      - indices highest (already collapsed redundancy)
      - reps next
      - raw next
      - add bonus if originating group was highly redundant (corr/VIF)
      - small penalty for very wide groups to encourage diversity across mechanisms
    """
    base = {"id": 1_000, "index": 100, "rep": 80, "raw": 60}.get(ftype, 50)

    bonus = 0
    penalty = 0
    if group_key in GROUP_DIAG:
        d = GROUP_DIAG[group_key]
        pairs90 = d["pairs_ge_0p90"]
        vif_max = d["vif_max"]
        width = d["n_features"]

        # redundancy bonus
        if pairs90 >= 50: bonus += 8
        elif pairs90 >= 20: bonus += 6
        elif pairs90 >= 10: bonus += 4
        elif pairs90 >= 5: bonus += 2

        if np.isfinite(vif_max):
            if vif_max >= 30: bonus += 8
            elif vif_max >= 10: bonus += 5
            elif vif_max >= 5: bonus += 2

        # width penalty (avoid a single explosion consuming budget)
        if np.isfinite(width):
            if width >= 30: penalty += 6
            elif width >= 20: penalty += 4
            elif width >= 12: penalty += 2

    # small “canonical” name bonus
    name_bonus = 0
    if feature.startswith("ttl_"): name_bonus += 2
    if "overall" in feature: name_bonus += 1
    if "method1" in feature: name_bonus += 1

    return base + bonus + name_bonus - penalty


# -------------------------
# 3) Build a transparent feature table
# -------------------------
sel = set(selected_features)

index_features = set(collapse_definitions.keys()) & sel
rep_features = set(mapping_table["representative"].dropna().unique()) & sel
rep_features = rep_features - index_features
id_features = (ALWAYS_KEEP & set(data.columns)) if "data" in globals() else ALWAYS_KEEP
raw_features = (sel - index_features - rep_features) - set(id_features)

rows = []

# IDs
for f in sorted(id_features):
    d, m = ("id", "id")
    rows.append({
        "feature": f, "domain": d, "mechanism_group": m,
        "type": "id", "group_key": None,
        "score": score_feature(f, "id", None),
        "reason": "ALWAYS_KEEP"
    })

# Indices
for f in sorted(index_features):
    d, m = get_domain_mech(f)
    gk = get_group_key_for_feature(f)
    rows.append({
        "feature": f, "domain": d, "mechanism_group": m,
        "type": "index", "group_key": gk,
        "score": score_feature(f, "index", gk),
        "reason": "INDEX_CREATED"
    })

# Representatives
for f in sorted(rep_features):
    d, m = get_domain_mech(f)
    gk = get_group_key_for_feature(f)
    rows.append({
        "feature": f, "domain": d, "mechanism_group": m,
        "type": "rep", "group_key": gk,
        "score": score_feature(f, "rep", gk),
        "reason": "REPRESENTATIVE"
    })

# Raw
for f in sorted(raw_features):
    d, m = get_domain_mech(f)
    gk = get_group_key_for_feature(f)
    rows.append({
        "feature": f, "domain": d, "mechanism_group": m,
        "type": "raw", "group_key": gk,
        "score": score_feature(f, "raw", gk),
        "reason": "RAW_SELECTED"
    })

feature_budget_table = pd.DataFrame(rows)

# Normalize domain labels
feature_budget_table["domain"] = feature_budget_table["domain"].map(safe_domain)

# -------------------------
# 4) Budgeted selection engine
# -------------------------
def apply_budgets(df: pd.DataFrame, domain_budgets: dict, mech_budgets: dict) -> list[str]:
    """
    Greedy:
      1) keep IDs
      2) keep all indices (by default)
      3) fill remaining by score subject to domain & optional mechanism caps
    """
    chosen = set(df[df["type"] == "id"]["feature"].tolist())

    # keep indices first
    for f in df[df["type"] == "index"].sort_values("score", ascending=False)["feature"]:
        chosen.add(f)

    # counts so far
    chosen_df = df[df["feature"].isin(chosen)]
    domain_counts = chosen_df.groupby("domain").size().to_dict()
    mech_counts = chosen_df.groupby(["domain", "mechanism_group"]).size().to_dict()

    def domain_cap(domain):
        return domain_budgets.get(domain, domain_budgets.get("other", 0))

    def mech_cap(domain, mech):
        return mech_budgets.get((domain, mech), None)

    # candidates: reps then raw, sorted by score
    candidates = df[~df["feature"].isin(chosen)].copy()
    candidates = candidates.sort_values("score", ascending=False)

    for _, r in candidates.iterrows():
        f = r["feature"]
        d = safe_domain(r["domain"])
        m = r["mechanism_group"]

        # domain budget check
        cap_d = domain_cap(d)
        if cap_d is not None and domain_counts.get(d, 0) >= cap_d:
            continue

        # mechanism budget check
        cap_m = mech_cap(d, m)
        if cap_m is not None and mech_counts.get((d, m), 0) >= cap_m:
            continue

        chosen.add(f)
        domain_counts[d] = domain_counts.get(d, 0) + 1
        mech_counts[(d, m)] = mech_counts.get((d, m), 0) + 1

    # return sorted for stability (keep high score first for readability)
    out = df[df["feature"].isin(chosen)].sort_values(
        ["type", "domain", "mechanism_group", "score"],
        ascending=[False, True, True, False]
    )["feature"].tolist()
    return out

final_features = apply_budgets(feature_budget_table, DOMAIN_BUDGETS, MECHANISM_BUDGETS)

# -------------------------
# 5) Reporting
# -------------------------
budget_report = (
    feature_budget_table[feature_budget_table["feature"].isin(final_features)]
    .groupby(["domain", "mechanism_group", "type"])
    .size()
    .reset_index(name="n_selected")
    .sort_values(["domain", "n_selected"], ascending=[True, False])
    .reset_index(drop=True)
)

domain_counts = (
    feature_budget_table[feature_budget_table["feature"].isin(final_features)]
    .groupby("domain")
    .size()
    .sort_values(ascending=False)
)

print("Selected before budgeting:", len(selected_features))
print("Selected after budgeting:", len(final_features))
print("\nDomain counts after budgeting:")
print(domain_counts)

# Optional: inspect what got selected for a domain
# feature_budget_table[feature_budget_table["feature"].isin(final_features) & (feature_budget_table["domain"]=="mix_share")].head(50)

budget_report.head(40)


Selected before budgeting: 236
Selected after budgeting: 81

Domain counts after budgeting:
domain
mix_share    35
promo        12
media        10
menu          8
csat          8
pricing       6
id            2
dtype: int64


Unnamed: 0,domain,mechanism_group,type,n_selected
0,csat,rating_overall,rep,6
1,csat,rating_accuracy,rep,1
2,csat,rating_fast,rep,1
3,id,id,id,2
4,media,campaign_subcategory_volume,rep,4
5,media,campaign_partner_volume,rep,3
6,media,media_spend,rep,3
7,menu,lto_count_by_slice,rep,4
8,menu,promo_item_count,rep,4
9,mix_share,gc_share,rep,21


In [None]:
feature_budget_table[feature_budget_table["feature"].isin(final_features) & (feature_budget_table["domain"]=="mix_share")].head(25)

Unnamed: 0,feature,domain,mechanism_group,type,group_key,score,reason
26,beef_drive_thru_sales_share,mix_share,sales_share,rep,"(mix_share, sales_share, category, False, nan)",86,REPRESENTATIVE
27,beef_front_counter_sales_share,mix_share,sales_share,rep,"(mix_share, sales_share, category, False, nan)",86,REPRESENTATIVE
32,beef_sales_share_total,mix_share,sales_share,rep,"(mix_share, sales_share, category, False, nan)",86,REPRESENTATIVE
43,breakfast_drive_thru_gc_share,mix_share,gc_share,rep,"(mix_share, gc_share, daypart, False, nan)",80,REPRESENTATIVE
45,breakfast_front_counter_gc_share,mix_share,gc_share,rep,"(mix_share, gc_share, daypart, False, nan)",80,REPRESENTATIVE
46,breakfast_gc_share_total,mix_share,gc_share,rep,"(mix_share, gc_share, daypart, False, nan)",80,REPRESENTATIVE
49,breakfast_gc_share_total_8wk_lag,mix_share,gc_share,rep,"(mix_share, gc_share, daypart, True, 8.0)",80,REPRESENTATIVE
64,chicken_drive_thru_sales_share,mix_share,sales_share,rep,"(mix_share, sales_share, category, False, nan)",86,REPRESENTATIVE
65,chicken_front_counter_sales_share,mix_share,sales_share,rep,"(mix_share, sales_share, category, False, nan)",86,REPRESENTATIVE
70,chicken_sales_share_total,mix_share,sales_share,rep,"(mix_share, sales_share, category, False, nan)",86,REPRESENTATIVE
