In [2]:
# --- Parámetros (se pueden sobreescribir en CI) ---
from pathlib import Path
from datetime import datetime
import os
import pandas as pd
import pytz

# Zona horaria para "hoy"
TZ = pytz.timezone("Europe/Madrid")

def _today_tz(tz=TZ) -> str:
    return datetime.now(tz).date().strftime("%Y-%m-%d")

# RUN_DATE: prioridad -> valor ya definido (papermill/globals) -> env -> hoy (Europe/Madrid)
_run_injected = globals().get("RUN_DATE", None)
if _run_injected not in (None, "", "auto", "today"):
    RUN_DATE = str(_run_injected)
else:
    RUN_DATE = os.environ.get("RUN_DATE", _today_tz())

# Normaliza a YYYY-MM-DD
RUN_DATE = pd.to_datetime(RUN_DATE, errors="coerce").date().strftime("%Y-%m-%d")

# SEASON: si no viene dada, se calcula a partir de RUN_DATE (formato 2025_26)
if "SEASON" in globals() and globals()["SEASON"]:
    SEASON = globals()["SEASON"]
else:
    _dt = pd.to_datetime(RUN_DATE)
    _y = int(_dt.year) if _dt.month >= 7 else int(_dt.year) - 1
    SEASON = f"{_y}_{(_y+1) % 100:02d}"

# MATCHDAY (jornada): permite inyección externa; por defecto None
MATCHDAY = globals().get("MATCHDAY", os.environ.get("MATCHDAY", None))

# Versión de modelo: respeta inyección / env, si no usa por defecto
MODEL_VERSION = globals().get("MODEL_VERSION", os.environ.get("MODEL_VERSION", "xgb-local"))

# --- Rutas coherentes local/CI ---
ROOT   = Path.cwd()
DATA   = ROOT / "data"
RAW    = DATA / "01_raw"
PROC   = DATA / "02_processed"
FEAT   = DATA / "03_features"
MODELS = DATA / "04_models"
OUT    = ROOT / "outputs"

for p in [RAW, PROC, FEAT, MODELS, OUT]:
    p.mkdir(parents=True, exist_ok=True)

# Reproducibilidad
import random, numpy as np
random.seed(42); np.random.seed(42)

print(f"RUN_DATE = {RUN_DATE} | SEASON = {SEASON} | MATCHDAY = {MATCHDAY} | MODEL_VERSION = {MODEL_VERSION}")
print(f"ROOT = {ROOT}")

RUN_DATE = 2025-11-06 | SEASON = 2025_26 | MATCHDAY = None | MODEL_VERSION = xgb-local
ROOT = /content


In [3]:
import pandas as pd, json

def load_feat(name: str):
    return pd.read_parquet(FEAT / name)

def save_model(obj, name: str):
    from joblib import dump
    MODELS.mkdir(parents=True, exist_ok=True)
    dump(obj, MODELS / name)

def save_predictions(df: pd.DataFrame, name: str = "predictions_next.csv"):
    OUT.mkdir(parents=True, exist_ok=True)
    df.to_csv(OUT / name, index=False)

def save_json(obj, name: str = "metrics_overview.json"):
    OUT.mkdir(parents=True, exist_ok=True)
    with open(OUT / name, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

# **MODELOS**

In [4]:
import json
from collections import defaultdict
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import accuracy_score, log_loss
from imblearn.over_sampling import SMOTE
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix
import time
import hashlib
import re

# **EL MODELO**

In [4]:
IN_PATH = FEAT / "df_final.parquet"
df = pd.read_parquet(IN_PATH)

print("Leído:", IN_PATH, "· filas=", len(df), "· cols=", df.shape[1])
df.head(2)

Leído: /content/data/03_features/df_final.parquet · filas= 7340 · cols= 121


Unnamed: 0,B365A,B365D,B365H,Date,FTR,HomeTeam_norm,AwayTeam_norm,h_elo,a_elo,Season,...,form_gd_6_diff,effectiveness_diff,relative_perf_diff,target,home_playstyle_defensivo,home_playstyle_equilibrado,home_playstyle_ofensivo,away_playstyle_defensivo,away_playstyle_equilibrado,away_playstyle_ofensivo
0,6.0,3.6,1.57,2006-08-26,H,valencia,betis,1857.375122,1726.076904,2006,...,0.0,0.0,0.05756,2.0,False,False,True,True,False,False
1,4.33,3.3,1.83,2006-08-27,A,osasuna,getafe,1756.190308,1762.177246,2006,...,0.0,0.0,0.02968,0.0,False,True,False,False,True,False


In [5]:
# Logit de mercado (home vs away)
df['market_home_logit'] = np.log((df['pimp1'] + 1e-9) / (df['pimp2'] + 1e-9))
df['market_draw_logit'] = np.log((df['pimpx'] + 1e-9) / ((df['pimp1'] + df['pimp2'])/2 + 1e-9))

# Diferencial de Elo
df['elo_diff'] = df['h_elo'] - df['a_elo']

In [1]:
FEATURES_S0   = ['pimp1', 'pimpx', 'pimp2']
FEATURES_S0p  = FEATURES_S0 + ['elo_diff']

FEATURES_S1 = ['pimp1','pimpx','pimp2','relative_perf_diff']
FEATURES_S1p = FEATURES_S1 + ['elo_diff']

FEATURES_S2 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff']
FEATURES_S2p = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','has_xg_data']

FEATURES_S3 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff', 'form_points_6_diff']
FEATURES_S3p = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff', 'form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum']

FEATURES_S4 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff', 'form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'prev_position_diff']
FEATURES_S4p = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff', 'form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'home_prev_position', 'away_prev_position', 'elo_diff']

FEATURES_S5 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff']
FEATURES_S5p = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'h2h_draw_rate_ewm_diff', 'h2h_loss_rate_ewm_diff']

FEATURES_S6 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'home_total_matches_prev', 'away_total_matches_prev']

FEATURES_S7 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'home_total_matches_prev', 'away_total_matches_prev', 'home_avg_shotsontarget_last7', 'avg_shots_last7_diff']

FEATURES_S8 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'h2h_draw_rate_ewm_diff', 'h2h_loss_rate_ewm_diff','home_playstyle_equilibrado', 'home_playstyle_ofensivo', 'away_playstyle_defensivo', 'away_playstyle_ofensivo']

FEATURES_S9 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'h2h_draw_rate_ewm_diff', 'h2h_loss_rate_ewm_diff','home_playstyle_equilibrado', 'home_playstyle_ofensivo', 'away_playstyle_defensivo', 'away_playstyle_ofensivo', 'a_elo']

FEATURES_S10 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'h2h_draw_rate_ewm_diff', 'h2h_loss_rate_ewm_diff','home_playstyle_defensivo', 'home_playstyle_ofensivo', 'away_playstyle_defensivo', 'away_playstyle_ofensivo', 'a_elo', 'h2h_draw_rate_roll8_diff']

FEATURES_S11 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'home_total_matches_prev', 'away_total_matches_prev', 'home_avg_shotsontarget_last7', 'avg_shots_last7_diff', 'away_playstyle_equilibrado']
FEATURES_S11p = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'home_total_matches_prev', 'away_total_matches_prev', 'home_avg_shotsontarget_last7', 'avg_shots_last7_diff', 'away_gd_cum']

FEATURES_S12 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'home_total_matches_prev', 'away_total_matches_prev', 'home_avg_shotsontarget_last7', 'avg_shots_last7_diff', 'away_playstyle_equilibrado', 'home_prev_big_odds_win_any']

FEATURES_S13 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'home_total_matches_prev', 'away_total_matches_prev', 'home_avg_shotsontarget_last7', 'avg_shots_last7_diff', 'away_playstyle_equilibrado', 'home_prev_big_odds_win_any', 'total_gd_cum_diff']



## Sin SMOTE:

In [7]:
# ============================================================
# 1) Unicidad de clave (solo para la clave "humana")
# ============================================================
def enforce_unique_pred_key(df_in, key_col="pred_key"):
    """
    Si hay claves duplicadas en `key_col`, añade '#k' (k=0,1,2,...) por orden estable
    dentro de cada grupo duplicado. Devuelve df modificado y nº de filas afectadas.
    """
    d = df_in.copy()
    base = d[key_col].astype(str)
    grp_sizes = base.map(base.value_counts())
    pos = base.groupby(base).cumcount()
    suffix = np.where(grp_sizes > 1, "#" + pos.astype(str), "")
    d[key_col] = base + suffix
    affected = int((grp_sizes > 1).sum())
    return d, affected

# ============================================================
# 2) Walk-forward con proba_H/D/A y claves estables
#    - Genera pred_key (humana) y pred_key_match (estable)
# ============================================================
def walkforward_multinomial_accuracy(
    df,
    feature_cols,
    date_col='Date',
    label_col='FTR',
    n_seasons_window=4,
    season_size=380,
    recent_weight=3.0,
    older_weight=1.0,
    C=1.0,
    max_iter=1000,
    verbose_every=0  # pon >0 para logs cada N días
):
    """
    Evaluación día a día, añade proba_H/D/A y claves:
      - pred_key        = Season|YYYY-MM-DD|HomeTeam_norm|AwayTeam_norm (legible, puede llevar #k)
      - pred_key_match  = Season|YYYY-MM-DD|home_norm|away_norm         (estable para merges)
    """
    import re

    def _norm_name(s: str) -> str:
        return re.sub(r'[^a-z0-9]+', '', str(s).strip().lower())

    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')

    # Feature derivada opcional
    if 'market_home_logit' in feature_cols and 'market_home_logit' not in df.columns:
        if {'pimp1','pimp2'}.issubset(df.columns):
            df['market_home_logit'] = np.log(
                (pd.to_numeric(df['pimp1'], errors='coerce') + 1e-9) /
                (pd.to_numeric(df['pimp2'], errors='coerce') + 1e-9)
            )
        else:
            raise ValueError("market_home_logit pedido en feature_cols pero faltan pimp1/pimp2 en df.")

    df = df.sort_values(date_col).reset_index(drop=True)
    uniq_dates = df[date_col].sort_values().unique()

    train_window = n_seasons_window * season_size
    recent_block = season_size

    pipe = Pipeline(steps=[
        ('imp', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler(with_mean=True, with_std=True)),
        ('logit', LogisticRegression(solver='lbfgs', C=C, max_iter=max_iter))
    ])

    preds_all = []

    for d_i, current_date in enumerate(uniq_dates):
        test_mask = df[date_col] == current_date
        test_idx = np.where(test_mask)[0]
        if test_idx.size == 0:
            continue

        train_mask = df[date_col] < current_date
        train_idx_all = np.where(train_mask)[0]
        if train_idx_all.size < train_window:
            continue
        train_idx = train_idx_all[-train_window:]

        # Pesos
        sample_weight = np.full(train_idx.shape[0], older_weight, dtype=float)
        if recent_block > 0:
            sample_weight[-recent_block:] = recent_weight

        # X, y
        X_train = df.iloc[train_idx][feature_cols]
        y_train = df.iloc[train_idx][label_col]
        X_test  = df.iloc[test_idx][feature_cols]
        y_test  = df.iloc[test_idx][label_col]

        # Entrena y predice
        pipe.fit(X_train, y_train, **{'logit__sample_weight': sample_weight})
        y_pred  = pipe.predict(X_test)
        y_proba = pipe.predict_proba(X_test)

        # --- METADATA para merges (claves estable y legible) ---
        meta = df.iloc[test_idx][['Season','Date','HomeTeam_norm','AwayTeam_norm']].copy()
        meta['_date_key'] = pd.to_datetime(meta['Date'], errors='coerce')\
                               .dt.tz_localize(None, nonexistent='NaT', ambiguous='NaT')\
                               .dt.floor('D')

        home_raw = meta['HomeTeam_norm'].astype(str)
        away_raw = meta['AwayTeam_norm'].astype(str)
        home_norm = home_raw.map(_norm_name)
        away_norm = away_raw.map(_norm_name)

        # Clave legible (se puede forzar unicidad con #k)
        meta['pred_key'] = (
            meta['Season'].astype('Int64').astype(str) + "|" +
            meta['_date_key'].dt.strftime("%Y-%m-%d") + "|" +
            home_raw + "|" +
            away_raw
        )
        # Clave para MERGE estable (NO poner #k)
        meta['pred_key_match'] = (
            meta['Season'].astype('Int64').astype(str) + "|" +
            meta['_date_key'].dt.strftime("%Y-%m-%d") + "|" +
            home_norm + "|" +
            away_norm
        )

        # --- Probabilidades en orden fijo H/D/A ---
        classes = pipe.named_steps['logit'].classes_.astype(str)
        proba_cols_map = {c: y_proba[:, i] for i, c in enumerate(classes)}
        proba_H = proba_cols_map.get('H', np.full(len(test_idx), np.nan))
        proba_D = proba_cols_map.get('D', np.full(len(test_idx), np.nan))
        proba_A = proba_cols_map.get('A', np.full(len(test_idx), np.nan))

        day_res = pd.DataFrame({
            'Date': meta['Date'].values,
            'y_true': y_test.values,
            'y_pred': y_pred,
            'proba_H': proba_H,
            'proba_D': proba_D,
            'proba_A': proba_A
        })

        # etiqueta válida (para accuracy)
        y_true_clean = day_res['y_true'].astype(str).str.upper().str.strip()
        day_res['has_label'] = y_true_clean.isin(['H', 'D', 'A']).astype(int)

        # anexamos Season/Home/Away/pred_keys
        day_res[['Season','HomeTeam_norm','AwayTeam_norm','pred_key']] = \
            meta[['Season','HomeTeam_norm','AwayTeam_norm','pred_key']].values
        day_res['pred_key_match'] = meta['pred_key_match'].values

        preds_all.append(day_res)

        if verbose_every and (d_i % verbose_every == 0):
            mask_lbl = day_res['has_label'] == 1
            if mask_lbl.any():
                acc_day = (day_res.loc[mask_lbl, 'y_true'] == day_res.loc[mask_lbl, 'y_pred']).mean()
                print(f"[{d_i+1}/{len(uniq_dates)}] {str(current_date)[:10]}  "
                      f"test_n={len(day_res)}  scored_n={int(mask_lbl.sum())}  acc={acc_day:.3f}")
            else:
                print(f"[{d_i+1}/{len(uniq_dates)}] {str(current_date)[:10]}  "
                      f"test_n={len(day_res)}  (sin labels válidas)")

    if not preds_all:
        raise RuntimeError("No se generaron predicciones; ¿hay suficientes datos previos para armar ventanas?")

    preds_all = pd.concat(preds_all, ignore_index=True)

    # Unicidad de la clave legible (NO tocar pred_key_match)
    if 'pred_key' in preds_all.columns:
        preds_all, _ = enforce_unique_pred_key(preds_all, key_col='pred_key')

    # Accuracy oficial (sin cuotas)
    scored_mask = preds_all['has_label'] == 1
    if scored_mask.any():
        accuracy = (preds_all.loc[scored_mask, 'y_true'] == preds_all.loc[scored_mask, 'y_pred']).mean()
    else:
        raise RuntimeError("No hay partidos con etiqueta válida para calcular accuracy.")

    return float(accuracy), preds_all

In [8]:
# ============================================================
# 3) Elige features y parámetros WF (ajusta a tu proyecto)
# ============================================================
FEATURES = FEATURES_S13

WF_KWARGS = dict(
    n_seasons_window=4,
    season_size=380,
    recent_weight=3.0,
    older_weight=1.0,
    C=1.0,
    max_iter=1000,
    verbose_every=0
)

acc_global_oficial, preds = walkforward_multinomial_accuracy(
    df,
    feature_cols=FEATURES,
    **WF_KWARGS
)

In [9]:
# ============================================================
# 4) Alineación ROBUSTA: por pred_key_match y fallback por (Date,row_in_date)
#    Sustituye a align_preds_by_date_order_and_build_predkey
# ============================================================
def align_preds_by_key_then_fallback(preds: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
    """
    1) Asegura que preds y df tengan 'pred_key_match' (Season|YYYY-MM-DD|home_norm|away_norm).
    2) MERGE por 'pred_key_match' para traer Season/Home/Away/B365/pimp*.
    3) Fallback SOLO para filas sin casar: (Date,row_in_date) con orden estable.
    4) Reconstruye 'pred_key' (humana) y fuerza unicidad SOLO en 'pred_key'.
    """
    import re

    def _norm_name(s: str) -> str:
        return re.sub(r'[^a-z0-9]+', '', str(s).strip().lower())

    def _ensure_keys(x: pd.DataFrame) -> pd.DataFrame:
        x = x.copy()
        x['Date'] = pd.to_datetime(x['Date'], errors='coerce')
        date_key = x['Date'].dt.tz_localize(None, nonexistent='NaT', ambiguous='NaT').dt.floor('D')
        # inferir columnas de equipos
        home_col = next((c for c in ['HomeTeam_norm','HomeTeam','home_team','Home'] if c in x.columns), None)
        away_col = next((c for c in ['AwayTeam_norm','AwayTeam','away_team','Away'] if c in x.columns), None)
        if home_col is None or away_col is None:
            raise KeyError("No se hallaron columnas Home/Away para construir la clave.")

        home_raw = x[home_col].astype(str)
        away_raw = x[away_col].astype(str)
        home_norm = home_raw.map(_norm_name)
        away_norm = away_raw.map(_norm_name)

        if 'Season' not in x.columns:
            x['Season'] = pd.NA

        x['pred_key'] = (
            x['Season'].astype('Int64').astype(str) + "|" +
            date_key.dt.strftime("%Y-%m-%d") + "|" +
            home_raw + "|" +
            away_raw
        )
        x['pred_key_match'] = (
            x['Season'].astype('Int64').astype(str) + "|" +
            date_key.dt.strftime("%Y-%m-%d") + "|" +
            home_norm + "|" +
            away_norm
        )
        return x

    # 1) Claves en ambos
    p = _ensure_keys(preds)
    d = _ensure_keys(df)

    # 2) Merge por clave estable
    need_cols = [
        'Season','HomeTeam_norm','AwayTeam_norm',
        'B365H','B365D','B365A','pimp1','pimpx','pimp2'
    ]
    take = ['pred_key_match'] + [c for c in need_cols if c in d.columns]
    m = p.merge(d[take].drop_duplicates('pred_key_match'),
                on='pred_key_match', how='left', suffixes=('', '_from_df'))

    # Si falta Season tras el merge, toma Season_from_df
    if 'Season_from_df' in m.columns:
        if 'Season' in m.columns:
            m['Season'] = m['Season'].fillna(m['Season_from_df'])
        else:
            m['Season'] = m['Season_from_df']
        m.drop(columns=['Season_from_df'], inplace=True)

    # 3) Fallback por (Date,row_in_date) SOLO para los que no consiguieron cuotas ni pimps
    need_any = ['B365H','B365D','B365A','pimp1','pimpx','pimp2']
    missing = np.ones(len(m), dtype=bool)
    for c in [c for c in need_any if c in m.columns]:
        missing &= m[c].isna().to_numpy()

    if missing.any():
        p2 = p.sort_values('Date', kind='mergesort').reset_index(drop=True)
        p2['row_in_date'] = p2.groupby(p2['Date']).cumcount()
        d2 = d.sort_values('Date', kind='mergesort').reset_index(drop=True)
        d2['row_in_date'] = d2.groupby(d2['Date']).cumcount()

        fb_cols = ['Date','row_in_date'] + [c for c in need_any if c in d2.columns]
        fb = p2[['Date','row_in_date']].merge(d2[fb_cols], on=['Date','row_in_date'], how='left')

        for c in [c for c in need_any if c in m.columns and c in fb.columns]:
            m.loc[missing, c] = m.loc[missing, c].where(m.loc[missing, c].notna(), fb.loc[missing, c])

    # 4) Rehacer pred_key legible (puede haber cambiado Season)
    date_key = pd.to_datetime(m['Date'], errors='coerce').dt.tz_localize(None).dt.floor('D')
    m['pred_key'] = (
        m['Season'].astype('Int64').astype(str) + "|" +
        date_key.dt.strftime("%Y-%m-%d") + "|" +
        m['HomeTeam_norm'].astype(str) + "|" +
        m['AwayTeam_norm'].astype(str)
    )

    m, _ = enforce_unique_pred_key(m, key_col='pred_key')
    return m

# ============================================================
# 5) Ejecutar alineación robusta y chequeos de sanidad
# ============================================================
merged = align_preds_by_key_then_fallback(preds, df)

def sanity_checks(m: pd.DataFrame):
    # 1) Filas sin cuotas tras merge
    miss_odds = m[['B365H','B365D','B365A']].isna().any(axis=1).sum() if \
        {'B365H','B365D','B365A'}.issubset(m.columns) else None
    print(f"[CHECK] Filas sin cuotas: {miss_odds}")

    # 2) Duplicados en clave estable (no deberían)
    dup_match = m['pred_key_match'].duplicated(keep=False).sum() if 'pred_key_match' in m.columns else 0
    print(f"[CHECK] Duplicados pred_key_match: {dup_match}")

    # 3) Overround razonable
    if {'B365H','B365D','B365A'}.issubset(m.columns):
        inv = 1.0/np.clip(pd.to_numeric(m['B365H'], errors='coerce'),1.0,None) + \
              1.0/np.clip(pd.to_numeric(m['B365D'], errors='coerce'),1.0,None) + \
              1.0/np.clip(pd.to_numeric(m['B365A'], errors='coerce'),1.0,None)
        bad = inv[(inv<1.0) | (inv>1.3)].count()
        print(f"[CHECK] Overround fuera de [1.00, 1.30]: {bad}")

sanity_checks(merged)

# ============================================================
# 6) Accuracy & ROI ENTRE APUESTAS (usa merged ya alineado)
# ============================================================
def compute_accuracy_roi(merged_df, pred_col='y_pred'):
    """
    Accuracy & ROI ENTRE APUESTAS (solo filas con label H/D/A y cuota válida >= 1.01).
    No modifica el accuracy oficial de walkforward (que no depende de cuotas).
    """
    m = merged_df.copy()
    n = len(m)

    y_true_arr = m['y_true'].astype(str).str.upper().str.strip().to_numpy()
    pred_arr   = m[pred_col].astype(str).str.upper().str.strip().to_numpy()

    valid_label = np.isin(y_true_arr, ['H','D','A'])

    odds_pred = np.where(
        pred_arr == 'H', m['B365H'].to_numpy() if 'B365H' in m.columns else np.nan,
        np.where(pred_arr == 'D', m['B365D'].to_numpy() if 'B365D' in m.columns else np.nan,
                 np.where(pred_arr == 'A', m['B365A'].to_numpy() if 'B365A' in m.columns else np.nan, np.nan))
    ).astype(float)

    valid_odds = np.isfinite(odds_pred) & (odds_pred >= 1.01)
    scored = valid_label & valid_odds

    is_correct = np.zeros(n, dtype=bool)
    is_correct[scored] = (pred_arr[scored] == y_true_arr[scored])

    acc_bets = is_correct[scored].mean() if scored.any() else np.nan

    profit = np.full(n, np.nan, dtype=float)
    profit[scored] = -1.0
    profit[scored & is_correct] = odds_pred[scored & is_correct] - 1.0

    n_bets = int(np.isfinite(profit).sum())
    total_profit = float(np.nansum(profit))
    roi_global = (total_profit / n_bets) if n_bets > 0 else np.nan

    if 'Season' in m.columns:
        scored_idx = np.isfinite(profit)
        by_season = m.loc[scored_idx, ['Season']].copy()
        by_season['correct'] = is_correct[scored_idx].astype(int)
        by_season['profit']  = profit[scored_idx]

        acc_by_season_bets = (
            by_season.groupby('Season', dropna=True)['correct']
                     .agg(matches='size', accuracy='mean')
                     .reset_index()
                     .sort_values('Season')
        )
        roi_by_season = (
            by_season.groupby('Season', dropna=True)['profit']
                     .agg(bets='size', total_profit='sum')
                     .reset_index()
                     .sort_values('Season')
        )
        roi_by_season['roi'] = roi_by_season['total_profit'] / roi_by_season['bets']
    else:
        acc_by_season_bets = pd.DataFrame(columns=['Season','matches','accuracy'])
        roi_by_season = pd.DataFrame(columns=['Season','bets','total_profit','roi'])

    return acc_bets, roi_global, n_bets, total_profit, acc_by_season_bets, roi_by_season

# Métricas de tu modelo (ENTRE apuestas)
acc_bets_model, roi_g_model, bets_model, prof_model, acc_seas_bets_model, roi_seas_model = compute_accuracy_roi(
    merged, pred_col='y_pred'
)

# ============================================================
# 7) Baseline mercado (argmax pimp1/pimpx/pimp2) y métricas
#    (pimp* ya alineados en 'merged')
# ============================================================
market_labels = np.array(['H','D','A'])
if {'pimp1','pimpx','pimp2'}.issubset(merged.columns):
    probs = merged[['pimp1','pimpx','pimp2']].to_numpy(dtype=float)
    probs_filled = np.where(np.isnan(probs), -np.inf, probs)
    argmax_idx = np.argmax(probs_filled, axis=1)

    merged_market = merged.copy()
    merged_market['y_pred_market'] = market_labels[argmax_idx]

    acc_bets_mkt, roi_g_mkt, bets_mkt, prof_mkt, acc_seas_bets_mkt, roi_seas_mkt = compute_accuracy_roi(
        merged_market, pred_col='y_pred_market'
    )
else:
    acc_bets_mkt = roi_g_mkt = prof_mkt = np.nan
    bets_mkt = 0
    acc_seas_bets_mkt = pd.DataFrame(columns=['Season','matches','accuracy'])
    roi_seas_mkt = pd.DataFrame(columns=['Season','bets','total_profit','roi'])

# ============================================================
# 8) Reporting
# ============================================================
print("\n=== CONFIGURACIÓN ===")
print("Features:", FEATURES)
print("WF kwargs:", WF_KWARGS)

print("\n=== ACCURACY OFICIAL (función walkforward, SIN cuotas) ===")
print(f"Global: {acc_global_oficial:.4f}")
print("\nAccuracy por temporada (oficial):")
date_season = df[['Date','Season']].copy()
date_season['Date'] = pd.to_datetime(date_season['Date'], errors='coerce')
date_season = date_season.drop_duplicates(subset=['Date'])

preds_seas = preds.copy()
preds_seas['Date'] = pd.to_datetime(preds_seas['Date'], errors='coerce')
preds_seas = preds_seas.merge(date_season, on='Date', how='left', validate='m:1')

if 'Season' not in preds_seas.columns:
    if 'Season_y' in preds_seas.columns:
        preds_seas['Season'] = preds_seas['Season_y']
    elif 'Season_x' in preds_seas.columns:
        preds_seas['Season'] = preds_seas['Season_x']
preds_seas.drop(columns=[c for c in ['Season_x','Season_y'] if c in preds_seas.columns], inplace=True)
preds_seas['Season'] = pd.to_numeric(preds_seas['Season'], errors='coerce').astype('Int64')

preds_seas['correct'] = (preds_seas['y_true'] == preds_seas['y_pred']).astype(int)
acc_by_season_oficial = (
    preds_seas[preds_seas['has_label'] == 1]
    .groupby('Season', dropna=True)['correct']
    .agg(matches='size', accuracy='mean')
    .reset_index()
    .sort_values('Season')
)
print(acc_by_season_oficial.to_string(index=False))

print("\n=== TU MODELO — ENTRE APUESTAS (CON cuotas) ===")
print(f"Accuracy entre apuestas: {acc_bets_model:.4f}")
print(f"ROI global             : {roi_g_model:.4f}   |  Bets: {bets_model}   |  Profit: {prof_model:.2f}")
print("\nROI por temporada (tu modelo):")
print(roi_seas_model[['Season','bets','roi','total_profit']].to_string(index=False))

print("\n=== BASELINE MERCADO (argmax pimp1/pimpx/pimp2) — ENTRE APUESTAS ===")
print(f"Accuracy entre apuestas: {acc_bets_mkt:.4f}")
print(f"ROI global             : {roi_g_mkt:.4f}   |  Bets: {bets_mkt}   |  Profit: {prof_mkt:.2f}")
print("\nROI por temporada (mercado):")
print(roi_seas_mkt[['Season','bets','roi','total_profit']].to_string(index=False))

[CHECK] Filas sin cuotas: 0
[CHECK] Duplicados pred_key_match: 0
[CHECK] Overround fuera de [1.00, 1.30]: 0

=== CONFIGURACIÓN ===
Features: ['pimp1', 'pimpx', 'pimp2', 'relative_perf_diff', 'avg_xg_last7_diff', 'form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'home_total_matches_prev', 'away_total_matches_prev', 'home_avg_shotsontarget_last7', 'avg_shots_last7_diff', 'away_playstyle_equilibrado', 'home_prev_big_odds_win_any']
WF kwargs: {'n_seasons_window': 4, 'season_size': 380, 'recent_weight': 3.0, 'older_weight': 1.0, 'C': 1.0, 'max_iter': 1000, 'verbose_every': 0}

=== ACCURACY OFICIAL (función walkforward, SIN cuotas) ===
Global: 0.5511

Accuracy por temporada (oficial):
 Season  matches  accuracy
   2010      380  0.613158
   2011      380  0.536842
   2012      380  0.528947
   2013      380  0.547368
   2014      380  0.544737
   2015      380  0.557895
   2016      380  0.597368
   2017      380  0.552632
   2018      380  0.494737
   

  m['Season'] = m['Season'].fillna(m['Season_from_df'])


## Con SMOTE:

In [None]:
# # ============================================
# # Walk-forward multinomial con SMOTE + calibración
# # - Usa TU enforce_unique_pred_key (con sufijo "#k")
# # - Mantiene misma clave pred_key y estructura de salida
# # ============================================

# def walkforward_multinomial_accuracy_smote_calibrated(
#     df,
#     feature_cols,
#     date_col='Date',
#     label_col='FTR',
#     n_seasons_window=4,
#     season_size=380,
#     recent_weight=3.0,
#     older_weight=1.0,
#     C=1.0,
#     max_iter=1000,
#     # --- SMOTE & Calibración ---
#     smote_k_neighbors=5,
#     smote_sampling_strategy='auto',
#     smote_random_state=42,
#     calibrate=True,
#     calibration_method='sigmoid',   # 'sigmoid' (Platt) o 'isotonic'
#     calibration_cv=3,
#     # --- Otros ---
#     random_state=42,
#     verbose_every=0
# ):
#     """
#     Igual que tu walkforward_multinomial_accuracy, pero:
#       - Aplica SMOTE en el ENTRENAMIENTO de cada día (tras imputar y escalar).
#       - Aplica calibración de probabilidades multiclase con CalibratedClassifierCV.
#       - Mantiene la ventana temporal y una aproximación al peso temporal reciente
#         replicando el último bloque de season_size muestras antes de SMOTE.

#     Devuelve:
#       accuracy (float), preds_all (DataFrame con y_true, y_pred, proba, pred_key, meta)
#     """
#     d0 = df.copy()
#     d0[date_col] = pd.to_datetime(d0[date_col], errors='coerce')

#     # Feature derivada opcional (misma lógica que tu función)
#     if 'market_home_logit' in feature_cols and 'market_home_logit' not in d0.columns:
#         if {'pimp1','pimp2'}.issubset(d0.columns):
#             d0['market_home_logit'] = np.log(
#                 (pd.to_numeric(d0['pimp1'], errors='coerce') + 1e-9) /
#                 (pd.to_numeric(d0['pimp2'], errors='coerce') + 1e-9)
#             )
#         else:
#             raise ValueError("market_home_logit pedido en feature_cols pero faltan pimp1/pimp2 en df.")

#     d0 = d0.sort_values(date_col).reset_index(drop=True)
#     uniq_dates = d0[date_col].sort_values().unique()

#     train_window = n_seasons_window * season_size
#     recent_block = season_size

#     # Clasificador base
#     base_logit = LogisticRegression(
#         solver='lbfgs',
#         C=C,
#         max_iter=max_iter,
#         random_state=random_state
#     )

#     # Pipeline con SMOTE (imputar, escalar, smote, logit)
#     # OJO: usamos ImbPipeline para que SMOTE actúe solo en entrenamiento
#     pipe_base = ImbPipeline(steps=[
#         ('imp', SimpleImputer(strategy='median')),
#         ('scaler', StandardScaler(with_mean=True, with_std=True)),
#         ('smote', SMOTE(
#             sampling_strategy=smote_sampling_strategy,
#             k_neighbors=smote_k_neighbors,
#             random_state=smote_random_state
#         )),
#         ('logit', base_logit)
#     ])

#     preds_all = []

#     # Factor de replicación aproximado para el bloque reciente
#     # (equivale a recent_weight/older_weight redondeado al entero más cercano, >=1)
#     # Si older_weight es 0 (raro), por seguridad fijamos a 1.
#     denom = older_weight if older_weight > 0 else 1.0
#     recent_dup_factor = int(max(1, round(float(recent_weight) / float(denom))))

#     for d_i, current_date in enumerate(uniq_dates):
#         test_mask = d0[date_col] == current_date
#         test_idx = np.where(test_mask)[0]
#         if test_idx.size == 0:
#             continue

#         # Entrenamiento ANTERIOR al día
#         train_mask = d0[date_col] < current_date
#         train_idx_all = np.where(train_mask)[0]
#         if train_idx_all.size < train_window:
#             continue

#         train_idx = train_idx_all[-train_window:]

#         # --- Construir X_train / y_train con replicación del bloque reciente ---
#         X_train_full = d0.iloc[train_idx][feature_cols]
#         y_train_full = d0.iloc[train_idx][label_col].astype(str).str.upper().str.strip()

#         # Asegura que solo entrenamos con H/D/A
#         valid_mask = y_train_full.isin(['H', 'D', 'A'])
#         X_train_full = X_train_full.loc[valid_mask]
#         y_train_full = y_train_full.loc[valid_mask]
#         if len(y_train_full) < 3:
#             # no hay suficiente para multiclass este día
#             continue

#         # Índices relativos del bloque reciente en el training recortado
#         # (últimos 'recent_block' partidos dentro de X_train_full)
#         if recent_block > 0 and recent_dup_factor > 1:
#             n_train = len(X_train_full)
#             cut = max(0, n_train - recent_block)
#             X_older = X_train_full.iloc[:cut]
#             y_older = y_train_full.iloc[:cut]
#             X_recent = X_train_full.iloc[cut:]
#             y_recent = y_train_full.iloc[cut:]

#             # Replicamos el bloque reciente para aproximar pesos temporales
#             X_recent_dup = pd.concat([X_recent] * recent_dup_factor, axis=0, ignore_index=True)
#             y_recent_dup = pd.concat([y_recent] * recent_dup_factor, axis=0, ignore_index=True)

#             X_train_w = pd.concat([X_older, X_recent_dup], axis=0, ignore_index=True)
#             y_train_w = pd.concat([y_older, y_recent_dup], axis=0, ignore_index=True)
#         else:
#             X_train_w = X_train_full
#             y_train_w = y_train_full

#         # Test del día
#         X_test = d0.iloc[test_idx][feature_cols]
#         y_test = d0.iloc[test_idx][label_col]

#         # --- Ajuste con SMOTE ---
#         if calibrate:
#             # Calibración multiclase (One-vs-Rest internamente)
#             # CalibratedClassifierCV clona el estimador y aplica el pipeline por fold (SMOTE en train-fold).
#             clf = CalibratedClassifierCV(
#                 estimator=pipe_base,
#                 method=calibration_method,
#                 cv=calibration_cv
#             )
#         else:
#             clf = pipe_base

#         # Fit y predicción
#         clf.fit(X_train_w, y_train_w)

#         # Probabilidades multiclase (garantizamos orden H/D/A)
#         proba = clf.predict_proba(X_test)
#         # CalibratedClassifierCV devuelve lista de proba por clase; si multiclass, predict_proba es (n, n_classes)
#         # Aseguramos mapeo en el mismo orden que las clases que expone el último paso
#         # Obtenemos las clases de forma segura:
#         if hasattr(clf, "classes_"):
#             classes = list(clf.classes_)
#         else:
#             # fallback para estimador interno
#             classes = list(clf.estimator.named_steps['logit'].classes_)

#         idx_map = {c: classes.index(c) for c in classes}
#         def colp(c):
#             return proba[:, idx_map[c]] if c in idx_map else np.full(proba.shape[0], np.nan)

#         pH = colp('H'); pD = colp('D'); pA = colp('A')
#         y_pred = np.array(['H','D','A'])[np.nanargmax(np.vstack([pH, pD, pA]), axis=0)]

#         # --- METADATA / pred_key idéntico a tu función ---
#         meta = d0.iloc[test_idx][['Season','Date','HomeTeam_norm','AwayTeam_norm']].copy()
#         meta['_date_key'] = pd.to_datetime(meta['Date'], errors='coerce')\
#                                 .dt.tz_localize(None, nonexistent='NaT', ambiguous='NaT')\
#                                 .dt.floor('D')
#         meta['pred_key'] = (
#             meta['Season'].astype('Int64').astype(str) + "|" +
#             meta['_date_key'].dt.strftime("%Y-%m-%d") + "|" +
#             meta['HomeTeam_norm'].astype(str) + "|" +
#             meta['AwayTeam_norm'].astype(str)
#         )

#         day_res = pd.DataFrame({
#             'Date': meta['Date'].values,
#             'y_true': y_test.values.astype(object),  # conserva NaN/strings
#             'y_pred': y_pred,
#             'pH_pred': pH,
#             'pD_pred': pD,
#             'pA_pred': pA,
#         })

#         # etiqueta válida (para accuracy)
#         y_true_clean = day_res['y_true'].astype(str).str.upper().str.strip()
#         day_res['has_label'] = y_true_clean.isin(['H', 'D', 'A']).astype(int)

#         # anexamos Season/Home/Away/pred_key
#         day_res[['Season','HomeTeam_norm','AwayTeam_norm','pred_key']] = \
#             meta[['Season','HomeTeam_norm','AwayTeam_norm','pred_key']].values

#         preds_all.append(day_res)

#         if verbose_every and (d_i % verbose_every == 0):
#             mask_lbl = day_res['has_label'] == 1
#             if mask_lbl.any():
#                 acc_day = (day_res.loc[mask_lbl, 'y_true'] == day_res.loc[mask_lbl, 'y_pred']).mean()
#                 print(f"[SMOTE+Calib {d_i+1}/{len(uniq_dates)}] {str(current_date)[:10]}  "
#                       f"test_n={len(day_res)}  scored_n={int(mask_lbl.sum())}  acc={acc_day:.3f}")

#     if not preds_all:
#         raise RuntimeError("No se generaron predicciones; ¿hay suficientes datos previos para armar ventanas?")

#     preds_all = pd.concat(preds_all, ignore_index=True)

#     # Fuerza unicidad de pred_key CON TU VERSIÓN (sufijo '#k')
#     if 'pred_key' in preds_all.columns:
#         preds_all, _ = enforce_unique_pred_key(preds_all, key_col='pred_key')

#     # Accuracy SOLO sobre filas con etiqueta válida
#     scored_mask = preds_all['has_label'] == 1
#     if scored_mask.any():
#         accuracy = (preds_all.loc[scored_mask, 'y_true'] == preds_all.loc[scored_mask, 'y_pred']).mean()
#     else:
#         raise RuntimeError("No hay partidos con etiqueta válida para calcular accuracy.")

#     return float(accuracy), preds_all

In [None]:
# # ===================== 1) ELIGE TU SET DE FEATURES =====================
# FEATURES = FEATURES_S11p

# # ===================== 2) PARÁMETROS WALK-FORWARD =======================
# WF_KWARGS = dict(
#     n_seasons_window=4,
#     season_size=380,
#     recent_weight=3.0,
#     older_weight=1.0,
#     C=1.0,
#     max_iter=1000,
#     verbose_every=0
# )

# # ===================== 3) EJECUCIÓN WALK-FORWARD (TU MODELO) ===========
# acc_global_oficial, preds = walkforward_multinomial_accuracy_smote_calibrated(
#     df,
#     feature_cols=FEATURES,
#     **WF_KWARGS
# )

# # ---------- util: alinear por (Date + orden en esa fecha) y CONSTRUIR pred_key ----------
# def align_preds_by_date_order_and_build_predkey(preds, df):
#     """
#     1) Alinea preds con df por (Date, row_in_date) para añadir Season/Home/Away/B365/pimp*.
#     2) Construye pred_key = Season|YYYY-MM-DD|Home|Away normalizando Date al DÍA y tz-naive.
#     3) Si Season quedara NaN tras el merge principal, la rellena con un fallback Date->Season.
#     4) NEW: fuerza unicidad de pred_key con sufijo '#k' si hay colisiones.
#     """
#     p = preds.copy()
#     p['Date'] = pd.to_datetime(p['Date'], errors='coerce')
#     # orden estable para que 'row_in_date' sea reproducible
#     p = p.sort_values('Date', kind='mergesort').reset_index(drop=True)
#     p['row_in_date'] = p.groupby('Date').cumcount()

#     d = df.copy()
#     d['Date'] = pd.to_datetime(d['Date'], errors='coerce')
#     d = d.sort_values('Date', kind='mergesort').reset_index(drop=True)
#     d['row_in_date'] = d.groupby('Date').cumcount()

#     need_cols = [
#         'Season','HomeTeam_norm','AwayTeam_norm',
#         'B365H','B365D','B365A','pimp1','pimpx','pimp2'
#     ]
#     need_cols = [c for c in need_cols if c in d.columns]

#     # Merge determinista por (Date + row_in_date)
#     m = p.merge(
#         d[['Date','row_in_date'] + need_cols],
#         on=['Date','row_in_date'],
#         how='left',
#         validate='1:1'
#     )

#     # Fallback Season por día si faltara
#     if ('Season' not in m.columns) or (m['Season'].isna().any()):
#         date_season = df[['Date','Season']].copy()
#         date_season['Date'] = pd.to_datetime(date_season['Date'], errors='coerce').dt.floor('D')
#         date_season = date_season.drop_duplicates(subset=['Date'])

#         m['_Date_day'] = m['Date'].dt.floor('D')
#         m = m.merge(date_season.rename(columns={'Date':'_Date_day','Season':'Season_from_day'}),
#                     on='_Date_day', how='left')
#         if 'Season' in m.columns:
#             m['Season'] = m['Season'].fillna(m['Season_from_day'])
#         else:
#             m['Season'] = m['Season_from_day']
#         m = m.drop(columns=['_Date_day','Season_from_day'])

#     # Tipos numéricos robustos
#     m['Season'] = pd.to_numeric(m['Season'], errors='coerce').astype('Int64')
#     for col in ['B365H','B365D','B365A','pimp1','pimpx','pimp2']:
#         if col in m.columns:
#             m[col] = pd.to_numeric(m[col], errors='coerce')

#     # pred_key estable (Season|YYYY-MM-DD|Home|Away) con Date al DÍA y tz-naive
#     if {'Season','HomeTeam_norm','AwayTeam_norm'}.issubset(m.columns):
#         date_key = m['Date'].dt.tz_localize(None).dt.floor('D')
#         m['pred_key'] = (
#             m['Season'].astype('Int64').astype(str) + "|" +
#             date_key.dt.strftime("%Y-%m-%d") + "|" +
#             m['HomeTeam_norm'].astype(str) + "|" +
#             m['AwayTeam_norm'].astype(str)
#         )
#     else:
#         m['pred_key'] = pd.NA

#     # --- NEW: asegurar pred_key ÚNICA en el merged ---
#     m, _ = enforce_unique_pred_key(m, key_col='pred_key')

#     return m

# merged = align_preds_by_date_order_and_build_predkey(preds, df)

# # ===================== 4) ACCURACY OFICIAL POR TEMPORADA (SIN cuotas) ========
# date_season = df[['Date','Season']].copy()
# date_season['Date'] = pd.to_datetime(date_season['Date'], errors='coerce')
# date_season = date_season.drop_duplicates(subset=['Date'])

# preds_seas = preds.copy()
# preds_seas['Date'] = pd.to_datetime(preds_seas['Date'], errors='coerce')
# preds_seas = preds_seas.merge(date_season, on='Date', how='left', validate='m:1')

# if 'Season' not in preds_seas.columns:
#     if 'Season_y' in preds_seas.columns:
#         preds_seas['Season'] = preds_seas['Season_y']
#     elif 'Season_x' in preds_seas.columns:
#         preds_seas['Season'] = preds_seas['Season_x']
# preds_seas.drop(columns=[c for c in ['Season_x','Season_y'] if c in preds_seas.columns], inplace=True)
# preds_seas['Season'] = pd.to_numeric(preds_seas['Season'], errors='coerce').astype('Int64')

# preds_seas['correct'] = (preds_seas['y_true'] == preds_seas['y_pred']).astype(int)
# acc_by_season_oficial = (
#     preds_seas[preds_seas['has_label'] == 1]
#     .groupby('Season', dropna=True)['correct']
#     .agg(matches='size', accuracy='mean')
#     .reset_index()
#     .sort_values('Season')
# )

# # ===================== 5) ROI Y ACCURACY ENTRE APUESTAS (CON cuotas) =========
# def compute_accuracy_roi(merged_df, pred_col='y_pred'):
#     """
#     Accuracy & ROI ENTRE APUESTAS (solo filas con label H/D/A y cuota válida >= 1.01).
#     No modifica el accuracy oficial de tu función (que no depende de cuotas).
#     """
#     m = merged_df.copy()
#     n = len(m)

#     y_true_arr = m['y_true'].astype(str).str.upper().str.strip().to_numpy()
#     pred_arr   = m[pred_col].astype(str).str.upper().str.strip().to_numpy()

#     valid_label = np.isin(y_true_arr, ['H','D','A'])

#     odds_pred = np.where(
#         pred_arr == 'H', m['B365H'].to_numpy() if 'B365H' in m.columns else np.nan,
#         np.where(pred_arr == 'D', m['B365D'].to_numpy() if 'B365D' in m.columns else np.nan,
#                  np.where(pred_arr == 'A', m['B365A'].to_numpy() if 'B365A' in m.columns else np.nan, np.nan))
#     ).astype(float)

#     valid_odds = np.isfinite(odds_pred) & (odds_pred >= 1.01)
#     scored = valid_label & valid_odds

#     is_correct = np.zeros(n, dtype=bool)
#     is_correct[scored] = (pred_arr[scored] == y_true_arr[scored])

#     acc_bets = is_correct[scored].mean() if scored.any() else np.nan

#     profit = np.full(n, np.nan, dtype=float)
#     profit[scored] = -1.0
#     profit[scored & is_correct] = odds_pred[scored & is_correct] - 1.0

#     n_bets = int(np.isfinite(profit).sum())
#     total_profit = float(np.nansum(profit))
#     roi_global = (total_profit / n_bets) if n_bets > 0 else np.nan

#     if 'Season' in m.columns:
#         scored_idx = np.isfinite(profit)
#         by_season = m.loc[scored_idx, ['Season']].copy()
#         by_season['correct'] = is_correct[scored_idx].astype(int)
#         by_season['profit']  = profit[scored_idx]

#         acc_by_season_bets = (
#             by_season.groupby('Season', dropna=True)['correct']
#                      .agg(matches='size', accuracy='mean')
#                      .reset_index()
#                      .sort_values('Season')
#         )
#         roi_by_season = (
#             by_season.groupby('Season', dropna=True)['profit']
#                      .agg(bets='size', total_profit='sum')
#                      .reset_index()
#                      .sort_values('Season')
#         )
#         roi_by_season['roi'] = roi_by_season['total_profit'] / roi_by_season['bets']
#     else:
#         acc_by_season_bets = pd.DataFrame(columns=['Season','matches','accuracy'])
#         roi_by_season = pd.DataFrame(columns=['Season','bets','total_profit','roi'])

#     return acc_bets, roi_global, n_bets, total_profit, acc_by_season_bets, roi_by_season

# # Métricas de tu modelo (ENTRE apuestas)
# acc_bets_model, roi_g_model, bets_model, prof_model, acc_seas_bets_model, roi_seas_model = compute_accuracy_roi(
#     merged, pred_col='y_pred'
# )

# # Baseline mercado
# market_labels = np.array(['H','D','A'])
# probs = merged[['pimp1','pimpx','pimp2']].to_numpy(dtype=float)
# probs_filled = np.where(np.isnan(probs), -np.inf, probs)
# argmax_idx = np.argmax(probs_filled, axis=1)

# merged_market = merged.copy()
# merged_market['y_pred_market'] = market_labels[argmax_idx]

# acc_bets_mkt, roi_g_mkt, bets_mkt, prof_mkt, acc_seas_bets_mkt, roi_seas_mkt = compute_accuracy_roi(
#     merged_market, pred_col='y_pred_market'
# )

# # ===================== 6) REPORTING =========================================
# print("\n=== CONFIGURACIÓN ===")
# print("Features:", FEATURES)
# print("WF kwargs:", WF_KWARGS)

# print("\n=== ACCURACY OFICIAL (función walkforward, SIN cuotas) ===")
# print(f"Global: {acc_global_oficial:.4f}")
# print("\nAccuracy por temporada (oficial):")
# print(acc_by_season_oficial.to_string(index=False))

# print("\n=== TU MODELO — ENTRE APUESTAS (CON cuotas) ===")
# print(f"Accuracy entre apuestas: {acc_bets_model:.4f}")
# print(f"ROI global             : {roi_g_model:.4f}   |  Bets: {bets_model}   |  Profit: {prof_model:.2f}")
# print("\nROI por temporada (tu modelo):")
# print(roi_seas_model[['Season','bets','roi','total_profit']].to_string(index=False))

# print("\n=== BASELINE MERCADO (argmax pimp1/pimpx/pimp2) — ENTRE APUESTAS ===")
# print(f"Accuracy entre apuestas: {acc_bets_mkt:.4f}")
# print(f"ROI global             : {roi_g_mkt:.4f}   |  Bets: {bets_mkt}   |  Profit: {prof_mkt:.2f}")
# print("\nROI por temporada (mercado):")
# print(roi_seas_mkt[['Season','bets','roi','total_profit']].to_string(index=False))

# **PREDICCIÓN: Logistic Regression multinomial**

## Sin SMOTE:

In [None]:
# ============================================================
# CELDA ÚNICA: Future Predictions Exporter + Reproducibilidad 100% (+ Matchday)
# ============================================================

# ---------- Imports necesarios ----------
import os, re, json, random, hashlib
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression

# ---------- Reproducibilidad absoluta ----------
os.environ["PYTHONHASHSEED"] = "0"
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
random.seed(42)
np.random.seed(42)

# --------------------- utils de claves/duplicados ---------------------
def _norm_name(s: str) -> str:
    """Normaliza nombres de equipo: minúsculas y solo a-z0-9."""
    return re.sub(r'[^a-z0-9]+', '', str(s).strip().lower())

def _find_col(df: pd.DataFrame, candidates: list[str]) -> str | None:
    """Devuelve el nombre real de la primera columna candidata que exista (case/espacios robusto)."""
    norm2real = {re.sub(r'[^a-z0-9]+', '', str(c).strip().lower()): c for c in df.columns}
    for cand in candidates:
        norm_cand = re.sub(r'[^a-z0-9]+', '', cand.strip().lower())
        if norm_cand in norm2real:
            return norm2real[norm_cand]
    return None

def enforce_unique_pred_key(df_in: pd.DataFrame, key_col: str = "pred_key"):
    """
    Si hay claves duplicadas en `key_col`, añade '#k' (k=0,1,2,...) por orden estable
    dentro de cada grupo duplicado. Devuelve df modificado y un informe mínimo.
    """
    d = df_in.copy()
    base = d[key_col].astype(str)
    grp_sizes = base.map(base.value_counts())
    pos = base.groupby(base).cumcount()
    suffix = np.where(grp_sizes > 1, "#" + pos.astype(str), "")
    d[key_col] = base + suffix
    affected = int((grp_sizes > 1).sum())
    return d, {"collisions_augmented": affected}

def _build_pred_keys(df: pd.DataFrame, season_col="Season", date_col="Date",
                     home_col="HomeTeam_norm", away_col="AwayTeam_norm"):
    """Añade pred_key (humana) y pred_key_match (estable) usando fecha al día y tz-naive."""
    d = df.copy()
    d[date_col] = pd.to_datetime(d[date_col], errors="coerce")
    day = d[date_col].dt.tz_localize(None, nonexistent="NaT", ambiguous="NaT").dt.floor("D")
    home_raw = d[home_col].astype(str)
    away_raw = d[away_col].astype(str)
    home_norm = home_raw.map(_norm_name)
    away_norm = away_raw.map(_norm_name)

    d["pred_key"] = (
        d[season_col].astype("Int64").astype(str) + "|" +
        day.dt.strftime("%Y-%m-%d") + "|" +
        home_raw + "|" + away_raw
    )
    d["pred_key_match"] = (
        d[season_col].astype("Int64").astype(str) + "|" +
        day.dt.strftime("%Y-%m-%d") + "|" +
        home_norm + "|" + away_norm
    )
    return d


# --------------------- función principal: predicciones futuras ------------------------
def generate_future_predictions(
    df: pd.DataFrame,
    feature_cols,
    outputs_dir="outputs",
    date_col="Date",
    label_col="FTR",
    season_col="Season",
    home_col="HomeTeam_norm",
    away_col="AwayTeam_norm",
    n_seasons_window=4,
    season_size=380,
    recent_weight=3.0,
    older_weight=1.0,
    C=1.0,
    max_iter=1000,
    season_filter: int | None = None,   # si None, exporta por cada temporada detectada en futuros
    verbose_every=0
):
    """
    Predice FUTUROS (sin etiqueta H/D/A) y exporta columnas garantizadas:
      Season, Date, Matchday, HomeTeam_norm, AwayTeam_norm, pred_key,
      [B365H,B365D,B365A si existen], y_pred, pH_pred, pD_pred, pA_pred, conf_maxprob, entropy, margin_top12

    Genera, por temporada:
      - outputs/future_predictions_<SEASON>.csv
      - outputs/future_predictions_<SEASON>.json
    Además:
      - outputs/future_predictions_summary_<YYYYMMDD>-<RUN>.json (summary determinista del run)
    """
    df = df.copy()
    outputs_dir = str(outputs_dir)
    Path(outputs_dir).mkdir(parents=True, exist_ok=True)

    # Fecha y orden (estable)
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    df = df.sort_values(date_col, kind="mergesort").reset_index(drop=True)

    # --- detectar columna de jornada (Matchday) en df ---
    mw_col = _find_col(
        df,
        ["Matchday","Matchweek","matchweek","Jornada","Gameweek","GW","Week","MD"]
    )

    # Feature derivada opcional
    if "market_home_logit" in feature_cols and "market_home_logit" not in df.columns:
        if {"pimp1","pimp2"}.issubset(df.columns):
            df["market_home_logit"] = np.log(
                (pd.to_numeric(df["pimp1"], errors="coerce") + 1e-9) /
                (pd.to_numeric(df["pimp2"], errors="coerce") + 1e-9)
            )
        else:
            raise ValueError("market_home_logit está en feature_cols pero faltan pimp1/pimp2 en df.")

    # Futuro = sin etiqueta válida H/D/A
    y = df[label_col].astype(str).str.upper().str.strip()
    is_valid = y.isin(["H","D","A"])
    future_mask = ~is_valid
    if season_filter is not None:
        future_mask &= (pd.to_numeric(df[season_col], errors="coerce").astype("Int64") == int(season_filter))

    future_df = df.loc[future_mask].copy()
    if future_df.empty:
        raise RuntimeError("No hay partidos futuros (sin etiqueta H/D/A) con los filtros actuales.")

    # Ventanas de entrenamiento
    train_window = n_seasons_window * season_size
    recent_block = season_size

    pipe = Pipeline(steps=[
        ("imp", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler(with_mean=True, with_std=True)),
        ("logit", LogisticRegression(
            solver="lbfgs",
            C=C,
            max_iter=max_iter,
            random_state=42
        ))
    ])

    # Fechas futuras únicas (por día)
    future_dates = np.sort(future_df[date_col].unique())

    all_rows = []
    last_classes = ["H","D","A"]  # por si alguna iteración no asigna
    for i, fut_date in enumerate(future_dates):
        test_mask = (df[date_col] == fut_date) & (~is_valid)
        test_idx = np.where(test_mask)[0]
        if test_idx.size == 0:
            continue

        # Entrena con todo lo anterior etiquetado
        train_mask = (df[date_col] < fut_date) & is_valid
        train_idx_all = np.where(train_mask)[0]
        if train_idx_all.size < train_window:
            if verbose_every and (i % verbose_every == 0):
                print(f"[{i+1}/{len(future_dates)}] {str(fut_date)[:10]} -> histórico insuficiente: "
                      f"{train_idx_all.size} < {train_window}")
            continue

        train_idx = train_idx_all[-train_window:]

        X_train = df.iloc[train_idx][feature_cols]
        y_train = df.iloc[train_idx][label_col].astype(str).str.upper().str.strip()
        X_test  = df.iloc[test_idx][feature_cols]

        # Pesos deterministas (vector fijo por orden estable)
        sw = np.full(len(train_idx), older_weight, dtype=float)
        if recent_block > 0:
            sw[-recent_block:] = recent_weight

        pipe.fit(X_train, y_train, **{"logit__sample_weight": sw})
        proba = pipe.predict_proba(X_test)
        classes = list(pipe.named_steps["logit"].classes_)
        last_classes = classes  # guarda el último orden visto
        idx_map = {cls: classes.index(cls) for cls in classes}

        def col(c):
            return proba[:, idx_map[c]] if c in idx_map else np.full(proba.shape[0], np.nan)

        pH = col("H"); pD = col("D"); pA = col("A")
        y_pred = np.array(["H","D","A"])[np.nanargmax(np.vstack([pH, pD, pA]), axis=0)]

        # Métricas de confianza
        maxp = np.nanmax(np.vstack([pH, pD, pA]), axis=0)
        with np.errstate(divide="ignore", invalid="ignore"):
            ent = -(pH*np.log(pH + 1e-15) + pD*np.log(pD + 1e-15) + pA*np.log(pA + 1e-15))
        sorted_ps = np.sort(np.vstack([pH, pD, pA]), axis=0)
        margin = sorted_ps[-1, :] - sorted_ps[-2, :]

        # Meta, claves y Matchday (si existe)
        meta_cols = [season_col, date_col, home_col, away_col] + ([mw_col] if mw_col else [])
        meta = df.iloc[test_idx][meta_cols].copy()
        meta = _build_pred_keys(meta, season_col=season_col, date_col=date_col, home_col=home_col, away_col=away_col)

        out = pd.DataFrame({
            "Season": meta[season_col].values,
            "Date": meta[date_col].values,
            "Matchday": (pd.to_numeric(meta[mw_col], errors="coerce").astype("Int64").values
                         if mw_col else pd.Series([pd.NA]*len(meta)).values),
            "HomeTeam_norm": meta[home_col].values,
            "AwayTeam_norm": meta[away_col].values,
            "pred_key": meta["pred_key"].values,  # humana (con #k si colisiona)
            "y_pred": y_pred,
            "pH_pred": pH,
            "pD_pred": pD,
            "pA_pred": pA,
            "conf_maxprob": maxp,
            "entropy": ent,
            "margin_top12": margin,
        })

        all_rows.append(out)

        if verbose_every and (i % verbose_every == 0):
            print(f"[{i+1}/{len(future_dates)}] {str(fut_date)[:10]}  "
                  f"test_n={len(out)}  mean_conf={np.nanmean(maxp):.3f}  mean_entropy={np.nanmean(ent):.3f}")

    if not all_rows:
        raise RuntimeError("No se generaron predicciones (¿histórico insuficiente o no hay futuros?).")

    preds_all = pd.concat(all_rows, ignore_index=True)
    preds_all = preds_all.sort_values(["Date","HomeTeam_norm","AwayTeam_norm"], kind="mergesort").reset_index(drop=True)
    preds_all, uniq_report_all = enforce_unique_pred_key(preds_all, key_col="pred_key")

    # --------------------- Export por temporada (convención de nombres) ---------------------
    created = []

    # columnas base garantizadas (ahora incluye Matchday)
    cols_out = [
        "Season","Date","Matchday","HomeTeam_norm","AwayTeam_norm","pred_key",
        "y_pred","pH_pred","pD_pred","pA_pred",
        "conf_maxprob","entropy","margin_top12"
    ]

    # si hay cuotas Bet365, las añadimos al export
    for c in ["B365H","B365D","B365A"]:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")
            if not df[c].isna().all():
                insert_at = cols_out.index("y_pred")
                if c not in cols_out:
                    cols_out.insert(insert_at, c)

    if season_filter is not None:
        seasons_to_write = [int(season_filter)]
    else:
        seasons_to_write = (
            pd.to_numeric(preds_all["Season"], errors="coerce")
            .dropna().astype(int).sort_values().unique().tolist()
        )

    for seas in seasons_to_write:
        sub = preds_all[pd.to_numeric(preds_all["Season"], errors="coerce").astype("Int64") == int(seas)].copy()

        # añadir cuotas desde df original (alineación por pred_key) — si existen
        if {"B365H","B365D","B365A"}.issubset(df.columns):
            add_cols = ["B365H","B365D","B365A"]
            daux = df[[season_col, date_col, home_col, away_col] + add_cols].copy()
            daux = _build_pred_keys(daux, season_col=season_col, date_col=date_col, home_col=home_col, away_col=away_col)
            sub = sub.merge(
                daux[["pred_key"] + add_cols],
                on="pred_key", how="left", validate="m:1"
            )

        # ordenar columnas finales
        sub = sub[cols_out] if set(cols_out).issubset(sub.columns) else sub

        csv_path  = Path(outputs_dir) / f"future_predictions_{int(seas)}.csv"
        json_path = Path(outputs_dir) / f"future_predictions_{int(seas)}.json"

        sub.to_csv(csv_path, index=False)
        sub.to_json(json_path, orient="records", date_format="iso")

        created.append({
            "season": int(seas),
            "csv": str(csv_path),
            "json": str(json_path),
            "n_rows": int(len(sub))
        })

    # --------------------- Summary único del run (YYYYMMDD-código NUMÉRICO determinista) ------------
    def _safe_mean(s):
        s = pd.to_numeric(s, errors="coerce")
        return float(np.nanmean(s)) if s.notna().any() else np.nan

    by_date = preds_all.groupby(pd.to_datetime(preds_all["Date"]).dt.strftime("%Y-%m-%d")).agg(
        n_matches=("pred_key","count"),
        mean_conf=("conf_maxprob", _safe_mean),
        mean_entropy=("entropy", _safe_mean),
        mean_margin=("margin_top12", _safe_mean),
        pct_pick_H=("y_pred", lambda s: float((s == "H").mean())),
        pct_pick_D=("y_pred", lambda s: float((s == "D").mean())),
        pct_pick_A=("y_pred", lambda s: float((s == "A").mean())),
    ).reset_index().rename(columns={"index": "date", 0: "date"})

    gen_date = datetime.now().strftime("%Y%m%d")  # solo fecha

    # Código determinista del run a partir del contenido ordenado -> 6 dígitos numéricos
    digest_src = preds_all[
        ["Season","Date","HomeTeam_norm","AwayTeam_norm","y_pred","pH_pred","pD_pred","pA_pred"]
    ].copy()
    digest_src["Date"] = pd.to_datetime(digest_src["Date"], errors="coerce").dt.strftime("%Y-%m-%d")
    for c in ["pH_pred","pD_pred","pA_pred"]:
        digest_src[c] = pd.to_numeric(digest_src[c], errors="coerce").round(10)
    digest_src = digest_src.sort_values(["Season","Date","HomeTeam_norm","AwayTeam_norm"], kind="mergesort")
    payload = digest_src.to_csv(index=False).encode("utf-8")
    h = hashlib.md5(payload).hexdigest()
    run_code = f"{int(h[:12], 16) % (10**6):06d}"   # ← SOLO NÚMEROS (000000–999999)

    summary_path = Path(outputs_dir) / f"future_predictions_summary_{gen_date}-{run_code}.json"
    summary = {
        "generated_at": gen_date,
        "run_code": run_code,
        "model": {
            "type": "LogisticRegression(multinomial)",
            "C": C, "max_iter": max_iter,
            "n_seasons_window": n_seasons_window, "season_size": season_size,
            "recent_weight": recent_weight, "older_weight": older_weight,
            "features": list(feature_cols),
            "classes_order": last_classes,
            "proba_mapping": {"pH_pred": "H", "pD_pred": "D", "pA_pred": "A"},
        },
        "filters": {"season_filter": season_filter},
        "data": {
            "n_future_rows_out": int(len(preds_all)),
            "future_min_date": str(pd.to_datetime(preds_all["Date"]).min()),
            "future_max_date": str(pd.to_datetime(preds_all["Date"]).max()),
            "unique_key_report": uniq_report_all,
            "per_season_exports": created,
        },
        "by_date": by_date.to_dict(orient="records"),
    }

    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

    return {
        "created": created,                 # lista por temporada con paths y n_rows
        "summary_path": str(summary_path),  # path del resumen
        "total_rows": int(len(preds_all))
    }

# ===================== USO (ejemplo) =====================
OUT = Path("outputs")
FEATURES = FEATURES_S11  # tu lista definitiva
CURRENT_SEASON = int(df["Season"].max())
res = generate_future_predictions(
    df=df,
    feature_cols=FEATURES,
    outputs_dir=str(OUT),
    n_seasons_window=4,
    season_size=380,
    recent_weight=3.0,
    older_weight=1.0,
    C=1.0,
    max_iter=1000,
    season_filter=CURRENT_SEASON,  # solo temporada en curso; si None, exporta por cada temporada detectada
    verbose_every=0
)
print(res)

{'created': [{'season': 2025, 'csv': 'outputs/future_predictions_2025.csv', 'json': 'outputs/future_predictions_2025.json', 'n_rows': 10}], 'summary_path': 'outputs/future_predictions_summary_20251019-841232.json', 'total_rows': 10}


## Con SMOTE:

# **EVALUACIÓN HISTÓRICA: Logistic Regression multinomial**

In [None]:
IN_PATH = FEAT / "df_final.parquet"
df = pd.read_parquet(IN_PATH)

## Sin SMOTE:

In [None]:
# ============================================================
# MÉTRICAS PRINCIPALES POR TEMPORADA → CSV (extendido, armónico)
# Requiere en memoria: df, preds, merged, acc_by_season_oficial, roi_seas_model
# Salida: outputs/metrics_main_by_season.csv
# Columnas: Season,accuracy,logloss,brier,roi,n_bets,n_wins,hit_rate,
#           avg_odds_win,avg_overround,avg_conf,avg_entropy,avg_margin
# ============================================================

EPS = 1e-15

def _log_loss_mc_vec(y_true_series, P_mat, classes=("H","D","A")):
    y = y_true_series.astype(str).str.upper().str.strip()
    mask = y.isin(classes)
    if not mask.any():
        return np.nan
    y = y[mask].to_numpy()
    idx = {c:i for i,c in enumerate(classes)}
    P = np.clip(P_mat[mask, :], EPS, 1.0-EPS)
    p_true = P[np.arange(P.shape[0]), [idx[c] for c in y]]
    return float(-np.mean(np.log(p_true)))

def _brier_mc_vec(y_true_series, P_mat, classes=("H","D","A")):
    y = y_true_series.astype(str).str.upper().str.strip()
    mask = y.isin(classes)
    if not mask.any():
        return np.nan
    y = y[mask].to_numpy()
    idx = {c:i for i,c in enumerate(classes)}
    P = np.clip(P_mat[mask, :], 0.0, 1.0)
    Y = np.zeros_like(P)
    Y[np.arange(P.shape[0]), [idx[c] for c in y]] = 1.0
    return float(np.mean(np.sum((P - Y)**2, axis=1)))

# --- 1) Asegurar Season en preds (igual que en bloques previos) ---
date_season = df[['Date','Season']].copy()
date_season['Date'] = pd.to_datetime(date_season['Date'], errors='coerce')
date_season = date_season.drop_duplicates(subset=['Date'])

preds_seas = preds.copy()
preds_seas['Date'] = pd.to_datetime(preds_seas['Date'], errors='coerce')
preds_seas = preds_seas.merge(date_season, on='Date', how='left', validate='m:1')

if 'Season' not in preds_seas.columns:
    if 'Season_y' in preds_seas.columns:
        preds_seas['Season'] = preds_seas['Season_y']
    elif 'Season_x' in preds_seas.columns:
        preds_seas['Season'] = preds_seas['Season_x']
preds_seas.drop(columns=[c for c in ['Season_x','Season_y'] if c in preds_seas.columns], inplace=True)
preds_seas['Season'] = pd.to_numeric(preds_seas['Season'], errors='coerce').astype('Int64')

# --- 2) Validar probabilidades H/D/A presentes en preds ---
for col in ['proba_H','proba_D','proba_A']:
    if col not in preds_seas.columns:
        raise ValueError(f"Falta la columna {col} en preds. Usa la versión que añade proba_H/D/A.")

# --- 3) Filas con etiqueta válida y métricas de confianza ---
valid_mask = preds_seas['has_label'] == 1
preds_scored = preds_seas.loc[valid_mask].copy()

probs_mat = preds_scored[['proba_H','proba_D','proba_A']].to_numpy(dtype=float)
conf_maxprob = np.nanmax(probs_mat, axis=1)
sorted_ps = np.sort(probs_mat, axis=1)
margin_top12 = sorted_ps[:, -1] - sorted_ps[:, -2]
entropy = -(probs_mat * np.log(np.clip(probs_mat, EPS, 1.0))).sum(axis=1)

preds_scored['conf_maxprob'] = conf_maxprob
preds_scored['entropy'] = entropy
preds_scored['margin_top12'] = margin_top12

# --- 4) Accuracy oficial por temporada (del bloque previo) ---
if not {'Season','accuracy'}.issubset(acc_by_season_oficial.columns):
    raise ValueError("acc_by_season_oficial debe contener columnas ['Season','accuracy'].")
acc_by_season = acc_by_season_oficial[['Season','accuracy']].copy()
acc_by_season['Season'] = pd.to_numeric(acc_by_season['Season'], errors='coerce').astype('Int64')

# --- 5) LogLoss y Brier por temporada (desde preds_scored) ---
logloss_rows, brier_rows = [], []
for s, grp in preds_scored.groupby('Season', dropna=True):
    P = grp[['proba_H','proba_D','proba_A']].to_numpy(dtype=float)
    ll = _log_loss_mc_vec(grp['y_true'], P)
    br = _brier_mc_vec(grp['y_true'], P)
    logloss_rows.append({'Season': int(s), 'logloss': ll})
    brier_rows.append({'Season': int(s), 'brier': br})
logloss_by_season = pd.DataFrame(logloss_rows)
brier_by_season  = pd.DataFrame(brier_rows)

# --- 6) ROI y estadísticas de apuestas por temporada (desde merged alineado) ---
m = merged.copy()

# Normalizaciones
y_true_arr = m['y_true'].astype(str).str.upper().str.strip().to_numpy()
pred_arr   = m['y_pred'].astype(str).str.upper().str.strip().to_numpy()
valid_label = np.isin(y_true_arr, ['H','D','A'])

B365H = pd.to_numeric(m.get('B365H', np.nan), errors='coerce').to_numpy()
B365D = pd.to_numeric(m.get('B365D', np.nan), errors='coerce').to_numpy()
B365A = pd.to_numeric(m.get('B365A', np.nan), errors='coerce').to_numpy()

odds_pred = np.where(pred_arr=='H', B365H,
             np.where(pred_arr=='D', B365D,
                      np.where(pred_arr=='A', B365A, np.nan))).astype(float)
valid_odds = np.isfinite(odds_pred) & (odds_pred >= 1.01)
bet_mask = valid_label & valid_odds

m['__bet__']  = bet_mask
m['__win__']  = False
m.loc[bet_mask, '__win__'] = (pred_arr[bet_mask] == y_true_arr[bet_mask])
m['__odds__'] = odds_pred

# Overround por fila (si hay cuotas)
overround_row = (1/np.clip(B365H, 1.0, None)) + (1/np.clip(B365D, 1.0, None)) + (1/np.clip(B365A, 1.0, None))
overround_row[~np.isfinite(overround_row)] = np.nan
m['__overround__'] = overround_row

stats_rows = []
for s, grp in m.groupby('Season', dropna=True):
    g = grp[grp['__bet__'] == True]
    n_bets = int(len(g))
    if n_bets == 0:
        stats_rows.append({
            'Season': int(s), 'n_bets': 0, 'n_wins': 0,
            'hit_rate': np.nan, 'avg_odds_win': np.nan, 'avg_overround': float(np.nan)
        })
        continue
    n_wins = int(g['__win__'].sum())
    hit_rate = n_wins / n_bets if n_bets > 0 else np.nan
    avg_odds_win = float(pd.to_numeric(g.loc[g['__win__'], '__odds__'], errors='coerce').mean()) if n_wins > 0 else np.nan

    # promedio de overround sobre las filas apostadas (coherente con ROI entre apuestas)
    avg_overround = float(pd.to_numeric(g['__overround__'], errors='coerce').mean())

    stats_rows.append({
        'Season': int(s),
        'n_bets': n_bets,
        'n_wins': n_wins,
        'hit_rate': float(hit_rate) if np.isfinite(hit_rate) else np.nan,
        'avg_odds_win': avg_odds_win,
        'avg_overround': avg_overround
    })
stats_by_season = pd.DataFrame(stats_rows)
stats_by_season['Season'] = pd.to_numeric(stats_by_season['Season'], errors='coerce').astype('Int64')

# --- 7) Medias de confianza/entropía/margen por temporada ---
conf_agg = (
    preds_scored.groupby('Season', dropna=True)[['conf_maxprob','entropy','margin_top12']]
               .mean()
               .reset_index()
               .rename(columns={'conf_maxprob':'avg_conf','entropy':'avg_entropy','margin_top12':'avg_margin'})
)
conf_agg['Season'] = pd.to_numeric(conf_agg['Season'], errors='coerce').astype('Int64')

# --- 8) ROI por temporada (de compute_accuracy_roi) ---
if not {'Season','roi'}.issubset(roi_seas_model.columns):
    raise ValueError("roi_seas_model debe contener columnas ['Season','roi'].")
roi_by_season = roi_seas_model[['Season','roi']].copy()
roi_by_season['Season'] = pd.to_numeric(roi_by_season['Season'], errors='coerce').astype('Int64')

# --- 9) Unir TODO en un solo DataFrame ordenado ---
metrics_all = (
    acc_by_season
    .merge(logloss_by_season, on='Season', how='left')
    .merge(brier_by_season,  on='Season', how='left')
    .merge(roi_by_season,    on='Season', how='left')
    .merge(stats_by_season,  on='Season', how='left')
    .merge(conf_agg,         on='Season', how='left')
    .sort_values('Season')
    .reset_index(drop=True)
)

# Asegurar orden y tipos finales
cols_final = [
    'Season','accuracy','logloss','brier','roi',
    'n_bets','n_wins','hit_rate','avg_odds_win','avg_overround',
    'avg_conf','avg_entropy','avg_margin'
]
for c in cols_final:
    if c not in metrics_all.columns:
        metrics_all[c] = np.nan
metrics_all = metrics_all[cols_final]
metrics_all['Season'] = pd.to_numeric(metrics_all['Season'], errors='coerce').astype('Int64')

# --- 10) Exportar a CSV ---
out_dir = Path("outputs")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "metrics_main_by_season.csv"
metrics_all.to_csv(out_path, index=False)

print("✔ CSV generado con métricas extendidas por temporada:")
print(out_path)
display(metrics_all.head(20))

✔ CSV generado con métricas extendidas por temporada:
outputs/metrics_main_by_season.csv


Unnamed: 0,Season,accuracy,logloss,brier,roi,n_bets,n_wins,hit_rate,avg_odds_win,avg_overround,avg_conf,avg_entropy,avg_margin
0,2010,0.618421,0.925345,0.538282,0.126289,380,235,0.618421,1.821234,1.065656,0.579876,0.917463,0.336271
1,2011,0.542105,0.954313,0.566552,-0.053921,380,206,0.542105,1.745194,1.064498,0.581461,0.910054,0.337554
2,2012,0.534211,0.96743,0.572264,-0.078026,380,203,0.534211,1.725862,1.063707,0.574298,0.924929,0.33008
3,2013,0.555263,0.962451,0.56951,-0.038211,380,211,0.555263,1.732133,1.06363,0.578612,0.905347,0.343965
4,2014,0.55,0.917879,0.542395,-0.062132,380,209,0.55,1.705215,1.055137,0.587763,0.882519,0.349188
5,2015,0.557895,0.943671,0.556749,-0.012474,380,212,0.557895,1.770094,1.051227,0.559915,0.91993,0.306148
6,2016,0.589474,0.915753,0.538011,0.022289,380,224,0.589474,1.734241,1.050764,0.579215,0.903447,0.341286
7,2017,0.544737,0.975661,0.579182,-0.033368,380,207,0.544737,1.774493,1.052719,0.563537,0.928061,0.314689
8,2018,0.492105,1.022301,0.61122,-0.1005,380,187,0.492105,1.827861,1.052628,0.525382,0.976305,0.259489
9,2019,0.528947,0.983904,0.586535,0.012316,380,201,0.528947,1.913831,1.054675,0.522596,0.973573,0.247345


## Con SMOTE:

Con este modelo obtengo el mejor **Accuracy** (porcentaje de aciertos totales), pero esta métrica ignora como de seguras son esas esas predicciones.

$$
\text{Accuracy} = \frac{\text{Número de aciertos}}{\text{Número total de predicciones}}
$$

Para ello se utiliza el **Log Loss** (Cross-Entropy Loss), métrica que mide qué tan buenas son las probabilidades que predice mi modelo de clasificación. A esta métrica no solo le importa acertar la clase, sino cuán seguro está el modelo.

$$
\text{LogLoss} = -\frac{1}{N} \sum_{i=1}^{N} \sum_{j=1}^{K} y_{ij} \cdot \log(p_{ij})
$$

donde:

- $y_{ij}$ = 1 si la clase real del ejemplo $i$ es la clase $j$, y 0 en caso contrario.
- $p_{ij}$ es la probabilidad predicha por el modelo de que el ejemplo $i$ pertenezca a la clase $j$.

Tener un Log Loss alto en este caso significaría dar una probabilidad alta a la clase incorrecta, o lo que es lo mismo, dar una probabilidad baja a la clase correcta.

Por último añadí también el **Brier Score**, que es una métrica que evalúa cuán cercanas están las probabilidades predichas por tu modelo respecto a la realidad, comparando la distribución de probabilidades contra la clase real (codificada en one-hot). Es como un error cuadrático medio (MSE) para probabilidades.

$$
\text{Brier Score} = \frac{1}{N} \sum_{i=1}^{N} \sum_{j=1}^{K} (p_{ij} - y_{ij})^2
$$

donde:

- $N$ es el número de ejemplos.
- $K$ es el número de clases (en este caso 3: victoria local, empate, victoria visitante).
- $p_{ij}$ es la probabilidad predicha por el modelo de que el ejemplo $i$ pertenezca a la clase $j$.
- $y_{ij}$ es 1 si la clase real del ejemplo $i$ es la clase $j$, y 0 en caso contrario.

Un Brier Score de 0 significa que las probabilidades dadas por el modelo son perfectas, mientras que uno del 0.66 en nuestro caso sería un modelo completamente aleatorio.


## Selección de variables

La función `forward_selection` implementa un algoritmo clásico de selección de variables hacia adelante (**forward feature selection**) sobre un modelo de regresión logística multiclase con escalado de variables.

Va añadiendo sucesivamente la variable que mejor mejora el rendimiento del modelo (según accuracy o log_loss), una por una.





In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline
# from sklearn.metrics import accuracy_score, log_loss
# import numpy as np

# def forward_selection(X, y, max_features=20, scoring='accuracy'):
#     selected_features = []
#     remaining_features = list(X.columns)
#     scores = []

#     for i in range(min(max_features, len(remaining_features))):
#         best_score = -np.inf if scoring == 'accuracy' else np.inf
#         best_feature = None

#         for feature in remaining_features:
#             current_features = selected_features + [feature]

#             model = make_pipeline(
#                 StandardScaler(),
#                 LogisticRegression(max_iter=1000, solver='lbfgs')
#             )

#             model.fit(X[current_features], y)
#             y_pred = model.predict(X[current_features])
#             y_proba = model.predict_proba(X[current_features])

#             if scoring == 'accuracy':
#                 score = accuracy_score(y, y_pred)
#                 if score > best_score:
#                     best_score = score
#                     best_feature = feature
#             elif scoring == 'log_loss':
#                 score = log_loss(y, y_proba)
#                 if score < best_score:
#                     best_score = score
#                     best_feature = feature
#             else:
#                 raise ValueError("scoring debe ser 'accuracy' o 'log_loss'.")

#         if best_feature is not None:
#             selected_features.append(best_feature)
#             remaining_features.remove(best_feature)
#             scores.append(best_score)

#         print(f"[{i+1}] Añadida: {best_feature} | Score: {best_score:.4f}")

#     return selected_features, scores

In [None]:
# selected, scores = forward_selection(X_train, y_train, max_features=81, scoring='accuracy')

In [None]:
# import matplotlib.pyplot as plt
# import numpy as np

# # Suponemos que tienes las listas: selected (variables) y scores (métricas acumuladas)

# # Calcular diferencia respecto al valor anterior
# deltas = np.diff([0] + scores)
# colors = ['blue' if delta >= 0 else 'red' for delta in deltas]

# plt.figure(figsize=(12,6))
# bar_width = 0.6  # Reducir ancho de barra para separarlas
# indices = np.arange(len(selected))

# plt.bar(indices, scores, color=colors, width=bar_width)
# plt.xticks(indices, selected, rotation=90)
# plt.xlabel('Variables añadidas')
# plt.ylabel('Valor de la métrica')
# plt.title('Evolución del rendimiento al añadir variables')

# plt.ylim(min(scores) - 0.01, max(scores) + 0.01)
# plt.tight_layout()
# plt.show()


Se implementó un proceso de selección hacia adelante (forward selection) sobre el modelo de regresión logística con variables estandarizadas. Este procedimiento consiste en partir sin predictores y añadir, en cada iteración, la variable que mayor mejora produce en el rendimiento del modelo. Se evaluaron dos métricas complementarias como criterio de selección: el accuracy (para priorizar aciertos de clasificación) y el log loss (para priorizar la calibración de las probabilidades). Esta técnica permitió reducir la dimensionalidad del conjunto original y determinar el orden de relevancia de las variables desde el punto de vista predictivo.

# **Resultados**

## **MATRIZ DE CONFUSIÓN**

In [None]:
# ============================================================
# MATRICES DE CONFUSIÓN POR TEMPORADA → JSON (flujo armonizado)
# Requiere en memoria: df (calendario con Date/Season) y preds
#   - preds debe traer: y_true, y_pred, has_label (o se infiere), Date
# Salida: outputs/confusion_matrices_by_season.json
# Convenciones:
#   - Orden de etiquetas: ["H","D","A"]
#   - Filtrado: solo filas con etiqueta válida (y_true ∈ {H,D,A})
#   - Ejes: filas = y_true, columnas = y_pred
# ============================================================

LABELS = ["H", "D", "A"]
label_to_idx = {c: i for i, c in enumerate(LABELS)}

def _ensure_season_in_preds(preds: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
    """Si preds no trae Season, lo añade vía merge por Date (día), tal como en el resto del pipeline."""
    if "Season" in preds.columns:
        p = preds.copy()
        p["Season"] = pd.to_numeric(p["Season"], errors="coerce").astype("Int64")
        return p

    date_season = df[["Date", "Season"]].copy()
    date_season["Date"] = pd.to_datetime(date_season["Date"], errors="coerce").dt.floor("D")
    date_season = date_season.drop_duplicates(subset=["Date"])

    p = preds.copy()
    p["Date"] = pd.to_datetime(p["Date"], errors="coerce").dt.floor("D")
    p = p.merge(date_season, on="Date", how="left", validate="m:1")

    if "Season" not in p.columns:
        if "Season_y" in p.columns:
            p["Season"] = p["Season_y"]
        elif "Season_x" in p.columns:
            p["Season"] = p["Season_x"]

    p.drop(columns=[c for c in ["Season_x", "Season_y"] if c in p.columns], inplace=True)
    p["Season"] = pd.to_numeric(p["Season"], errors="coerce").astype("Int64")
    return p

def _confusion_counts(y_true_s: pd.Series, y_pred_s: pd.Series, labels=LABELS):
    """
    Devuelve (M, support) donde:
      - M es matriz len(labels) x len(labels) con filas=y_true y columnas=y_pred
      - support es dict label->n_true
    Solo considera pares (y_true, y_pred) válidos dentro de 'labels'.
    """
    y_true = y_true_s.astype(str).str.upper().str.strip().to_numpy()
    y_pred = y_pred_s.astype(str).str.upper().str.strip().to_numpy()

    it = np.array([label_to_idx.get(x, -1) for x in y_true], dtype=int)
    ip = np.array([label_to_idx.get(x, -1) for x in y_pred], dtype=int)
    mask = (it >= 0) & (ip >= 0)

    M = np.zeros((len(labels), len(labels)), dtype=int)
    if mask.any():
        flat = it[mask] * len(labels) + ip[mask]
        counts = np.bincount(flat, minlength=len(labels) * len(labels))
        M = counts.reshape((len(labels), len(labels)))

    support = {lab: int(np.sum(it == label_to_idx[lab])) for lab in labels}
    return M.tolist(), support

# ---- 1) Asegurar df existe (para recuperar Season si hiciera falta) ----
try:
    _ = df  # noqa: F401
except NameError:
    raise RuntimeError("Se necesita 'df' en memoria para asegurar Season si falta en 'preds'.")

# ---- 2) Preds con Season garantizada y filtrado de filas evaluables ----
preds_cm = _ensure_season_in_preds(preds, df).copy()

# Filtra filas con etiqueta válida (y_true ∈ {H,D,A})
if "has_label" in preds_cm.columns:
    preds_cm = preds_cm[preds_cm["has_label"] == 1].copy()
else:
    vt = preds_cm["y_true"].astype(str).str.upper().str.strip()
    preds_cm = preds_cm[vt.isin(LABELS)].copy()

# Normaliza columnas clave
for col in ["y_true", "y_pred"]:
    if col not in preds_cm.columns:
        raise ValueError(f"Falta la columna '{col}' en preds.")

preds_cm["Season"] = pd.to_numeric(preds_cm["Season"], errors="coerce").astype("Int64")

# Orden estable previo a los groupby (determinismo total)
preds_cm = preds_cm.sort_values(
    ["Season", "Date", "HomeTeam_norm", "AwayTeam_norm"],
    kind="mergesort"
).reset_index(drop=True)

# ---- 3) Construcción por temporada y global ----
by_season = []
for s, grp in preds_cm.groupby("Season", dropna=True):
    M, support = _confusion_counts(grp["y_true"], grp["y_pred"], labels=LABELS)
    by_season.append({
        "Season": int(s),
        "labels": LABELS,
        "matrix": M,            # filas = verdaderas (H,D,A), columnas = predichas (H,D,A)
        "support": support,     # nº de verdaderos por clase
        "n_scored": int(len(grp))
    })

M_overall, support_overall = _confusion_counts(preds_cm["y_true"], preds_cm["y_pred"], labels=LABELS)
overall = {
    "labels": LABELS,
    "matrix": M_overall,
    "support": support_overall,
    "n_scored": int(len(preds_cm))
}

# ---- 4) Export JSON ----
out_dir = Path("outputs")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "confusion_matrices_by_season.json"

payload = {
    "meta": {
        "row_axis": "y_true",
        "col_axis": "y_pred",
        "labels_order": LABELS
    },
    "by_season": sorted(by_season, key=lambda x: x["Season"]),
    "overall": overall
}

with open(out_path, "w", encoding="utf-8") as f:
    json.dump(payload, f, ensure_ascii=False, indent=2)

print(f"✔ Confusion matrices guardadas en: {out_path}")

✔ Confusion matrices guardadas en: outputs/confusion_matrices_by_season.json


## **METRICAS DE CLASIFICACIÓN**

In [None]:
# ============================================================
# CLF METRICS POR TEMPORADA → CSV (macro / weighted, soporte)
# Requiere en memoria: df (calendario con Date/Season) y preds (walkforward)
# Salida: outputs/classification_report_by_season.csv
# Notas de robustez:
#  - Alineación de Season por fecha al DÍA (tz-naive), como en el resto del flujo
#  - Filtrado estricto a filas evaluables (y_true ∈ {H,D,A})
#  - Orden estable (mergesort) antes de groupby para determinismo bit-a-bit
# ============================================================

from sklearn.metrics import precision_recall_fscore_support

LABELS = ["H", "D", "A"]

def _ensure_season_in_preds(preds: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
    """
    Añade/normaliza Season en preds mediante merge por Date (al día, tz-naive),
    replicando la misma lógica que en las otras celdas del pipeline.
    """
    p = preds.copy()

    # Normaliza fechas a día
    p["Date"] = pd.to_datetime(p["Date"], errors="coerce").dt.floor("D")

    if "Season" not in p.columns:
        date_season = df[["Date", "Season"]].copy()
        date_season["Date"] = pd.to_datetime(date_season["Date"], errors="coerce").dt.floor("D")
        date_season = date_season.drop_duplicates(subset=["Date"])

        p = p.merge(date_season, on="Date", how="left", validate="m:1")

        if "Season" not in p.columns:  # por si viene como _x/_y
            if "Season_y" in p.columns:
                p["Season"] = p["Season_y"]
            elif "Season_x" in p.columns:
                p["Season"] = p["Season_x"]

        p.drop(columns=[c for c in ["Season_x", "Season_y"] if c in p.columns], inplace=True)

    p["Season"] = pd.to_numeric(p["Season"], errors="coerce").astype("Int64")
    return p

# 1) Season garantizada + filtrado evaluable
preds_seas = _ensure_season_in_preds(preds, df)

# Filas con etiqueta válida (H/D/A). Si existe has_label, úsalo; si no, infiere.
if "has_label" in preds_seas.columns:
    preds_scored = preds_seas[preds_seas["has_label"] == 1].copy()
else:
    vt = preds_seas["y_true"].astype(str).str.upper().str.strip()
    preds_scored = preds_seas[vt.isin(LABELS)].copy()

# Normaliza etiquetas/preds a mayúsculas limpias
preds_scored["y_true_norm"] = preds_scored["y_true"].astype(str).str.upper().str.strip()
preds_scored["y_pred_norm"] = preds_scored["y_pred"].astype(str).str.upper().str.strip()

# Orden estable antes de agrupar (determinismo)
preds_scored = preds_scored.sort_values(
    ["Season", "Date", "HomeTeam_norm", "AwayTeam_norm"],
    kind="mergesort"
).reset_index(drop=True)

# 2) Métricas por temporada
rows = []
for s, grp in preds_scored.groupby("Season", dropna=True):
    y_true = grp["y_true_norm"].to_numpy()
    y_pred = grp["y_pred_norm"].to_numpy()

    # macro
    p_mac, r_mac, f_mac, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=LABELS, average="macro", zero_division=0
    )
    # weighted
    p_w, r_w, f_w, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=LABELS, average="weighted", zero_division=0
    )

    rows.append({
        "Season": int(s),
        "precision_macro":   float(p_mac),
        "recall_macro":      float(r_mac),
        "f1_macro":          float(f_mac),
        "precision_weighted": float(p_w),
        "recall_weighted":    float(r_w),
        "f1_weighted":        float(f_w),
        "support": int(len(grp))  # nº de partidos evaluados en esa temporada
    })

report_df = pd.DataFrame(rows).sort_values("Season", kind="mergesort").reset_index(drop=True)

# 3) Exportar CSV con columnas EXACTAS pedidas
out_dir = Path("outputs"); out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "classification_report_by_season.csv"

cols_final = [
    "Season",
    "precision_macro","recall_macro","f1_macro",
    "precision_weighted","recall_weighted","f1_weighted",
    "support"
]
for c in cols_final:
    if c not in report_df.columns:
        report_df[c] = np.nan
report_df = report_df[cols_final]
report_df["Season"] = pd.to_numeric(report_df["Season"], errors="coerce").astype("Int64")

report_df.to_csv(out_path, index=False)

print("✔ Classification report por temporada guardado en:")
print(out_path)
display(report_df.head(20))

✔ Classification report por temporada guardado en:
outputs/classification_report_by_season.csv


Unnamed: 0,Season,precision_macro,recall_macro,f1_macro,precision_weighted,recall_weighted,f1_weighted,support
0,2010,0.415005,0.474798,0.431096,0.491348,0.618421,0.535531,380
1,2011,0.515888,0.438812,0.393218,0.526273,0.542105,0.464013,380
2,2012,0.401772,0.406184,0.360659,0.44894,0.534211,0.452183,380
3,2013,0.505063,0.455964,0.41566,0.523077,0.555263,0.488526,380
4,2014,0.488896,0.48138,0.464318,0.513199,0.55,0.513386,380
5,2015,0.500609,0.476344,0.446678,0.526628,0.557895,0.508639,380
6,2016,0.523399,0.500021,0.466012,0.547983,0.589474,0.532149,380
7,2017,0.49242,0.45272,0.409371,0.510908,0.544737,0.479561,380
8,2018,0.411808,0.431706,0.379042,0.430483,0.492105,0.4194,380
9,2019,0.479357,0.4665,0.441199,0.497059,0.528947,0.483088,380


## **AUC Y CURVA ROC**

In [None]:
# ============================================================
# ROC CURVES + AUC → JSON (overall y por temporada)
# Requiere: df, preds (con y_true, y_pred, has_label, proba_H/D/A)
# Salida: outputs/roc_curves_by_season.json
# ============================================================

from sklearn.metrics import roc_curve, auc

LABELS = ["H", "D", "A"]
EPS = 1e-15


# ---------------------- UTILIDADES DE ALINEACIÓN ----------------------
def _ensure_season_in_preds(preds: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
    """
    Garantiza columna Season en preds, alineando por Date (al día, tz-naive).
    Misma lógica usada en todas las demás celdas.
    """
    p = preds.copy()
    p["Date"] = pd.to_datetime(p["Date"], errors="coerce").dt.floor("D")

    if "Season" not in p.columns:
        date_season = df[["Date", "Season"]].copy()
        date_season["Date"] = pd.to_datetime(date_season["Date"], errors="coerce").dt.floor("D")
        date_season = date_season.drop_duplicates(subset=["Date"])

        p = p.merge(date_season, on="Date", how="left", validate="m:1")

        if "Season" not in p.columns:
            if "Season_y" in p.columns:
                p["Season"] = p["Season_y"]
            elif "Season_x" in p.columns:
                p["Season"] = p["Season_x"]

        p.drop(columns=[c for c in ["Season_x", "Season_y"] if c in p.columns], inplace=True)

    p["Season"] = pd.to_numeric(p["Season"], errors="coerce").astype("Int64")
    return p


def _prepare_scored(preds: pd.DataFrame) -> pd.DataFrame:
    """Filtra etiquetas válidas y probabilidades finitas; devuelve DF limpio para ROC."""
    for col in ["proba_H", "proba_D", "proba_A"]:
        if col not in preds.columns:
            raise ValueError(f"Falta {col} en preds. Usa la versión del pipeline que añade proba_H/D/A.")

    p = preds.copy()
    y = p["y_true"].astype(str).str.upper().str.strip()
    mask_lbl = y.isin(LABELS)

    # Asegura que las proba_* son numéricas y finitas
    probs = p[["proba_H", "proba_D", "proba_A"]].apply(pd.to_numeric, errors="coerce")
    mask_prob = np.isfinite(probs).all(axis=1)

    if "has_label" in p.columns:
        mask = (p["has_label"] == 1) & mask_prob
    else:
        mask = mask_lbl & mask_prob

    p = p.loc[mask].copy()
    p["y_true_norm"] = p["y_true"].astype(str).str.upper().str.strip()

    # Orden estable (determinismo)
    p = p.sort_values(
        ["Season", "Date", "HomeTeam_norm", "AwayTeam_norm"],
        kind="mergesort"
    ).reset_index(drop=True)
    return p


def _binarize_labels(y_true, labels=LABELS):
    """Codificación one-hot (n × C) para etiquetas verdaderas."""
    idx_map = {c: i for i, c in enumerate(labels)}
    it = np.array([idx_map.get(val, -1) for val in y_true], dtype=int)
    Y = np.zeros((len(y_true), len(labels)), dtype=int)
    valid_rows = it >= 0
    Y[np.where(valid_rows)[0], it[valid_rows]] = 1
    return Y


def _multiclass_roc_block(y_true_series, P_mat, labels=LABELS):
    """
    Devuelve dict con:
      - per_class[label]: {fpr, tpr, thresholds, auc}
      - micro-average
      - macro_auc: media simple de AUCs por clase
      - n_scored
    """
    y_true = y_true_series.astype(str).str.upper().str.strip().to_numpy()
    P = np.clip(P_mat.astype(float), 0.0, 1.0)

    # Normaliza por fila (softmax-like)
    row_sums = P.sum(axis=1, keepdims=True)
    ok = row_sums.squeeze() > 0
    P[ok] = P[ok] / np.clip(row_sums[ok], EPS, None)

    Y = _binarize_labels(y_true, labels=labels)

    per_class = {}
    aucs = []

    for j, lab in enumerate(labels):
        fpr, tpr, thr = roc_curve(Y[:, j], P[:, j], drop_intermediate=True)
        auc_j = auc(fpr, tpr) if len(fpr) > 1 else np.nan
        per_class[lab] = {
            "fpr": fpr.tolist(),
            "tpr": tpr.tolist(),
            "thresholds": thr.tolist(),
            "auc": float(auc_j) if np.isfinite(auc_j) else np.nan
        }
        if np.isfinite(auc_j):
            aucs.append(auc_j)

    # Micro-average (flatten)
    fpr_micro, tpr_micro, thr_micro = roc_curve(Y.ravel(), P.ravel(), drop_intermediate=True)
    auc_micro = auc(fpr_micro, tpr_micro) if len(fpr_micro) > 1 else np.nan
    macro_auc = float(np.mean(aucs)) if aucs else np.nan

    return {
        "per_class": per_class,
        "micro": {
            "fpr": fpr_micro.tolist(),
            "tpr": tpr_micro.tolist(),
            "thresholds": thr_micro.tolist(),
            "auc": float(auc_micro) if np.isfinite(auc_micro) else np.nan
        },
        "macro_auc": float(macro_auc) if np.isfinite(macro_auc) else np.nan,
        "n_scored": int(len(y_true))
    }


# ---------------------- CONSTRUCCIÓN DEL PAYLOAD ----------------------
preds_seas = _ensure_season_in_preds(preds, df)
preds_scored = _prepare_scored(preds_seas)

# Overall
P_all = preds_scored[["proba_H", "proba_D", "proba_A"]].to_numpy(dtype=float)
overall_block = _multiclass_roc_block(preds_scored["y_true_norm"], P_all, labels=LABELS)

# Por temporada
by_season = []
for s, grp in preds_scored.groupby("Season", dropna=True):
    P = grp[["proba_H", "proba_D", "proba_A"]].to_numpy(dtype=float)
    block = _multiclass_roc_block(grp["y_true_norm"], P, labels=LABELS)
    block["Season"] = int(s)
    by_season.append(block)

# Payload final (orden estable)
payload = {
    "meta": {
        "labels": LABELS,
        "proba_cols": ["proba_H", "proba_D", "proba_A"],
        "row_axis": "y_true (one-vs-rest)",
        "col_axis": "score",
        "generated_at": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
    },
    "overall": overall_block,
    "by_season": sorted(by_season, key=lambda x: x["Season"])
}

# ---------------------- GUARDAR JSON ----------------------
out_dir = Path("outputs")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "roc_curves_by_season.json"

with open(out_path, "w", encoding="utf-8") as f:
    json.dump(payload, f, ensure_ascii=False, indent=2)

print(f"✔ ROC + AUC guardado en: {out_path}")

✔ ROC + AUC guardado en: outputs/roc_curves_by_season.json


  "generated_at": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")


## **BENEFICIOS**

Por último, pero no por ello menos importante vamos a estudiar la última métrica: El **ROI (Return on Investment)**.

$$
ROI = \frac{\text{Beneficio}}{\text{Inversión}}
$$

Con el código siguiente lo que estoy haciendo es simular una apuesta de un euro al resultado que predice mi modelo, en todos los partidos que hay en test. Si se acierta sumamos la cuota que ofrece Bet365 pero si falla se resta la unidad apostada. Con esto calculamos el beneficio neto y el ROI.

### Sin SMOTE

In [None]:
# ============================================================
# MATCHLOGS POR TEMPORADA → CSV
#  - Matchday desde df[Matchweek] por (Date,row_in_date) estable
#  - Re-adjunta cuotas Bet365 por pred_key_match con fallback (Date,row_in_date)
#  - Exporta: outputs/matchlogs_<Season>.csv
# ============================================================

OUT_DIR = Path("outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def _norm_name(s: str) -> str:
    return re.sub(r'[^a-z0-9]+', '', str(s).strip().lower())

def _find_col(df, candidates):
    norm2real = {_norm_name(c): c for c in df.columns}
    for cand in candidates:
        if _norm_name(cand) in norm2real:
            return norm2real[_norm_name(cand)]
    return None

def _infer_team_cols(df: pd.DataFrame):
    home_candidates = ["HomeTeam_norm","HomeTeam","home_team","Home","local"]
    away_candidates = ["AwayTeam_norm","AwayTeam","away_team","Away","visitor","visiting"]
    home_col = _find_col(df, home_candidates)
    away_col = _find_col(df, away_candidates)
    if home_col is None or away_col is None:
        raise KeyError(f"No encuentro columnas Home/Away. Cols: {list(df.columns)[:40]}")
    return home_col, away_col

def _coalesce_suffix(mdf: pd.DataFrame, base: str) -> pd.DataFrame:
    cx, cy = f"{base}_x", f"{base}_y"
    if cx in mdf.columns or cy in mdf.columns:
        if cx in mdf.columns and cy in mdf.columns:
            mdf[base] = mdf[cx].where(mdf[cx].notna(), mdf[cy])
        elif cx in mdf.columns:
            mdf[base] = mdf[cx]
        else:
            mdf[base] = mdf[cy]
        mdf.drop(columns=[c for c in (cx, cy) if c in mdf.columns], inplace=True)
    return mdf

def _build_pred_key_like_pipeline(df_in: pd.DataFrame, home_col=None, away_col=None) -> pd.DataFrame:
    d = df_in.copy()
    d["Date"] = pd.to_datetime(d["Date"], errors="coerce").dt.tz_localize(None, nonexistent="NaT", ambiguous="NaT").dt.floor("D")
    if home_col is None or away_col is None:
        home_col, away_col = _infer_team_cols(d)
    d["Season"] = pd.to_numeric(d["Season"], errors="coerce").astype("Int64")

    home_norm = d[home_col].astype(str).map(_norm_name)
    away_norm = d[away_col].astype(str).map(_norm_name)
    d["pred_key_match"] = (
        d["Season"].astype("Int64").astype(str) + "|" +
        d["Date"].dt.strftime("%Y-%m-%d") + "|" +
        home_norm + "|" + away_norm
    )
    d["pred_key"] = (
        d["Season"].astype("Int64").astype(str) + "|" +
        d["Date"].dt.strftime("%Y-%m-%d") + "|" +
        d[home_col].astype(str) + "|" +
        d[away_col].astype(str)
    )
    return d

def _attach_matchday_from_df(merged_in: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
    m = merged_in.copy()
    m["Date"] = pd.to_datetime(m["Date"], errors="coerce").dt.floor("D")
    df2 = df.copy()
    df2["Date"] = pd.to_datetime(df2["Date"], errors="coerce").dt.floor("D")

    mw_col = _find_col(df2, ["Matchweek","MatchWeek","matchweek","Jornada","Gameweek","GW","Week","MD"])
    if mw_col is None:
        raise KeyError("No se encontró columna de jornada (Matchweek) en df.")

    df2_sorted = df2.sort_values("Date", kind="mergesort").reset_index(drop=True)
    df2_sorted["row_in_date"] = df2_sorted.groupby("Date").cumcount()

    m_sorted = m.sort_values("Date", kind="mergesort").reset_index(drop=True)
    m_sorted["row_in_date"] = m_sorted.groupby("Date").cumcount()

    bring = df2_sorted[["Date","row_in_date", mw_col]].rename(columns={mw_col: "Matchday"})
    m_sorted = m_sorted.merge(bring, on=["Date","row_in_date"], how="left", validate="1:1")

    if m_sorted["Matchday"].isna().any():
        missing = m_sorted["Matchday"].isna()
        if "pred_key" not in m_sorted.columns or "pred_key" not in df2_sorted.columns:
            home_m, away_m = _infer_team_cols(m_sorted)
            m_sorted = _build_pred_key_like_pipeline(m_sorted, home_m, away_m)
            home_d, away_d = _infer_team_cols(df2_sorted)
            df2_sorted = _build_pred_key_like_pipeline(df2_sorted, home_d, away_d)
        m_sorted["pred_key_base"] = m_sorted["pred_key"].astype(str).str.split("#", n=1, expand=True)[0]
        df2_sorted["pred_key_base"] = df2_sorted["pred_key"].astype(str).str.split("#", n=1, expand=True)[0]

        aux = (df2_sorted[["pred_key_base", mw_col]]
               .drop_duplicates("pred_key_base")
               .rename(columns={mw_col: "Matchday_fb"}))
        m_sorted = m_sorted.merge(aux, on="pred_key_base", how="left")
        m_sorted.loc[missing, "Matchday"] = m_sorted.loc[missing, "Matchday_fb"]
        m_sorted.drop(columns=["pred_key_base","Matchday_fb"], inplace=True, errors="ignore")

    return m_sorted

# ---------- 1) Carga y saneo de merged ----------
m = merged.copy()
for base in ["Season","HomeTeam_norm","AwayTeam_norm","HomeTeam","AwayTeam","Date"]:
    m = _coalesce_suffix(m, base)

home_col_real, away_col_real = _infer_team_cols(m)
if "HomeTeam_norm" not in m.columns:
    m["HomeTeam_norm"] = m[home_col_real]
if "AwayTeam_norm" not in m.columns:
    m["AwayTeam_norm"] = m[away_col_real]

m["Date"] = pd.to_datetime(m["Date"], errors="coerce").dt.floor("D")
for c in ["B365H","B365D","B365A","pimp1","pimpx","pimp2","proba_H","proba_D","proba_A"]:
    if c in m.columns:
        m[c] = pd.to_numeric(m[c], errors="coerce")
m["Season"] = pd.to_numeric(m["Season"], errors="coerce").astype("Int64")

# ---------- 2) Matchday desde df ----------
m = _attach_matchday_from_df(m, df)

# ---------- 3) Claves y df con claves para re-mapear cuotas ----------
m = _build_pred_key_like_pipeline(m, "HomeTeam_norm", "AwayTeam_norm")
df_keyed = _build_pred_key_like_pipeline(df, None, None)
for c in ["B365H","B365D","B365A"]:
    if c in df_keyed.columns:
        df_keyed[c] = pd.to_numeric(df_keyed[c], errors="coerce")

odds_cols = ["B365H","B365D","B365A"]
have_odds_in_df = all(c in df_keyed.columns for c in odds_cols)

# Diagnóstico de colisiones
if have_odds_in_df:
    dup = (df_keyed
           .dropna(subset=["pred_key_match"])
           .groupby("pred_key_match")[odds_cols]
           .nunique(dropna=True)
           .max(axis=1))
    collisions = int((dup > 1).sum())
    if collisions > 0:
        print(f"⚠️  Aviso: {collisions} pred_key_match con cuotas inconsistentes en df. Se usará la primera aparición (orden estable).")

# ---------- 4) Re-adjuntar cuotas por pred_key_match (fix: incluye Date si existe) ----------
if have_odds_in_df:
    cols_for_map = ["pred_key_match", "Date"] + odds_cols
    cols_for_map = [c for c in cols_for_map if c in df_keyed.columns]  # por si acaso
    odds_map = (df_keyed[cols_for_map]
                .dropna(subset=["pred_key_match"])
                .sort_values(cols_for_map if "Date" in cols_for_map else ["pred_key_match"], kind="mergesort")
                .drop_duplicates("pred_key_match", keep="first"))
    m = m.merge(odds_map, on="pred_key_match", how="left", suffixes=("", "_dfmap"))
    for c in odds_cols:
        c_map = f"{c}_dfmap"
        if c_map in m.columns:
            m[c] = m[c_map].where(m[c_map].notna(), m.get(c))
            m.drop(columns=[c_map], inplace=True)
    # Limpia Date extra mapeada si viniera de odds_map
    if "Date_dfmap" in m.columns:
        m.drop(columns=["Date_dfmap"], inplace=True)

    # Fallback: (Date,row_in_date)
    if m[odds_cols].isna().any().any():
        df2_sorted = df_keyed.sort_values("Date", kind="mergesort").reset_index(drop=True)
        df2_sorted["row_in_date"] = df2_sorted.groupby("Date").cumcount()
        m2_sorted = m.sort_values("Date", kind="mergesort").reset_index(drop=True)
        m2_sorted["row_in_date"] = m2_sorted.groupby("Date").cumcount()
        bring_odds = df2_sorted[["Date","row_in_date"] + odds_cols]
        m2_sorted = m2_sorted.merge(bring_odds, on=["Date","row_in_date"], how="left", suffixes=("", "_fb2"))
        for c in odds_cols:
            c_fb = f"{c}_fb2"
            if c_fb in m2_sorted.columns:
                m2_sorted[c] = m2_sorted[c].where(m2_sorted[c].notna(), m2_sorted[c_fb])
        m = m2_sorted.drop(columns=[c for c in m2_sorted.columns if c.endswith("_fb2")], errors="ignore")

# ---------- 5) Métricas de probas ----------
if {"proba_H","proba_D","proba_A"}.issubset(m.columns):
    probs = m[["proba_H","proba_D","proba_A"]].to_numpy(dtype=float)
    m["conf_maxprob"] = np.nanmax(probs, axis=1)
    sorted_p = np.sort(probs, axis=1)
    m["margin_top12"] = sorted_p[:, -1] - sorted_p[:, -2]
    m["entropy"] = -(probs * np.log(np.clip(probs, 1e-15, 1.0))).sum(axis=1)
else:
    m["conf_maxprob"] = np.nan
    m["entropy"] = np.nan
    m["margin_top12"] = np.nan

# ---------- 6) Mercado y overround ----------
if {"B365H","B365D","B365A"}.issubset(m.columns):
    pH_imp = 1.0 / np.clip(m["B365H"].astype(float), 1.0, None)
    pD_imp = 1.0 / np.clip(m["B365D"].astype(float), 1.0, None)
    pA_imp = 1.0 / np.clip(m["B365A"].astype(float), 1.0, None)
    s_imp = pH_imp.fillna(0) + pD_imp.fillna(0) + pA_imp.fillna(0)
    m["overround"] = s_imp.where(s_imp > 0, np.nan)
    m["pH_mkt"] = (pH_imp / s_imp).where(s_imp > 0, np.nan)
    m["pD_mkt"] = (pD_imp / s_imp).where(s_imp > 0, np.nan)
    m["pA_mkt"] = (pA_imp / s_imp).where(s_imp > 0, np.nan)
else:
    m["overround"] = np.nan
    m["pH_mkt"] = np.nan
    m["pD_mkt"] = np.nan
    m["pA_mkt"] = np.nan

# ---------- 7) Pick: odds, prob, EV, Kelly ----------
def _pick_odds(row):
    if row.get("y_pred") == "H": return row.get("B365H", np.nan)
    if row.get("y_pred") == "D": return row.get("B365D", np.nan)
    if row.get("y_pred") == "A": return row.get("B365A", np.nan)
    return np.nan

def _pick_prob(row):
    y = str(row.get("y_pred"))
    if y == "H": return row.get("proba_H", np.nan)
    if y == "D": return row.get("proba_D", np.nan)
    if y == "A": return row.get("proba_A", np.nan)
    return np.nan

m["odds_pick"] = m.apply(_pick_odds, axis=1).astype(float)
m["p_pick"]    = m.apply(_pick_prob,  axis=1).astype(float)

b = np.where(np.isfinite(m["odds_pick"]), m["odds_pick"] - 1.0, np.nan)
m["ev_pick"] = m["p_pick"] * b - (1 - m["p_pick"])
kelly_raw = (m["p_pick"] * b - (1 - m["p_pick"])) / b
m["kelly_pick"] = np.clip(kelly_raw, 0.0, 1.0)
m.loc[~np.isfinite(b), "kelly_pick"] = np.nan

# ---------- 8) Resultado y profit (stake 1) ----------
y_true_norm = m["y_true"].astype(str).str.upper().str.strip()
pred_norm   = m["y_pred"].astype(str).str.upper().str.strip()
valid_label = y_true_norm.isin(["H","D","A"])
valid_odds  = np.isfinite(m["odds_pick"]) & (m["odds_pick"] >= 1.01)

m["bet_placed"] = (valid_label & valid_odds).astype(int)
m["correct"]    = ((y_true_norm == pred_norm) & (m["bet_placed"] == 1)).astype(int)
m["profit"]     = np.where(m["bet_placed"] == 1, -1.0, np.nan)
m.loc[m["correct"] == 1, "profit"] = m.loc[m["correct"] == 1, "odds_pick"] - 1.0

# ---------- 9) Profit acumulado por temporada ----------
m = m.sort_values(["Season","Matchday","Date","HomeTeam_norm","AwayTeam_norm"], kind="mergesort").reset_index(drop=True)
m["profit_filled"] = pd.to_numeric(m["profit"], errors="coerce").fillna(0.0)
m["cum_profit_season"] = m.groupby("Season", sort=False)["profit_filled"].transform("cumsum")
m.drop(columns=["profit_filled"], inplace=True)

# ---------- 10) Selección de columnas ----------
cols_head = [
    "Season","Matchday","Date","HomeTeam_norm","AwayTeam_norm","pred_key","pred_key_match",
    "y_true","y_pred",
    "proba_H","proba_D","proba_A","conf_maxprob","entropy","margin_top12",
    "B365H","B365D","B365A","overround","pH_mkt","pD_mkt","pA_mkt",
    "odds_pick","p_pick","ev_pick","kelly_pick",
    "bet_placed","correct","profit","cum_profit_season"
]
cols_exist = [c for c in cols_head if c in m.columns]
log = m[cols_exist].copy()

# ---------- 11) Exportar CSV por temporada ----------
for s, grp in log.groupby("Season", dropna=True):
    out_path = OUT_DIR / f"matchlogs_{int(s)}.csv"
    grp.sort_values(["Matchday","Date","HomeTeam_norm","AwayTeam_norm"], kind="mergesort").to_csv(out_path, index=False)

print("✔ Matchlogs por temporada generados en 'outputs/'. Matchday por (Date,row_in_date); cuotas re-adjuntadas por 'pred_key_match' con fallback por (Date,row_in_date).")

✔ Matchlogs por temporada generados en 'outputs/'. Matchday por (Date,row_in_date); cuotas re-adjuntadas por 'pred_key_match' con fallback por (Date,row_in_date).


### Con SMOTE:

## **COMPARACIÓN CON EL MODELO DE BET365**

El modelo basado en las cuotas de Bet365 consiste en predecir siempre el resultado más probable según la probabilidad implícita.

In [None]:
# ============================================================
# MÉTRICAS PRINCIPALES — MODELO MERCADO (argmax pimp1/pimpx/pimp2)
# Salidas:
#   - outputs/metrics_market_by_season.csv
#   - outputs/metrics_market_overall.json
# Requiere: merged (con y_true, Season, B365H/D/A, pimp1/pimpx/pimp2)
# ============================================================

OUT = Path("outputs")
OUT.mkdir(parents=True, exist_ok=True)

EPS = 1e-15
LABELS = np.array(["H","D","A"])

# ---------- 0) Validaciones y preparación ----------
m = merged.copy()

need_cols = ["y_true","Season","pimp1","pimpx","pimp2"]
missing = [c for c in need_cols if c not in m.columns]
if missing:
    raise KeyError(f"Faltan columnas en merged: {missing}")

# Tipos
m["Season"] = pd.to_numeric(m["Season"], errors="coerce").astype("Int64")
for c in ["pimp1","pimpx","pimp2","B365H","B365D","B365A"]:
    if c in m.columns:
        m[c] = pd.to_numeric(m[c], errors="coerce")
m["y_true_norm"] = m["y_true"].astype(str).str.upper().str.strip()

# ---------- 1) Probabilidades de mercado normalizadas y pick ----------
# (H, D, A) = (pimp1, pimpx, pimp2)
P_raw = m[["pimp1","pimpx","pimp2"]].to_numpy(dtype=float)

# Normalización por fila (respetando NaN en filas inválidas)
row_sum = np.nansum(P_raw, axis=1, keepdims=True)
row_sum = np.where(row_sum <= 0, np.nan, row_sum)
P_mkt = P_raw / row_sum

m["pH_mkt_pred"] = P_mkt[:, 0]
m["pD_mkt_pred"] = P_mkt[:, 1]
m["pA_mkt_pred"] = P_mkt[:, 2]

# Pick = argmax (si toda la fila es NaN, queda NaN)
with np.errstate(invalid="ignore"):
    best_idx = np.nanargmax(np.where(np.isnan(P_mkt), -np.inf, P_mkt), axis=1)
mask_valid = np.isfinite(P_mkt).any(axis=1)

y_pred_mkt = pd.Series(LABELS[best_idx], dtype="object")
y_pred_mkt = y_pred_mkt.where(mask_valid, np.nan)
m["y_pred_market"] = y_pred_mkt

# Confianza / entropía / margen
probs = np.column_stack([
    m["pH_mkt_pred"].to_numpy(dtype=float),
    m["pD_mkt_pred"].to_numpy(dtype=float),
    m["pA_mkt_pred"].to_numpy(dtype=float),
])
m["conf_maxprob"] = np.nanmax(probs, axis=1)
sorted_p = np.sort(probs, axis=1)
m["margin_top12"] = sorted_p[:, -1] - sorted_p[:, -2]
m["entropy"] = -(probs * np.log(np.clip(probs, EPS, 1.0))).sum(axis=1)

# ---------- 2) Filtro de filas evaluables ----------
valid_label = m["y_true_norm"].isin(["H","D","A"])
valid_prob = np.isfinite(probs).all(axis=1)
scored_mask = valid_label & valid_prob
scored = m.loc[scored_mask].copy()

# ---------- 3) Accuracy, LogLoss, Brier por temporada ----------
def brier_mc(y_true_series, P, labels=("H","D","A")):
    """Brier multiclas clásico, labels en orden fijo."""
    y = y_true_series.astype(str).str.upper().str.strip().to_numpy()
    idx = {c: i for i, c in enumerate(labels)}
    Y = np.zeros_like(P)
    Y[np.arange(len(y)), [idx[c] for c in y]] = 1.0
    return float(np.mean(np.sum((P - Y) ** 2, axis=1)))

rows = []
for s, g in scored.groupby("Season", dropna=True):
    y = g["y_true_norm"]
    P = g[["pH_mkt_pred","pD_mkt_pred","pA_mkt_pred"]].to_numpy(dtype=float)
    acc = float((g["y_pred_market"].astype(str).str.upper().str.strip() == y).mean())
    ll = float(log_loss(y, P, labels=["H","D","A"]))
    br = brier_mc(y, P, labels=("H","D","A"))
    rows.append({
        "Season": int(s),
        "accuracy": acc,
        "logloss": ll,
        "brier": br,
        "n_scored": int(len(g))
    })
metrics_by_season = pd.DataFrame(rows).sort_values("Season").reset_index(drop=True)

# ---------- 4) ROI y estadísticas de apuestas por temporada ----------
B365H = pd.to_numeric(m.get("B365H", np.nan), errors="coerce").to_numpy()
B365D = pd.to_numeric(m.get("B365D", np.nan), errors="coerce").to_numpy()
B365A = pd.to_numeric(m.get("B365A", np.nan), errors="coerce").to_numpy()

pred_arr = m["y_pred_market"].astype("object").astype(str).str.upper().str.strip().to_numpy()
yt_arr   = m["y_true_norm"].to_numpy()

odds_pick = np.where(
    pred_arr == "H", B365H,
    np.where(pred_arr == "D", B365D, np.where(pred_arr == "A", B365A, np.nan))
).astype(float)
valid_odds = np.isfinite(odds_pick) & (odds_pick >= 1.01)
bet_mask = valid_label.to_numpy() & valid_odds

m["__bet__"] = bet_mask
m["__win__"] = False
mask_bet_idx = np.where(bet_mask)[0]
m.loc[mask_bet_idx, "__win__"] = (pred_arr[bet_mask] == yt_arr[bet_mask])
m["__odds__"] = odds_pick

# Overround por fila (para promediar sobre apostados)
overround_row = (1 / np.clip(B365H, 1.0, None)) + (1 / np.clip(B365D, 1.0, None)) + (1 / np.clip(B365A, 1.0, None))
overround_row[~np.isfinite(overround_row)] = np.nan
m["__overround__"] = overround_row

roi_rows = []
for s, g in m.groupby("Season", dropna=True):
    gb = g[g["__bet__"] == True]
    n_bets = int(len(gb))
    if n_bets == 0:
        roi_rows.append({
            "Season": int(s),
            "roi": np.nan,
            "n_bets": 0,
            "n_wins": 0,
            "hit_rate": np.nan,
            "avg_odds_win": np.nan,
            "avg_overround": np.nan
        })
        continue
    n_wins = int(gb["__win__"].sum())
    profit = np.where(gb["__win__"], gb["__odds__"] - 1.0, -1.0)
    roi = float(profit.sum() / n_bets)
    hit_rate = n_wins / n_bets if n_bets > 0 else np.nan
    avg_odds_win = float(pd.to_numeric(gb.loc[gb["__win__"], "__odds__"], errors="coerce").mean()) if n_wins > 0 else np.nan
    avg_overround = float(pd.to_numeric(gb["__overround__"], errors="coerce").mean())
    roi_rows.append({
        "Season": int(s),
        "roi": roi,
        "n_bets": n_bets,
        "n_wins": n_wins,
        "hit_rate": float(hit_rate) if np.isfinite(hit_rate) else np.nan,
        "avg_odds_win": avg_odds_win,
        "avg_overround": avg_overround
    })
roi_by_season = pd.DataFrame(roi_rows)

# ---------- 5) Métricas finales por temporada (merge y columnas ordenadas) ----------
final_by_season = (
    metrics_by_season
    .merge(roi_by_season, on="Season", how="left")
    .sort_values("Season")
    .reset_index(drop=True)
)

# Orden de columnas EXACTO para el CSV:
cols_order = [
    "Season", "accuracy", "logloss", "brier", "n_scored",
    "roi", "n_bets", "n_wins", "hit_rate", "avg_odds_win", "avg_overround"
]
final_by_season = final_by_season.reindex(columns=cols_order)

# ---------- 6) Guardar CSV por temporada y resumen overall ----------
csv_path = OUT / "metrics_market_by_season.csv"
final_by_season.to_csv(csv_path, index=False)

def wavg(col, weight):
    c = pd.to_numeric(final_by_season[col], errors="coerce")
    w = pd.to_numeric(final_by_season[weight], errors="coerce").fillna(0)
    return float(np.nansum(c * w) / np.nansum(w)) if np.nansum(w) > 0 else np.nan

overall = {
    "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "model": "market_argmax(pimp1,pimpx,pimp2)",
    "overall": {
        "n_scored_total": int(final_by_season["n_scored"].fillna(0).sum()),
        "n_bets_total": int(final_by_season["n_bets"].fillna(0).sum()),
        "accuracy_overall": wavg("accuracy", "n_scored"),
        "logloss_overall":  wavg("logloss",  "n_scored"),
        "brier_overall":    wavg("brier",    "n_scored"),
        "roi_overall":      wavg("roi",      "n_bets"),
        "hit_rate_overall": wavg("hit_rate", "n_bets"),
        "avg_overround_overall": float(pd.to_numeric(final_by_season["avg_overround"], errors="coerce").mean()),
        "avg_conf_overall": float(pd.to_numeric(m.loc[scored_mask, "conf_maxprob"], errors="coerce").mean()),
        "avg_entropy_overall": float(pd.to_numeric(m.loc[scored_mask, "entropy"], errors="coerce").mean()),
        "avg_margin_overall": float(pd.to_numeric(m.loc[scored_mask, "margin_top12"], errors="coerce").mean()),
    }
}

json_path = OUT / "metrics_market_overall.json"
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(overall, f, ensure_ascii=False, indent=2)

print("✔ Métricas del modelo de mercado guardadas:")
print(" -", csv_path)
print(" -", json_path)
display(final_by_season.head(20))

✔ Métricas del modelo de mercado guardadas:
 - outputs/metrics_market_by_season.csv
 - outputs/metrics_market_overall.json


Unnamed: 0,Season,accuracy,logloss,brier,n_scored,roi,n_bets,n_wins,hit_rate,avg_odds_win,avg_overround
0,2010,0.610526,1.468413,0.54574,380,0.097342,380,232,0.610526,1.797371,1.065656
1,2011,0.536842,1.517757,0.563149,380,-0.079105,380,204,0.536842,1.715392,1.064498
2,2012,0.547368,1.458616,0.556594,380,-0.051132,380,208,0.547368,1.73351,1.063707
3,2013,0.539474,1.5105,0.564819,380,-0.092184,380,205,0.539474,1.68278,1.06363
4,2014,0.568421,1.581797,0.530062,380,-0.053,380,216,0.568421,1.666019,1.055137
5,2015,0.547368,1.541064,0.550708,380,-0.062579,380,208,0.547368,1.712596,1.051227
6,2016,0.584211,1.585006,0.531931,380,-0.006658,380,222,0.584211,1.700315,1.050764
7,2017,0.547368,1.481166,0.570501,380,-0.036026,380,208,0.547368,1.761106,1.052719
8,2018,0.481579,1.396761,0.602516,380,-0.141237,380,183,0.481579,1.783224,1.052628
9,2019,0.518421,1.391247,0.588021,380,-0.040605,380,197,0.518421,1.850609,1.054675


In [None]:
# ============================================================
# MATCHLOGS — MODELO MERCADO (argmax pimp1/pimpx/pimp2) → CSV por temporada
# Genera: outputs/matchlogs_market_<Season>.csv
# ============================================================

OUT_DIR = Path("outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------- Helpers robustos ----------
def _norm_name(s: str) -> str:
    return re.sub(r'[^a-z0-9]+', '', str(s).strip().lower())

def _find_col(df, candidates):
    norm2real = {_norm_name(c): c for c in df.columns}
    for cand in candidates:
        if _norm_name(cand) in norm2real:
            return norm2real[_norm_name(cand)]
    return None

def _infer_team_cols(df):
    home_candidates = ["HomeTeam_norm","HomeTeam","home_team","Home","local"]
    away_candidates = ["AwayTeam_norm","AwayTeam","away_team","Away","visitor","visiting"]
    home_col = _find_col(df, home_candidates)
    away_col = _find_col(df, away_candidates)
    if home_col is None or away_col is None:
        raise KeyError(f"No encuentro columnas Home/Away. Cols: {list(df.columns)[:40]}")
    return home_col, away_col

def _coalesce_suffix(mdf: pd.DataFrame, base: str) -> pd.DataFrame:
    cx, cy = f"{base}_x", f"{base}_y"
    if cx in mdf.columns or cy in mdf.columns:
        if cx in mdf.columns and cy in mdf.columns:
            mdf[base] = mdf[cx].where(mdf[cx].notna(), mdf[cy])
        elif cx in mdf.columns:
            mdf[base] = mdf[cx]
        else:
            mdf[base] = mdf[cy]
        mdf.drop(columns=[c for c in (cx, cy) if c in mdf.columns], inplace=True)
    return mdf

def _build_pred_key_like_pipeline(df_in, home_col=None, away_col=None):
    d = df_in.copy()
    d["Date"] = pd.to_datetime(d["Date"], errors="coerce")
    if home_col is None or away_col is None:
        home_col, away_col = _infer_team_cols(d)
    d["Season"] = pd.to_numeric(d["Season"], errors="coerce").astype("Int64")
    date_key = d["Date"].dt.tz_localize(None, nonexistent="NaT", ambiguous="NaT").dt.floor("D")
    d["pred_key"] = (
        d["Season"].astype("Int64").astype(str) + "|" +
        date_key.dt.strftime("%Y-%m-%d") + "|" +
        d[home_col].astype(str) + "|" +
        d[away_col].astype(str)
    )
    return d

def _attach_matchday_from_df(merged_in: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
    """
    Trae Matchday (= df[Matchweek]) usando alineación determinista por (Date,row_in_date).
    Fallback: por pred_key base (sin '#k') si hiciera falta.
    """
    m = merged_in.copy()
    m["Date"] = pd.to_datetime(m["Date"], errors="coerce")
    df2 = df.copy()
    df2["Date"] = pd.to_datetime(df2["Date"], errors="coerce")

    # Detecta columna Matchweek en df
    mw_col = _find_col(df2, ["Matchweek","MatchWeek","matchweek","Jornada","Gameweek","GW","Week","MD"])
    if mw_col is None:
        raise KeyError("No se encontró columna de jornada (Matchweek) en df.")

    # Alineación por (Date,row_in_date) con orden estable (determinista)
    df2_sorted = df2.sort_values("Date", kind="mergesort").reset_index(drop=True)
    df2_sorted["row_in_date"] = df2_sorted.groupby("Date").cumcount()

    m_sorted = m.sort_values("Date", kind="mergesort").reset_index(drop=True)
    m_sorted["row_in_date"] = m_sorted.groupby("Date").cumcount()

    bring = df2_sorted[["Date","row_in_date", mw_col]].rename(columns={mw_col: "Matchday"})
    m_sorted = m_sorted.merge(bring, on=["Date","row_in_date"], how="left", validate="1:1")

    # Fallback por pred_key base si quedaran NaN
    if m_sorted["Matchday"].isna().any():
        missing = m_sorted["Matchday"].isna()
        if "pred_key" not in m_sorted.columns or "pred_key" not in df2_sorted.columns:
            hm_m, aw_m = _infer_team_cols(m_sorted)
            m_sorted = _build_pred_key_like_pipeline(m_sorted, hm_m, aw_m)
            hm_d, aw_d = _infer_team_cols(df2_sorted)
            df2_sorted = _build_pred_key_like_pipeline(df2_sorted, hm_d, aw_d)
        m_sorted["pred_key_base"] = m_sorted["pred_key"].astype(str).str.split("#", n=1, expand=True)[0]
        df2_sorted["pred_key_base"] = df2_sorted["pred_key"].astype(str).str.split("#", n=1, expand=True)[0]
        aux = (df2_sorted[["pred_key_base", mw_col]]
               .drop_duplicates("pred_key_base")
               .rename(columns={mw_col:"Matchday_fb"}))
        m_sorted = m_sorted.merge(aux, on="pred_key_base", how="left")
        m_sorted.loc[missing, "Matchday"] = m_sorted.loc[missing, "Matchday_fb"]
        m_sorted.drop(columns=["pred_key_base","Matchday_fb"], inplace=True, errors="ignore")

    return m_sorted

# ---------- Carga y saneo ----------
m = merged.copy()

# Coalesce _x/_y si existen (incluye Date para máxima robustez)
for base in ["Season","HomeTeam_norm","AwayTeam_norm","HomeTeam","AwayTeam","Date"]:
    m = _coalesce_suffix(m, base)

# Home/Away canónicas
home_col_real, away_col_real = _infer_team_cols(m)
if "HomeTeam_norm" not in m.columns:
    m["HomeTeam_norm"] = m[home_col_real]
if "AwayTeam_norm" not in m.columns:
    m["AwayTeam_norm"] = m[away_col_real]

# Tipos/numéricos
m["Date"] = pd.to_datetime(m["Date"], errors="coerce")
for c in ["pimp1","pimpx","pimp2","B365H","B365D","B365A"]:
    if c in m.columns:
        m[c] = pd.to_numeric(m[c], errors="coerce")
m["Season"] = pd.to_numeric(m["Season"], errors="coerce").astype("Int64")
m["y_true"] = m["y_true"].astype(str).str.upper().str.strip()

# Construye pred_key (estable) para trazabilidad y fallback
m = _build_pred_key_like_pipeline(m, "HomeTeam_norm", "AwayTeam_norm")

# ---------- Probabilidades de mercado (normalizadas) y pick ----------
P_raw = m[["pimp1","pimpx","pimp2"]].to_numpy(dtype=float)  # (H, D, A)
row_sum = np.nansum(P_raw, axis=1, keepdims=True)
row_sum = np.where(row_sum <= 0, np.nan, row_sum)
P_mkt = P_raw / row_sum

m["pH_mkt_pred"] = P_mkt[:,0]
m["pD_mkt_pred"] = P_mkt[:,1]
m["pA_mkt_pred"] = P_mkt[:,2]

with np.errstate(invalid="ignore"):
    best_idx = np.nanargmax(np.where(np.isnan(P_mkt), -np.inf, P_mkt), axis=1)
mask_valid_row = np.isfinite(P_mkt).any(axis=1)
LABELS = np.array(["H","D","A"])
y_pred_market = pd.Series(LABELS[best_idx], dtype="object").where(mask_valid_row, np.nan)
m["y_pred_market"] = y_pred_market

# Confianza/entropía/margen sobre probs de mercado
probs = np.column_stack([m["pH_mkt_pred"], m["pD_mkt_pred"], m["pA_mkt_pred"]]).astype(float)
m["conf_maxprob"] = np.nanmax(probs, axis=1)
sorted_p = np.sort(probs, axis=1)
m["margin_top12"] = sorted_p[:,-1] - sorted_p[:,-2]
m["entropy"] = -(probs * np.log(np.clip(probs, 1e-15, 1.0))).sum(axis=1)

# ---------- Matchday real desde df ----------
m = _attach_matchday_from_df(m, df)

# ---------- Mercado: overround e implícitas (1/odds) ----------
if {"B365H","B365D","B365A"}.issubset(m.columns):
    pH_imp = 1.0/np.clip(m["B365H"].astype(float), 1.0, None)
    pD_imp = 1.0/np.clip(m["B365D"].astype(float), 1.0, None)
    pA_imp = 1.0/np.clip(m["B365A"].astype(float), 1.0, None)
    s_imp = pH_imp.fillna(0) + pD_imp.fillna(0) + pA_imp.fillna(0)
    m["overround"] = s_imp.where(s_imp > 0, np.nan)
else:
    m["overround"] = np.nan

# ---------- Pick: odds, prob, EV, Kelly ----------
def _pick_odds(row):
    if row.get("y_pred_market") == "H": return row.get("B365H", np.nan)
    if row.get("y_pred_market") == "D": return row.get("B365D", np.nan)
    if row.get("y_pred_market") == "A": return row.get("B365A", np.nan)
    return np.nan

def _pick_prob(row):
    y = str(row.get("y_pred_market"))
    if y == "H": return row.get("pH_mkt_pred", np.nan)
    if y == "D": return row.get("pD_mkt_pred", np.nan)
    if y == "A": return row.get("pA_mkt_pred", np.nan)
    return np.nan

m["odds_pick"] = m.apply(_pick_odds, axis=1).astype(float)
m["p_pick"]    = m.apply(_pick_prob,  axis=1).astype(float)

b = np.where(np.isfinite(m["odds_pick"]), m["odds_pick"] - 1.0, np.nan)
m["ev_pick"] = m["p_pick"] * b - (1 - m["p_pick"])
kelly_raw = (m["p_pick"]*b - (1 - m["p_pick"])) / b
m["kelly_pick"] = np.clip(kelly_raw, 0.0, 1.0)
m.loc[~np.isfinite(b), "kelly_pick"] = np.nan

# ---------- Resultado y profit (stake 1) ----------
valid_label = m["y_true"].isin(["H","D","A"])
valid_odds  = np.isfinite(m["odds_pick"]) & (m["odds_pick"] >= 1.01)

m["bet_placed"] = (valid_label & valid_odds).astype(int)
m["correct"]    = ((m["y_true"] == m["y_pred_market"].astype(str).str.upper().str.strip()) & (m["bet_placed"]==1)).astype(int)
m["profit"]     = np.where(m["bet_placed"]==1, -1.0, np.nan)
m.loc[m["correct"]==1, "profit"] = m.loc[m["correct"]==1, "odds_pick"] - 1.0

# ---------- Profit acumulado por temporada ----------
m = m.sort_values(["Season","Matchday","Date","HomeTeam_norm","AwayTeam_norm"], kind="mergesort").reset_index(drop=True)
m["profit_filled"] = pd.to_numeric(m["profit"], errors="coerce").fillna(0.0)
m["cum_profit_season"] = m.groupby("Season", sort=False)["profit_filled"].transform("cumsum")
m.drop(columns=["profit_filled"], inplace=True)

# ---------- Selección de columnas ----------
cols_head = [
    "Season","Matchday","Date","HomeTeam_norm","AwayTeam_norm","pred_key",
    "y_true","y_pred_market",
    "pH_mkt_pred","pD_mkt_pred","pA_mkt_pred","conf_maxprob","entropy","margin_top12",
    "B365H","B365D","B365A","overround",
    "odds_pick","p_pick","ev_pick","kelly_pick",
    "bet_placed","correct","profit","cum_profit_season"
]
cols_exist = [c for c in cols_head if c in m.columns]
log = m[cols_exist].copy()

# ---------- Exportar CSV por temporada ----------
for s, grp in log.groupby("Season", dropna=True):
    out_path = OUT_DIR / f"matchlogs_market_{int(s)}.csv"
    grp.sort_values(["Matchday","Date","HomeTeam_norm","AwayTeam_norm"], kind="mergesort").to_csv(out_path, index=False)

print("✔ Matchlogs del modelo de mercado generados en 'outputs/' (uno por temporada).")

✔ Matchlogs del modelo de mercado generados en 'outputs/' (uno por temporada).


# Radar plot

In [None]:
# ============================================================
# OUTPUTS PARA STREAMLIT — RADAR + BARRAS (SOLO PARTIDOS FUTUROS)
# Genera por temporada: outputs/radar_prematch/radar_prematch_{SEASON}.csv
# Contiene columnas brutas y normalizadas para radar y barras
# ============================================================

from __future__ import annotations
from pathlib import Path
import pandas as pd
import numpy as np
import json

# ---------------- RUTAS ----------------
try:
    ROOT
except NameError:
    ROOT = Path(".")
DATA = ROOT / "data" / "03_features"
SRC_PATH = DATA / "df_final.parquet"

OUT_DIR = ROOT / "outputs" / "radar_prematch"
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------------- CARGA ----------------
if not SRC_PATH.exists():
    raise FileNotFoundError(f"No se encuentra {SRC_PATH}")
df = pd.read_parquet(SRC_PATH).reset_index(drop=True)
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

# ---------------- SEASON: normaliza nombre y tipo ----------------
if "Season" not in df.columns and "season" in df.columns:
    df = df.rename(columns={"season": "Season"})
df["Season"] = pd.to_numeric(df["Season"], errors="coerce").astype("Int64")

# ---------------- FUTUROS (arreglo tz) ----------------
today_naive = pd.Timestamp.now(tz="Europe/Madrid").normalize().tz_localize(None)
futuros = df.loc[df["Date"] >= today_naive].copy()
if futuros.empty:
    print("No hay partidos futuros. Nada que exportar.")
    raise SystemExit

# ---------------- MATCH_ID ESTABLE ----------------
def _mk_match_id(r):
    return f"{int(r['Season'])}__{pd.to_datetime(r['Date']).date()}__{r['HomeTeam_norm']}__{r['AwayTeam_norm']}"
futuros["match_id"] = futuros.apply(_mk_match_id, axis=1)

# ---------------- ESQUEMAS ----------------
# RADAR: 8 ejes (normalización fija a [0,1])
RADAR_METRICS = {
    "xG7": ("home_avg_xg_last7", "away_avg_xg_last7"),                              # 0–4
    "OnTarget7": ("home_avg_shotsontarget_last7", "away_avg_shotsontarget_last7"),  # 0–12
    "Corners7": ("home_avg_corners_last7", "away_avg_corners_last7"),               # 0–12
    "Effectiveness": ("home_effectiveness", "away_effectiveness"),                  # 0–1
    "FormPts6": ("home_form_points_6", "away_form_points_6"),                       # 0–18
    "FormGD6": ("home_form_gd_6", "away_form_gd_6"),                                # -10–10
    "Elo": ("h_elo", "a_elo"),                                                      # 1450–2150
    "RelPerf": ("home_relative_perf", "away_relative_perf"),                        # 0–2
}
RADAR_RANGES = {
    "xG7": (0.0, 4.0),
    "OnTarget7": (0.0, 12.0),
    "Corners7": (0.0, 12.0),
    "Effectiveness": (0.0, 1.0),
    "FormPts6": (0.0, 18.0),
    "FormGD6": (-10.0, 10.0),
    "Elo": (1450.0, 2150.0),
    "RelPerf": (0.0, 2.0),
}
NORM_VERSION = "radar_v1.0"

# BARRAS: valores brutos + normalizados (algunos invertidos)
BARS_RANGES = {
    "TotalPoints": (0.0, 114.0),    # 38*3
    "PointsPct": (0.0, 1.0),
    "Position": (1.0, 20.0),        # invertida
    "GDCum": (-30.0, 30.0),
    "Shots7": (0.0, 30.0),
    "Corners7_bar": (0.0, 12.0),
    "Fouls7": (5.0, 25.0),          # invertida
    "Yellows7": (0.0, 5.0),         # invertida
    "ImpProb": (0.0, 1.0),
}
BARS_INVERT = {"Position", "Fouls7", "Yellows7"}

# Persistimos los esquemas (trazabilidad)
schema_path = OUT_DIR / "schemas.json"
with open(schema_path, "w", encoding="utf-8") as f:
    json.dump({
        "norm_version": NORM_VERSION,
        "radar_ranges": RADAR_RANGES,
        "bars_ranges": BARS_RANGES,
        "bars_invert": sorted(list(BARS_INVERT)),
    }, f, ensure_ascii=False, indent=2)

# ---------------- HELPERS ----------------
def _norm(v, lo, hi, invert=False):
    if pd.isna(v):
        return np.nan
    x = (v - lo) / (hi - lo + 1e-12)
    x = float(np.clip(x, 0, 1))
    return 1.0 - x if invert else x

# % de puntos posibles (VERSIÓN VECTORIZADA)
def _points_pct_series(points: pd.Series, matches: pd.Series) -> pd.Series:
    points = pd.to_numeric(points, errors="coerce")
    matches = pd.to_numeric(matches, errors="coerce")
    out = pd.Series(np.nan, index=points.index, dtype=float)
    valid = matches > 0
    out.loc[valid] = points.loc[valid] / (3.0 * matches.loc[valid])
    return out.clip(lower=0.0, upper=1.0)

# Asegura columnas faltantes
def _ensure_cols(df_in, cols):
    for c in cols:
        if c not in df_in.columns:
            df_in[c] = np.nan

def _dedup(seq):
    seen = set()
    out = []
    for x in seq:
        if x not in seen:
            out.append(x)
            seen.add(x)
    return out

# ---------------- SELECCIÓN Y COLUMNAS BASE ----------------
id_cols = ["Season","Date","Matchweek","HomeTeam_norm","AwayTeam_norm","match_id"]
market_cols = ["B365H","B365D","B365A","pimp1","pimpx","pimp2","overround"]
_ensure_cols(futuros, id_cols + market_cols)

# ---------------- RADAR: brutos + norm ----------------
for label, (h_col, a_col) in RADAR_METRICS.items():
    _ensure_cols(futuros, [h_col, a_col])
    lo, hi = RADAR_RANGES[label]
    futuros[f"{h_col}_norm"] = futuros[h_col].map(lambda x: _norm(x, lo, hi))
    futuros[f"{a_col}_norm"] = futuros[a_col].map(lambda x: _norm(x, lo, hi))

# ---------------- BARRAS: cálculos y norm ----------------
_ensure_cols(futuros, ["home_total_matches_prev","away_total_matches_prev",
                       "home_total_points_cum","away_total_points_cum",
                       "home_prev_position","away_prev_position",
                       "home_gd_cum","away_gd_cum",
                       "home_avg_shots_last7","away_avg_shots_last7",
                       "home_avg_corners_last7","away_avg_corners_last7",
                       "home_avg_fouls_last7","away_avg_fouls_last7",
                       "home_avg_yellows_last7","away_avg_yellows_last7",
                       "pimp1","pimp2"])

# % puntos posibles
futuros["home_points_pct"] = _points_pct_series(
    futuros["home_total_points_cum"], futuros["home_total_matches_prev"]
)
futuros["away_points_pct"] = _points_pct_series(
    futuros["away_total_points_cum"], futuros["away_total_matches_prev"]
)

# Normalización barras (brutos + *_norm)
def _ensure_bar_norm(df_in, raw, label):
    col_norm = f"{raw}_norm"
    if col_norm in df_in.columns:
        return
    lo, hi = BARS_RANGES[label]
    inv = label in BARS_INVERT
    df_in[col_norm] = df_in[raw].map(lambda x: _norm(x, lo, hi, invert=inv))

_raw_label_map = {
    "home_total_points_cum": "TotalPoints", "away_total_points_cum": "TotalPoints",
    "home_points_pct": "PointsPct", "away_points_pct": "PointsPct",
    "home_prev_position": "Position", "away_prev_position": "Position",
    "home_gd_cum": "GDCum", "away_gd_cum": "GDCum",
    "home_avg_shots_last7": "Shots7", "away_avg_shots_last7": "Shots7",
    "home_avg_corners_last7": "Corners7_bar", "away_avg_corners_last7": "Corners7_bar",
    "home_avg_fouls_last7": "Fouls7", "away_avg_fouls_last7": "Fouls7",
    "home_avg_yellows_last7": "Yellows7", "away_avg_yellows_last7": "Yellows7",
    "pimp1": "ImpProb", "pimp2": "ImpProb",
}
for raw, lab in _raw_label_map.items():
    _ensure_bar_norm(futuros, raw, lab)

# ---------------- FLAGS & META ----------------
futuros["has_odds"] = futuros[["B365H","B365D","B365A"]].notna().all(axis=1)
futuros["generated_at"] = pd.Timestamp.utcnow()
futuros["norm_version"] = NORM_VERSION
futuros["schema_path"] = str(schema_path)

# ---------------- SELECCIÓN DE COLUMNAS PARA CSV ----------------
# Radar columns
radar_raw_cols, radar_norm_cols = [], []
for _, (h,a) in RADAR_METRICS.items():
    radar_raw_cols += [h, a]
    radar_norm_cols += [f"{h}_norm", f"{a}_norm"]

# Bars columns
bars_raw_cols = [
    "home_total_points_cum","away_total_points_cum",
    "home_points_pct","away_points_pct",
    "home_prev_position","away_prev_position",
    "home_gd_cum","away_gd_cum",
    "home_avg_shots_last7","away_avg_shots_last7",
    "home_avg_corners_last7","away_avg_corners_last7",
    "home_avg_fouls_last7","away_avg_fouls_last7",
    "home_avg_yellows_last7","away_avg_yellows_last7",
    "pimp1","pimp2","overround"
]
bars_norm_cols = [
    "home_total_points_cum_norm","away_total_points_cum_norm",
    "home_points_pct_norm","away_points_pct_norm",
    "home_prev_position_norm","away_prev_position_norm",
    "home_gd_cum_norm","away_gd_cum_norm",
    "home_avg_shots_last7_norm","away_avg_shots_last7_norm",
    "home_avg_corners_last7_norm","away_avg_corners_last7_norm",
    "home_avg_fouls_last7_norm","away_avg_fouls_last7_norm",
    "home_avg_yellows_last7_norm","away_avg_yellows_last7_norm",
    "pimp1_norm","pimp2_norm"
]

id_market_cols = ["Season","Date","Matchweek","HomeTeam_norm","AwayTeam_norm","match_id",
                  "B365H","B365D","B365A","pimp1","pimpx","pimp2","overround","has_odds"]
meta_cols = ["generated_at","norm_version","schema_path"]

# Deduplicar columnas finales para evitar InvalidIndexError
final_cols = [c for c in id_market_cols
              + radar_raw_cols + radar_norm_cols
              + bars_raw_cols + bars_norm_cols
              + meta_cols if c in futuros.columns]
final_cols = _dedup(final_cols)

# ---------------- ESCRITURA CSV POR TEMPORADA (idempotente) ----------------
summary = []
for season in sorted(futuros["Season"].dropna().unique().tolist()):
    part = futuros.loc[futuros["Season"] == season, final_cols].copy()
    out_path = OUT_DIR / f"radar_prematch_{int(season)}.csv"
    part = part.sort_values(["Date","HomeTeam_norm","AwayTeam_norm"]).reset_index(drop=True)

    # (1) Asegura unicidad de columnas y normaliza tipos antes de merge
    part = part.loc[:, _dedup(list(part.columns))]
    # generated_at a datetime para ordenar
    if "generated_at" in part.columns:
        part["generated_at"] = pd.to_datetime(part["generated_at"], errors="coerce", utc=True)

    if out_path.exists():
        prev = pd.read_csv(out_path)
        prev = prev.loc[:, _dedup(list(prev.columns))]
        # generated_at del CSV a datetime para ordenar
        if "generated_at" in prev.columns:
            prev["generated_at"] = pd.to_datetime(prev["generated_at"], errors="coerce", utc=True)

        merged = pd.concat([prev, part], ignore_index=True)

        # (2) Ordena por generated_at (NaT al final), dedup por match_id
        if "generated_at" in merged.columns:
            merged = merged.sort_values("generated_at", na_position="last")
        merged = merged.drop_duplicates(subset=["match_id"], keep="last")

        # (3) generated_at a ISO string estable antes de guardar
        if "generated_at" in merged.columns:
            merged["generated_at"] = merged["generated_at"].dt.strftime("%Y-%m-%dT%H:%M:%SZ")

        merged.to_csv(out_path, index=False, encoding="utf-8-sig")
        n_new = max(0, len(merged) - len(prev))
    else:
        # generated_at a ISO string estable en primera escritura
        if "generated_at" in part.columns:
            part["generated_at"] = part["generated_at"].dt.strftime("%Y-%m-%dT%H:%M:%SZ")
        part.to_csv(out_path, index=False, encoding="utf-8-sig")
        n_new = len(part)

    summary.append((season, out_path, n_new))

print("STREAMLIT OUTPUT — RADAR + BARRAS")
for s, p, n in summary:
    print(f"  • Season {int(s)} → {p} (filas nuevas: {n})")
print(f"Esquemas guardados en: {schema_path}")

STREAMLIT OUTPUT — RADAR + BARRAS
  • Season 2025 → /content/outputs/radar_prematch/radar_prematch_2025.csv (filas nuevas: 0)
Esquemas guardados en: /content/outputs/radar_prematch/schemas.json
