<a href="https://colab.research.google.com/github/manuferrod/laliga-predictor-engine/blob/main/MODELOS_(8).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# --- Parámetros (se pueden sobreescribir en CI) ---
from pathlib import Path
from datetime import datetime
import os
import pandas as pd
import pytz

# Zona horaria para "hoy"
TZ = pytz.timezone("Europe/Madrid")

def _today_tz(tz=TZ) -> str:
    return datetime.now(tz).date().strftime("%Y-%m-%d")

# RUN_DATE: prioridad -> valor ya definido (papermill/globals) -> env -> hoy (Europe/Madrid)
_run_injected = globals().get("RUN_DATE", None)
if _run_injected not in (None, "", "auto", "today"):
    RUN_DATE = str(_run_injected)
else:
    RUN_DATE = os.environ.get("RUN_DATE", _today_tz())

# Normaliza a YYYY-MM-DD
RUN_DATE = pd.to_datetime(RUN_DATE, errors="coerce").date().strftime("%Y-%m-%d")

# SEASON: si no viene dada, se calcula a partir de RUN_DATE (formato 2025_26)
if "SEASON" in globals() and globals()["SEASON"]:
    SEASON = globals()["SEASON"]
else:
    _dt = pd.to_datetime(RUN_DATE)
    _y = int(_dt.year) if _dt.month >= 7 else int(_dt.year) - 1
    SEASON = f"{_y}_{(_y+1) % 100:02d}"

# MATCHDAY (jornada): permite inyección externa; por defecto None
MATCHDAY = globals().get("MATCHDAY", os.environ.get("MATCHDAY", None))

# Versión de modelo: respeta inyección / env, si no usa por defecto
MODEL_VERSION = globals().get("MODEL_VERSION", os.environ.get("MODEL_VERSION", "xgb-local"))

# --- Rutas coherentes local/CI ---
ROOT   = Path.cwd()
DATA   = ROOT / "data"
RAW    = DATA / "01_raw"
PROC   = DATA / "02_processed"
FEAT   = DATA / "03_features"
MODELS = DATA / "04_models"
OUT    = ROOT / "outputs"

for p in [RAW, PROC, FEAT, MODELS, OUT]:
    p.mkdir(parents=True, exist_ok=True)

# Reproducibilidad
import random, numpy as np
random.seed(42); np.random.seed(42)

print(f"RUN_DATE = {RUN_DATE} | SEASON = {SEASON} | MATCHDAY = {MATCHDAY} | MODEL_VERSION = {MODEL_VERSION}")
print(f"ROOT = {ROOT}")

RUN_DATE = 2025-10-14 | SEASON = 2025_26 | MATCHDAY = None | MODEL_VERSION = xgb-local
ROOT = /content


In [2]:
import pandas as pd, json

def load_feat(name: str):
    return pd.read_parquet(FEAT / name)

def save_model(obj, name: str):
    from joblib import dump
    MODELS.mkdir(parents=True, exist_ok=True)
    dump(obj, MODELS / name)

def save_predictions(df: pd.DataFrame, name: str = "predictions_next.csv"):
    OUT.mkdir(parents=True, exist_ok=True)
    df.to_csv(OUT / name, index=False)

def save_json(obj, name: str = "metrics_overview.json"):
    OUT.mkdir(parents=True, exist_ok=True)
    with open(OUT / name, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

# **MODELOS**

In [13]:
import json
from collections import defaultdict
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import accuracy_score, log_loss
from imblearn.over_sampling import SMOTE
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix
import time
import hashlib

# **EL MODELO**

In [4]:
IN_PATH = FEAT / "df_final.parquet"
df = pd.read_parquet(IN_PATH)

print("Leído:", IN_PATH, "· filas=", len(df), "· cols=", df.shape[1])
df.head(2)

Leído: /content/data/03_features/df_final.parquet · filas= 7310 · cols= 121


Unnamed: 0,B365A,B365D,B365H,Date,FTR,HomeTeam_norm,AwayTeam_norm,h_elo,a_elo,Season,...,form_gd_6_diff,effectiveness_diff,relative_perf_diff,target,home_playstyle_defensivo,home_playstyle_equilibrado,home_playstyle_ofensivo,away_playstyle_defensivo,away_playstyle_equilibrado,away_playstyle_ofensivo
0,6.0,3.6,1.57,2006-08-26,H,valencia,betis,1857.375122,1726.076904,2006,...,0.0,0.0,0.05756,2.0,False,False,True,True,False,False
1,3.0,3.25,2.3,2006-08-27,D,recreativo,mallorca,1701.504761,1723.469849,2006,...,0.0,0.0,-1.263211,1.0,False,False,False,False,True,False


In [5]:
# Logit de mercado (home vs away)
df['market_home_logit'] = np.log((df['pimp1'] + 1e-9) / (df['pimp2'] + 1e-9))
df['market_draw_logit'] = np.log((df['pimpx'] + 1e-9) / ((df['pimp1'] + df['pimp2'])/2 + 1e-9))

# Diferencial de Elo
df['elo_diff'] = df['h_elo'] - df['a_elo']

In [6]:
FEATURES_S0   = ['pimp1', 'pimpx', 'pimp2']
FEATURES_S0p  = FEATURES_S0 + ['elo_diff']

FEATURES_S1 = ['pimp1','pimpx','pimp2','relative_perf_diff']
FEATURES_S1p = FEATURES_S1 + ['elo_diff']

FEATURES_S2 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff']
FEATURES_S2p = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','has_xg_data']

FEATURES_S3 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff', 'form_points_6_diff']
FEATURES_S3p = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff', 'form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum']

FEATURES_S4 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff', 'form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'prev_position_diff']
FEATURES_S4p = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff', 'form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'home_prev_position', 'away_prev_position', 'elo_diff']

FEATURES_S5 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff']
FEATURES_S5p = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'h2h_draw_rate_ewm_diff', 'h2h_loss_rate_ewm_diff']

FEATURES_S6 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'home_total_matches_prev', 'away_total_matches_prev']

FEATURES_S7 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'home_total_matches_prev', 'away_total_matches_prev', 'home_avg_shotsontarget_last7', 'avg_shots_last7_diff']

FEATURES_S8 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'h2h_draw_rate_ewm_diff', 'h2h_loss_rate_ewm_diff','home_playstyle_equilibrado', 'home_playstyle_ofensivo', 'away_playstyle_defensivo', 'away_playstyle_ofensivo']

FEATURES_S9 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'h2h_draw_rate_ewm_diff', 'h2h_loss_rate_ewm_diff','home_playstyle_equilibrado', 'home_playstyle_ofensivo', 'away_playstyle_defensivo', 'away_playstyle_ofensivo', 'a_elo']

FEATURES_S10 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'h2h_draw_rate_ewm_diff', 'h2h_loss_rate_ewm_diff','home_playstyle_defensivo', 'home_playstyle_ofensivo', 'away_playstyle_defensivo', 'away_playstyle_ofensivo', 'a_elo', 'h2h_draw_rate_roll8_diff']

FEATURES_S11 = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'home_total_matches_prev', 'away_total_matches_prev', 'home_avg_shotsontarget_last7', 'avg_shots_last7_diff', 'away_playstyle_equilibrado']
FEATURES_S11p = ['pimp1','pimpx','pimp2','relative_perf_diff','avg_xg_last7_diff','form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'home_total_matches_prev', 'away_total_matches_prev', 'home_avg_shotsontarget_last7', 'avg_shots_last7_diff', 'away_gd_cum']


## Sin SMOTE:

In [65]:
# --- NEW: fuerza unicidad de pred_key con sufijo estable "#k" ---
def enforce_unique_pred_key(df_in, key_col="pred_key"):
    """
    Si hay claves duplicadas en `key_col`, añade '#k' (k=0,1,2,...) por orden estable
    dentro de cada grupo duplicado. Devuelve df modificado y nº de filas afectadas.
    """
    d = df_in.copy()
    base = d[key_col].astype(str)
    grp_sizes = base.map(base.value_counts())
    pos = base.groupby(base).cumcount()
    suffix = np.where(grp_sizes > 1, "#" + pos.astype(str), "")
    d[key_col] = base + suffix
    affected = int((grp_sizes > 1).sum())
    return d, affected


def walkforward_multinomial_accuracy(
    df,
    feature_cols,
    date_col='Date',
    label_col='FTR',
    n_seasons_window=4,
    season_size=380,
    recent_weight=3.0,
    older_weight=1.0,
    C=1.0,
    max_iter=1000,
    verbose_every=0  # pon >0 para logs cada N días
):
    """
    Evaluación día a día (igual que antes), pero ahora añade proba_H/D/A a la salida.
    """
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col], errors='coerce')

    # Feature derivada opcional (igual que antes)
    if 'market_home_logit' in feature_cols and 'market_home_logit' not in df.columns:
        if {'pimp1','pimp2'}.issubset(df.columns):
            df['market_home_logit'] = np.log(
                (pd.to_numeric(df['pimp1'], errors='coerce') + 1e-9) /
                (pd.to_numeric(df['pimp2'], errors='coerce') + 1e-9)
            )
        else:
            raise ValueError("market_home_logit pedido en feature_cols pero faltan pimp1/pimp2 en df.")

    df = df.sort_values(date_col).reset_index(drop=True)
    uniq_dates = df[date_col].sort_values().unique()

    train_window = n_seasons_window * season_size
    recent_block = season_size

    pipe = Pipeline(steps=[
        ('imp', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler(with_mean=True, with_std=True)),
        ('logit', LogisticRegression(solver='lbfgs', C=C, max_iter=max_iter))
    ])

    preds_all = []

    for d_i, current_date in enumerate(uniq_dates):
        test_mask = df[date_col] == current_date
        test_idx = np.where(test_mask)[0]
        if test_idx.size == 0:
            continue

        train_mask = df[date_col] < current_date
        train_idx_all = np.where(train_mask)[0]
        if train_idx_all.size < train_window:
            continue

        train_idx = train_idx_all[-train_window:]

        # Pesos
        sample_weight = np.full(train_idx.shape[0], older_weight, dtype=float)
        if recent_block > 0:
            sample_weight[-recent_block:] = recent_weight

        # X, y
        X_train = df.iloc[train_idx][feature_cols]
        y_train = df.iloc[train_idx][label_col]
        X_test  = df.iloc[test_idx][feature_cols]
        y_test  = df.iloc[test_idx][label_col]

        # Entrena y predice
        pipe.fit(X_train, y_train, **{'logit__sample_weight': sample_weight})
        y_pred  = pipe.predict(X_test)
        y_proba = pipe.predict_proba(X_test)

        # --- METADATA para merge estable (sin cambios) ---
        meta = df.iloc[test_idx][['Season','Date','HomeTeam_norm','AwayTeam_norm']].copy()
        meta['_date_key'] = pd.to_datetime(meta['Date'], errors='coerce')\
                               .dt.tz_localize(None, nonexistent='NaT', ambiguous='NaT')\
                               .dt.floor('D')
        meta['pred_key'] = (
            meta['Season'].astype('Int64').astype(str) + "|" +
            meta['_date_key'].dt.strftime("%Y-%m-%d") + "|" +
            meta['HomeTeam_norm'].astype(str) + "|" +
            meta['AwayTeam_norm'].astype(str)
        )

        # --- NUEVO: añadir columnas de probabilidad H/D/A en orden fijo ---
        classes = pipe.named_steps['logit'].classes_.astype(str)
        proba_cols_map = {c: y_proba[:, i] for i, c in enumerate(classes)}
        # Inicializa como NaN por si alguna clase no aparece en este fold
        proba_H = proba_cols_map.get('H', np.full(len(test_idx), np.nan))
        proba_D = proba_cols_map.get('D', np.full(len(test_idx), np.nan))
        proba_A = proba_cols_map.get('A', np.full(len(test_idx), np.nan))

        day_res = pd.DataFrame({
            'Date': meta['Date'].values,
            'y_true': y_test.values,
            'y_pred': y_pred,
            'proba_H': proba_H,
            'proba_D': proba_D,
            'proba_A': proba_A
        })

        # etiqueta válida (para accuracy)
        y_true_clean = day_res['y_true'].astype(str).str.upper().str.strip()
        day_res['has_label'] = y_true_clean.isin(['H', 'D', 'A']).astype(int)

        # anexamos Season/Home/Away/pred_key
        day_res[['Season','HomeTeam_norm','AwayTeam_norm','pred_key']] = \
            meta[['Season','HomeTeam_norm','AwayTeam_norm','pred_key']].values

        preds_all.append(day_res)

        if verbose_every and (d_i % verbose_every == 0):
            mask_lbl = day_res['has_label'] == 1
            if mask_lbl.any():
                acc_day = (day_res.loc[mask_lbl, 'y_true'] == day_res.loc[mask_lbl, 'y_pred']).mean()
                print(f"[{d_i+1}/{len(uniq_dates)}] {str(current_date)[:10]}  "
                      f"test_n={len(day_res)}  scored_n={int(mask_lbl.sum())}  acc={acc_day:.3f}")
            else:
                print(f"[{d_i+1}/{len(uniq_dates)}] {str(current_date)[:10]}  "
                      f"test_n={len(day_res)}  (sin labels válidas)")

    if not preds_all:
        raise RuntimeError("No se generaron predicciones; ¿hay suficientes datos previos para armar ventanas?")

    preds_all = pd.concat(preds_all, ignore_index=True)

    # Unicidad de pred_key (igual que antes)
    if 'pred_key' in preds_all.columns:
        preds_all, _ = enforce_unique_pred_key(preds_all, key_col='pred_key')

    # Accuracy oficial (sin cuotas)
    scored_mask = preds_all['has_label'] == 1
    if scored_mask.any():
        accuracy = (preds_all.loc[scored_mask, 'y_true'] == preds_all.loc[scored_mask, 'y_pred']).mean()
    else:
        raise RuntimeError("No hay partidos con etiqueta válida para calcular accuracy.")

    return float(accuracy), preds_all

In [66]:
# ===================== 1) ELIGE TU SET DE FEATURES =====================
FEATURES = FEATURES_S11p

# ===================== 2) PARÁMETROS WALK-FORWARD =======================
WF_KWARGS = dict(
    n_seasons_window=4,
    season_size=380,
    recent_weight=3.0,
    older_weight=1.0,
    C=1.0,
    max_iter=1000,
    verbose_every=0
)

# ===================== 3) EJECUCIÓN WALK-FORWARD (TU MODELO) ===========
acc_global_oficial, preds = walkforward_multinomial_accuracy(
    df,
    feature_cols=FEATURES,
    **WF_KWARGS
)

# ---------- util: alinear por (Date + orden en esa fecha) y CONSTRUIR pred_key ----------
def align_preds_by_date_order_and_build_predkey(preds, df):
    """
    1) Alinea preds con df por (Date, row_in_date) para añadir Season/Home/Away/B365/pimp*.
    2) Construye pred_key = Season|YYYY-MM-DD|Home|Away normalizando Date al DÍA y tz-naive.
    3) Si Season quedara NaN tras el merge principal, la rellena con un fallback Date->Season.
    4) NEW: fuerza unicidad de pred_key con sufijo '#k' si hay colisiones.
    """
    p = preds.copy()
    p['Date'] = pd.to_datetime(p['Date'], errors='coerce')
    # orden estable para que 'row_in_date' sea reproducible
    p = p.sort_values('Date', kind='mergesort').reset_index(drop=True)
    p['row_in_date'] = p.groupby('Date').cumcount()

    d = df.copy()
    d['Date'] = pd.to_datetime(d['Date'], errors='coerce')
    d = d.sort_values('Date', kind='mergesort').reset_index(drop=True)
    d['row_in_date'] = d.groupby('Date').cumcount()

    need_cols = [
        'Season','HomeTeam_norm','AwayTeam_norm',
        'B365H','B365D','B365A','pimp1','pimpx','pimp2'
    ]
    need_cols = [c for c in need_cols if c in d.columns]

    # Merge determinista por (Date + row_in_date)
    m = p.merge(
        d[['Date','row_in_date'] + need_cols],
        on=['Date','row_in_date'],
        how='left',
        validate='1:1'
    )

    # Fallback Season por día si faltara
    if ('Season' not in m.columns) or (m['Season'].isna().any()):
        date_season = df[['Date','Season']].copy()
        date_season['Date'] = pd.to_datetime(date_season['Date'], errors='coerce').dt.floor('D')
        date_season = date_season.drop_duplicates(subset=['Date'])

        m['_Date_day'] = m['Date'].dt.floor('D')
        m = m.merge(date_season.rename(columns={'Date':'_Date_day','Season':'Season_from_day'}),
                    on='_Date_day', how='left')
        if 'Season' in m.columns:
            m['Season'] = m['Season'].fillna(m['Season_from_day'])
        else:
            m['Season'] = m['Season_from_day']
        m = m.drop(columns=['_Date_day','Season_from_day'])

    # Tipos numéricos robustos
    m['Season'] = pd.to_numeric(m['Season'], errors='coerce').astype('Int64')
    for col in ['B365H','B365D','B365A','pimp1','pimpx','pimp2']:
        if col in m.columns:
            m[col] = pd.to_numeric(m[col], errors='coerce')

    # pred_key estable (Season|YYYY-MM-DD|Home|Away) con Date al DÍA y tz-naive
    if {'Season','HomeTeam_norm','AwayTeam_norm'}.issubset(m.columns):
        date_key = m['Date'].dt.tz_localize(None).dt.floor('D')
        m['pred_key'] = (
            m['Season'].astype('Int64').astype(str) + "|" +
            date_key.dt.strftime("%Y-%m-%d") + "|" +
            m['HomeTeam_norm'].astype(str) + "|" +
            m['AwayTeam_norm'].astype(str)
        )
    else:
        m['pred_key'] = pd.NA

    # --- NEW: asegurar pred_key ÚNICA en el merged ---
    m, _ = enforce_unique_pred_key(m, key_col='pred_key')

    return m

merged = align_preds_by_date_order_and_build_predkey(preds, df)

# ===================== 4) ACCURACY OFICIAL POR TEMPORADA (SIN cuotas) ========
date_season = df[['Date','Season']].copy()
date_season['Date'] = pd.to_datetime(date_season['Date'], errors='coerce')
date_season = date_season.drop_duplicates(subset=['Date'])

preds_seas = preds.copy()
preds_seas['Date'] = pd.to_datetime(preds_seas['Date'], errors='coerce')
preds_seas = preds_seas.merge(date_season, on='Date', how='left', validate='m:1')

if 'Season' not in preds_seas.columns:
    if 'Season_y' in preds_seas.columns:
        preds_seas['Season'] = preds_seas['Season_y']
    elif 'Season_x' in preds_seas.columns:
        preds_seas['Season'] = preds_seas['Season_x']
preds_seas.drop(columns=[c for c in ['Season_x','Season_y'] if c in preds_seas.columns], inplace=True)
preds_seas['Season'] = pd.to_numeric(preds_seas['Season'], errors='coerce').astype('Int64')

preds_seas['correct'] = (preds_seas['y_true'] == preds_seas['y_pred']).astype(int)
acc_by_season_oficial = (
    preds_seas[preds_seas['has_label'] == 1]
    .groupby('Season', dropna=True)['correct']
    .agg(matches='size', accuracy='mean')
    .reset_index()
    .sort_values('Season')
)

# ===================== 5) ROI Y ACCURACY ENTRE APUESTAS (CON cuotas) =========
def compute_accuracy_roi(merged_df, pred_col='y_pred'):
    """
    Accuracy & ROI ENTRE APUESTAS (solo filas con label H/D/A y cuota válida >= 1.01).
    No modifica el accuracy oficial de tu función (que no depende de cuotas).
    """
    m = merged_df.copy()
    n = len(m)

    y_true_arr = m['y_true'].astype(str).str.upper().str.strip().to_numpy()
    pred_arr   = m[pred_col].astype(str).str.upper().str.strip().to_numpy()

    valid_label = np.isin(y_true_arr, ['H','D','A'])

    odds_pred = np.where(
        pred_arr == 'H', m['B365H'].to_numpy() if 'B365H' in m.columns else np.nan,
        np.where(pred_arr == 'D', m['B365D'].to_numpy() if 'B365D' in m.columns else np.nan,
                 np.where(pred_arr == 'A', m['B365A'].to_numpy() if 'B365A' in m.columns else np.nan, np.nan))
    ).astype(float)

    valid_odds = np.isfinite(odds_pred) & (odds_pred >= 1.01)
    scored = valid_label & valid_odds

    is_correct = np.zeros(n, dtype=bool)
    is_correct[scored] = (pred_arr[scored] == y_true_arr[scored])

    acc_bets = is_correct[scored].mean() if scored.any() else np.nan

    profit = np.full(n, np.nan, dtype=float)
    profit[scored] = -1.0
    profit[scored & is_correct] = odds_pred[scored & is_correct] - 1.0

    n_bets = int(np.isfinite(profit).sum())
    total_profit = float(np.nansum(profit))
    roi_global = (total_profit / n_bets) if n_bets > 0 else np.nan

    if 'Season' in m.columns:
        scored_idx = np.isfinite(profit)
        by_season = m.loc[scored_idx, ['Season']].copy()
        by_season['correct'] = is_correct[scored_idx].astype(int)
        by_season['profit']  = profit[scored_idx]

        acc_by_season_bets = (
            by_season.groupby('Season', dropna=True)['correct']
                     .agg(matches='size', accuracy='mean')
                     .reset_index()
                     .sort_values('Season')
        )
        roi_by_season = (
            by_season.groupby('Season', dropna=True)['profit']
                     .agg(bets='size', total_profit='sum')
                     .reset_index()
                     .sort_values('Season')
        )
        roi_by_season['roi'] = roi_by_season['total_profit'] / roi_by_season['bets']
    else:
        acc_by_season_bets = pd.DataFrame(columns=['Season','matches','accuracy'])
        roi_by_season = pd.DataFrame(columns=['Season','bets','total_profit','roi'])

    return acc_bets, roi_global, n_bets, total_profit, acc_by_season_bets, roi_by_season

# Métricas de tu modelo (ENTRE apuestas)
acc_bets_model, roi_g_model, bets_model, prof_model, acc_seas_bets_model, roi_seas_model = compute_accuracy_roi(
    merged, pred_col='y_pred'
)

# Baseline mercado
market_labels = np.array(['H','D','A'])
probs = merged[['pimp1','pimpx','pimp2']].to_numpy(dtype=float)
probs_filled = np.where(np.isnan(probs), -np.inf, probs)
argmax_idx = np.argmax(probs_filled, axis=1)

merged_market = merged.copy()
merged_market['y_pred_market'] = market_labels[argmax_idx]

acc_bets_mkt, roi_g_mkt, bets_mkt, prof_mkt, acc_seas_bets_mkt, roi_seas_mkt = compute_accuracy_roi(
    merged_market, pred_col='y_pred_market'
)

# ===================== 6) REPORTING =========================================
print("\n=== CONFIGURACIÓN ===")
print("Features:", FEATURES)
print("WF kwargs:", WF_KWARGS)

print("\n=== ACCURACY OFICIAL (función walkforward, SIN cuotas) ===")
print(f"Global: {acc_global_oficial:.4f}")
print("\nAccuracy por temporada (oficial):")
print(acc_by_season_oficial.to_string(index=False))

print("\n=== TU MODELO — ENTRE APUESTAS (CON cuotas) ===")
print(f"Accuracy entre apuestas: {acc_bets_model:.4f}")
print(f"ROI global             : {roi_g_model:.4f}   |  Bets: {bets_model}   |  Profit: {prof_model:.2f}")
print("\nROI por temporada (tu modelo):")
print(roi_seas_model[['Season','bets','roi','total_profit']].to_string(index=False))

print("\n=== BASELINE MERCADO (argmax pimp1/pimpx/pimp2) — ENTRE APUESTAS ===")
print(f"Accuracy entre apuestas: {acc_bets_mkt:.4f}")
print(f"ROI global             : {roi_g_mkt:.4f}   |  Bets: {bets_mkt}   |  Profit: {prof_mkt:.2f}")
print("\nROI por temporada (mercado):")
print(roi_seas_mkt[['Season','bets','roi','total_profit']].to_string(index=False))


=== CONFIGURACIÓN ===
Features: ['pimp1', 'pimpx', 'pimp2', 'relative_perf_diff', 'avg_xg_last7_diff', 'form_points_6_diff', 'home_total_gd_cum', 'away_total_gd_cum', 'h2h_win_rate_ewm_diff', 'home_total_matches_prev', 'away_total_matches_prev', 'home_avg_shotsontarget_last7', 'avg_shots_last7_diff', 'away_gd_cum']
WF kwargs: {'n_seasons_window': 4, 'season_size': 380, 'recent_weight': 3.0, 'older_weight': 1.0, 'C': 1.0, 'max_iter': 1000, 'verbose_every': 0}

=== ACCURACY OFICIAL (función walkforward, SIN cuotas) ===
Global: 0.5458

Accuracy por temporada (oficial):
 Season  matches  accuracy
   2010      380  0.602632
   2011      380  0.531579
   2012      380  0.531579
   2013      380  0.555263
   2014      380  0.552632
   2015      380  0.542105
   2016      380  0.594737
   2017      380  0.536842
   2018      380  0.497368
   2019      380  0.518421
   2020      380  0.523684
   2021      380  0.526316
   2022      380  0.544737
   2023      380  0.573684
   2024      380  0.5

## Con SMOTE:

In [44]:
# # ============================================
# # Walk-forward multinomial con SMOTE + calibración
# # - Usa TU enforce_unique_pred_key (con sufijo "#k")
# # - Mantiene misma clave pred_key y estructura de salida
# # ============================================

# def walkforward_multinomial_accuracy_smote_calibrated(
#     df,
#     feature_cols,
#     date_col='Date',
#     label_col='FTR',
#     n_seasons_window=4,
#     season_size=380,
#     recent_weight=3.0,
#     older_weight=1.0,
#     C=1.0,
#     max_iter=1000,
#     # --- SMOTE & Calibración ---
#     smote_k_neighbors=5,
#     smote_sampling_strategy='auto',
#     smote_random_state=42,
#     calibrate=True,
#     calibration_method='sigmoid',   # 'sigmoid' (Platt) o 'isotonic'
#     calibration_cv=3,
#     # --- Otros ---
#     random_state=42,
#     verbose_every=0
# ):
#     """
#     Igual que tu walkforward_multinomial_accuracy, pero:
#       - Aplica SMOTE en el ENTRENAMIENTO de cada día (tras imputar y escalar).
#       - Aplica calibración de probabilidades multiclase con CalibratedClassifierCV.
#       - Mantiene la ventana temporal y una aproximación al peso temporal reciente
#         replicando el último bloque de season_size muestras antes de SMOTE.

#     Devuelve:
#       accuracy (float), preds_all (DataFrame con y_true, y_pred, proba, pred_key, meta)
#     """
#     d0 = df.copy()
#     d0[date_col] = pd.to_datetime(d0[date_col], errors='coerce')

#     # Feature derivada opcional (misma lógica que tu función)
#     if 'market_home_logit' in feature_cols and 'market_home_logit' not in d0.columns:
#         if {'pimp1','pimp2'}.issubset(d0.columns):
#             d0['market_home_logit'] = np.log(
#                 (pd.to_numeric(d0['pimp1'], errors='coerce') + 1e-9) /
#                 (pd.to_numeric(d0['pimp2'], errors='coerce') + 1e-9)
#             )
#         else:
#             raise ValueError("market_home_logit pedido en feature_cols pero faltan pimp1/pimp2 en df.")

#     d0 = d0.sort_values(date_col).reset_index(drop=True)
#     uniq_dates = d0[date_col].sort_values().unique()

#     train_window = n_seasons_window * season_size
#     recent_block = season_size

#     # Clasificador base
#     base_logit = LogisticRegression(
#         solver='lbfgs',
#         C=C,
#         max_iter=max_iter,
#         random_state=random_state
#     )

#     # Pipeline con SMOTE (imputar, escalar, smote, logit)
#     # OJO: usamos ImbPipeline para que SMOTE actúe solo en entrenamiento
#     pipe_base = ImbPipeline(steps=[
#         ('imp', SimpleImputer(strategy='median')),
#         ('scaler', StandardScaler(with_mean=True, with_std=True)),
#         ('smote', SMOTE(
#             sampling_strategy=smote_sampling_strategy,
#             k_neighbors=smote_k_neighbors,
#             random_state=smote_random_state
#         )),
#         ('logit', base_logit)
#     ])

#     preds_all = []

#     # Factor de replicación aproximado para el bloque reciente
#     # (equivale a recent_weight/older_weight redondeado al entero más cercano, >=1)
#     # Si older_weight es 0 (raro), por seguridad fijamos a 1.
#     denom = older_weight if older_weight > 0 else 1.0
#     recent_dup_factor = int(max(1, round(float(recent_weight) / float(denom))))

#     for d_i, current_date in enumerate(uniq_dates):
#         test_mask = d0[date_col] == current_date
#         test_idx = np.where(test_mask)[0]
#         if test_idx.size == 0:
#             continue

#         # Entrenamiento ANTERIOR al día
#         train_mask = d0[date_col] < current_date
#         train_idx_all = np.where(train_mask)[0]
#         if train_idx_all.size < train_window:
#             continue

#         train_idx = train_idx_all[-train_window:]

#         # --- Construir X_train / y_train con replicación del bloque reciente ---
#         X_train_full = d0.iloc[train_idx][feature_cols]
#         y_train_full = d0.iloc[train_idx][label_col].astype(str).str.upper().str.strip()

#         # Asegura que solo entrenamos con H/D/A
#         valid_mask = y_train_full.isin(['H', 'D', 'A'])
#         X_train_full = X_train_full.loc[valid_mask]
#         y_train_full = y_train_full.loc[valid_mask]
#         if len(y_train_full) < 3:
#             # no hay suficiente para multiclass este día
#             continue

#         # Índices relativos del bloque reciente en el training recortado
#         # (últimos 'recent_block' partidos dentro de X_train_full)
#         if recent_block > 0 and recent_dup_factor > 1:
#             n_train = len(X_train_full)
#             cut = max(0, n_train - recent_block)
#             X_older = X_train_full.iloc[:cut]
#             y_older = y_train_full.iloc[:cut]
#             X_recent = X_train_full.iloc[cut:]
#             y_recent = y_train_full.iloc[cut:]

#             # Replicamos el bloque reciente para aproximar pesos temporales
#             X_recent_dup = pd.concat([X_recent] * recent_dup_factor, axis=0, ignore_index=True)
#             y_recent_dup = pd.concat([y_recent] * recent_dup_factor, axis=0, ignore_index=True)

#             X_train_w = pd.concat([X_older, X_recent_dup], axis=0, ignore_index=True)
#             y_train_w = pd.concat([y_older, y_recent_dup], axis=0, ignore_index=True)
#         else:
#             X_train_w = X_train_full
#             y_train_w = y_train_full

#         # Test del día
#         X_test = d0.iloc[test_idx][feature_cols]
#         y_test = d0.iloc[test_idx][label_col]

#         # --- Ajuste con SMOTE ---
#         if calibrate:
#             # Calibración multiclase (One-vs-Rest internamente)
#             # CalibratedClassifierCV clona el estimador y aplica el pipeline por fold (SMOTE en train-fold).
#             clf = CalibratedClassifierCV(
#                 estimator=pipe_base,
#                 method=calibration_method,
#                 cv=calibration_cv
#             )
#         else:
#             clf = pipe_base

#         # Fit y predicción
#         clf.fit(X_train_w, y_train_w)

#         # Probabilidades multiclase (garantizamos orden H/D/A)
#         proba = clf.predict_proba(X_test)
#         # CalibratedClassifierCV devuelve lista de proba por clase; si multiclass, predict_proba es (n, n_classes)
#         # Aseguramos mapeo en el mismo orden que las clases que expone el último paso
#         # Obtenemos las clases de forma segura:
#         if hasattr(clf, "classes_"):
#             classes = list(clf.classes_)
#         else:
#             # fallback para estimador interno
#             classes = list(clf.estimator.named_steps['logit'].classes_)

#         idx_map = {c: classes.index(c) for c in classes}
#         def colp(c):
#             return proba[:, idx_map[c]] if c in idx_map else np.full(proba.shape[0], np.nan)

#         pH = colp('H'); pD = colp('D'); pA = colp('A')
#         y_pred = np.array(['H','D','A'])[np.nanargmax(np.vstack([pH, pD, pA]), axis=0)]

#         # --- METADATA / pred_key idéntico a tu función ---
#         meta = d0.iloc[test_idx][['Season','Date','HomeTeam_norm','AwayTeam_norm']].copy()
#         meta['_date_key'] = pd.to_datetime(meta['Date'], errors='coerce')\
#                                 .dt.tz_localize(None, nonexistent='NaT', ambiguous='NaT')\
#                                 .dt.floor('D')
#         meta['pred_key'] = (
#             meta['Season'].astype('Int64').astype(str) + "|" +
#             meta['_date_key'].dt.strftime("%Y-%m-%d") + "|" +
#             meta['HomeTeam_norm'].astype(str) + "|" +
#             meta['AwayTeam_norm'].astype(str)
#         )

#         day_res = pd.DataFrame({
#             'Date': meta['Date'].values,
#             'y_true': y_test.values.astype(object),  # conserva NaN/strings
#             'y_pred': y_pred,
#             'pH_pred': pH,
#             'pD_pred': pD,
#             'pA_pred': pA,
#         })

#         # etiqueta válida (para accuracy)
#         y_true_clean = day_res['y_true'].astype(str).str.upper().str.strip()
#         day_res['has_label'] = y_true_clean.isin(['H', 'D', 'A']).astype(int)

#         # anexamos Season/Home/Away/pred_key
#         day_res[['Season','HomeTeam_norm','AwayTeam_norm','pred_key']] = \
#             meta[['Season','HomeTeam_norm','AwayTeam_norm','pred_key']].values

#         preds_all.append(day_res)

#         if verbose_every and (d_i % verbose_every == 0):
#             mask_lbl = day_res['has_label'] == 1
#             if mask_lbl.any():
#                 acc_day = (day_res.loc[mask_lbl, 'y_true'] == day_res.loc[mask_lbl, 'y_pred']).mean()
#                 print(f"[SMOTE+Calib {d_i+1}/{len(uniq_dates)}] {str(current_date)[:10]}  "
#                       f"test_n={len(day_res)}  scored_n={int(mask_lbl.sum())}  acc={acc_day:.3f}")

#     if not preds_all:
#         raise RuntimeError("No se generaron predicciones; ¿hay suficientes datos previos para armar ventanas?")

#     preds_all = pd.concat(preds_all, ignore_index=True)

#     # Fuerza unicidad de pred_key CON TU VERSIÓN (sufijo '#k')
#     if 'pred_key' in preds_all.columns:
#         preds_all, _ = enforce_unique_pred_key(preds_all, key_col='pred_key')

#     # Accuracy SOLO sobre filas con etiqueta válida
#     scored_mask = preds_all['has_label'] == 1
#     if scored_mask.any():
#         accuracy = (preds_all.loc[scored_mask, 'y_true'] == preds_all.loc[scored_mask, 'y_pred']).mean()
#     else:
#         raise RuntimeError("No hay partidos con etiqueta válida para calcular accuracy.")

#     return float(accuracy), preds_all

In [18]:
# # ===================== 1) ELIGE TU SET DE FEATURES =====================
# FEATURES = FEATURES_S11p

# # ===================== 2) PARÁMETROS WALK-FORWARD =======================
# WF_KWARGS = dict(
#     n_seasons_window=4,
#     season_size=380,
#     recent_weight=3.0,
#     older_weight=1.0,
#     C=1.0,
#     max_iter=1000,
#     verbose_every=0
# )

# # ===================== 3) EJECUCIÓN WALK-FORWARD (TU MODELO) ===========
# acc_global_oficial, preds = walkforward_multinomial_accuracy_smote_calibrated(
#     df,
#     feature_cols=FEATURES,
#     **WF_KWARGS
# )

# # ---------- util: alinear por (Date + orden en esa fecha) y CONSTRUIR pred_key ----------
# def align_preds_by_date_order_and_build_predkey(preds, df):
#     """
#     1) Alinea preds con df por (Date, row_in_date) para añadir Season/Home/Away/B365/pimp*.
#     2) Construye pred_key = Season|YYYY-MM-DD|Home|Away normalizando Date al DÍA y tz-naive.
#     3) Si Season quedara NaN tras el merge principal, la rellena con un fallback Date->Season.
#     4) NEW: fuerza unicidad de pred_key con sufijo '#k' si hay colisiones.
#     """
#     p = preds.copy()
#     p['Date'] = pd.to_datetime(p['Date'], errors='coerce')
#     # orden estable para que 'row_in_date' sea reproducible
#     p = p.sort_values('Date', kind='mergesort').reset_index(drop=True)
#     p['row_in_date'] = p.groupby('Date').cumcount()

#     d = df.copy()
#     d['Date'] = pd.to_datetime(d['Date'], errors='coerce')
#     d = d.sort_values('Date', kind='mergesort').reset_index(drop=True)
#     d['row_in_date'] = d.groupby('Date').cumcount()

#     need_cols = [
#         'Season','HomeTeam_norm','AwayTeam_norm',
#         'B365H','B365D','B365A','pimp1','pimpx','pimp2'
#     ]
#     need_cols = [c for c in need_cols if c in d.columns]

#     # Merge determinista por (Date + row_in_date)
#     m = p.merge(
#         d[['Date','row_in_date'] + need_cols],
#         on=['Date','row_in_date'],
#         how='left',
#         validate='1:1'
#     )

#     # Fallback Season por día si faltara
#     if ('Season' not in m.columns) or (m['Season'].isna().any()):
#         date_season = df[['Date','Season']].copy()
#         date_season['Date'] = pd.to_datetime(date_season['Date'], errors='coerce').dt.floor('D')
#         date_season = date_season.drop_duplicates(subset=['Date'])

#         m['_Date_day'] = m['Date'].dt.floor('D')
#         m = m.merge(date_season.rename(columns={'Date':'_Date_day','Season':'Season_from_day'}),
#                     on='_Date_day', how='left')
#         if 'Season' in m.columns:
#             m['Season'] = m['Season'].fillna(m['Season_from_day'])
#         else:
#             m['Season'] = m['Season_from_day']
#         m = m.drop(columns=['_Date_day','Season_from_day'])

#     # Tipos numéricos robustos
#     m['Season'] = pd.to_numeric(m['Season'], errors='coerce').astype('Int64')
#     for col in ['B365H','B365D','B365A','pimp1','pimpx','pimp2']:
#         if col in m.columns:
#             m[col] = pd.to_numeric(m[col], errors='coerce')

#     # pred_key estable (Season|YYYY-MM-DD|Home|Away) con Date al DÍA y tz-naive
#     if {'Season','HomeTeam_norm','AwayTeam_norm'}.issubset(m.columns):
#         date_key = m['Date'].dt.tz_localize(None).dt.floor('D')
#         m['pred_key'] = (
#             m['Season'].astype('Int64').astype(str) + "|" +
#             date_key.dt.strftime("%Y-%m-%d") + "|" +
#             m['HomeTeam_norm'].astype(str) + "|" +
#             m['AwayTeam_norm'].astype(str)
#         )
#     else:
#         m['pred_key'] = pd.NA

#     # --- NEW: asegurar pred_key ÚNICA en el merged ---
#     m, _ = enforce_unique_pred_key(m, key_col='pred_key')

#     return m

# merged = align_preds_by_date_order_and_build_predkey(preds, df)

# # ===================== 4) ACCURACY OFICIAL POR TEMPORADA (SIN cuotas) ========
# date_season = df[['Date','Season']].copy()
# date_season['Date'] = pd.to_datetime(date_season['Date'], errors='coerce')
# date_season = date_season.drop_duplicates(subset=['Date'])

# preds_seas = preds.copy()
# preds_seas['Date'] = pd.to_datetime(preds_seas['Date'], errors='coerce')
# preds_seas = preds_seas.merge(date_season, on='Date', how='left', validate='m:1')

# if 'Season' not in preds_seas.columns:
#     if 'Season_y' in preds_seas.columns:
#         preds_seas['Season'] = preds_seas['Season_y']
#     elif 'Season_x' in preds_seas.columns:
#         preds_seas['Season'] = preds_seas['Season_x']
# preds_seas.drop(columns=[c for c in ['Season_x','Season_y'] if c in preds_seas.columns], inplace=True)
# preds_seas['Season'] = pd.to_numeric(preds_seas['Season'], errors='coerce').astype('Int64')

# preds_seas['correct'] = (preds_seas['y_true'] == preds_seas['y_pred']).astype(int)
# acc_by_season_oficial = (
#     preds_seas[preds_seas['has_label'] == 1]
#     .groupby('Season', dropna=True)['correct']
#     .agg(matches='size', accuracy='mean')
#     .reset_index()
#     .sort_values('Season')
# )

# # ===================== 5) ROI Y ACCURACY ENTRE APUESTAS (CON cuotas) =========
# def compute_accuracy_roi(merged_df, pred_col='y_pred'):
#     """
#     Accuracy & ROI ENTRE APUESTAS (solo filas con label H/D/A y cuota válida >= 1.01).
#     No modifica el accuracy oficial de tu función (que no depende de cuotas).
#     """
#     m = merged_df.copy()
#     n = len(m)

#     y_true_arr = m['y_true'].astype(str).str.upper().str.strip().to_numpy()
#     pred_arr   = m[pred_col].astype(str).str.upper().str.strip().to_numpy()

#     valid_label = np.isin(y_true_arr, ['H','D','A'])

#     odds_pred = np.where(
#         pred_arr == 'H', m['B365H'].to_numpy() if 'B365H' in m.columns else np.nan,
#         np.where(pred_arr == 'D', m['B365D'].to_numpy() if 'B365D' in m.columns else np.nan,
#                  np.where(pred_arr == 'A', m['B365A'].to_numpy() if 'B365A' in m.columns else np.nan, np.nan))
#     ).astype(float)

#     valid_odds = np.isfinite(odds_pred) & (odds_pred >= 1.01)
#     scored = valid_label & valid_odds

#     is_correct = np.zeros(n, dtype=bool)
#     is_correct[scored] = (pred_arr[scored] == y_true_arr[scored])

#     acc_bets = is_correct[scored].mean() if scored.any() else np.nan

#     profit = np.full(n, np.nan, dtype=float)
#     profit[scored] = -1.0
#     profit[scored & is_correct] = odds_pred[scored & is_correct] - 1.0

#     n_bets = int(np.isfinite(profit).sum())
#     total_profit = float(np.nansum(profit))
#     roi_global = (total_profit / n_bets) if n_bets > 0 else np.nan

#     if 'Season' in m.columns:
#         scored_idx = np.isfinite(profit)
#         by_season = m.loc[scored_idx, ['Season']].copy()
#         by_season['correct'] = is_correct[scored_idx].astype(int)
#         by_season['profit']  = profit[scored_idx]

#         acc_by_season_bets = (
#             by_season.groupby('Season', dropna=True)['correct']
#                      .agg(matches='size', accuracy='mean')
#                      .reset_index()
#                      .sort_values('Season')
#         )
#         roi_by_season = (
#             by_season.groupby('Season', dropna=True)['profit']
#                      .agg(bets='size', total_profit='sum')
#                      .reset_index()
#                      .sort_values('Season')
#         )
#         roi_by_season['roi'] = roi_by_season['total_profit'] / roi_by_season['bets']
#     else:
#         acc_by_season_bets = pd.DataFrame(columns=['Season','matches','accuracy'])
#         roi_by_season = pd.DataFrame(columns=['Season','bets','total_profit','roi'])

#     return acc_bets, roi_global, n_bets, total_profit, acc_by_season_bets, roi_by_season

# # Métricas de tu modelo (ENTRE apuestas)
# acc_bets_model, roi_g_model, bets_model, prof_model, acc_seas_bets_model, roi_seas_model = compute_accuracy_roi(
#     merged, pred_col='y_pred'
# )

# # Baseline mercado
# market_labels = np.array(['H','D','A'])
# probs = merged[['pimp1','pimpx','pimp2']].to_numpy(dtype=float)
# probs_filled = np.where(np.isnan(probs), -np.inf, probs)
# argmax_idx = np.argmax(probs_filled, axis=1)

# merged_market = merged.copy()
# merged_market['y_pred_market'] = market_labels[argmax_idx]

# acc_bets_mkt, roi_g_mkt, bets_mkt, prof_mkt, acc_seas_bets_mkt, roi_seas_mkt = compute_accuracy_roi(
#     merged_market, pred_col='y_pred_market'
# )

# # ===================== 6) REPORTING =========================================
# print("\n=== CONFIGURACIÓN ===")
# print("Features:", FEATURES)
# print("WF kwargs:", WF_KWARGS)

# print("\n=== ACCURACY OFICIAL (función walkforward, SIN cuotas) ===")
# print(f"Global: {acc_global_oficial:.4f}")
# print("\nAccuracy por temporada (oficial):")
# print(acc_by_season_oficial.to_string(index=False))

# print("\n=== TU MODELO — ENTRE APUESTAS (CON cuotas) ===")
# print(f"Accuracy entre apuestas: {acc_bets_model:.4f}")
# print(f"ROI global             : {roi_g_model:.4f}   |  Bets: {bets_model}   |  Profit: {prof_model:.2f}")
# print("\nROI por temporada (tu modelo):")
# print(roi_seas_model[['Season','bets','roi','total_profit']].to_string(index=False))

# print("\n=== BASELINE MERCADO (argmax pimp1/pimpx/pimp2) — ENTRE APUESTAS ===")
# print(f"Accuracy entre apuestas: {acc_bets_mkt:.4f}")
# print(f"ROI global             : {roi_g_mkt:.4f}   |  Bets: {bets_mkt}   |  Profit: {prof_mkt:.2f}")
# print("\nROI por temporada (mercado):")
# print(roi_seas_mkt[['Season','bets','roi','total_profit']].to_string(index=False))

# **PREDICCIÓN: Logistic Regression multinomial**

## Sin SMOTE:

In [71]:
# ================= FUTURE PREDICTIONS EXPORTER (solo columnas garantizadas) =================

# --------------------- util: clave estable y saneo de duplicados ---------------------
def enforce_unique_pred_key(df_in, key_col="pred_key"):
    """
    Si hay claves duplicadas en `key_col`, añade '#k' (k=0,1,2,...) por orden estable
    dentro de cada grupo duplicado. Devuelve df modificado y nº de filas afectadas.
    """
    d = df_in.copy()
    base = d[key_col].astype(str)
    grp_sizes = base.map(base.value_counts())
    pos = base.groupby(base).cumcount()
    suffix = np.where(grp_sizes > 1, "#" + pos.astype(str), "")
    d[key_col] = base + suffix
    affected = int((grp_sizes > 1).sum())
    return d, {"collisions_augmented": affected}


# --------------------- función principal: predicciones futuras ------------------------
def generate_future_predictions(
    df: pd.DataFrame,
    feature_cols,
    outputs_dir="outputs",
    date_col="Date",
    label_col="FTR",
    season_col="Season",
    home_col="HomeTeam_norm",
    away_col="AwayTeam_norm",
    n_seasons_window=4,
    season_size=380,
    recent_weight=3.0,
    older_weight=1.0,
    C=1.0,
    max_iter=1000,
    season_filter: int | None = None,   # si quieres limitar solo a la temporada en curso
    verbose_every=0
):
    """
    Predice resultados para partidos FUTUROS (sin etiqueta válida H/D/A)
    y exporta SOLO columnas sin vacíos:
      Season, Date, HomeTeam_norm, AwayTeam_norm, pred_key,
      y_pred, pH_pred, pD_pred, pA_pred, conf_maxprob, entropy, margin_top12
    """
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col], errors="coerce")
    df = df.sort_values(date_col).reset_index(drop=True)

    # Feature derivada opcional
    if 'market_home_logit' in feature_cols and 'market_home_logit' not in df.columns:
        if {'pimp1','pimp2'}.issubset(df.columns):
            df['market_home_logit'] = np.log(
                (pd.to_numeric(df['pimp1'], errors='coerce') + 1e-9) /
                (pd.to_numeric(df['pimp2'], errors='coerce') + 1e-9)
            )
        else:
            raise ValueError("market_home_logit está en feature_cols pero faltan pimp1/pimp2 en df.")

    # Futuro = sin etiqueta válida H/D/A
    y = df[label_col].astype(str).str.upper().str.strip()
    is_valid = y.isin(['H', 'D', 'A'])
    future_mask = ~is_valid
    if season_filter is not None:
        future_mask &= (df[season_col] == season_filter)
    if not future_mask.any():
        raise RuntimeError("No hay partidos futuros (sin etiqueta H/D/A) que predecir con los filtros actuales.")

    future_dates = np.sort(df.loc[future_mask, date_col].unique())

    train_window = n_seasons_window * season_size
    recent_block = season_size

    pipe = Pipeline(steps=[
        ('imp', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler(with_mean=True, with_std=True)),
        ('logit', LogisticRegression(solver='lbfgs', C=C, max_iter=max_iter))
    ])

    all_rows = []
    for i, fut_date in enumerate(future_dates):
        test_mask = (df[date_col] == fut_date) & future_mask
        test_idx = np.where(test_mask)[0]
        if test_idx.size == 0:
            continue

        train_mask = (df[date_col] < fut_date) & is_valid
        train_idx_all = np.where(train_mask)[0]
        if train_idx_all.size < train_window:
            if verbose_every and (i % verbose_every == 0):
                print(f"[{i+1}/{len(future_dates)}] {str(fut_date)[:10]} -> insuf. histórico: "
                      f"{train_idx_all.size}<{train_window}")
            continue

        train_idx = train_idx_all[-train_window:]

        # Pesos de muestra
        sample_weight = np.full(train_idx.shape[0], older_weight, dtype=float)
        if recent_block > 0:
            sample_weight[-recent_block:] = recent_weight

        # X/y
        X_train = df.iloc[train_idx][feature_cols]
        y_train = df.iloc[train_idx][label_col].astype(str).str.upper().str.strip()
        X_test  = df.iloc[test_idx][feature_cols]

        # Entrena y predice
        pipe.fit(X_train, y_train, **{'logit__sample_weight': sample_weight})
        proba = pipe.predict_proba(X_test)
        classes = list(pipe.named_steps['logit'].classes_)
        idx_map = {cls: classes.index(cls) for cls in classes}

        def col(c):
            return proba[:, idx_map[c]] if c in idx_map else np.full(proba.shape[0], np.nan)

        pH = col('H'); pD = col('D'); pA = col('A')
        y_pred = np.array(['H','D','A'])[np.nanargmax(np.vstack([pH, pD, pA]), axis=0)]

        # Métricas de confianza
        maxp = np.nanmax(np.vstack([pH, pD, pA]), axis=0)
        with np.errstate(divide='ignore', invalid='ignore'):
            ent = -(pH*np.log(pH + 1e-15) + pD*np.log(pD + 1e-15) + pA*np.log(pA + 1e-15))
        sorted_ps = np.sort(np.vstack([pH, pD, pA]), axis=0)
        margin = sorted_ps[-1, :] - sorted_ps[-2, :]

        # Meta y pred_key
        meta = df.iloc[test_idx][[season_col, date_col, home_col, away_col]].copy()
        _date_key = pd.to_datetime(meta[date_col], errors='coerce')\
                       .dt.tz_localize(None, nonexistent='NaT', ambiguous='NaT')\
                       .dt.floor('D')
        meta['_date_key'] = _date_key
        meta['pred_key'] = (
            meta[season_col].astype('Int64').astype(str) + "|" +
            meta['_date_key'].dt.strftime("%Y-%m-%d") + "|" +
            meta[home_col].astype(str) + "|" +
            meta[away_col].astype(str)
        )

        out = pd.DataFrame({
            "Season": meta[season_col].values,
            "Date": meta[date_col].values,
            "HomeTeam_norm": meta[home_col].values,
            "AwayTeam_norm": meta[away_col].values,
            "pred_key": meta['pred_key'].values,
            "y_pred": y_pred,
            "pH_pred": pH,
            "pD_pred": pD,
            "pA_pred": pA,
            "conf_maxprob": maxp,
            "entropy": ent,
            "margin_top12": margin,
        })

        all_rows.append(out)

        if verbose_every and (i % verbose_every == 0):
            print(f"[{i+1}/{len(future_dates)}] {str(fut_date)[:10]}  "
                  f"test_n={len(out)}  mean_conf={np.nanmean(maxp):.3f}  mean_entropy={np.nanmean(ent):.3f}")

    if not all_rows:
        raise RuntimeError("No se generaron predicciones. (¿Faltó histórico suficiente o no hay futuros?)")

    preds = pd.concat(all_rows, ignore_index=True)

    # Orden y unicidad de clave
    preds = preds.sort_values(['Date','HomeTeam_norm','AwayTeam_norm'], kind='mergesort').reset_index(drop=True)
    preds, uniq_report = enforce_unique_pred_key(preds, key_col='pred_key')

    # Metadatos del run y export
    season_tag = str(season_filter) if season_filter is not None else "all"
    ts = datetime.now().strftime("%Y%m%d-%H%M%S")
    Path(outputs_dir).mkdir(parents=True, exist_ok=True)

    csv_path = Path(outputs_dir) / f"future_predictions_{season_tag}.csv"
    json_path = Path(outputs_dir) / f"future_predictions_{season_tag}.json"

    # Export SOLO columnas garantizadas
    cols_out = [
        "Season","Date","HomeTeam_norm","AwayTeam_norm","pred_key",
        "y_pred","pH_pred","pD_pred","pA_pred","conf_maxprob","entropy","margin_top12"
    ]
    preds[cols_out].to_csv(csv_path, index=False)
    preds[cols_out].to_json(json_path, orient="records", date_format="iso")

    # Summary (sin EV/overround, solo métricas de confianza y distribución de picks)
    def _safe_mean(s):
        s = pd.to_numeric(s, errors='coerce')
        return float(np.nanmean(s)) if s.notna().any() else np.nan

    by_date = preds.groupby(pd.to_datetime(preds['Date']).dt.strftime("%Y-%m-%d")).agg(
        n_matches=('pred_key','count'),
        mean_conf=('conf_maxprob', _safe_mean),
        mean_entropy=('entropy', _safe_mean),
        mean_margin=('margin_top12', _safe_mean),
        pct_pick_H=('y_pred', lambda s: float((s=='H').mean())),
        pct_pick_D=('y_pred', lambda s: float((s=='D').mean())),
        pct_pick_A=('y_pred', lambda s: float((s=='A').mean())),
    ).reset_index().rename(columns={'Date':'date'})

    summary = {
        "generated_at": ts,
        "model": {
            "type": "LogisticRegression(multinomial)",
            "C": C, "max_iter": max_iter,
            "n_seasons_window": n_seasons_window, "season_size": season_size,
            "recent_weight": recent_weight, "older_weight": older_weight,
            "features": list(feature_cols),
            # NEW: orden de clases visto por el modelo en el último fit del bucle
            "classes_order": classes,   # <- ya la tienes en ese scope
            "proba_mapping": {"pH_pred": "H", "pD_pred": "D", "pA_pred": "A"}
        },
        "filters": {"season_filter": season_filter},
        "data": {
            "n_future_rows_out": int(len(preds)),
            "future_min_date": str(pd.to_datetime(preds['Date']).min()),
            "future_max_date": str(pd.to_datetime(preds['Date']).max()),
            "unique_key_report": uniq_report,
        },
        "by_date": by_date.to_dict(orient="records")
    }

    summary_path = Path(outputs_dir) / f"future_predictions_summary_{ts}.json"
    with open(summary_path, "w", encoding="utf-8") as f:
        json.dump(summary, f, ensure_ascii=False, indent=2)

    return {
        "csv_path": str(csv_path),
        "json_path": str(json_path),
        "summary_path": str(summary_path),
        "n_rows": int(len(preds))
    }

In [91]:
# ===  PARÁMETROS DEL MODELO (ALINEADOS CON TU WALK-FORWARD) ===============
MODEL_NAME        = "base"
N_SEASONS_WINDOW  = 4
SEASON_SIZE       = 380
RECENT_WEIGHT     = 3.0
OLDER_WEIGHT      = 1.0
LOGREG_C          = 1.0
MAX_ITER          = 1000

# Tu lista fija de features (usa la misma que en tu entrenamiento/productivo)
# SUSTITUYE la línea de abajo por tu lista definitiva
FEATURES = FEATURES_S11p

# ===  PREDICCIONES FUTURAS PARA APP (SÓLO TEMPORADA EN CURSO) =============
CURRENT_SEASON = int(df["Season"].max())

future_result = generate_future_predictions(
    df=df,
    feature_cols=FEATURES,
    outputs_dir=str(OUT),
    n_seasons_window=N_SEASONS_WINDOW,
    season_size=SEASON_SIZE,
    recent_weight=RECENT_WEIGHT,
    older_weight=OLDER_WEIGHT,
    C=LOGREG_C,
    max_iter=MAX_ITER,
    season_filter=CURRENT_SEASON,  # sólo la temporada en juego
    verbose_every=0
)
print("Future preds:", future_result)

Future preds: {'csv_path': '/content/outputs/future_predictions_2025.csv', 'json_path': '/content/outputs/future_predictions_2025.json', 'summary_path': '/content/outputs/future_predictions_summary_20251014-192942.json', 'n_rows': 10}


## Con SMOTE:

# **EVALUACIÓN HISTÓRICA: Logistic Regression multinomial**

In [73]:
IN_PATH = FEAT / "df_final.parquet"
df = pd.read_parquet(IN_PATH)

## Sin SMOTE:

In [92]:
# ============================================================
# MÉTRICAS PRINCIPALES POR TEMPORADA → CSV (extendido)
# Añade: n_bets, n_wins, hit_rate, avg_odds_win, avg_conf, avg_entropy, avg_margin, avg_overround
# Requiere en memoria: df, preds, merged, acc_by_season_oficial, roi_seas_model
# ============================================================

EPS = 1e-15

def _log_loss_mc_vec(y_true_series, P_mat, classes=("H","D","A")):
    y = y_true_series.astype(str).str.upper().str.strip()
    mask = y.isin(classes)
    if not mask.any():
        return np.nan
    y = y[mask].to_numpy()
    idx = {c:i for i,c in enumerate(classes)}
    P = np.clip(P_mat[mask, :], EPS, 1.0-EPS)
    p_true = P[np.arange(P.shape[0]), [idx[c] for c in y]]
    return float(-np.mean(np.log(p_true)))

def _brier_mc_vec(y_true_series, P_mat, classes=("H","D","A")):
    y = y_true_series.astype(str).str.upper().str.strip()
    mask = y.isin(classes)
    if not mask.any():
        return np.nan
    y = y[mask].to_numpy()
    idx = {c:i for i,c in enumerate(classes)}
    P = np.clip(P_mat[mask, :], 0.0, 1.0)
    Y = np.zeros_like(P)
    Y[np.arange(P.shape[0]), [idx[c] for c in y]] = 1.0
    return float(np.mean(np.sum((P - Y)**2, axis=1)))

# --- Asegurar Season en preds (igual que tu bloque) ---
date_season = df[['Date','Season']].copy()
date_season['Date'] = pd.to_datetime(date_season['Date'], errors='coerce')
date_season = date_season.drop_duplicates(subset=['Date'])

preds_seas = preds.copy()
preds_seas['Date'] = pd.to_datetime(preds_seas['Date'], errors='coerce')
preds_seas = preds_seas.merge(date_season, on='Date', how='left', validate='m:1')

if 'Season' not in preds_seas.columns:
    if 'Season_y' in preds_seas.columns:
        preds_seas['Season'] = preds_seas['Season_y']
    elif 'Season_x' in preds_seas.columns:
        preds_seas['Season'] = preds_seas['Season_x']
preds_seas.drop(columns=[c for c in ['Season_x','Season_y'] if c in preds_seas.columns], inplace=True)
preds_seas['Season'] = pd.to_numeric(preds_seas['Season'], errors='coerce').astype('Int64')

# --- Preparar matriz de probabilidades (H,D,A) ---
for col in ['proba_H','proba_D','proba_A']:
    if col not in preds_seas.columns:
        raise ValueError(f"Falta la columna {col} en preds. Asegúrate de usar la versión que añade proba_H/D/A.")
P_all = np.column_stack([
    pd.to_numeric(preds_seas['proba_H'], errors='coerce').to_numpy(),
    pd.to_numeric(preds_seas['proba_D'], errors='coerce').to_numpy(),
    pd.to_numeric(preds_seas['proba_A'], errors='coerce').to_numpy()
])

# --- Filas con etiqueta válida ---
valid_mask = preds_seas['has_label'] == 1
preds_scored = preds_seas.loc[valid_mask].copy()

# --- Métricas derivadas de probas (conf, entropy, margin) ---
probs_mat = preds_scored[['proba_H','proba_D','proba_A']].to_numpy(dtype=float)
conf_maxprob = np.nanmax(probs_mat, axis=1)
sorted_ps = np.sort(probs_mat, axis=1)
margin_top12 = sorted_ps[:, -1] - sorted_ps[:, -2]
entropy = -(probs_mat * np.log(np.clip(probs_mat, EPS, 1.0))).sum(axis=1)

preds_scored['conf_maxprob'] = conf_maxprob
preds_scored['entropy'] = entropy
preds_scored['margin_top12'] = margin_top12

# --- Accuracy por temporada (reutilizamos lo tuyo) ---
acc_by_season = acc_by_season_oficial[['Season','accuracy']].copy()

# --- LogLoss y Brier por temporada ---
logloss_rows = []
brier_rows = []
for s, grp in preds_scored.groupby('Season', dropna=True):
    P = grp[['proba_H','proba_D','proba_A']].to_numpy(dtype=float)
    ll = _log_loss_mc_vec(grp['y_true'], P)
    br = _brier_mc_vec(grp['y_true'], P)
    logloss_rows.append({'Season': int(s), 'logloss': ll})
    brier_rows.append({'Season': int(s), 'brier': br})
logloss_by_season = pd.DataFrame(logloss_rows)
brier_by_season  = pd.DataFrame(brier_rows)

# --- ROI y estadísticas de apuestas por temporada (mismas reglas que compute_accuracy_roi) ---
m = merged.copy()
y_true_arr = m['y_true'].astype(str).str.upper().str.strip().to_numpy()
pred_arr   = m['y_pred'].astype(str).str.upper().str.strip().to_numpy()
valid_label = np.isin(y_true_arr, ['H','D','A'])

B365H = pd.to_numeric(m.get('B365H', np.nan), errors='coerce').to_numpy()
B365D = pd.to_numeric(m.get('B365D', np.nan), errors='coerce').to_numpy()
B365A = pd.to_numeric(m.get('B365A', np.nan), errors='coerce').to_numpy()
odds_pred = np.where(pred_arr=='H', B365H,
             np.where(pred_arr=='D', B365D,
                      np.where(pred_arr=='A', B365A, np.nan))).astype(float)

valid_odds = np.isfinite(odds_pred) & (odds_pred >= 1.01)
bet_mask = valid_label & valid_odds

# per-season aggregates
m['__bet__'] = bet_mask
m['__win__'] = False
m.loc[bet_mask, '__win__'] = (pred_arr[bet_mask] == y_true_arr[bet_mask])
m['__odds__'] = odds_pred

# overround por fila (si hay cuotas)
overround_row = (1/np.clip(B365H, 1.0, None)) + (1/np.clip(B365D, 1.0, None)) + (1/np.clip(B365A, 1.0, None))
overround_row[~np.isfinite(overround_row)] = np.nan
m['__overround__'] = overround_row

stats_rows = []
for s, grp in m.groupby('Season', dropna=True):
    g = grp[grp['__bet__'] == True]
    n_bets = int(len(g))
    if n_bets == 0:
        stats_rows.append({
            'Season': int(s), 'n_bets': 0, 'n_wins': 0, 'hit_rate': np.nan,
            'avg_odds_win': np.nan, 'avg_overround': float(np.nan)
        })
        continue
    n_wins = int(g['__win__'].sum())
    hit_rate = n_wins / n_bets if n_bets > 0 else np.nan
    avg_odds_win = float(pd.to_numeric(g.loc[g['__win__'], '__odds__'], errors='coerce').mean()) if n_wins>0 else np.nan

    # promedio de overround solo en filas con etiqueta válida (opcionalmente también exigir bet_mask)
    g_over = grp[grp['__bet__'] == True]
    avg_overround = float(pd.to_numeric(g_over['__overround__'], errors='coerce').mean())

    stats_rows.append({
        'Season': int(s),
        'n_bets': n_bets,
        'n_wins': n_wins,
        'hit_rate': float(hit_rate) if np.isfinite(hit_rate) else np.nan,
        'avg_odds_win': avg_odds_win,
        'avg_overround': avg_overround
    })
stats_by_season = pd.DataFrame(stats_rows)

# --- Medias de confianza/entropía/margen por temporada ---
conf_agg = (
    preds_scored.groupby('Season', dropna=True)[['conf_maxprob','entropy','margin_top12']]
               .mean()
               .reset_index()
               .rename(columns={'conf_maxprob':'avg_conf','entropy':'avg_entropy','margin_top12':'avg_margin'})
)
conf_agg['Season'] = pd.to_numeric(conf_agg['Season'], errors='coerce').astype('Int64')

# --- ROI por temporada (desde tu compute_accuracy_roi) ---
roi_by_season = roi_seas_model[['Season','roi']].copy()
roi_by_season['Season'] = pd.to_numeric(roi_by_season['Season'], errors='coerce').astype('Int64')

# --- Unir TODO en un solo DataFrame ordenado ---
metrics_all = (
    acc_by_season
    .merge(logloss_by_season, on='Season', how='left')
    .merge(brier_by_season,  on='Season', how='left')
    .merge(roi_by_season,    on='Season', how='left')
    .merge(stats_by_season,  on='Season', how='left')
    .merge(conf_agg,         on='Season', how='left')
    .sort_values('Season')
    .reset_index(drop=True)
)

# --- Exportar a CSV (mismo nombre que antes) ---
out_dir = Path("outputs")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "metrics_main_by_season.csv"
metrics_all.to_csv(out_path, index=False)

print("✔ CSV generado con métricas extendidas por temporada:")
print(out_path)
display(metrics_all.head(20))

✔ CSV generado con métricas extendidas por temporada:
outputs/metrics_main_by_season.csv


Unnamed: 0,Season,accuracy,logloss,brier,roi,n_bets,n_wins,hit_rate,avg_odds_win,avg_overround,avg_conf,avg_entropy,avg_margin
0,2010,0.602632,0.923591,0.536898,0.372105,380,229,0.602632,2.276856,1.065656,0.576866,0.920726,0.330923
1,2011,0.531579,0.952837,0.565318,0.285132,380,202,0.531579,2.417574,1.064498,0.580066,0.911441,0.334069
2,2012,0.531579,0.964567,0.57072,0.168447,380,202,0.531579,2.198069,1.063707,0.576237,0.923658,0.334983
3,2013,0.555263,0.956975,0.565876,0.367,380,211,0.555263,2.461896,1.06363,0.582032,0.902447,0.348603
4,2014,0.552632,0.918473,0.542505,0.696421,380,210,0.552632,3.069714,1.055137,0.594109,0.87442,0.358429
5,2015,0.542105,0.947823,0.56015,0.474763,380,206,0.542105,2.720437,1.051227,0.56001,0.918395,0.307672
6,2016,0.594737,0.918901,0.541469,0.656079,380,226,0.594737,2.784558,1.050764,0.578259,0.905668,0.3409
7,2017,0.536842,0.974213,0.578583,0.321605,380,204,0.536842,2.461814,1.052719,0.56377,0.928536,0.317175
8,2018,0.497368,1.016991,0.606917,0.110237,380,189,0.497368,2.232222,1.052628,0.529775,0.971869,0.266231
9,2019,0.518421,0.983434,0.586713,0.202658,380,197,0.518421,2.319848,1.054675,0.518819,0.977766,0.241377


## Con SMOTE:

Con este modelo obtengo el mejor **Accuracy** (porcentaje de aciertos totales), pero esta métrica ignora como de seguras son esas esas predicciones.

$$
\text{Accuracy} = \frac{\text{Número de aciertos}}{\text{Número total de predicciones}}
$$

Para ello se utiliza el **Log Loss** (Cross-Entropy Loss), métrica que mide qué tan buenas son las probabilidades que predice mi modelo de clasificación. A esta métrica no solo le importa acertar la clase, sino cuán seguro está el modelo.

$$
\text{LogLoss} = -\frac{1}{N} \sum_{i=1}^{N} \sum_{j=1}^{K} y_{ij} \cdot \log(p_{ij})
$$

donde:

- $y_{ij}$ = 1 si la clase real del ejemplo $i$ es la clase $j$, y 0 en caso contrario.
- $p_{ij}$ es la probabilidad predicha por el modelo de que el ejemplo $i$ pertenezca a la clase $j$.

Tener un Log Loss alto en este caso significaría dar una probabilidad alta a la clase incorrecta, o lo que es lo mismo, dar una probabilidad baja a la clase correcta.

Por último añadí también el **Brier Score**, que es una métrica que evalúa cuán cercanas están las probabilidades predichas por tu modelo respecto a la realidad, comparando la distribución de probabilidades contra la clase real (codificada en one-hot). Es como un error cuadrático medio (MSE) para probabilidades.

$$
\text{Brier Score} = \frac{1}{N} \sum_{i=1}^{N} \sum_{j=1}^{K} (p_{ij} - y_{ij})^2
$$

donde:

- $N$ es el número de ejemplos.
- $K$ es el número de clases (en este caso 3: victoria local, empate, victoria visitante).
- $p_{ij}$ es la probabilidad predicha por el modelo de que el ejemplo $i$ pertenezca a la clase $j$.
- $y_{ij}$ es 1 si la clase real del ejemplo $i$ es la clase $j$, y 0 en caso contrario.

Un Brier Score de 0 significa que las probabilidades dadas por el modelo son perfectas, mientras que uno del 0.66 en nuestro caso sería un modelo completamente aleatorio.


## Selección de variables

La función `forward_selection` implementa un algoritmo clásico de selección de variables hacia adelante (**forward feature selection**) sobre un modelo de regresión logística multiclase con escalado de variables.

Va añadiendo sucesivamente la variable que mejor mejora el rendimiento del modelo (según accuracy o log_loss), una por una.





In [None]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline
# from sklearn.metrics import accuracy_score, log_loss
# import numpy as np

# def forward_selection(X, y, max_features=20, scoring='accuracy'):
#     selected_features = []
#     remaining_features = list(X.columns)
#     scores = []

#     for i in range(min(max_features, len(remaining_features))):
#         best_score = -np.inf if scoring == 'accuracy' else np.inf
#         best_feature = None

#         for feature in remaining_features:
#             current_features = selected_features + [feature]

#             model = make_pipeline(
#                 StandardScaler(),
#                 LogisticRegression(max_iter=1000, solver='lbfgs')
#             )

#             model.fit(X[current_features], y)
#             y_pred = model.predict(X[current_features])
#             y_proba = model.predict_proba(X[current_features])

#             if scoring == 'accuracy':
#                 score = accuracy_score(y, y_pred)
#                 if score > best_score:
#                     best_score = score
#                     best_feature = feature
#             elif scoring == 'log_loss':
#                 score = log_loss(y, y_proba)
#                 if score < best_score:
#                     best_score = score
#                     best_feature = feature
#             else:
#                 raise ValueError("scoring debe ser 'accuracy' o 'log_loss'.")

#         if best_feature is not None:
#             selected_features.append(best_feature)
#             remaining_features.remove(best_feature)
#             scores.append(best_score)

#         print(f"[{i+1}] Añadida: {best_feature} | Score: {best_score:.4f}")

#     return selected_features, scores

In [None]:
# selected, scores = forward_selection(X_train, y_train, max_features=81, scoring='accuracy')

In [None]:
# import matplotlib.pyplot as plt
# import numpy as np

# # Suponemos que tienes las listas: selected (variables) y scores (métricas acumuladas)

# # Calcular diferencia respecto al valor anterior
# deltas = np.diff([0] + scores)
# colors = ['blue' if delta >= 0 else 'red' for delta in deltas]

# plt.figure(figsize=(12,6))
# bar_width = 0.6  # Reducir ancho de barra para separarlas
# indices = np.arange(len(selected))

# plt.bar(indices, scores, color=colors, width=bar_width)
# plt.xticks(indices, selected, rotation=90)
# plt.xlabel('Variables añadidas')
# plt.ylabel('Valor de la métrica')
# plt.title('Evolución del rendimiento al añadir variables')

# plt.ylim(min(scores) - 0.01, max(scores) + 0.01)
# plt.tight_layout()
# plt.show()


Se implementó un proceso de selección hacia adelante (forward selection) sobre el modelo de regresión logística con variables estandarizadas. Este procedimiento consiste en partir sin predictores y añadir, en cada iteración, la variable que mayor mejora produce en el rendimiento del modelo. Se evaluaron dos métricas complementarias como criterio de selección: el accuracy (para priorizar aciertos de clasificación) y el log loss (para priorizar la calibración de las probabilidades). Esta técnica permitió reducir la dimensionalidad del conjunto original y determinar el orden de relevancia de las variables desde el punto de vista predictivo.

# **Resultados**

## **MATRIZ DE CONFUSIÓN**

In [93]:
# ============================================================
# MATRICES DE CONFUSIÓN POR TEMPORADA → JSON
# Requiere: preds (con y_true, y_pred, has_label, y preferible Season)
# ============================================================

LABELS = ["H","D","A"]
label_to_idx = {c:i for i,c in enumerate(LABELS)}

def _ensure_season_in_preds(preds, df):
    """Si preds no trae Season, lo añade vía merge por Date (como en tu bloque)."""
    if "Season" in preds.columns:
        return preds
    date_season = df[["Date","Season"]].copy()
    date_season["Date"] = pd.to_datetime(date_season["Date"], errors="coerce")
    date_season = date_season.drop_duplicates(subset=["Date"])
    p = preds.copy()
    p["Date"] = pd.to_datetime(p["Date"], errors="coerce")
    p = p.merge(date_season, on="Date", how="left", validate="m:1")
    # Normaliza columna Season
    if "Season" not in p.columns:
        if "Season_y" in p.columns:
            p["Season"] = p["Season_y"]
        elif "Season_x" in p.columns:
            p["Season"] = p["Season_x"]
    p.drop(columns=[c for c in ["Season_x","Season_y"] if c in p.columns], inplace=True)
    p["Season"] = pd.to_numeric(p["Season"], errors="coerce").astype("Int64")
    return p

def _confusion_counts(y_true_s, y_pred_s, labels=LABELS):
    """
    Devuelve (M, support) donde:
      - M es matriz 3x3 (filas = y_true, columnas = y_pred) en orden labels
      - support es dict label->n_true
    """
    y_true = y_true_s.astype(str).str.upper().str.strip().to_numpy()
    y_pred = y_pred_s.astype(str).str.upper().str.strip().to_numpy()

    it = np.array([label_to_idx.get(x, -1) for x in y_true], dtype=int)
    ip = np.array([label_to_idx.get(x, -1) for x in y_pred], dtype=int)
    mask = (it >= 0) & (ip >= 0)

    M = np.zeros((len(labels), len(labels)), dtype=int)
    if mask.any():
        flat = it[mask] * len(labels) + ip[mask]
        counts = np.bincount(flat, minlength=len(labels)*len(labels))
        M = counts.reshape((len(labels), len(labels)))

    support = {lab: int(np.sum(it == label_to_idx[lab])) for lab in labels}
    return M.tolist(), support

# 1) Asegura Season en preds
try:
    _ = df  # por si no está en el entorno
except NameError:
    raise RuntimeError("Se necesita 'df' en memoria para asegurar Season si falta en 'preds'.")

preds_cm = _ensure_season_in_preds(preds, df)

# 2) Filtra filas con etiqueta válida (como en tu pipeline)
if "has_label" in preds_cm.columns:
    preds_cm = preds_cm[preds_cm["has_label"] == 1].copy()
else:
    # fallback si no hubiera 'has_label'
    vt = preds_cm["y_true"].astype(str).str.upper().str.strip()
    preds_cm = preds_cm[vt.isin(LABELS)].copy()

# 3) Construye matrices por temporada y overall
preds_cm["Season"] = pd.to_numeric(preds_cm["Season"], errors="coerce").astype("Int64")

by_season = []
for s, grp in preds_cm.groupby("Season", dropna=True):
    M, support = _confusion_counts(grp["y_true"], grp["y_pred"], labels=LABELS)
    by_season.append({
        "Season": int(s),
        "labels": LABELS,
        "matrix": M,              # filas = verdaderas (H,D,A), columnas = predichas (H,D,A)
        "support": support,       # nº de verdaderos por clase
        "n_scored": int(len(grp))
    })

# Overall
M_overall, support_overall = _confusion_counts(preds_cm["y_true"], preds_cm["y_pred"], labels=LABELS)
overall = {
    "labels": LABELS,
    "matrix": M_overall,
    "support": support_overall,
    "n_scored": int(len(preds_cm))
}

# 4) Exporta JSON
out_dir = Path("outputs")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "confusion_matrices_by_season.json"

payload = {
    "meta": {
        "row_axis": "y_true",
        "col_axis": "y_pred",
        "labels_order": LABELS
    },
    "by_season": sorted(by_season, key=lambda x: x["Season"]),
    "overall": overall
}

with open(out_path, "w", encoding="utf-8") as f:
    json.dump(payload, f, ensure_ascii=False, indent=2)

print(f"✔ Confusion matrices guardadas en: {out_path}")

✔ Confusion matrices guardadas en: outputs/confusion_matrices_by_season.json


## **METRICAS DE CLASIFICACIÓN**

In [94]:
# ============================================================
# CLF METRICS POR TEMPORADA → CSV (precision/recall/f1/support: macro y weighted)
# Requiere en memoria: df, preds (de walkforward_multinomial_accuracy)
# ============================================================

from sklearn.metrics import precision_recall_fscore_support

LABELS = ["H","D","A"]

def _ensure_season_in_preds(preds, df):
    """Igual que en tus celdas: añade Season a preds vía Date si hiciera falta."""
    if "Season" in preds.columns:
        p = preds.copy()
    else:
        date_season = df[["Date","Season"]].copy()
        date_season["Date"] = pd.to_datetime(date_season["Date"], errors="coerce")
        date_season = date_season.drop_duplicates(subset=["Date"])
        p = preds.copy()
        p["Date"] = pd.to_datetime(p["Date"], errors="coerce")
        p = p.merge(date_season, on="Date", how="left", validate="m:1")
        if "Season" not in p.columns:
            if "Season_y" in p.columns:
                p["Season"] = p["Season_y"]
            elif "Season_x" in p.columns:
                p["Season"] = p["Season_x"]
        p.drop(columns=[c for c in ["Season_x","Season_y"] if c in p.columns], inplace=True)
    p["Season"] = pd.to_numeric(p["Season"], errors="coerce").astype("Int64")
    return p

# 1) Asegura Season y filtra filas con etiqueta válida H/D/A
preds_seas = _ensure_season_in_preds(preds, df).copy()

if "has_label" in preds_seas.columns:
    preds_scored = preds_seas[preds_seas["has_label"] == 1].copy()
else:
    vt = preds_seas["y_true"].astype(str).str.upper().str.strip()
    preds_scored = preds_seas[vt.isin(LABELS)].copy()

# Normaliza etiquetas/predicciones
preds_scored["y_true_norm"] = preds_scored["y_true"].astype(str).str.upper().str.strip()
preds_scored["y_pred_norm"] = preds_scored["y_pred"].astype(str).str.upper().str.strip()

# 2) Métricas por temporada
rows = []
for s, grp in preds_scored.groupby("Season", dropna=True):
    y_true = grp["y_true_norm"].to_numpy()
    y_pred = grp["y_pred_norm"].to_numpy()

    # macro
    p_mac, r_mac, f_mac, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=LABELS, average="macro", zero_division=0
    )
    # weighted
    p_w, r_w, f_w, _ = precision_recall_fscore_support(
        y_true, y_pred, labels=LABELS, average="weighted", zero_division=0
    )
    rows.append({
        "Season": int(s),
        "precision_macro": float(p_mac),
        "recall_macro":    float(r_mac),
        "f1_macro":        float(f_mac),
        "precision_weighted": float(p_w),
        "recall_weighted":    float(r_w),
        "f1_weighted":        float(f_w),
        "support": int(len(grp))  # nº de partidos evaluados en la temporada
    })

report_df = pd.DataFrame(rows).sort_values("Season").reset_index(drop=True)

# 3) Exportar CSV
out_dir = Path("outputs")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "classification_report_by_season.csv"
report_df.to_csv(out_path, index=False)

print("✔ Classification report por temporada guardado en:")
print(out_path)
display(report_df.head(20))

✔ Classification report por temporada guardado en:
outputs/classification_report_by_season.csv


Unnamed: 0,Season,precision_macro,recall_macro,f1_macro,precision_weighted,recall_weighted,f1_weighted,support
0,2010,0.408714,0.464646,0.424988,0.485277,0.602632,0.527475,380
1,2011,0.342178,0.428173,0.374833,0.39595,0.531579,0.448562,380
2,2012,0.337042,0.402215,0.353841,0.406365,0.531579,0.447784,380
3,2013,0.488688,0.454928,0.415107,0.512201,0.555263,0.488311,380
4,2014,0.488545,0.484204,0.466274,0.514689,0.552632,0.516396,380
5,2015,0.452173,0.463614,0.434079,0.489625,0.542105,0.495843,380
6,2016,0.558332,0.507512,0.479183,0.572793,0.594737,0.541543,380
7,2017,0.355595,0.441033,0.38997,0.415347,0.536842,0.464305,380
8,2018,0.409885,0.434628,0.377372,0.429228,0.497368,0.419109,380
9,2019,0.468891,0.459971,0.439398,0.488271,0.518421,0.479705,380


## **AUC Y CURVA ROC**

In [96]:
# ============================================================
# ROC CURVES + AUC → JSON (overall y por temporada)
# Requiere en memoria: df, preds (con y_true, y_pred, has_label, proba_H/D/A)
# ============================================================

from sklearn.metrics import roc_curve, auc

LABELS = ["H","D","A"]
EPS = 1e-15

def _ensure_season_in_preds(preds, df):
    """Si falta Season en preds, la trae por merge con Date (idéntico a tu patrón)."""
    if "Season" in preds.columns:
        p = preds.copy()
    else:
        date_season = df[["Date","Season"]].copy()
        date_season["Date"] = pd.to_datetime(date_season["Date"], errors="coerce")
        date_season = date_season.drop_duplicates(subset=["Date"])
        p = preds.copy()
        p["Date"] = pd.to_datetime(p["Date"], errors="coerce")
        p = p.merge(date_season, on="Date", how="left", validate="m:1")
        if "Season" not in p.columns:
            if "Season_y" in p.columns:
                p["Season"] = p["Season_y"]
            elif "Season_x" in p.columns:
                p["Season"] = p["Season_x"]
        p.drop(columns=[c for c in ["Season_x","Season_y"] if c in p.columns], inplace=True)
    p["Season"] = pd.to_numeric(p["Season"], errors="coerce").astype("Int64")
    return p

def _prepare_scored(preds):
    """Filtra etiquetas válidas y probabilidades finitas; devuelve DataFrame listo para ROC."""
    for col in ["proba_H","proba_D","proba_A"]:
        if col not in preds.columns:
            raise ValueError(f"Falta {col} en preds. Usa la función que añade proba_H/D/A.")
    p = preds.copy()
    y = p["y_true"].astype(str).str.upper().str.strip()
    mask_lbl = y.isin(LABELS)
    # probs finitas (evitamos NaN/inf)
    probs = p[["proba_H","proba_D","proba_A"]].apply(pd.to_numeric, errors="coerce")
    mask_prob = np.isfinite(probs).all(axis=1)
    if "has_label" in p.columns:
        mask = (p["has_label"] == 1) & mask_prob
    else:
        mask = mask_lbl & mask_prob
    p = p.loc[mask].copy()
    p["y_true_norm"] = p["y_true"].astype(str).str.upper().str.strip()
    return p

def _binarize_labels(y_true, labels=LABELS):
    """One-hot (n x C) para las verdaderas."""
    idx_map = {c:i for i,c in enumerate(labels)}
    it = np.array([idx_map.get(val, -1) for val in y_true], dtype=int)
    Y = np.zeros((len(y_true), len(labels)), dtype=int)
    valid_rows = it >= 0
    Y[np.where(valid_rows)[0], it[valid_rows]] = 1
    return Y

def _multiclass_roc_block(y_true_series, P_mat, labels=LABELS):
    """
    Devuelve dict con:
      per_class[label]: {fpr, tpr, thresholds, auc}
      micro: {fpr, tpr, thresholds, auc}
      macro_auc: media de AUC por clase
      n_scored
    """
    y_true = y_true_series.astype(str).str.upper().str.strip().to_numpy()
    P = np.clip(P_mat.astype(float), 0.0, 1.0)
    # normaliza filas por seguridad (no debería ser necesario si viene de softmax)
    row_sums = P.sum(axis=1, keepdims=True)
    ok = row_sums > 0
    P[ok.squeeze()] = P[ok.squeeze()] / np.clip(row_sums[ok.squeeze()], EPS, None)

    Y = _binarize_labels(y_true, labels=labels)

    per_class = {}
    aucs = []
    for j, lab in enumerate(labels):
        fpr, tpr, thr = roc_curve(Y[:, j], P[:, j], drop_intermediate=True)
        auc_j = auc(fpr, tpr) if len(fpr) > 1 else np.nan
        per_class[lab] = {
            "fpr": fpr.tolist(),
            "tpr": tpr.tolist(),
            "thresholds": thr.tolist(),
            "auc": float(auc_j) if np.isfinite(auc_j) else np.nan
        }
        if np.isfinite(auc_j):
            aucs.append(auc_j)

    # micro-average
    fpr_micro, tpr_micro, thr_micro = roc_curve(Y.ravel(), P.ravel(), drop_intermediate=True)
    auc_micro = auc(fpr_micro, tpr_micro) if len(fpr_micro) > 1 else np.nan

    macro_auc = float(np.mean(aucs)) if len(aucs) else np.nan

    return {
        "per_class": per_class,
        "micro": {
            "fpr": fpr_micro.tolist(),
            "tpr": tpr_micro.tolist(),
            "thresholds": thr_micro.tolist(),
            "auc": float(auc_micro) if np.isfinite(auc_micro) else np.nan
        },
        "macro_auc": float(macro_auc) if np.isfinite(macro_auc) else np.nan,
        "n_scored": int(len(y_true))
    }

# ------------------ Construcción del payload ------------------
preds_seas = _ensure_season_in_preds(preds, df)
preds_scored = _prepare_scored(preds_seas)

# OVERALL
P_all = preds_scored[["proba_H","proba_D","proba_A"]].to_numpy()
overall_block = _multiclass_roc_block(preds_scored["y_true_norm"], P_all, labels=LABELS)

# BY SEASON
by_season = []
for s, grp in preds_scored.groupby("Season", dropna=True):
    P = grp[["proba_H","proba_D","proba_A"]].to_numpy()
    block = _multiclass_roc_block(grp["y_true_norm"], P, labels=LABELS)
    block["Season"] = int(s)
    by_season.append(block)

payload = {
    "meta": {
        "labels": LABELS,
        "proba_cols": ["proba_H","proba_D","proba_A"],
        "row_axis": "y_true (one-vs-rest)",
        "col_axis": "score",
        "generated_at": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")
    },
    "overall": overall_block,
    "by_season": sorted(by_season, key=lambda x: x["Season"])
}

# ------------------ Guardar JSON ------------------
out_dir = Path("outputs")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "roc_curves_by_season.json"
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(payload, f, ensure_ascii=False, indent=2)

print(f"✔ ROC + AUC guardado en: {out_path}")

  "generated_at": datetime.utcnow().strftime("%Y-%m-%dT%H:%M:%SZ")


✔ ROC + AUC guardado en: outputs/roc_curves_by_season.json


## **BENEFICIOS**

Por último, pero no por ello menos importante vamos a estudiar la última métrica: El **ROI (Return on Investment)**.

$$
ROI = \frac{\text{Beneficio}}{\text{Inversión}}
$$

Con el código siguiente lo que estoy haciendo es simular una apuesta de un euro al resultado que predice mi modelo, en todos los partidos que hay en test. Si se acierta sumamos la cuota que ofrece Bet365 pero si falla se resta la unidad apostada. Con esto calculamos el beneficio neto y el ROI.

### Sin SMOTE

In [104]:
# ============================================================
# MATCHLOGS POR TEMPORADA → CSV (Matchday desde df[Matchweek] con alineación Date+row_in_date)
# Requiere: merged (ya alineado con df en tu pipeline) y df (con columna Matchweek)
# Genera: outputs/matchlogs_<Season>.csv
# ============================================================


OUT_DIR = Path("outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)

def _norm_name(s: str) -> str:
    return re.sub(r'[^a-z0-9]+', '', str(s).strip().lower())

def _find_col(df, candidates):
    norm2real = {_norm_name(c): c for c in df.columns}
    for cand in candidates:
        if _norm_name(cand) in norm2real:
            return norm2real[_norm_name(cand)]
    return None

def _infer_team_cols(df):
    home_candidates = ["HomeTeam_norm","HomeTeam","home_team","Home","local"]
    away_candidates = ["AwayTeam_norm","AwayTeam","away_team","Away","visitor","visiting"]
    home_col = _find_col(df, home_candidates)
    away_col = _find_col(df, away_candidates)
    if home_col is None or away_col is None:
        raise KeyError(f"No encuentro columnas Home/Away. Cols: {list(df.columns)[:40]}")
    return home_col, away_col

def _coalesce_suffix(mdf: pd.DataFrame, base: str) -> pd.DataFrame:
    cx, cy = f"{base}_x", f"{base}_y"
    if cx in mdf.columns or cy in mdf.columns:
        if cx in mdf.columns and cy in mdf.columns:
            mdf[base] = mdf[cx].where(mdf[cx].notna(), mdf[cy])
        elif cx in mdf.columns:
            mdf[base] = mdf[cx]
        else:
            mdf[base] = mdf[cy]
        mdf.drop(columns=[c for c in (cx, cy) if c in mdf.columns], inplace=True)
    return mdf

def _build_pred_key_like_pipeline(df_in, home_col=None, away_col=None):
    d = df_in.copy()
    d["Date"] = pd.to_datetime(d["Date"], errors="coerce")
    if home_col is None or away_col is None:
        home_col, away_col = _infer_team_cols(d)
    d["Season"] = pd.to_numeric(d["Season"], errors="coerce").astype("Int64")
    date_key = d["Date"].dt.tz_localize(None, nonexistent="NaT", ambiguous="NaT").dt.floor("D")
    d["pred_key"] = (
        d["Season"].astype("Int64").astype(str) + "|" +
        date_key.dt.strftime("%Y-%m-%d") + "|" +
        d[home_col].astype(str) + "|" +
        d[away_col].astype(str)
    )
    return d

def _attach_matchday_from_df(merged_in: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
    """
    Trae Matchday (= df[Matchweek]) usando alineación determinista por (Date,row_in_date).
    Fallback: si quedan NaN, intenta por pred_key 'base' (sin sufijo #k).
    """
    m = merged_in.copy()
    # Normaliza tipos
    m["Date"] = pd.to_datetime(m["Date"], errors="coerce")
    df2 = df.copy()
    df2["Date"] = pd.to_datetime(df2["Date"], errors="coerce")

    # Detecta columna Matchweek en df
    mw_col = _find_col(df2, ["Matchweek","MatchWeek","matchweek","Jornada","Gameweek","GW","Week","MD"])
    if mw_col is None:
        raise KeyError("No se encontró columna de jornada (Matchweek) en df.")

    # 1) Alineación por (Date,row_in_date) con orden estable
    #    (mismo criterio mergesort que usaste en tu pipeline)
    df2_sorted = df2.sort_values("Date", kind="mergesort").reset_index(drop=True)
    df2_sorted["row_in_date"] = df2_sorted.groupby("Date").cumcount()

    m_sorted = m.sort_values("Date", kind="mergesort").reset_index(drop=True)
    m_sorted["row_in_date"] = m_sorted.groupby("Date").cumcount()

    bring = df2_sorted[["Date","row_in_date", mw_col]].rename(columns={mw_col: "Matchday"})
    m_sorted = m_sorted.merge(bring, on=["Date","row_in_date"], how="left", validate="1:1")

    # 2) Fallback: para los pocos que queden NaN, emparejar por pred_key base (sin '#k')
    if m_sorted["Matchday"].isna().any():
        missing = m_sorted["Matchday"].isna()
        # pred_key base en m
        if "pred_key" not in m_sorted.columns or "pred_key" not in df2_sorted.columns:
            # asegúrate de tener pred_key en ambos
            home_m, away_m = _infer_team_cols(m_sorted)
            m_sorted = _build_pred_key_like_pipeline(m_sorted, home_m, away_m)
            home_d, away_d = _infer_team_cols(df2_sorted)
            df2_sorted = _build_pred_key_like_pipeline(df2_sorted, home_d, away_d)
        m_sorted["pred_key_base"] = m_sorted["pred_key"].astype(str).str.split("#", n=1, expand=True)[0]
        df2_sorted["pred_key_base"] = df2_sorted["pred_key"].astype(str).str.split("#", n=1, expand=True)[0]
        aux = (df2_sorted[["pred_key_base", mw_col]]
               .drop_duplicates("pred_key_base")
               .rename(columns={mw_col:"Matchday_fb"}))
        m_sorted = m_sorted.merge(aux, on="pred_key_base", how="left")
        # rellena solo los que faltaban
        m_sorted.loc[missing, "Matchday"] = m_sorted.loc[missing, "Matchday_fb"]
        m_sorted.drop(columns=["pred_key_base","Matchday_fb"], inplace=True, errors="ignore")

    return m_sorted

# ---------- Carga y saneo de merged ----------
m = merged.copy()

# Coalesce posibles _x/_y
for base in ["Season","HomeTeam_norm","AwayTeam_norm","HomeTeam","AwayTeam"]:
    m = _coalesce_suffix(m, base)

# Home/Away canónicas
home_col_real, away_col_real = _infer_team_cols(m)
if "HomeTeam_norm" not in m.columns:
    m["HomeTeam_norm"] = m[home_col_real]
if "AwayTeam_norm" not in m.columns:
    m["AwayTeam_norm"] = m[away_col_real]

# Tipos/numéricos
m["Date"] = pd.to_datetime(m["Date"], errors="coerce")
for c in ["B365H","B365D","B365A","pimp1","pimpx","pimp2","proba_H","proba_D","proba_A"]:
    if c in m.columns:
        m[c] = pd.to_numeric(m[c], errors="coerce")
m["Season"] = pd.to_numeric(m["Season"], errors="coerce").astype("Int64")

# ---------- Matchday real desde df ----------
m = _attach_matchday_from_df(m, df)

# ---------- Métricas de probas ----------
if {"proba_H","proba_D","proba_A"}.issubset(m.columns):
    probs = m[["proba_H","proba_D","proba_A"]].to_numpy(dtype=float)
    m["conf_maxprob"] = np.nanmax(probs, axis=1)
    sorted_p = np.sort(probs, axis=1)
    m["margin_top12"] = sorted_p[:,-1] - sorted_p[:,-2]
    m["entropy"] = -(probs * np.log(np.clip(probs, 1e-15, 1.0))).sum(axis=1)
else:
    m["conf_maxprob"] = np.nan
    m["entropy"] = np.nan
    m["margin_top12"] = np.nan

# ---------- Mercado y overround ----------
if {"B365H","B365D","B365A"}.issubset(m.columns):
    pH_imp = 1.0/np.clip(m["B365H"].astype(float), 1.0, None)
    pD_imp = 1.0/np.clip(m["B365D"].astype(float), 1.0, None)
    pA_imp = 1.0/np.clip(m["B365A"].astype(float), 1.0, None)
    s_imp = pH_imp.fillna(0) + pD_imp.fillna(0) + pA_imp.fillna(0)
    m["overround"] = s_imp.where(s_imp > 0, np.nan)
    m["pH_mkt"] = (pH_imp / s_imp).where(s_imp > 0, np.nan)
    m["pD_mkt"] = (pD_imp / s_imp).where(s_imp > 0, np.nan)
    m["pA_mkt"] = (pA_imp / s_imp).where(s_imp > 0, np.nan)
else:
    m["overround"] = np.nan
    m["pH_mkt"] = np.nan
    m["pD_mkt"] = np.nan
    m["pA_mkt"] = np.nan

# ---------- Pick: odds, prob, EV, Kelly ----------
def _pick_odds(row):
    if row.get("y_pred") == "H": return row.get("B365H", np.nan)
    if row.get("y_pred") == "D": return row.get("B365D", np.nan)
    if row.get("y_pred") == "A": return row.get("B365A", np.nan)
    return np.nan

def _pick_prob(row):
    y = str(row.get("y_pred"))
    if y == "H": return row.get("proba_H", np.nan)
    if y == "D": return row.get("proba_D", np.nan)
    if y == "A": return row.get("proba_A", np.nan)
    return np.nan

m["odds_pick"] = m.apply(_pick_odds, axis=1).astype(float)
m["p_pick"]    = m.apply(_pick_prob,  axis=1).astype(float)

b = np.where(np.isfinite(m["odds_pick"]), m["odds_pick"] - 1.0, np.nan)
m["ev_pick"] = m["p_pick"] * b - (1 - m["p_pick"])
kelly_raw = (m["p_pick"]*b - (1 - m["p_pick"])) / b
m["kelly_pick"] = np.clip(kelly_raw, 0.0, 1.0)
m.loc[~np.isfinite(b), "kelly_pick"] = np.nan

# ---------- Resultado y profit (stake 1) ----------
y_true_norm = m["y_true"].astype(str).str.upper().str.strip()
pred_norm   = m["y_pred"].astype(str).str.upper().str.strip()
valid_label = y_true_norm.isin(["H","D","A"])
valid_odds  = np.isfinite(m["odds_pick"]) & (m["odds_pick"] >= 1.01)

m["bet_placed"] = (valid_label & valid_odds).astype(int)
m["correct"]    = ((y_true_norm == pred_norm) & (m["bet_placed"]==1)).astype(int)
m["profit"]     = np.where(m["bet_placed"]==1, -1.0, np.nan)
m.loc[m["correct"]==1, "profit"] = m.loc[m["correct"]==1, "odds_pick"] - 1.0

# ---------- Profit acumulado por temporada ----------
m = m.sort_values(["Season","Matchday","Date","HomeTeam_norm","AwayTeam_norm"], kind="mergesort").reset_index(drop=True)
m["profit_filled"] = pd.to_numeric(m["profit"], errors="coerce").fillna(0.0)
m["cum_profit_season"] = m.groupby("Season", sort=False)["profit_filled"].transform("cumsum")
m.drop(columns=["profit_filled"], inplace=True)

# ---------- Selección de columnas ----------
cols_head = [
    "Season","Matchday","Date","HomeTeam_norm","AwayTeam_norm","pred_key",
    "y_true","y_pred",
    "proba_H","proba_D","proba_A","conf_maxprob","entropy","margin_top12",
    "B365H","B365D","B365A","overround","pH_mkt","pD_mkt","pA_mkt",
    "odds_pick","p_pick","ev_pick","kelly_pick",
    "bet_placed","correct","profit","cum_profit_season"
]
cols_exist = [c for c in cols_head if c in m.columns]
log = m[cols_exist].copy()

# ---------- Exportar CSV por temporada ----------
for s, grp in log.groupby("Season", dropna=True):
    out_path = OUT_DIR / f"matchlogs_{int(s)}.csv"
    grp.sort_values(["Matchday","Date","HomeTeam_norm","AwayTeam_norm"], kind="mergesort").to_csv(out_path, index=False)

print("✔ Matchlogs por temporada generados en 'outputs/'. Matchday tomado de df[Matchweek] por (Date,row_in_date) con fallback por pred_key base.")

✔ Matchlogs por temporada generados en 'outputs/'. Matchday tomado de df[Matchweek] por (Date,row_in_date) con fallback por pred_key base.


### Con SMOTE:

## **COMPARACIÓN CON EL MODELO DE BET365**

El modelo basado en las cuotas de Bet365 consiste en predecir siempre el resultado más probable según la probabilidad implícita.

In [107]:
# ============================================================
# MÉTRICAS PRINCIPALES — MODELO MERCADO (argmax pimp1/pimpx/pimp2)
# Salidas:
#   - outputs/metrics_market_by_season.csv
#   - outputs/metrics_market_overall.json
# Requiere: merged (con y_true, Season, B365H/D/A, pimp1/pimpx/pimp2)
# ============================================================

OUT = Path("outputs")
OUT.mkdir(parents=True, exist_ok=True)
EPS = 1e-15
LABELS = np.array(["H","D","A"])

# ---------- 0) Validaciones y preparación ----------
m = merged.copy()

need_cols = ["y_true","Season","pimp1","pimpx","pimp2"]
missing = [c for c in need_cols if c not in m.columns]
if missing:
    raise KeyError(f"Faltan columnas en merged: {missing}")

# Tipos
m["Season"] = pd.to_numeric(m["Season"], errors="coerce").astype("Int64")
for c in ["pimp1","pimpx","pimp2","B365H","B365D","B365A"]:
    if c in m.columns:
        m[c] = pd.to_numeric(m[c], errors="coerce")
m["y_true_norm"] = m["y_true"].astype(str).str.upper().str.strip()

# ---------- 1) Probabilidades de mercado normalizadas y pick ----------
P_raw = m[["pimp1","pimpx","pimp2"]].to_numpy(dtype=float)  # (H, D, A) = (1, x, 2)
row_sum = np.nansum(P_raw, axis=1, keepdims=True)
row_sum = np.where(row_sum <= 0, np.nan, row_sum)
P_mkt = P_raw / row_sum  # normaliza por fila (deja NaN si fila inválida)

m["pH_mkt_pred"] = P_mkt[:,0]
m["pD_mkt_pred"] = P_mkt[:,1]
m["pA_mkt_pred"] = P_mkt[:,2]

# pick = argmax; si la fila es inválida (todas NaN), dejamos NaN (dtype=object)
with np.errstate(invalid="ignore"):
    best_idx = np.nanargmax(np.where(np.isnan(P_mkt), -np.inf, P_mkt), axis=1)
mask_valid = np.isfinite(P_mkt).any(axis=1)

y_pred_mkt = pd.Series(LABELS[best_idx], dtype="object")
y_pred_mkt = y_pred_mkt.where(mask_valid, np.nan)   # <- evita mezclar str/float en np.where
m["y_pred_market"] = y_pred_mkt

# Confianza/entropía/margen
probs = np.column_stack([m["pH_mkt_pred"], m["pD_mkt_pred"], m["pA_mkt_pred"]]).astype(float)
m["conf_maxprob"] = np.nanmax(probs, axis=1)
sorted_p = np.sort(probs, axis=1)
m["margin_top12"] = sorted_p[:,-1] - sorted_p[:,-2]
m["entropy"] = -(probs * np.log(np.clip(probs, EPS, 1.0))).sum(axis=1)

# ---------- 2) Filtro de filas evaluables ----------
valid_label = m["y_true_norm"].isin(["H","D","A"])
valid_prob = np.isfinite(probs).all(axis=1)
scored_mask = valid_label & valid_prob
scored = m.loc[scored_mask].copy()

# ---------- 3) Accuracy, LogLoss, Brier por temporada ----------
def brier_mc(y_true, P, labels=("H","D","A")):
    idx = {c:i for i,c in enumerate(labels)}
    Y = np.zeros_like(P)
    y = y_true.astype(str).str.upper().str.strip().to_numpy()
    Y[np.arange(len(y)), [idx[c] for c in y]] = 1.0
    return float(np.mean(np.sum((P - Y)**2, axis=1)))

rows = []
for s, g in scored.groupby("Season", dropna=True):
    y = g["y_true_norm"]
    P = g[["pH_mkt_pred","pD_mkt_pred","pA_mkt_pred"]].to_numpy(dtype=float)
    acc = float((g["y_pred_market"].astype(str).str.upper().str.strip() == y).mean())
    ll = float(log_loss(y, P, labels=["H","D","A"]))
    br = brier_mc(y, P, labels=("H","D","A"))
    rows.append({"Season": int(s), "accuracy": acc, "logloss": ll, "brier": br, "n_scored": int(len(g))})
metrics_by_season = pd.DataFrame(rows).sort_values("Season").reset_index(drop=True)

# ---------- 4) ROI y estadísticas de apuestas por temporada ----------
B365H = pd.to_numeric(m.get("B365H", np.nan), errors="coerce").to_numpy()
B365D = pd.to_numeric(m.get("B365D", np.nan), errors="coerce").to_numpy()
B365A = pd.to_numeric(m.get("B365A", np.nan), errors="coerce").to_numpy()

pred_arr = m["y_pred_market"].astype("object").astype(str).str.upper().str.strip().to_numpy()
yt_arr   = m["y_true_norm"].to_numpy()

odds_pick = np.where(pred_arr=="H", B365H,
              np.where(pred_arr=="D", B365D,
                       np.where(pred_arr=="A", B365A, np.nan))).astype(float)
valid_odds = np.isfinite(odds_pick) & (odds_pick >= 1.01)
bet_mask = valid_label.to_numpy() & valid_odds

m["__bet__"] = bet_mask
m["__win__"] = False
mask_bet_idx = np.where(bet_mask)[0]
m.loc[mask_bet_idx, "__win__"] = (pred_arr[bet_mask] == yt_arr[bet_mask])
m["__odds__"] = odds_pick

# overround promedio en partidos apostados
overround_row = (1/np.clip(B365H, 1.0, None)) + (1/np.clip(B365D, 1.0, None)) + (1/np.clip(B365A, 1.0, None))
overround_row[~np.isfinite(overround_row)] = np.nan
m["__overround__"] = overround_row

roi_rows = []
for s, g in m.groupby("Season", dropna=True):
    gb = g[g["__bet__"] == True]
    n_bets = int(len(gb))
    if n_bets == 0:
        roi_rows.append({
            "Season": int(s), "roi": np.nan, "n_bets": 0, "n_wins": 0,
            "hit_rate": np.nan, "avg_odds_win": np.nan, "avg_overround": np.nan
        })
        continue
    n_wins = int(gb["__win__"].sum())
    profit = np.where(gb["__win__"], gb["__odds__"] - 1.0, -1.0)
    roi = float(profit.sum() / n_bets)
    hit_rate = n_wins / n_bets if n_bets > 0 else np.nan
    avg_odds_win = float(pd.to_numeric(gb.loc[gb["__win__"], "__odds__"], errors="coerce").mean()) if n_wins>0 else np.nan
    avg_overround = float(pd.to_numeric(gb["__overround__"], errors="coerce").mean())
    roi_rows.append({
        "Season": int(s),
        "roi": roi,
        "n_bets": n_bets,
        "n_wins": n_wins,
        "hit_rate": float(hit_rate) if np.isfinite(hit_rate) else np.nan,
        "avg_odds_win": avg_odds_win,
        "avg_overround": avg_overround
    })
roi_by_season = pd.DataFrame(roi_rows)

# ---------- 5) Métricas finales por temporada (merge) ----------
final_by_season = (
    metrics_by_season
    .merge(roi_by_season, on="Season", how="left")
    .sort_values("Season")
    .reset_index(drop=True)
)

# ---------- 6) Guardar CSV por temporada y resumen overall ----------
csv_path = OUT / "metrics_market_by_season.csv"
final_by_season.to_csv(csv_path, index=False)

def wavg(col, weight):
    c = pd.to_numeric(final_by_season[col], errors="coerce")
    w = pd.to_numeric(final_by_season[weight], errors="coerce").fillna(0)
    return float(np.nansum(c*w) / np.nansum(w)) if np.nansum(w) > 0 else np.nan

overall = {
    "generated_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
    "model": "market_argmax(pimp1,pimpx,pimp2)",
    "overall": {
        "n_scored_total": int(final_by_season["n_scored"].fillna(0).sum()),
        "n_bets_total": int(final_by_season["n_bets"].fillna(0).sum()),
        "accuracy_overall": wavg("accuracy","n_scored"),
        "logloss_overall":  wavg("logloss","n_scored"),
        "brier_overall":    wavg("brier","n_scored"),
        "roi_overall":      wavg("roi","n_bets"),
        "hit_rate_overall": wavg("hit_rate","n_bets"),
        "avg_overround_overall": float(pd.to_numeric(final_by_season["avg_overround"], errors="coerce").mean()),
        "avg_conf_overall": float(pd.to_numeric(m.loc[scored_mask, "conf_maxprob"], errors="coerce").mean()),
        "avg_entropy_overall": float(pd.to_numeric(m.loc[scored_mask, "entropy"], errors="coerce").mean()),
        "avg_margin_overall": float(pd.to_numeric(m.loc[scored_mask, "margin_top12"], errors="coerce").mean()),
    }
}

json_path = OUT / "metrics_market_overall.json"
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(overall, f, ensure_ascii=False, indent=2)

print("✔ Métricas del modelo de mercado guardadas:")
print(" -", csv_path)
print(" -", json_path)
display(final_by_season.head(20))

✔ Métricas del modelo de mercado guardadas:
 - outputs/metrics_market_by_season.csv
 - outputs/metrics_market_overall.json


Unnamed: 0,Season,accuracy,logloss,brier,n_scored,roi,n_bets,n_wins,hit_rate,avg_odds_win,avg_overround
0,2010,0.565789,1.391905,0.595885,380,0.031737,380,215,0.565789,1.823535,1.065656
1,2011,0.478947,1.424223,0.631134,380,-0.160789,380,182,0.478947,1.752198,1.064498
2,2012,0.5,1.388738,0.601052,380,-0.126,380,190,0.5,1.748,1.063707
3,2013,0.492105,1.424631,0.617362,380,-0.157053,380,187,0.492105,1.712941,1.06363
4,2014,0.465789,1.414498,0.644767,380,-0.200526,380,177,0.465789,1.716384,1.055137
5,2015,0.502632,1.445791,0.61687,380,-0.106737,380,191,0.502632,1.777173,1.051227
6,2016,0.526316,1.472547,0.612039,380,-0.071474,380,200,0.526316,1.7642,1.050764
7,2017,0.510526,1.424211,0.608548,380,-0.089553,380,194,0.510526,1.783351,1.052719
8,2018,0.455263,1.372568,0.62977,380,-0.180105,380,173,0.455263,1.800925,1.052628
9,2019,0.497368,1.356642,0.617235,380,-0.0605,380,189,0.497368,1.888942,1.054675


In [108]:
# ============================================================
# MATCHLOGS — MODELO MERCADO (argmax pimp1/pimpx/pimp2) → CSV por temporada
# Genera: outputs/matchlogs_market_<Season>.csv
# ============================================================

OUT_DIR = Path("outputs")
OUT_DIR.mkdir(parents=True, exist_ok=True)

# ---------- Helpers robustos ----------
def _norm_name(s: str) -> str:
    return re.sub(r'[^a-z0-9]+', '', str(s).strip().lower())

def _find_col(df, candidates):
    norm2real = {_norm_name(c): c for c in df.columns}
    for cand in candidates:
        if _norm_name(cand) in norm2real:
            return norm2real[_norm_name(cand)]
    return None

def _infer_team_cols(df):
    home_candidates = ["HomeTeam_norm","HomeTeam","home_team","Home","local"]
    away_candidates = ["AwayTeam_norm","AwayTeam","away_team","Away","visitor","visiting"]
    home_col = _find_col(df, home_candidates)
    away_col = _find_col(df, away_candidates)
    if home_col is None or away_col is None:
        raise KeyError(f"No encuentro columnas Home/Away. Cols: {list(df.columns)[:40]}")
    return home_col, away_col

def _coalesce_suffix(mdf: pd.DataFrame, base: str) -> pd.DataFrame:
    cx, cy = f"{base}_x", f"{base}_y"
    if cx in mdf.columns or cy in mdf.columns:
        if cx in mdf.columns and cy in mdf.columns:
            mdf[base] = mdf[cx].where(mdf[cx].notna(), mdf[cy])
        elif cx in mdf.columns:
            mdf[base] = mdf[cx]
        else:
            mdf[base] = mdf[cy]
        mdf.drop(columns=[c for c in (cx, cy) if c in mdf.columns], inplace=True)
    return mdf

def _build_pred_key_like_pipeline(df_in, home_col=None, away_col=None):
    d = df_in.copy()
    d["Date"] = pd.to_datetime(d["Date"], errors="coerce")
    if home_col is None or away_col is None:
        home_col, away_col = _infer_team_cols(d)
    d["Season"] = pd.to_numeric(d["Season"], errors="coerce").astype("Int64")
    date_key = d["Date"].dt.tz_localize(None, nonexistent="NaT", ambiguous="NaT").dt.floor("D")
    d["pred_key"] = (
        d["Season"].astype("Int64").astype(str) + "|" +
        date_key.dt.strftime("%Y-%m-%d") + "|" +
        d[home_col].astype(str) + "|" +
        d[away_col].astype(str)
    )
    return d

def _attach_matchday_from_df(merged_in: pd.DataFrame, df: pd.DataFrame) -> pd.DataFrame:
    """
    Trae Matchday (= df[Matchweek]) usando alineación determinista por (Date,row_in_date).
    Fallback: por pred_key base (sin '#k') si hiciera falta.
    """
    m = merged_in.copy()
    m["Date"] = pd.to_datetime(m["Date"], errors="coerce")
    df2 = df.copy()
    df2["Date"] = pd.to_datetime(df2["Date"], errors="coerce")

    # Detecta columna Matchweek en df
    mw_col = _find_col(df2, ["Matchweek","MatchWeek","matchweek","Jornada","Gameweek","GW","Week","MD"])
    if mw_col is None:
        raise KeyError("No se encontró columna de jornada (Matchweek) en df.")

    # Alineación por (Date,row_in_date)
    df2_sorted = df2.sort_values("Date", kind="mergesort").reset_index(drop=True)
    df2_sorted["row_in_date"] = df2_sorted.groupby("Date").cumcount()

    m_sorted = m.sort_values("Date", kind="mergesort").reset_index(drop=True)
    m_sorted["row_in_date"] = m_sorted.groupby("Date").cumcount()

    bring = df2_sorted[["Date","row_in_date", mw_col]].rename(columns={mw_col: "Matchday"})
    m_sorted = m_sorted.merge(bring, on=["Date","row_in_date"], how="left", validate="1:1")

    # Fallback por pred_key base para los NaN
    if m_sorted["Matchday"].isna().any():
        missing = m_sorted["Matchday"].isna()
        if "pred_key" not in m_sorted.columns or "pred_key" not in df2_sorted.columns:
            hm_m, aw_m = _infer_team_cols(m_sorted)
            m_sorted = _build_pred_key_like_pipeline(m_sorted, hm_m, aw_m)
            hm_d, aw_d = _infer_team_cols(df2_sorted)
            df2_sorted = _build_pred_key_like_pipeline(df2_sorted, hm_d, aw_d)
        m_sorted["pred_key_base"] = m_sorted["pred_key"].astype(str).str.split("#", n=1, expand=True)[0]
        df2_sorted["pred_key_base"] = df2_sorted["pred_key"].astype(str).str.split("#", n=1, expand=True)[0]
        aux = (df2_sorted[["pred_key_base", mw_col]]
               .drop_duplicates("pred_key_base")
               .rename(columns={mw_col:"Matchday_fb"}))
        m_sorted = m_sorted.merge(aux, on="pred_key_base", how="left")
        m_sorted.loc[missing, "Matchday"] = m_sorted.loc[missing, "Matchday_fb"]
        m_sorted.drop(columns=["pred_key_base","Matchday_fb"], inplace=True, errors="ignore")

    return m_sorted

# ---------- Carga y saneo ----------
m = merged.copy()

# Coalesce _x/_y si existen
for base in ["Season","HomeTeam_norm","AwayTeam_norm","HomeTeam","AwayTeam"]:
    m = _coalesce_suffix(m, base)

# Home/Away canónicas
home_col_real, away_col_real = _infer_team_cols(m)
if "HomeTeam_norm" not in m.columns:
    m["HomeTeam_norm"] = m[home_col_real]
if "AwayTeam_norm" not in m.columns:
    m["AwayTeam_norm"] = m[away_col_real]

# Tipos/numéricos
m["Date"] = pd.to_datetime(m["Date"], errors="coerce")
for c in ["pimp1","pimpx","pimp2","B365H","B365D","B365A"]:
    if c in m.columns:
        m[c] = pd.to_numeric(m[c], errors="coerce")
m["Season"] = pd.to_numeric(m["Season"], errors="coerce").astype("Int64")
m["y_true"] = m["y_true"].astype(str).str.upper().str.strip()

# ---------- Probabilidades de mercado (normalizadas) y pick ----------
P_raw = m[["pimp1","pimpx","pimp2"]].to_numpy(dtype=float)  # (H, D, A)
row_sum = np.nansum(P_raw, axis=1, keepdims=True)
row_sum = np.where(row_sum <= 0, np.nan, row_sum)
P_mkt = P_raw / row_sum

m["pH_mkt_pred"] = P_mkt[:,0]
m["pD_mkt_pred"] = P_mkt[:,1]
m["pA_mkt_pred"] = P_mkt[:,2]

with np.errstate(invalid="ignore"):
    best_idx = np.nanargmax(np.where(np.isnan(P_mkt), -np.inf, P_mkt), axis=1)
mask_valid_row = np.isfinite(P_mkt).any(axis=1)
LABELS = np.array(["H","D","A"])
y_pred_market = pd.Series(LABELS[best_idx], dtype="object").where(mask_valid_row, np.nan)
m["y_pred_market"] = y_pred_market

# Confianza/entropía/margen sobre probs de mercado
probs = np.column_stack([m["pH_mkt_pred"], m["pD_mkt_pred"], m["pA_mkt_pred"]]).astype(float)
m["conf_maxprob"] = np.nanmax(probs, axis=1)
sorted_p = np.sort(probs, axis=1)
m["margin_top12"] = sorted_p[:,-1] - sorted_p[:,-2]
m["entropy"] = -(probs * np.log(np.clip(probs, 1e-15, 1.0))).sum(axis=1)

# ---------- Matchday real desde df ----------
m = _attach_matchday_from_df(m, df)

# ---------- Mercado: overround e implícitas (1/odds) ----------
if {"B365H","B365D","B365A"}.issubset(m.columns):
    pH_imp = 1.0/np.clip(m["B365H"].astype(float), 1.0, None)
    pD_imp = 1.0/np.clip(m["B365D"].astype(float), 1.0, None)
    pA_imp = 1.0/np.clip(m["B365A"].astype(float), 1.0, None)
    s_imp = pH_imp.fillna(0) + pD_imp.fillna(0) + pA_imp.fillna(0)
    m["overround"] = s_imp.where(s_imp > 0, np.nan)
else:
    m["overround"] = np.nan

# ---------- Pick: odds, prob, EV, Kelly ----------
def _pick_odds(row):
    if row.get("y_pred_market") == "H": return row.get("B365H", np.nan)
    if row.get("y_pred_market") == "D": return row.get("B365D", np.nan)
    if row.get("y_pred_market") == "A": return row.get("B365A", np.nan)
    return np.nan

def _pick_prob(row):
    y = str(row.get("y_pred_market"))
    if y == "H": return row.get("pH_mkt_pred", np.nan)
    if y == "D": return row.get("pD_mkt_pred", np.nan)
    if y == "A": return row.get("pA_mkt_pred", np.nan)
    return np.nan

m["odds_pick"] = m.apply(_pick_odds, axis=1).astype(float)
m["p_pick"]    = m.apply(_pick_prob,  axis=1).astype(float)

b = np.where(np.isfinite(m["odds_pick"]), m["odds_pick"] - 1.0, np.nan)
m["ev_pick"] = m["p_pick"] * b - (1 - m["p_pick"])
kelly_raw = (m["p_pick"]*b - (1 - m["p_pick"])) / b
m["kelly_pick"] = np.clip(kelly_raw, 0.0, 1.0)
m.loc[~np.isfinite(b), "kelly_pick"] = np.nan

# ---------- Resultado y profit (stake 1) ----------
valid_label = m["y_true"].isin(["H","D","A"])
valid_odds  = np.isfinite(m["odds_pick"]) & (m["odds_pick"] >= 1.01)

m["bet_placed"] = (valid_label & valid_odds).astype(int)
m["correct"]    = ((m["y_true"] == m["y_pred_market"].astype(str).str.upper().str.strip()) & (m["bet_placed"]==1)).astype(int)
m["profit"]     = np.where(m["bet_placed"]==1, -1.0, np.nan)
m.loc[m["correct"]==1, "profit"] = m.loc[m["correct"]==1, "odds_pick"] - 1.0

# ---------- Profit acumulado por temporada ----------
m = m.sort_values(["Season","Matchday","Date","HomeTeam_norm","AwayTeam_norm"], kind="mergesort").reset_index(drop=True)
m["profit_filled"] = pd.to_numeric(m["profit"], errors="coerce").fillna(0.0)
m["cum_profit_season"] = m.groupby("Season", sort=False)["profit_filled"].transform("cumsum")
m.drop(columns=["profit_filled"], inplace=True)

# ---------- Selección de columnas ----------
cols_head = [
    "Season","Matchday","Date","HomeTeam_norm","AwayTeam_norm","pred_key",
    "y_true","y_pred_market",
    "pH_mkt_pred","pD_mkt_pred","pA_mkt_pred","conf_maxprob","entropy","margin_top12",
    "B365H","B365D","B365A","overround",
    "odds_pick","p_pick","ev_pick","kelly_pick",
    "bet_placed","correct","profit","cum_profit_season"
]
cols_exist = [c for c in cols_head if c in m.columns]
log = m[cols_exist].copy()

# ---------- Exportar CSV por temporada ----------
for s, grp in log.groupby("Season", dropna=True):
    out_path = OUT_DIR / f"matchlogs_market_{int(s)}.csv"
    grp.sort_values(["Matchday","Date","HomeTeam_norm","AwayTeam_norm"], kind="mergesort").to_csv(out_path, index=False)

print("✔ Matchlogs del modelo de mercado generados en 'outputs/' (uno por temporada).")

✔ Matchlogs del modelo de mercado generados en 'outputs/' (uno por temporada).
