In [1]:
# --- Parámetros (se pueden sobreescribir en CI) ---
from pathlib import Path
from datetime import datetime
import os
import pandas as pd
import pytz

# Zona horaria para "hoy"
TZ = pytz.timezone("Europe/Madrid")

def _today_tz(tz=TZ) -> str:
    return datetime.now(tz).date().strftime("%Y-%m-%d")

# RUN_DATE: prioridad -> valor ya definido (papermill/globals) -> env -> hoy (Europe/Madrid)
_run_injected = globals().get("RUN_DATE", None)
if _run_injected not in (None, "", "auto", "today"):
    RUN_DATE = str(_run_injected)
else:
    RUN_DATE = os.environ.get("RUN_DATE", _today_tz())

# Normaliza a YYYY-MM-DD
RUN_DATE = pd.to_datetime(RUN_DATE, errors="coerce").date().strftime("%Y-%m-%d")

# SEASON: si no viene dada, se calcula a partir de RUN_DATE (formato 2025_26)
if "SEASON" in globals() and globals()["SEASON"]:
    SEASON = globals()["SEASON"]
else:
    _dt = pd.to_datetime(RUN_DATE)
    _y = int(_dt.year) if _dt.month >= 7 else int(_dt.year) - 1
    SEASON = f"{_y}_{(_y+1) % 100:02d}"

# MATCHDAY (jornada): permite inyección externa; por defecto None
MATCHDAY = globals().get("MATCHDAY", os.environ.get("MATCHDAY", None))

# Versión de modelo: respeta inyección / env, si no usa por defecto
MODEL_VERSION = globals().get("MODEL_VERSION", os.environ.get("MODEL_VERSION", "xgb-local"))

# --- Rutas coherentes local/CI ---
ROOT   = Path.cwd()
DATA   = ROOT / "data"
RAW    = DATA / "01_raw"
PROC   = DATA / "02_processed"
FEAT   = DATA / "03_features"
MODELS = DATA / "04_models"
OUT    = ROOT / "outputs"

for p in [RAW, PROC, FEAT, MODELS, OUT]:
    p.mkdir(parents=True, exist_ok=True)

# Reproducibilidad
import random, numpy as np
random.seed(42); np.random.seed(42)

print(f"RUN_DATE = {RUN_DATE} | SEASON = {SEASON} | MATCHDAY = {MATCHDAY} | MODEL_VERSION = {MODEL_VERSION}")
print(f"ROOT = {ROOT}")

RUN_DATE = 2025-09-28 | SEASON = 2025_26 | MATCHDAY = None | MODEL_VERSION = xgb-local
ROOT = /content


In [2]:
import pandas as pd, json

def load_feat(name: str):
    return pd.read_parquet(FEAT / name)

def save_model(obj, name: str):
    from joblib import dump
    MODELS.mkdir(parents=True, exist_ok=True)
    dump(obj, MODELS / name)

def save_predictions(df: pd.DataFrame, name: str = "predictions_next.csv"):
    OUT.mkdir(parents=True, exist_ok=True)
    df.to_csv(OUT / name, index=False)

def save_json(obj, name: str = "metrics_overview.json"):
    OUT.mkdir(parents=True, exist_ok=True)
    with open(OUT / name, "w", encoding="utf-8") as f:
        json.dump(obj, f, ensure_ascii=False, indent=2)

# **MODELOS**

In [3]:
import json
from collections import defaultdict
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.linear_model import LogisticRegression
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import accuracy_score, log_loss
from imblearn.over_sampling import SMOTE
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import confusion_matrix

# **PREDICCIÓN: Logistic Regression multinomial**

In [4]:
IN_PATH = FEAT / "df_final.parquet"
df = pd.read_parquet(IN_PATH)

print("Leído:", IN_PATH, "· filas=", len(df), "· cols=", df.shape[1])
df.head(2)

Leído: /content/data/03_features/df_final.parquet · filas= 7290 · cols= 76


Unnamed: 0,B365A,B365D,B365H,Date,FTR,HomeTeam_norm,AwayTeam_norm,h_elo,a_elo,Season,...,a_squad_size_prev_season,a_pct_foreigners_prev_season,has_xg_data,target,home_playstyle_defensivo,home_playstyle_equilibrado,home_playstyle_ofensivo,away_playstyle_defensivo,away_playstyle_equilibrado,away_playstyle_ofensivo
0,6.0,3.6,1.57,2006-08-26,H,valencia,betis,1857.375122,1726.076904,2006,...,33.0,24.24,0,2.0,False,False,True,True,False,False
1,3.75,3.2,2.0,2006-08-27,D,ath bilbao,sociedad,1755.359253,1701.137573,2006,...,31.0,22.58,0,1.0,False,True,False,False,True,False


Sin SMOTE:

In [5]:
# =========================
# PREDICCIÓN (BASELINE, sin SMOTE) + B365 + export (sin df_old)
# =========================

# --- Detectar automáticamente la próxima jornada COMPLETA (10) ---
def _season_from_run_date(run_date_str: str) -> int:
    d = pd.to_datetime(run_date_str)
    return int(d.year) if d.month >= 7 else int(d.year) - 1

# df_final ya cargado como df
_df_dates = df.copy()
_df_dates["Date"] = pd.to_datetime(_df_dates["Date"]).dt.date

season_auto = _season_from_run_date(RUN_DATE)
today_d = pd.to_datetime(RUN_DATE).date()

# 1) Intento con lo que ya hay en df (sin filtrar por fecha para no perder el viernes)
grp_all = (_df_dates[_df_dates["Season"] == season_auto]
           .groupby("Wk")
           .agg(n=("Wk","size"), dmin=("Date","min"), dmax=("Date","max"))
           .reset_index()
           .sort_values(["dmin","Wk"]))

wk_next = None; start_date = None; end_date = None
if not grp_all.empty:
    cand = grp_all[(grp_all["n"] >= 10) & (grp_all["dmax"] >= today_d)]
    if len(cand):
        row = cand.iloc[0]
        wk_next = int(row["Wk"])
        start_date = row["dmin"]; end_date = row["dmax"]

# 2) Fallback: parquet de jornadas si aún no está completo en df
if wk_next is None:
    PROC = Path(PROC) if "PROC" in globals() else Path("./data/02_processed")
    for wk_path in [PROC/"wk_actualizado_2005_2025.parquet", PROC/"wk_2005_2025.parquet"]:
        if wk_path.exists():
            wk = pd.read_parquet(wk_path)
            wk = wk[wk["Season"] == season_auto].copy()
            wk["Date"] = pd.to_datetime(wk["Date"], errors="coerce").dt.date
            g = (wk.groupby("Wk")
                   .agg(n=("Wk","size"), dmin=("Date","min"), dmax=("Date","max"))
                   .reset_index()
                   .sort_values(["dmin","Wk"]))
            cand2 = g[(g["n"] >= 10) & (g["dmax"] >= today_d)]
            if len(cand2):
                row = cand2.iloc[0]
                wk_next = int(row["Wk"])
                start_date = row["dmin"]; end_date = row["dmax"]
                break

assert wk_next is not None, "No pude detectar la próxima jornada."
PRED_SEASON = season_auto
print(f"[AUTO] Próxima jornada: Season={PRED_SEASON}  Wk={wk_next}  ({start_date}–{end_date})")

# --- Normaliza fechas en df (df_final ya cargado) ---
df = df.copy()
df["Date"] = pd.to_datetime(df["Date"]).dt.date

# --- Índices a predecir: por jornada completa (no por fechas) ---
mask_pred = (
    (df["Season"] == PRED_SEASON) &
    (df["Wk"] == wk_next)
)
pred_idx_sorted = (
    df.loc[mask_pred]
      .assign(_idx=lambda x: x.index)
      .sort_values(["Date","_idx"]).index.tolist()
)
print(f"[BASE] partidos a predecir: {len(pred_idx_sorted)} en jornada {wk_next}")

# --- X,y evitando fugas (añadimos los nombres para NO usarlos como features) ---
drop_cols = [
    'FTR','target','Date','has_xg_data','overround','pimp2','B365D',
    'a_squad_size_prev_season','away_form_gd_6','home_form_gd_6',
    'HomeTeam_norm','AwayTeam_norm',  # excluir nombres del modelo
    'row_id'
]
drop_cols = [c for c in drop_cols if c in df.columns]

X = df.drop(columns=drop_cols)
y = df["target"]

df_dates = pd.to_datetime(df["Date"], errors="coerce").dt.date
mask_train = (y.notna()) & (df_dates < start_date)
X_train = X.loc[mask_train].copy()
y_train = y.loc[mask_train].astype(int)

# X de predicción en el MISMO orden que exportaremos
X_pred  = X.loc[pred_idx_sorted].copy()

# quitar 'Season' si queda y alinear columnas
for D in (X_train, X_pred):
    if "Season" in D.columns:
        D.drop(columns=["Season"], inplace=True)
X_pred = X_pred.reindex(columns=X_train.columns, fill_value=np.nan)

# --- Modelo baseline ---
pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  StandardScaler()),
    ("logreg",  LogisticRegression(solver="lbfgs", max_iter=1000, random_state=42))
])
pipe.fit(X_train, y_train)

# --- Predicción (ya en orden final) ---
proba_pred  = pipe.predict_proba(X_pred)
pred_labels = pipe.predict(X_pred)

# map de clases a 1X2
class_map = {0:"A", 1:"D", 2:"H"}
classes    = list(pipe.named_steps["logreg"].classes_)  # e.g. [0,1,2]
pred_1x2   = pd.Series(pred_labels).map(class_map).values

# probabilidades por H/D/A robustas a orden de clases
proba_df = pd.DataFrame(proba_pred, columns=[class_map[c] for c in classes])
for lab in ["H","D","A"]:
    if lab not in proba_df.columns:
        proba_df[lab] = np.nan
proba_df = proba_df[["H","D","A"]].reset_index(drop=True)

# --- Nombres, cuotas, jornada y fechas del df en el orden de predicción ---
need_cols = ["Date","HomeTeam_norm","AwayTeam_norm","Wk","B365H","B365D","B365A"]
missing = [c for c in need_cols if c not in df.columns]
assert not missing, f"Faltan columnas en df_final: {missing}"

meta_ord = df.loc[pred_idx_sorted, need_cols].copy().reset_index(drop=True)
meta_ord = meta_ord.rename(columns={"Wk": "jornada"})

# probabilidades implícitas y overround
with np.errstate(divide="ignore", invalid="ignore"):
    inv = 1.0 / meta_ord[["B365H","B365D","B365A"]]
overround = inv.sum(axis=1)
imp = inv.div(overround, axis=0)
imp.columns = ["Imp_H","Imp_D","Imp_A"]

# --- Resultado final + export ---
out_base = pd.concat([
    meta_ord[["Date","jornada","HomeTeam_norm","AwayTeam_norm","B365H","B365D","B365A"]],
    pd.Series(pred_1x2, name="Pred"),
    proba_df.rename(columns={"H":"Prob_H","D":"Prob_D","A":"Prob_A"}),
    imp,
    overround.rename("Overround"),
], axis=1)

# Asegura carpeta OUT
try:
    OUT
except NameError:
    ROOT = Path(".")
    OUT = ROOT / "outputs"
OUT.mkdir(parents=True, exist_ok=True)

suffix = f"{PRED_SEASON}_{start_date}_{end_date}"

# con sufijo (histórico)
out_base.to_csv( OUT / f"predictions_{suffix}_base.csv", index=False)
out_base.to_json(OUT / f"predictions_{suffix}_base.json", orient="records", force_ascii=False, indent=2)

# “current” (para la app)
out_base.to_csv( OUT / "predictions_current_base.csv", index=False)
out_base.to_json(OUT / "predictions_current_base.json", orient="records", force_ascii=False, indent=2)

display(out_base.head(10))
print("Exportado BASE en:", OUT)

[AUTO] Próxima jornada: Season=2025  Wk=7  (2025-09-26–2025-09-29)
[BASE] partidos a predecir: 10 en jornada 7


Unnamed: 0,Date,jornada,HomeTeam_norm,AwayTeam_norm,B365H,B365D,B365A,Pred,Prob_H,Prob_D,Prob_A,Imp_H,Imp_D,Imp_A,Overround
0,2025-09-26,7,girona,espanol,2.55,3.3,2.75,D,0.284438,0.406349,0.309214,0.37037,0.286195,0.343434,1.058824
1,2025-09-27,7,getafe,levante,2.05,3.1,4.1,D,0.330293,0.348261,0.321446,0.462687,0.30597,0.231343,1.054288
2,2025-09-27,7,ath madrid,real madrid,3.0,3.6,2.25,A,0.236097,0.340303,0.4236,0.315789,0.263158,0.421053,1.055556
3,2025-09-27,7,villarreal,ath bilbao,2.1,3.25,3.7,H,0.400736,0.355985,0.243279,0.451728,0.291886,0.256386,1.054153
4,2025-09-27,7,mallorca,alaves,2.35,3.0,3.4,D,0.39483,0.40234,0.20283,0.40412,0.316561,0.279319,1.052983
5,2025-09-28,7,vallecano,sevilla,2.0,3.3,4.0,H,0.467457,0.345514,0.187029,0.47482,0.28777,0.23741,1.05303
6,2025-09-28,7,betis,osasuna,1.75,3.8,4.5,H,0.458432,0.338874,0.202694,0.540711,0.249012,0.210277,1.056809
7,2025-09-28,7,elche,celta,2.87,3.1,2.6,A,0.263479,0.317903,0.418618,0.330071,0.305582,0.364347,1.055628
8,2025-09-28,7,barcelona,sociedad,1.3,5.5,9.5,H,0.739343,0.178818,0.081839,0.728223,0.172125,0.099652,1.056312
9,2025-09-29,7,valencia,real oviedo,1.7,3.5,5.5,H,0.47951,0.359695,0.160795,0.557164,0.270622,0.172214,1.055768


Exportado BASE en: /content/outputs


Con SMOTE:

In [6]:
# =========================
# PREDICCIÓN (SMOTE) + B365 + export (sin df_old)
# =========================

# --- Detectar automáticamente la próxima jornada COMPLETA (10), como en baseline actualizado ---
def _season_from_run_date(run_date_str: str) -> int:
    d = pd.to_datetime(run_date_str)
    return int(d.year) if d.month >= 7 else int(d.year) - 1

_df_dates = df.copy()
_df_dates["Date"] = pd.to_datetime(_df_dates["Date"]).dt.date

season_auto = _season_from_run_date(RUN_DATE)
today_d = pd.to_datetime(RUN_DATE).date()

# 1) Primero con lo que ya hay en df (sin filtrar por futuro para no perder el viernes)
grp_all = (_df_dates[_df_dates["Season"] == season_auto]
           .groupby("Wk")
           .agg(n=("Wk","size"), dmin=("Date","min"), dmax=("Date","max"))
           .reset_index()
           .sort_values(["dmin","Wk"]))

wk_next = None; start_date = None; end_date = None
if not grp_all.empty:
    cand = grp_all[(grp_all["n"] >= 10) & (grp_all["dmax"] >= today_d)]
    if len(cand):
        row = cand.iloc[0]
        wk_next = int(row["Wk"])
        start_date = row["dmin"]; end_date = row["dmax"]

# 2) Fallback: parquet de jornadas si df no está completo
if wk_next is None:
    PROC = Path(PROC) if "PROC" in globals() else Path("./data/02_processed")
    for wk_path in [PROC/"wk_actualizado_2005_2025.parquet", PROC/"wk_2005_2025.parquet"]:
        if wk_path.exists():
            wk = pd.read_parquet(wk_path)
            wk = wk[wk["Season"] == season_auto].copy()
            wk["Date"] = pd.to_datetime(wk["Date"], errors="coerce").dt.date
            g = (wk.groupby("Wk")
                   .agg(n=("Wk","size"), dmin=("Date","min"), dmax=("Date","max"))
                   .reset_index()
                   .sort_values(["dmin","Wk"]))
            cand2 = g[(g["n"] >= 10) & (g["dmax"] >= today_d)]
            if len(cand2):
                row = cand2.iloc[0]
                wk_next = int(row["Wk"])
                start_date = row["dmin"]; end_date = row["dmax"]
                break

assert wk_next is not None, "No pude detectar la próxima jornada."
PRED_SEASON = season_auto
print(f"[AUTO] Próxima jornada: Season={PRED_SEASON}  Wk={wk_next}  ({start_date}–{end_date})")

# --- Normaliza fechas en df (df_final ya cargado) ---
df = df.copy()
df["Date"] = pd.to_datetime(df["Date"]).dt.date

# --- Índices a predecir: por jornada completa (no por rango de fechas) ---
mask_pred = (
    (df["Season"] == PRED_SEASON) &
    (df["Wk"] == wk_next)
)
pred_idx_sorted = (
    df.loc[mask_pred]
      .assign(_idx=lambda x: x.index)
      .sort_values(["Date","_idx"]).index.tolist()
)
print(f"[SMOTE] partidos a predecir: {len(pred_idx_sorted)} en jornada {wk_next}")

# --- X,y evitando fugas (excluye nombres de equipos de las features) ---
drop_cols = [
    'FTR','target','Date','has_xg_data','overround','pimp2','B365D',
    'a_squad_size_prev_season','away_form_gd_6','home_form_gd_6',
    'HomeTeam_norm','AwayTeam_norm',  # <- excluir nombres del modelo
    'row_id'
]
drop_cols = [c for c in drop_cols if c in df.columns]

X = df.drop(columns=drop_cols)
y = df["target"]

# === CAMBIO CLAVE (alineado con baseline): ENTRENAR SOLO CON PASADO ===
df_dates = pd.to_datetime(df["Date"], errors="coerce").dt.date
mask_train = (y.notna()) & (df_dates < start_date)

X_train = X.loc[mask_train].copy()
y_train = y.loc[mask_train].astype(int)

# X de predicción en el MISMO orden de export
X_pred  = X.loc[pred_idx_sorted].copy()

# quitar 'Season' si queda y alinear columnas
for D in (X_train, X_pred):
    if "Season" in D.columns:
        D.drop(columns=["Season"], inplace=True)
X_pred = X_pred.reindex(columns=X_train.columns, fill_value=np.nan)

# --- Modelo SMOTE ---
pipe_sm = ImbPipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler",  StandardScaler()),
    ("smote",   SMOTE(random_state=42)),
    ("logreg",  LogisticRegression(solver="saga", penalty="l2", max_iter=1000, random_state=42))
])
pipe_sm.fit(X_train, y_train)

# --- Predicción (ya en orden final) ---
proba_pred_sm  = pipe_sm.predict_proba(X_pred)
pred_labels_sm = pipe_sm.predict(X_pred)

class_map = {0:"A", 1:"D", 2:"H"}
classes_sm = list(pipe_sm.named_steps["logreg"].classes_)
pred_1x2_sm = pd.Series(pred_labels_sm).map(class_map).values

proba_df_sm = pd.DataFrame(proba_pred_sm, columns=[class_map[c] for c in classes_sm])
for lab in ["H","D","A"]:
    if lab not in proba_df_sm.columns:
        proba_df_sm[lab] = np.nan
proba_df_sm = proba_df_sm[["H","D","A"]].reset_index(drop=True)

# --- Nombres, cuotas, jornada y fechas directamente de df (orden pred_idx_sorted) ---
need_cols = ["Date","HomeTeam_norm","AwayTeam_norm","Wk","B365H","B365D","B365A"]
missing = [c for c in need_cols if c not in df.columns]
assert not missing, f"Faltan columnas en df_final: {missing}"

meta_ord = df.loc[pred_idx_sorted, need_cols].copy().reset_index(drop=True)
meta_ord = meta_ord.rename(columns={"Wk": "jornada"})

# probabilidades implícitas y overround
with np.errstate(divide="ignore", invalid="ignore"):
    inv = 1.0 / meta_ord[["B365H","B365D","B365A"]]
overround = inv.sum(axis=1)
imp = inv.div(overround, axis=0)
imp.columns = ["Imp_H","Imp_D","Imp_A"]

# --- Resultado final + export ---
out_sm = pd.concat([
    meta_ord[["Date","jornada","HomeTeam_norm","AwayTeam_norm","B365H","B365D","B365A"]],
    pd.Series(pred_1x2_sm, name="Pred"),
    proba_df_sm.rename(columns={"H":"Prob_H","D":"Prob_D","A":"Prob_A"}),
    imp,
    overround.rename("Overround"),
], axis=1)

# Asegura carpeta OUT (misma que baseline)
try:
    OUT
except NameError:
    ROOT = Path(".")
    OUT = ROOT / "outputs"
OUT.mkdir(parents=True, exist_ok=True)

suffix = f"{PRED_SEASON}_{start_date}_{end_date}"

# con sufijo (histórico)
out_sm.to_csv( OUT / f"predictions_{suffix}_smote.csv", index=False)
out_sm.to_json(OUT / f"predictions_{suffix}_smote.json", orient="records", force_ascii=False, indent=2)

# “current” (para la app)
out_sm.to_csv( OUT / "predictions_current_smote.csv", index=False)
out_sm.to_json(OUT / "predictions_current_smote.json", orient="records", force_ascii=False, indent=2)

display(out_sm.head(10))
print("Exportado SMOTE en:", OUT)

[AUTO] Próxima jornada: Season=2025  Wk=7  (2025-09-26–2025-09-29)
[SMOTE] partidos a predecir: 10 en jornada 7


Unnamed: 0,Date,jornada,HomeTeam_norm,AwayTeam_norm,B365H,B365D,B365A,Pred,Prob_H,Prob_D,Prob_A,Imp_H,Imp_D,Imp_A,Overround
0,2025-09-26,7,girona,espanol,2.55,3.3,2.75,D,0.165208,0.534987,0.299805,0.37037,0.286195,0.343434,1.058824
1,2025-09-27,7,getafe,levante,2.05,3.1,4.1,D,0.199491,0.447436,0.353072,0.462687,0.30597,0.231343,1.054288
2,2025-09-27,7,ath madrid,real madrid,3.0,3.6,2.25,D,0.120587,0.472663,0.40675,0.315789,0.263158,0.421053,1.055556
3,2025-09-27,7,villarreal,ath bilbao,2.1,3.25,3.7,D,0.228825,0.519527,0.251649,0.451728,0.291886,0.256386,1.054153
4,2025-09-27,7,mallorca,alaves,2.35,3.0,3.4,D,0.2401,0.557328,0.202573,0.40412,0.316561,0.279319,1.052983
5,2025-09-28,7,vallecano,sevilla,2.0,3.3,4.0,D,0.278788,0.536563,0.184649,0.47482,0.28777,0.23741,1.05303
6,2025-09-28,7,betis,osasuna,1.75,3.8,4.5,D,0.281199,0.501805,0.216996,0.540711,0.249012,0.210277,1.056809
7,2025-09-28,7,elche,celta,2.87,3.1,2.6,A,0.15343,0.415525,0.431045,0.330071,0.305582,0.364347,1.055628
8,2025-09-28,7,barcelona,sociedad,1.3,5.5,9.5,H,0.570768,0.306818,0.122414,0.728223,0.172125,0.099652,1.056312
9,2025-09-29,7,valencia,real oviedo,1.7,3.5,5.5,D,0.314457,0.515624,0.169919,0.557164,0.270622,0.172214,1.055768


Exportado SMOTE en: /content/outputs


# **EVALUACIÓN HISTÓRICA: Logistic Regression multinomial**

In [7]:
IN_PATH = FEAT / "df_final.parquet"
df = pd.read_parquet(IN_PATH)

Sin SMOTE:

In [8]:
# ============================================
# Eval LogReg (SIN SMOTE) walk-forward por jornada → métricas POR TEMPORADA
# ============================================

# --- si df no existe, intenta cargarlo del proyecto ---
try:
    df
except NameError:
    try:
        ROOT
    except NameError:
        ROOT = Path(".")
    try:
        DATA
    except NameError:
        DATA = ROOT / "data"
    FEAT = DATA / "03_features"
    df = pd.read_parquet(FEAT / "df_final.parquet").reset_index(drop=True)

# ---------- util: asegurar orden [0,1,2] en y_proba ----------
def _ensure_probs_012(y_proba: np.ndarray, classes_model: np.ndarray) -> np.ndarray:
    """Devuelve matriz (N,3) en orden fijo [0,1,2]; si falta alguna clase en el modelo, rellena con NaN."""
    pos = {int(c): i for i, c in enumerate(classes_model)}
    out = np.full((y_proba.shape[0], 3), np.nan, dtype=float)
    for cls in (0, 1, 2):
        if cls in pos:
            out[:, cls] = y_proba[:, pos[cls]]
    return out

# ===== Eval: LogReg SIN SMOTE (walk-forward por jornada, salida por temporada) =====
def run_logreg_eval_no_smote(
    df: pd.DataFrame,
    train_until_season: int = 2023,
    test_until_season: int | None = None,
    with_odds: bool = True,
    random_state: int = 42,
):
    """
    SALIDA (igual que antes): métricas agregadas POR TEMPORADA.
    NOVEDAD: el TEST se evalúa jornada a jornada (walk-forward).
    Para cada jornada del test, se entrena SOLO con partidos anteriores a la fecha mínima de esa jornada.
    """

    # --- columnas a excluir de X (mismas reglas que usas en todo el notebook) ---
    drop_cols_common = [
        'FTR','target','Date','has_xg_data',
        'a_squad_size_prev_season','away_form_gd_6','home_form_gd_6',
        'HomeTeam_norm','AwayTeam_norm','row_id'
    ]
    drop_cols_mode = (['overround','pimp2','B365D'] if with_odds else
                      ['fase_temporada_inicio','fase_temporada_mitad',
                       'B365H','B365D','B365A','overround','pimp1','pimpx','pimp2'])
    drop_cols = list(dict.fromkeys(drop_cols_common + drop_cols_mode))

    # --- X/y + filas válidas ---
    y_all = df['target']
    X_all = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')

    valid = y_all.notna()
    if with_odds:
        # si usamos cuotas como features (H y A), exige que existan
        for c in ['B365H','B365A']:
            if c in df.columns:
                valid &= df[c].notna()
    valid &= X_all.notna().all(axis=1)

    X_all = X_all.loc[valid].copy()
    y_all = y_all.loc[valid].astype(int)

    if 'Season' not in X_all.columns:
        raise ValueError("Falta 'Season' en los datos.")
    if 'Wk' not in df.columns:
        raise ValueError("Falta 'Wk' para el walk-forward por jornada.")

    # fechas reales para el corte temporal
    dates_all = pd.to_datetime(df.loc[X_all.index, 'Date'], errors='coerce')

    # --- seasons de test (como en tu código original) ---
    test_mask_season = X_all['Season'] > train_until_season
    if test_until_season is not None:
        test_mask_season &= (X_all['Season'] <= test_until_season)
    seasons_test = sorted(X_all.loc[test_mask_season, 'Season'].dropna().astype(int).unique())

    if not seasons_test:
        print("⚠️ TEST vacío tras filtrar seasons.")
        return None, None, None, None, None, None, None

    # acumuladores (test de toda la season)
    all_idx_test, all_y_true, all_y_pred, all_y_proba = [], [], [], []
    train_metrics_per_wk = []
    last_model = None
    last_scaler = None

    for seas in seasons_test:
        idx_season = X_all.index[X_all['Season'] == seas]
        wk_info = (pd.DataFrame({
                        'idx': idx_season,
                        'Wk':  df.loc[idx_season, 'Wk'].values,
                        'Date': dates_all.loc[idx_season].values
                   })
                   .dropna(subset=['Wk','Date']))
        if wk_info.empty:
            continue

        # orden de jornadas según la fecha mínima (y Wk como desempate natural del groupby)
        wk_order = (wk_info.groupby('Wk')['Date']
                            .min()
                            .sort_values(kind='mergesort')
                            .index.tolist())

        for wk in wk_order:
            idx_wk = wk_info.loc[wk_info['Wk'] == wk, 'idx'].tolist()
            if not idx_wk:
                continue

            cut_date = pd.to_datetime(wk_info.loc[wk_info['Wk'] == wk, 'Date']).min()

            # TRAIN: todo lo anterior al primer partido de la jornada
            train_mask = (dates_all < cut_date)
            X_tr_full = X_all.loc[train_mask].copy()
            y_tr_full = y_all.loc[train_mask].copy()

            # TEST: solo la jornada wk
            X_te_full = X_all.loc[idx_wk].copy()
            y_te_full = y_all.loc[idx_wk].copy()

            # quitar Season de features
            X_tr = X_tr_full.drop(columns=['Season']) if 'Season' in X_tr_full.columns else X_tr_full
            X_te = X_te_full.drop(columns=['Season']) if 'Season' in X_te_full.columns else X_te_full

            if (len(X_tr) == 0) or (len(np.unique(y_tr_full)) < 2):
                continue

            # escalado + modelo de la jornada
            scaler = StandardScaler()
            X_tr_s = scaler.fit_transform(X_tr)
            X_te_s = scaler.transform(X_te)

            model = LogisticRegression(solver='saga', penalty='l2', max_iter=1000, random_state=random_state)
            model.fit(X_tr_s, y_tr_full)

            # métricas de TRAIN (por jornada)
            ytr_pred  = model.predict(X_tr_s)
            ytr_proba = model.predict_proba(X_tr_s)
            classes_used = model.classes_
            ytr_bin  = label_binarize(y_tr_full, classes=classes_used)
            brier_tr = float(np.mean(np.sum((ytr_proba - ytr_bin)**2, axis=1)))
            acc_tr   = float(accuracy_score(y_tr_full, ytr_pred))
            ll_tr    = float(log_loss(y_tr_full, ytr_proba, labels=classes_used))
            train_metrics_per_wk.append({
                "n_train": int(len(y_tr_full)),
                "accuracy": acc_tr,
                "log_loss": ll_tr,
                "brier": brier_tr
            })

            # predicción TEST (jornada)
            yte_pred  = model.predict(X_te_s)
            yte_proba = model.predict_proba(X_te_s)
            yte_proba_012 = _ensure_probs_012(yte_proba, classes_model=classes_used)

            all_idx_test.extend(idx_wk)
            all_y_true.extend(y_te_full.tolist())
            all_y_pred.extend(yte_pred.tolist())
            all_y_proba.append(yte_proba_012)

            last_model = model
            last_scaler = scaler

    if not all_idx_test:
        print("⚠️ No hubo jornadas válidas en test.")
        return None, None, None, None, None, None, None

    # agregación de TEST por temporada (formato idéntico al original)
    y_test_concat  = np.array(all_y_true, dtype=int)
    y_pred_concat  = np.array(all_y_pred, dtype=int)
    y_proba_concat = np.vstack(all_y_proba)  # (N,3) con posibles NaN si faltó alguna clase

    # proba segura para log_loss (sin NaN y normalizada por fila)
    proba_safe = y_proba_concat.copy()
    proba_safe[np.isnan(proba_safe)] = 0.0
    row_sums = proba_safe.sum(axis=1, keepdims=True)
    zero_rows = (row_sums == 0).ravel()
    if zero_rows.any():
        proba_safe[zero_rows, :] = 1.0/3.0
        row_sums[zero_rows, :] = 1.0
    proba_safe = proba_safe / row_sums

    y_bin_full = label_binarize(y_test_concat, classes=[0,1,2])
    brier_te = float(np.mean(np.sum((proba_safe - y_bin_full)**2, axis=1)))
    acc_te   = float(accuracy_score(y_test_concat, y_pred_concat))
    ll_te    = float(log_loss(y_test_concat, proba_safe, labels=[0,1,2]))

    # TRAIN agregado (promedio ponderado por nº de train de cada jornada)
    if train_metrics_per_wk:
        w = np.array([m["n_train"] for m in train_metrics_per_wk], dtype=float)
        w /= w.sum()
        acc_tr_w = float(np.sum([m["accuracy"] * w[i] for i, m in enumerate(train_metrics_per_wk)]))
        ll_tr_w  = float(np.sum([m["log_loss"] * w[i]  for i, m in enumerate(train_metrics_per_wk)]))
        br_tr_w  = float(np.sum([m["brier"] * w[i]     for i, m in enumerate(train_metrics_per_wk)]))
        n_tr_last = int(train_metrics_per_wk[-1]["n_train"])
    else:
        acc_tr_w = ll_tr_w = br_tr_w = np.nan
        n_tr_last = 0

    metrics_train = {
        "accuracy": acc_tr_w,
        "log_loss": ll_tr_w,
        "brier":    br_tr_w,
        "n_train":  n_tr_last
    }
    seasons_text = f"{train_until_season+1}..{test_until_season}" if test_until_season is not None else f">{train_until_season}"
    metrics_test = {
        "accuracy": acc_te,
        "log_loss": ll_te,
        "brier":    brier_te,
        "n_test":   int(len(y_test_concat)),
        "season_min": int(min(seasons_test)),
        "season_max": int(max(seasons_test)),
    }

    print("Logistic Regression (sin SMOTE)", "(con cuotas)" if with_odds else "(sin cuotas)")
    print("\n=== Train (promedio ponderado por jornada) ==="); print(metrics_train)
    print(f"\n=== Test (Seasons {seasons_text}, walk-forward por jornada) ==="); print(metrics_test)

    return last_model, last_scaler, (metrics_train, metrics_test), \
           pd.Series(y_test_concat, index=all_idx_test), \
           y_pred_concat, proba_safe, np.array(all_idx_test)

# ===== Bucle que guarda eval_grid.json y metrics_by_season.csv =====
ROOT = Path(".")
OUT = ROOT / "outputs"
OUT.mkdir(parents=True, exist_ok=True)

seasons_all = sorted(df["Season"].dropna().astype(int).unique())

rows = []
for test_season in seasons_all:
    train_until = test_season - 1
    if train_until < seasons_all[0]:
        continue

    try:
        model, scaler, (mtr_tr, mtr_te), y_test, yte_pred, yte_proba, idx_test = run_logreg_eval_no_smote(
            df,
            train_until_season=train_until,
            test_until_season=test_season,
            with_odds=True,
            random_state=42
        )
        if mtr_te is None:
            continue

        # rango de jornadas presentes en el test de esa season (para referencia)
        wk_min = wk_max = None
        if "Wk" in df.columns and len(idx_test):
            wks = pd.to_numeric(df.loc[idx_test, "Wk"], errors="coerce").dropna().astype(int)
            if len(wks):
                wk_min = int(wks.min())
                wk_max = int(wks.max())

        rows.append({
            "train_until": int(train_until),
            "test_season": int(test_season),
            "metrics_train": {
                "accuracy": float(mtr_tr["accuracy"]),
                "log_loss": float(mtr_tr["log_loss"]),
                "brier":    float(mtr_tr["brier"]),
                "n_train":  int(mtr_tr["n_train"]),
            },
            "metrics_test": {
                "accuracy": float(mtr_te["accuracy"]),
                "log_loss": float(mtr_te["log_loss"]),
                "brier":    float(mtr_te["brier"]),
                "n_test":   int(mtr_te["n_test"]),
                "season_min": int(mtr_te["season_min"]),
                "season_max": int(mtr_te["season_max"]),
                "wk_min": wk_min,
                "wk_max": wk_max,
            }
        })
    except Exception as e:
        print(f"[SKIP] test={test_season} → {e}")

# guardar salidas (mismo formato que ya usabas)
with open(OUT / "eval_grid.json", "w", encoding="utf-8") as f:
    json.dump(rows, f, ensure_ascii=False, indent=2)

if rows:
    flat = []
    for r in rows:
        te = r["metrics_test"]
        flat.append({
            "test_season": r["test_season"],
            "train_until": r["train_until"],
            "acc_test":    te["accuracy"],
            "logloss_test":te["log_loss"],
            "brier_test":  te["brier"],
            "n_test":      te["n_test"],
            "wk_min":      te["wk_min"],
            "wk_max":      te["wk_max"],
        })
    pd.DataFrame(flat).sort_values("test_season").to_csv(
        OUT / "metrics_by_season.csv", index=False
    )

print(f"Guardados:\n- {OUT/'eval_grid.json'}\n- {OUT/'metrics_by_season.csv'}")

Logistic Regression (sin SMOTE) (con cuotas)

=== Train (promedio ponderado por jornada) ===
{'accuracy': 0.6, 'log_loss': 0.8503040399360499, 'brier': 0.5104307269727479, 'n_train': 380}

=== Test (Seasons 2007..2007, walk-forward por jornada) ===
{'accuracy': 0.4263157894736842, 'log_loss': 1.2349125387397448, 'brier': 0.7084656252008951, 'n_test': 380, 'season_min': 2007, 'season_max': 2007}
Logistic Regression (sin SMOTE) (con cuotas)

=== Train (promedio ponderado por jornada) ===
{'accuracy': 0.5565789473684211, 'log_loss': 0.9214888832439952, 'brier': 0.552017798278286, 'n_train': 760}

=== Test (Seasons 2008..2008, walk-forward por jornada) ===
{'accuracy': 0.48157894736842105, 'log_loss': 1.0946781196370998, 'brier': 0.6514989943768685, 'n_test': 380, 'season_min': 2008, 'season_max': 2008}
Logistic Regression (sin SMOTE) (con cuotas)

=== Train (promedio ponderado por jornada) ===
{'accuracy': 0.5482456140350878, 'log_loss': 0.9341333936534477, 'brier': 0.5582402681906519, 'n

In [9]:
# LOCAL
model, scaler, (mtr_tr, mtr_te), y_test, y_pred, y_proba, idx_test = \
    run_logreg_eval_no_smote(df, train_until_season=2023, test_until_season=2024, with_odds=True)

Logistic Regression (sin SMOTE) (con cuotas)

=== Train (promedio ponderado por jornada) ===
{'accuracy': 0.5497076023391813, 'log_loss': 0.9484854524823103, 'brier': 0.562807791558073, 'n_train': 6840}

=== Test (Seasons 2024..2024, walk-forward por jornada) ===
{'accuracy': 0.5736842105263158, 'log_loss': 0.9582196819419253, 'brier': 0.5659048337474383, 'n_test': 380, 'season_min': 2024, 'season_max': 2024}


Con SMOTE:

In [10]:
# ============================================
# Eval LogReg (CON SMOTE) walk-forward por jornada → métricas POR TEMPORADA
# ============================================

# --- SMOTE (imbalanced-learn) ---
try:
    from imblearn.over_sampling import SMOTE
except Exception as e:
    raise ImportError("⚠️ Necesitas 'imbalanced-learn' para usar SMOTE. Instálalo e inténtalo de nuevo.") from e

# --- si df no existe, intenta cargarlo del proyecto ---
try:
    df
except NameError:
    try:
        ROOT
    except NameError:
        ROOT = Path(".")
    try:
        DATA
    except NameError:
        DATA = ROOT / "data"
    FEAT = DATA / "03_features"
    df = pd.read_parquet(FEAT / "df_final.parquet").reset_index(drop=True)

# ---------- util: asegurar orden [0,1,2] en y_proba ----------
def _ensure_probs_012(y_proba: np.ndarray, classes_model: np.ndarray) -> np.ndarray:
    """Devuelve matriz (N,3) en orden fijo [0,1,2]; si falta alguna clase en el modelo, rellena con NaN."""
    pos = {int(c): i for i, c in enumerate(classes_model)}
    out = np.full((y_proba.shape[0], 3), np.nan, dtype=float)
    for cls in (0, 1, 2):
        if cls in pos:
            out[:, cls] = y_proba[:, pos[cls]]
    return out

# ===== Eval: LogReg CON SMOTE (walk-forward por jornada, salida por temporada) =====
def run_logreg_eval(
    df: pd.DataFrame,
    train_until_season: int = 2023,
    test_until_season: int | None = None,
    with_odds: bool = True,
    random_state: int = 42,
):
    """
    SALIDA (igual que tu versión previa): métricas agregadas POR TEMPORADA.
    NOVEDAD: el TEST se evalúa jornada a jornada (walk-forward).
      - Para cada jornada de la season de test, entrena SOLO con partidos anteriores a la fecha mínima de esa jornada.
      - Usa SMOTE en el set de entrenamiento de cada jornada (después de escalar).
    Devuelve:
      last_model, last_scaler, (metrics_train, metrics_test),
      y_test_series(index=idx_test), y_pred_test(np.ndarray), proba_test(np.ndarray Nx3), idx_test(np.ndarray)
    """

    # --- columnas a excluir de X (coherente con tus otras celdas) ---
    drop_cols_common = [
        'FTR','target','Date','has_xg_data',
        'a_squad_size_prev_season','away_form_gd_6','home_form_gd_6',
        'HomeTeam_norm','AwayTeam_norm','row_id'
    ]
    drop_cols_mode = (['overround','pimp2','B365D'] if with_odds else
                      ['fase_temporada_inicio','fase_temporada_mitad',
                       'B365H','B365D','B365A','overround','pimp1','pimpx','pimp2'])
    drop_cols = list(dict.fromkeys(drop_cols_common + drop_cols_mode))

    # --- X/y + filas válidas ---
    y_all = df['target']
    X_all = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')

    valid = y_all.notna()
    if with_odds:
        for c in ['B365H','B365A']:
            if c in df.columns:
                valid &= df[c].notna()
    valid &= X_all.notna().all(axis=1)

    X_all = X_all.loc[valid].copy()
    y_all = y_all.loc[valid].astype(int)

    if 'Season' not in X_all.columns:
        raise ValueError("Falta 'Season' en los datos.")
    if 'Wk' not in df.columns:
        raise ValueError("Falta 'Wk' en df para el walk-forward por jornada.")

    # fechas reales para el corte temporal
    dates_all = pd.to_datetime(df.loc[X_all.index, 'Date'], errors='coerce')

    # --- seasons de test (como antes) ---
    test_mask_season = X_all['Season'] > train_until_season
    if test_until_season is not None:
        test_mask_season &= (X_all['Season'] <= test_until_season)
    seasons_test = sorted(X_all.loc[test_mask_season, 'Season'].dropna().astype(int).unique())
    if not seasons_test:
        print("⚠️ TEST vacío tras filtrar seasons.")
        return None, None, None, None, None, None, None

    # acumuladores del TEST de toda la (s) season(s)
    all_idx_test, all_y_true, all_y_pred, all_y_proba = [], [], [], []
    train_metrics_per_wk = []
    last_model = None
    last_scaler = None

    for seas in seasons_test:
        idx_season = X_all.index[X_all['Season'] == seas]
        wk_info = (pd.DataFrame({
                        'idx': idx_season,
                        'Wk':  df.loc[idx_season, 'Wk'].values,
                        'Date': dates_all.loc[idx_season].values
                   })
                   .dropna(subset=['Wk','Date']))
        if wk_info.empty:
            continue

        # orden de jornadas según la fecha mínima
        wk_order = (wk_info.groupby('Wk')['Date']
                            .min()
                            .sort_values(kind='mergesort')
                            .index.tolist())

        for wk in wk_order:
            idx_wk = wk_info.loc[wk_info['Wk'] == wk, 'idx'].tolist()
            if not idx_wk:
                continue

            cut_date = pd.to_datetime(wk_info.loc[wk_info['Wk'] == wk, 'Date']).min()

            # TRAIN: todo lo anterior a la primera fecha de la jornada
            train_mask = (dates_all < cut_date)
            X_tr_full = X_all.loc[train_mask].copy()
            y_tr_full = y_all.loc[train_mask].copy()

            # TEST: solo esa jornada
            X_te_full = X_all.loc[idx_wk].copy()
            y_te_full = y_all.loc[idx_wk].copy()

            # quitar Season de features
            X_tr = X_tr_full.drop(columns=['Season']) if 'Season' in X_tr_full.columns else X_tr_full
            X_te = X_te_full.drop(columns=['Season']) if 'Season' in X_te_full.columns else X_te_full

            if (len(X_tr) == 0) or (len(np.unique(y_tr_full)) < 2):
                continue

            # escalado
            scaler = StandardScaler()
            X_tr_s = scaler.fit_transform(X_tr)
            X_te_s = scaler.transform(X_te)

            # SMOTE robusto (elige k según la clase minoritaria)
            _, counts = np.unique(y_tr_full, return_counts=True)
            min_count = int(counts.min()) if len(counts) else 0
            if min_count <= 1:
                X_res, y_res = X_tr_s, y_tr_full
            else:
                k = max(1, min(5, min_count - 1))
                try:
                    sm = SMOTE(random_state=random_state, k_neighbors=k)
                    X_res, y_res = sm.fit_resample(X_tr_s, y_tr_full)
                except Exception:
                    X_res, y_res = X_tr_s, y_tr_full

            # modelo
            model = LogisticRegression(
                solver='saga', penalty='l2', max_iter=1000, random_state=random_state
            )
            model.fit(X_res, y_res)

            # métricas de TRAIN (del modelo ya entrenado con SMOTE)
            ytr_pred  = model.predict(X_tr_s)
            ytr_proba = model.predict_proba(X_tr_s)
            classes_used = model.classes_
            ytr_bin  = label_binarize(y_tr_full, classes=classes_used)
            brier_tr = float(np.mean(np.sum((ytr_proba - ytr_bin)**2, axis=1)))
            acc_tr   = float(accuracy_score(y_tr_full, ytr_pred))
            ll_tr    = float(log_loss(y_tr_full, ytr_proba, labels=classes_used))
            train_metrics_per_wk.append({
                "n_train": int(len(y_tr_full)),
                "accuracy": acc_tr,
                "log_loss": ll_tr,
                "brier": brier_tr
            })

            # predicción TEST (jornada)
            yte_pred  = model.predict(X_te_s)
            yte_proba = model.predict_proba(X_te_s)
            yte_proba_012 = _ensure_probs_012(yte_proba, classes_model=classes_used)

            all_idx_test.extend(idx_wk)
            all_y_true.extend(y_te_full.tolist())
            all_y_pred.extend(yte_pred.tolist())
            all_y_proba.append(yte_proba_012)

            last_model = model
            last_scaler = scaler

    if not all_idx_test:
        print("⚠️ No hubo jornadas válidas en test.")
        return None, None, None, None, None, None, None

    # agregación de TEST por temporada (formato idéntico al de tu pipeline original)
    y_test_concat  = np.array(all_y_true, dtype=int)
    y_pred_concat  = np.array(all_y_pred, dtype=int)
    y_proba_concat = np.vstack(all_y_proba)  # (N,3) con posibles NaN si faltó una clase

    # proba segura para log_loss/brier (sin NaN; normalizada por fila)
    proba_safe = y_proba_concat.copy()
    proba_safe[np.isnan(proba_safe)] = 0.0
    row_sums = proba_safe.sum(axis=1, keepdims=True)
    zero_rows = (row_sums == 0).ravel()
    if zero_rows.any():
        proba_safe[zero_rows, :] = 1.0/3.0
        row_sums[zero_rows, :] = 1.0
    proba_safe = proba_safe / row_sums

    y_bin_full = label_binarize(y_test_concat, classes=[0,1,2])
    brier_te = float(np.mean(np.sum((proba_safe - y_bin_full)**2, axis=1)))
    acc_te   = float(accuracy_score(y_test_concat, y_pred_concat))
    ll_te    = float(log_loss(y_test_concat, proba_safe, labels=[0,1,2]))

    # TRAIN agregado (promedio ponderado por nº de train de cada jornada)
    if train_metrics_per_wk:
        w = np.array([m["n_train"] for m in train_metrics_per_wk], dtype=float)
        w /= w.sum()
        acc_tr_w = float(np.sum([m["accuracy"] * w[i] for i, m in enumerate(train_metrics_per_wk)]))
        ll_tr_w  = float(np.sum([m["log_loss"] * w[i]  for i, m in enumerate(train_metrics_per_wk)]))
        br_tr_w  = float(np.sum([m["brier"] * w[i]     for i, m in enumerate(train_metrics_per_wk)]))
        n_tr_last = int(train_metrics_per_wk[-1]["n_train"])
    else:
        acc_tr_w = ll_tr_w = br_tr_w = np.nan
        n_tr_last = 0

    metrics_train = {
        "accuracy": acc_tr_w,
        "log_loss": ll_tr_w,
        "brier":    br_tr_w,
        "n_train":  n_tr_last
    }
    seasons_text = f"{train_until_season+1}..{test_until_season}" if test_until_season is not None else f">{train_until_season}"
    metrics_test = {
        "accuracy": acc_te,
        "log_loss": ll_te,
        "brier":    brier_te,
        "n_test":   int(len(y_test_concat)),
        "season_min": int(min(seasons_test)),
        "season_max": int(max(seasons_test)),
    }

    print("Logistic Regression con SMOTE", "(con cuotas)" if with_odds else "(sin cuotas)")
    print("\n=== Train (promedio ponderado por jornada) ==="); print(metrics_train)
    print(f"\n=== Test (Seasons {seasons_text}, walk-forward por jornada) ==="); print(metrics_test)

    return last_model, last_scaler, (metrics_train, metrics_test), \
           pd.Series(y_test_concat, index=all_idx_test), \
           y_pred_concat, proba_safe, np.array(all_idx_test)


# ===== Bucle que guarda eval_grid_smote.json y metrics_by_season_smote.csv =====
try:
    ROOT
except NameError:
    ROOT = Path(".")
OUT = ROOT / "outputs"
OUT.mkdir(parents=True, exist_ok=True)

seasons_all = sorted(df["Season"].dropna().astype(int).unique())

rows_sm = []
for test_season in seasons_all:
    train_until = test_season - 1
    if train_until < seasons_all[0]:
        continue
    try:
        model, scaler, (mtr_tr, mtr_te), y_test, yte_pred, yte_proba, idx_test = run_logreg_eval(
            df,
            train_until_season=train_until,
            test_until_season=test_season,
            with_odds=True,
            random_state=42
        )
        if mtr_te is None:
            continue

        # rango de jornadas presentes en el test de esa season (solo informativo)
        wk_min = wk_max = None
        if "Wk" in df.columns and idx_test is not None and len(idx_test):
            wks = pd.to_numeric(df.loc[idx_test, "Wk"], errors="coerce").dropna().astype(int)
            if len(wks):
                wk_min = int(wks.min())
                wk_max = int(wks.max())

        rows_sm.append({
            "train_until": int(train_until),
            "test_season": int(test_season),
            "metrics_train": {
                "accuracy": float(mtr_tr["accuracy"]),
                "log_loss": float(mtr_tr["log_loss"]),
                "brier":    float(mtr_tr["brier"]),
                "n_train":  int(mtr_tr["n_train"]),
            },
            "metrics_test": {
                "accuracy": float(mtr_te["accuracy"]),
                "log_loss": float(mtr_te["log_loss"]),
                "brier":    float(mtr_te["brier"]),
                "n_test":   int(mtr_te["n_test"]),
                "season_min": int(mtr_te["season_min"]),
                "season_max": int(mtr_te["season_max"]),
                "wk_min": wk_min,
                "wk_max": wk_max,
            }
        })
    except Exception as e:
        print(f"[SMOTE SKIP] test={test_season} → {e}")

# guardar salidas (mismo formato que ya usabas para SMOTE)
with open(OUT / "eval_grid_smote.json", "w", encoding="utf-8") as f:
    json.dump(rows_sm, f, ensure_ascii=False, indent=2)

if rows_sm:
    flat_sm = []
    for r in rows_sm:
        te = r["metrics_test"]
        flat_sm.append({
            "test_season": r["test_season"],
            "train_until": r["train_until"],
            "acc_test":    te["accuracy"],
            "logloss_test":te["log_loss"],
            "brier_test":  te["brier"],
            "n_test":      te["n_test"],
            "wk_min":      te["wk_min"],
            "wk_max":      te["wk_max"],
        })
    pd.DataFrame(flat_sm).sort_values("test_season").to_csv(
        OUT / "metrics_by_season_smote.csv", index=False
    )

print("Guardados:\n- outputs/eval_grid_smote.json\n- outputs/metrics_by_season_smote.csv")

Logistic Regression con SMOTE (con cuotas)

=== Train (promedio ponderado por jornada) ===
{'accuracy': 0.6052631578947368, 'log_loss': 0.8793798310218244, 'brier': 0.523692392112868, 'n_train': 380}

=== Test (Seasons 2007..2007, walk-forward por jornada) ===
{'accuracy': 0.3973684210526316, 'log_loss': 1.3494414120417446, 'brier': 0.7661194141297701, 'n_test': 380, 'season_min': 2007, 'season_max': 2007}
Logistic Regression con SMOTE (con cuotas)

=== Train (promedio ponderado por jornada) ===
{'accuracy': 0.5210526315789473, 'log_loss': 0.9647718217284392, 'brier': 0.582134391362193, 'n_train': 760}

=== Test (Seasons 2008..2008, walk-forward por jornada) ===
{'accuracy': 0.3973684210526316, 'log_loss': 1.1495529048749027, 'brier': 0.6940270060264039, 'n_test': 380, 'season_min': 2008, 'season_max': 2008}
Logistic Regression con SMOTE (con cuotas)

=== Train (promedio ponderado por jornada) ===
{'accuracy': 0.5140350877192983, 'log_loss': 0.9766692149791395, 'brier': 0.5859414553822

In [11]:
# LOCAL
model_sm, scaler_sm, (mtr_tr_sm, mtr_te_sm), y_test_sm, y_pred_sm, y_proba_sm, idx_test_sm = \
    run_logreg_eval(df, train_until_season=2024, test_until_season=2025, with_odds=True)

Logistic Regression con SMOTE (con cuotas)

=== Train (promedio ponderado por jornada) ===
{'accuracy': 0.5013850415512465, 'log_loss': 0.9841414945443792, 'brier': 0.5860157980797667, 'n_train': 7220}

=== Test (Seasons 2025..2025, walk-forward por jornada) ===
{'accuracy': 0.43333333333333335, 'log_loss': 1.0173246489644299, 'brier': 0.6160581127766295, 'n_test': 60, 'season_min': 2025, 'season_max': 2025}


Con este modelo obtengo el mejor **Accuracy** (porcentaje de aciertos totales), pero esta métrica ignora como de seguras son esas esas predicciones.

$$
\text{Accuracy} = \frac{\text{Número de aciertos}}{\text{Número total de predicciones}}
$$

Para ello se utiliza el **Log Loss** (Cross-Entropy Loss), métrica que mide qué tan buenas son las probabilidades que predice mi modelo de clasificación. A esta métrica no solo le importa acertar la clase, sino cuán seguro está el modelo.

$$
\text{LogLoss} = -\frac{1}{N} \sum_{i=1}^{N} \sum_{j=1}^{K} y_{ij} \cdot \log(p_{ij})
$$

donde:

- $y_{ij}$ = 1 si la clase real del ejemplo $i$ es la clase $j$, y 0 en caso contrario.
- $p_{ij}$ es la probabilidad predicha por el modelo de que el ejemplo $i$ pertenezca a la clase $j$.

Tener un Log Loss alto en este caso significaría dar una probabilidad alta a la clase incorrecta, o lo que es lo mismo, dar una probabilidad baja a la clase correcta.

Por último añadí también el **Brier Score**, que es una métrica que evalúa cuán cercanas están las probabilidades predichas por tu modelo respecto a la realidad, comparando la distribución de probabilidades contra la clase real (codificada en one-hot). Es como un error cuadrático medio (MSE) para probabilidades.

$$
\text{Brier Score} = \frac{1}{N} \sum_{i=1}^{N} \sum_{j=1}^{K} (p_{ij} - y_{ij})^2
$$

donde:

- $N$ es el número de ejemplos.
- $K$ es el número de clases (en este caso 3: victoria local, empate, victoria visitante).
- $p_{ij}$ es la probabilidad predicha por el modelo de que el ejemplo $i$ pertenezca a la clase $j$.
- $y_{ij}$ es 1 si la clase real del ejemplo $i$ es la clase $j$, y 0 en caso contrario.

Un Brier Score de 0 significa que las probabilidades dadas por el modelo son perfectas, mientras que uno del 0.66 en nuestro caso sería un modelo completamente aleatorio.


## Selección de variables

La función `forward_selection` implementa un algoritmo clásico de selección de variables hacia adelante (**forward feature selection**) sobre un modelo de regresión logística multiclase con escalado de variables.

Va añadiendo sucesivamente la variable que mejor mejora el rendimiento del modelo (según accuracy o log_loss), una por una.





In [12]:
# from sklearn.linear_model import LogisticRegression
# from sklearn.preprocessing import StandardScaler
# from sklearn.pipeline import make_pipeline
# from sklearn.metrics import accuracy_score, log_loss
# import numpy as np

# def forward_selection(X, y, max_features=20, scoring='accuracy'):
#     selected_features = []
#     remaining_features = list(X.columns)
#     scores = []

#     for i in range(min(max_features, len(remaining_features))):
#         best_score = -np.inf if scoring == 'accuracy' else np.inf
#         best_feature = None

#         for feature in remaining_features:
#             current_features = selected_features + [feature]

#             model = make_pipeline(
#                 StandardScaler(),
#                 LogisticRegression(max_iter=1000, solver='lbfgs')
#             )

#             model.fit(X[current_features], y)
#             y_pred = model.predict(X[current_features])
#             y_proba = model.predict_proba(X[current_features])

#             if scoring == 'accuracy':
#                 score = accuracy_score(y, y_pred)
#                 if score > best_score:
#                     best_score = score
#                     best_feature = feature
#             elif scoring == 'log_loss':
#                 score = log_loss(y, y_proba)
#                 if score < best_score:
#                     best_score = score
#                     best_feature = feature
#             else:
#                 raise ValueError("scoring debe ser 'accuracy' o 'log_loss'.")

#         if best_feature is not None:
#             selected_features.append(best_feature)
#             remaining_features.remove(best_feature)
#             scores.append(best_score)

#         print(f"[{i+1}] Añadida: {best_feature} | Score: {best_score:.4f}")

#     return selected_features, scores

In [13]:
# selected, scores = forward_selection(X_train, y_train, max_features=81, scoring='accuracy')

In [14]:
# import matplotlib.pyplot as plt
# import numpy as np

# # Suponemos que tienes las listas: selected (variables) y scores (métricas acumuladas)

# # Calcular diferencia respecto al valor anterior
# deltas = np.diff([0] + scores)
# colors = ['blue' if delta >= 0 else 'red' for delta in deltas]

# plt.figure(figsize=(12,6))
# bar_width = 0.6  # Reducir ancho de barra para separarlas
# indices = np.arange(len(selected))

# plt.bar(indices, scores, color=colors, width=bar_width)
# plt.xticks(indices, selected, rotation=90)
# plt.xlabel('Variables añadidas')
# plt.ylabel('Valor de la métrica')
# plt.title('Evolución del rendimiento al añadir variables')

# plt.ylim(min(scores) - 0.01, max(scores) + 0.01)
# plt.tight_layout()
# plt.show()


Se implementó un proceso de selección hacia adelante (forward selection) sobre el modelo de regresión logística con variables estandarizadas. Este procedimiento consiste en partir sin predictores y añadir, en cada iteración, la variable que mayor mejora produce en el rendimiento del modelo. Se evaluaron dos métricas complementarias como criterio de selección: el accuracy (para priorizar aciertos de clasificación) y el log loss (para priorizar la calibración de las probabilidades). Esta técnica permitió reducir la dimensionalidad del conjunto original y determinar el orden de relevancia de las variables desde el punto de vista predictivo.

# **Resultados**

## **MATRIZ DE CONFUSIÓN**

In [15]:
# ============================================
# Confusion matrices por temporada (walk-forward jornada a jornada)
# - BASE (sin SMOTE) y SMOTE
# - Salida: JSON por modelo en outputs/
# ============================================

# --- df y rutas (por si no están en el entorno) ---
try:
    df
except NameError:
    try:
        ROOT
    except NameError:
        ROOT = Path(".")
    try:
        DATA
    except NameError:
        DATA = ROOT / "data"
    FEAT = DATA / "03_features"
    df = pd.read_parquet(FEAT / "df_final.parquet").reset_index(drop=True)

try:
    ROOT
except NameError:
    ROOT = Path(".")
OUT = (ROOT / "outputs")
OUT.mkdir(parents=True, exist_ok=True)

# --- helper: construir y guardar grid de confusiones por temporada ---
def build_confusion_grid(df: pd.DataFrame, out_dir: Path, model_type: str = "base", random_state: int = 42):
    """
    Genera matrices de confusión por temporada usando evaluación walk-forward por jornada.
    - model_type: "base" (usa run_logreg_eval_no_smote) | "smote" (usa run_logreg_eval)
    - Split por temporada: train ≤ S-1, test = S
    - with_odds=True, excluyendo nombres/IDs (ya lo hace cada run_*).
    Salva: outputs/confusion_grid_<model_type>.json
    """
    # comprobación de dependencias (las funciones de eval deben existir)
    if model_type.lower() == "base" and "run_logreg_eval_no_smote" not in globals():
        raise RuntimeError("Falta 'run_logreg_eval_no_smote' en el entorno (baseline walk-forward).")
    if model_type.lower() == "smote" and "run_logreg_eval" not in globals():
        raise RuntimeError("Falta 'run_logreg_eval' en el entorno (SMOTE walk-forward).")

    seasons_all = sorted(df["Season"].dropna().astype(int).unique())
    rows = []

    for test_season in seasons_all:
        train_until = test_season - 1
        if train_until < seasons_all[0]:
            continue

        try:
            if model_type.lower() == "base":
                _, _, (_, mtr_te), y_test, y_pred, _, idx_test = run_logreg_eval_no_smote(
                    df,
                    train_until_season=train_until,
                    test_until_season=test_season,
                    with_odds=True,
                    random_state=random_state
                )
            else:  # smote
                _, _, (_, mtr_te), y_test, y_pred, _, idx_test = run_logreg_eval(
                    df,
                    train_until_season=train_until,
                    test_until_season=test_season,
                    with_odds=True,
                    random_state=random_state
                )

            # si no hay test válido, omitimos la season
            if (mtr_te is None) or (y_test is None) or (y_pred is None) or (len(y_test) == 0):
                continue

            y_true = np.asarray(y_test, dtype=int)
            y_hat  = np.asarray(y_pred, dtype=int)

            # matriz en orden fijo [0=Away, 1=Draw, 2=Home]
            cm = confusion_matrix(y_true, y_hat, labels=[0, 1, 2]).tolist()

            # rango de jornadas incluido en ese test (informativo)
            wk_min = wk_max = None
            if "Wk" in df.columns and idx_test is not None and len(idx_test):
                wks = pd.to_numeric(df.loc[idx_test, "Wk"], errors="coerce").dropna().astype(int)
                if len(wks):
                    wk_min = int(wks.min())
                    wk_max = int(wks.max())

            rows.append({
                "model": model_type,
                "train_until": int(train_until),
                "test_season": int(test_season),
                "labels": ["A","D","H"],              # mapeo 0,1,2 -> A,D,H
                "matrix": cm,                         # 3x3
                "n_test": int(mtr_te["n_test"]),
                "wk_min": wk_min,
                "wk_max": wk_max,
            })

        except Exception as e:
            print(f"[CONF {model_type.upper()} SKIP] test={test_season} → {e}")

    out_path = out_dir / f"confusion_grid_{model_type}.json"
    out_path.write_text(json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"Guardado: {out_path}  ({len(rows)} temporadas)")

# --- (opcional) plot de una temporada concreta para inspección rápida ---
def plot_confusion_for_season(df: pd.DataFrame, test_season: int, model_type: str = "base", random_state: int = 42):
    """
    Dibuja la matriz de confusión agregada de una season concreta (walk-forward).
    No guarda nada; útil para inspección visual.
    """
    if model_type.lower() == "base":
        _, _, (_, mtr_te), y_test, y_pred, _, idx_test = run_logreg_eval_no_smote(
            df, train_until_season=test_season-1, test_until_season=test_season,
            with_odds=True, random_state=random_state
        )
        title_model = "Baseline (sin SMOTE)"
    else:
        _, _, (_, mtr_te), y_test, y_pred, _, idx_test = run_logreg_eval(
            df, train_until_season=test_season-1, test_until_season=test_season,
            with_odds=True, random_state=random_state
        )
        title_model = "SMOTE"

    if (mtr_te is None) or (y_test is None) or (y_pred is None) or (len(y_test) == 0):
        print("Sin test disponible para esa temporada.")
        return

    y_true = np.asarray(y_test, dtype=int)
    y_hat  = np.asarray(y_pred, dtype=int)
    disp = ConfusionMatrixDisplay.from_predictions(
        y_true, y_hat, labels=[0,1,2], display_labels=["Away","Draw","Home"],
        cmap="Blues", colorbar=False
    )

    # añade info de jornadas en el título si la tenemos
    wk_txt = ""
    if "Wk" in df.columns and idx_test is not None and len(idx_test):
        wks = pd.to_numeric(df.loc[idx_test, "Wk"], errors="coerce").dropna().astype(int)
        if len(wks):
            wk_txt = f" | Jornadas {int(wks.min())}–{int(wks.max())}"

    plt.title(f"Season {test_season} · {title_model}{wk_txt}")
    plt.tight_layout()
    plt.show()

# --- EJECUCIÓN (genera archivos para ambos modelos) ---
build_confusion_grid(df, OUT, model_type="base")
build_confusion_grid(df, OUT, model_type="smote")

Logistic Regression (sin SMOTE) (con cuotas)

=== Train (promedio ponderado por jornada) ===
{'accuracy': 0.6, 'log_loss': 0.8503040399360499, 'brier': 0.5104307269727479, 'n_train': 380}

=== Test (Seasons 2007..2007, walk-forward por jornada) ===
{'accuracy': 0.4263157894736842, 'log_loss': 1.2349125387397448, 'brier': 0.7084656252008951, 'n_test': 380, 'season_min': 2007, 'season_max': 2007}
Logistic Regression (sin SMOTE) (con cuotas)

=== Train (promedio ponderado por jornada) ===
{'accuracy': 0.5565789473684211, 'log_loss': 0.9214888832439952, 'brier': 0.552017798278286, 'n_train': 760}

=== Test (Seasons 2008..2008, walk-forward por jornada) ===
{'accuracy': 0.48157894736842105, 'log_loss': 1.0946781196370998, 'brier': 0.6514989943768685, 'n_test': 380, 'season_min': 2008, 'season_max': 2008}
Logistic Regression (sin SMOTE) (con cuotas)

=== Train (promedio ponderado por jornada) ===
{'accuracy': 0.5482456140350878, 'log_loss': 0.9341333936534477, 'brier': 0.5582402681906519, 'n

In [16]:
# (opcional) ejemplo de plot:
# plot_confusion_for_season(df, test_season=2025, model_type="base")

## **METRICAS DE CLASIFICACIÓN**

In [17]:
# ============================================
# Métricas de clasificación por temporada (walk-forward por jornada)
# - BASE (sin SMOTE) y SMOTE
# - Salida: JSON + CSV por modelo en outputs/
# ============================================

# -------------------------
# Carga df y rutas (fallback)
# -------------------------
try:
    df
except NameError:
    try:
        ROOT
    except NameError:
        ROOT = Path(".")
    try:
        DATA
    except NameError:
        DATA = ROOT / "data"
    FEAT = DATA / "03_features"
    df = pd.read_parquet(FEAT / "df_final.parquet").reset_index(drop=True)

try:
    ROOT
except NameError:
    ROOT = Path(".")
OUT = ROOT / "outputs"
OUT.mkdir(parents=True, exist_ok=True)

# -------------------------
# Helpers para uso local (reportar un rango)
# -------------------------
def _prep_test_split(df: pd.DataFrame, train_until_season: int, with_odds: bool, test_until_season: int | None = None):
    """Split TEST (devuelve X_test, y_test, idx_test) excluyendo nombres/IDs de las features."""
    drop_common = [
        'FTR','target','Date','has_xg_data',
        'a_squad_size_prev_season','away_form_gd_6','home_form_gd_6',
        'HomeTeam_norm','AwayTeam_norm','row_id'
    ]
    drop_mode = (['overround','pimp2','B365D'] if with_odds else
                 ['fase_temporada_inicio','fase_temporada_mitad',
                  'B365H','B365D','B365A','overround','pimp1','pimpx','pimp2'])
    drop_cols = list(dict.fromkeys(drop_common + drop_mode))

    y_all = df['target']
    X_all = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')

    valid = y_all.notna()
    if with_odds:
        for c in ['B365H','B365A']:
            if c in X_all.columns:
                valid &= X_all[c].notna()
    valid &= X_all.notna().all(axis=1)

    X_all = X_all.loc[valid].copy()
    y_all = y_all.loc[valid].astype(int)

    if 'Season' not in X_all.columns:
        raise ValueError("Falta 'Season' para hacer el split temporal.")

    test_mask  = X_all['Season'] > train_until_season
    if test_until_season is not None:
        test_mask &= (X_all['Season'] <= test_until_season)

    idx_test = X_all.loc[test_mask].index
    X_test = X_all.loc[test_mask].drop(columns=['Season'])
    y_test = y_all.loc[test_mask]
    return X_test, y_test, idx_test

def _align_to_fit_columns(X: pd.DataFrame, fitter, feature_names: list[str] | None = None) -> pd.DataFrame:
    """Alinea X a las columnas usadas en el fit; elimina extras y lanza si faltan."""
    cols_fit = feature_names if feature_names is not None else getattr(fitter, "feature_names_in_", None)
    if cols_fit is None:
        return X
    cols_fit = list(cols_fit)
    missing = [c for c in cols_fit if c not in X.columns]
    extra   = [c for c in X.columns   if c not in cols_fit]
    if extra:
        X = X.drop(columns=extra)
    if missing:
        raise ValueError(
            "X_test no contiene columnas usadas al entrenar:\n"
            f"- Faltan: {missing}\n"
            "Usa el MISMO esquema (with_odds/drop_cols) o pasa 'feature_names' del entrenamiento."
        )
    return X[cols_fit]

def print_classification_report_for_logreg(
    df: pd.DataFrame, mdl, scaler,
    train_until_season: int = 2023,
    test_until_season: int | None = None,
    with_odds: bool = True,
    digits: int = 3,
    feature_names: list[str] | None = None
):
    """Reporte local rápido para un rango (usa el modelo ya entrenado)."""
    X_test, y_test, idx_test = _prep_test_split(
        df, train_until_season=train_until_season,
        with_odds=with_odds, test_until_season=test_until_season
    )
    if len(X_test) == 0:
        rango = f"{train_until_season+1}..{test_until_season}" if test_until_season is not None else f">{train_until_season}"
        print(f"⚠️ No hay TEST disponible tras filtrar (Seasons {rango}).")
        return

    X_test = _align_to_fit_columns(X_test, scaler, feature_names=feature_names)
    y_pred = mdl.predict(scaler.transform(X_test))

    class2txt = {0:'Away', 1:'Draw', 2:'Home'}
    classes_used = getattr(mdl, "classes_", np.array([0,1,2]))
    classes_used = [c for c in [0,1,2] if c in classes_used]
    target_names = [class2txt[c] for c in classes_used]

    wk_txt = ""
    if "Wk" in df.columns and len(idx_test):
        wks = pd.to_numeric(df.loc[idx_test, "Wk"], errors="coerce").dropna().astype(int)
        if len(wks):
            wk_txt = f" | Jornadas {int(wks.min())}–{int(wks.max())}"

    rango = f"{train_until_season+1}..{test_until_season}" if test_until_season is not None else f">{train_until_season}"
    print(f"[Classification report] Seasons {rango}{wk_txt}\n")
    print(classification_report(
        y_test, y_pred,
        labels=classes_used,
        target_names=target_names,
        zero_division=0,
        digits=digits
    ))

# -------------------------
# Grid de métricas por temporada (BASE / SMOTE)
# -------------------------
def build_classification_grid(
    df: pd.DataFrame,
    out_dir: Path,
    model_type: str = "base",   # "base" (sin SMOTE) | "smote"
    with_odds: bool = True,
    random_state: int = 42
):
    """
    Exporta métricas de clasificación por temporada (train ≤ S-1, test = S) usando
    evaluación walk-forward jornada a jornada (vía run_logreg_eval_no_smote / run_logreg_eval).
    Salida:
      - outputs/classification_grid_<model_type>.json   (estructura por temporada)
      - outputs/classification_by_season_<model_type>.csv (tabla plana)
    """
    # Comprobación de dependencias
    if model_type.lower() == "base" and "run_logreg_eval_no_smote" not in globals():
        raise RuntimeError("Falta 'run_logreg_eval_no_smote' (baseline walk-forward) en el entorno.")
    if model_type.lower() == "smote" and "run_logreg_eval" not in globals():
        raise RuntimeError("Falta 'run_logreg_eval' (SMOTE walk-forward) en el entorno.")

    label_name = {0:"A", 1:"D", 2:"H"}  # tu codificación (0=Away,1=Draw,2=Home → A/D/H)
    seasons_all = sorted(df["Season"].dropna().astype(int).unique())

    rows, flat = [], []

    for test_season in seasons_all:
        train_until = test_season - 1
        if train_until < seasons_all[0]:
            continue

        try:
            if model_type.lower() == "base":
                mdl, _, (_, mtr_te), y_test, y_pred, _, idx_test = run_logreg_eval_no_smote(
                    df,
                    train_until_season=train_until,
                    test_until_season=test_season,
                    with_odds=with_odds,
                    random_state=random_state
                )
            else:
                mdl, _, (_, mtr_te), y_test, y_pred, _, idx_test = run_logreg_eval(
                    df,
                    train_until_season=train_until,
                    test_until_season=test_season,
                    with_odds=with_odds,
                    random_state=random_state
                )

            if (mtr_te is None) or (y_test is None) or (y_pred is None) or (len(y_test) == 0):
                continue

            # Orden de clases estable basado en el modelo entrenado para esa season
            classes_used = list(getattr(mdl, "classes_", np.array([0,1,2])))
            classes_used = [c for c in [0,1,2] if c in classes_used]
            target_names = [label_name[c] for c in classes_used]

            rep = classification_report(
                y_test, y_pred,
                labels=classes_used,
                target_names=target_names,
                output_dict=True,
                zero_division=0
            )

            # Métricas por clase (si la clase aparece en el reporte)
            per_class = {}
            for c in classes_used:
                nm = label_name[c]
                if nm in rep:
                    per_class[nm] = {
                        "precision": float(rep[nm]["precision"]),
                        "recall":    float(rep[nm]["recall"]),
                        "f1":        float(rep[nm]["f1-score"]),
                        "support":   int(rep[nm]["support"]),
                    }

            # Rango de jornadas del test
            wk_min = wk_max = None
            if "Wk" in df.columns and idx_test is not None and len(idx_test):
                wks = pd.to_numeric(df.loc[idx_test, "Wk"], errors="coerce").dropna().astype(int)
                if len(wks):
                    wk_min = int(wks.min())
                    wk_max = int(wks.max())

            overall = {
                "accuracy":     float(rep.get("accuracy", mtr_te.get("accuracy", float("nan")))),
                "macro_avg": {
                    "precision": float(rep["macro avg"]["precision"]),
                    "recall":    float(rep["macro avg"]["recall"]),
                    "f1":        float(rep["macro avg"]["f1-score"]),
                    "support":   int(rep["macro avg"]["support"]),
                },
                "weighted_avg": {
                    "precision": float(rep["weighted avg"]["precision"]),
                    "recall":    float(rep["weighted avg"]["recall"]),
                    "f1":        float(rep["weighted avg"]["f1-score"]),
                    "support":   int(rep["weighted avg"]["support"]),
                },
                "n_test": int(mtr_te["n_test"]),
                "wk_min": wk_min,
                "wk_max": wk_max,
            }

            rows.append({
                "model": model_type,
                "train_until": int(train_until),
                "test_season": int(test_season),
                "per_class": per_class,
                "overall": overall,
            })

            row_flat = {
                "test_season": int(test_season),
                "train_until": int(train_until),
                "accuracy": overall["accuracy"],
                "macro_f1": overall["macro_avg"]["f1"],
                "n_test": overall["n_test"],
                "wk_min": overall["wk_min"],
                "wk_max": overall["wk_max"],
            }
            for nm in ["A","D","H"]:
                if nm in per_class:
                    row_flat[f"precision_{nm}"] = per_class[nm]["precision"]
                    row_flat[f"recall_{nm}"]    = per_class[nm]["recall"]
                    row_flat[f"f1_{nm}"]        = per_class[nm]["f1"]
                    row_flat[f"support_{nm}"]   = per_class[nm]["support"]
            flat.append(row_flat)

        except Exception as e:
            print(f"[CLASS {model_type.upper()} SKIP] test={test_season} → {e}")

    out_dir.mkdir(parents=True, exist_ok=True)
    (out_dir / f"classification_grid_{model_type}.json").write_text(
        json.dumps(rows, ensure_ascii=False, indent=2),
        encoding="utf-8"
    )
    print(f"Guardado: {out_dir / f'classification_grid_{model_type}.json'}  ({len(rows)} temporadas)")

    if flat:
        pd.DataFrame(flat).sort_values("test_season").to_csv(
            out_dir / f"classification_by_season_{model_type}.csv", index=False
        )
        print(f"Guardado: {out_dir / f'classification_by_season_{model_type}.csv'}")

# -------------------------
# EJECUCIÓN (genera archivos para ambos modelos)
# -------------------------
build_classification_grid(df, OUT, model_type="base",  with_odds=True)
build_classification_grid(df, OUT, model_type="smote", with_odds=True)

Logistic Regression (sin SMOTE) (con cuotas)

=== Train (promedio ponderado por jornada) ===
{'accuracy': 0.6, 'log_loss': 0.8503040399360499, 'brier': 0.5104307269727479, 'n_train': 380}

=== Test (Seasons 2007..2007, walk-forward por jornada) ===
{'accuracy': 0.4263157894736842, 'log_loss': 1.2349125387397448, 'brier': 0.7084656252008951, 'n_test': 380, 'season_min': 2007, 'season_max': 2007}
Logistic Regression (sin SMOTE) (con cuotas)

=== Train (promedio ponderado por jornada) ===
{'accuracy': 0.5565789473684211, 'log_loss': 0.9214888832439952, 'brier': 0.552017798278286, 'n_train': 760}

=== Test (Seasons 2008..2008, walk-forward por jornada) ===
{'accuracy': 0.48157894736842105, 'log_loss': 1.0946781196370998, 'brier': 0.6514989943768685, 'n_test': 380, 'season_min': 2008, 'season_max': 2008}
Logistic Regression (sin SMOTE) (con cuotas)

=== Train (promedio ponderado por jornada) ===
{'accuracy': 0.5482456140350878, 'log_loss': 0.9341333936534477, 'brier': 0.5582402681906519, 'n

In [18]:
# # (Opcional) Ejecución local para inspección rápida de un rango concreto:
# mdl_base, scaler_base, *_ = run_logreg_eval_no_smote(df, train_until_season=2024, test_until_season=2025, with_odds=True)
# print_classification_report_for_logreg(df, mdl_base, scaler_base, train_until_season=2024, test_until_season=2025, with_odds=True)

## **AUC Y CURVA ROC**

In [19]:
# ============================================
# ROC & AUC por temporada (walk-forward por jornada)
# - BASE (sin SMOTE) y SMOTE
# - Salida: JSON + CSV por modelo en outputs/
# ============================================

# -------------------------
# Carga df y rutas (fallback)
# -------------------------
try:
    df
except NameError:
    try:
        ROOT
    except NameError:
        ROOT = Path(".")
    try:
        DATA
    except NameError:
        DATA = ROOT / "data"
    FEAT = DATA / "03_features"
    df = pd.read_parquet(FEAT / "df_final.parquet").reset_index(drop=True)

try:
    ROOT
except NameError:
    ROOT = Path(".")
OUT = ROOT / "outputs"
OUT.mkdir(parents=True, exist_ok=True)

# ---------- Split de TEST con tope de temporada (devuelve idx_test para jornadas) ----------
def _prep_test_split(
    df: pd.DataFrame,
    train_until_season: int,
    with_odds: bool,
    test_until_season: int | None = None
):
    # Excluir nombres de equipo/ids para que NO entren como features
    drop_common = [
        'FTR','target','Date','has_xg_data',
        'a_squad_size_prev_season','away_form_gd_6','home_form_gd_6',
        'HomeTeam_norm','AwayTeam_norm','row_id'
    ]
    drop_mode = (['overround','pimp2','B365D'] if with_odds else
                 ['fase_temporada_inicio','fase_temporada_mitad',
                  'B365H','B365D','B365A','overround','pimp1','pimpx','pimp2'])
    drop_cols = list(dict.fromkeys(drop_common + drop_mode))

    y_all = df['target']
    X_all = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')

    valid = y_all.notna()
    if with_odds:
        for c in ['B365H','B365A']:
            if c in X_all.columns:
                valid &= X_all[c].notna()
    valid &= X_all.notna().all(axis=1)

    X_all = X_all.loc[valid].copy()
    y_all = y_all.loc[valid].astype(int)

    if 'Season' not in X_all.columns:
        raise ValueError("Falta 'Season' para el split temporal.")

    test_mask  = X_all['Season'] > train_until_season
    if test_until_season is not None:
        test_mask &= (X_all['Season'] <= test_until_season)

    idx_test = X_all.loc[test_mask].index  # <- para jornadas
    X_test = X_all.loc[test_mask].drop(columns=['Season'])
    y_test = y_all.loc[test_mask]
    return X_test, y_test, idx_test

# ---------- Alinear columnas de X a las usadas en el fit ----------
def _align_to_fit_columns(X: pd.DataFrame, fitter, feature_names: list[str] | None = None) -> pd.DataFrame:
    cols_fit = feature_names if feature_names is not None else getattr(fitter, "feature_names_in_", None)
    if cols_fit is None:
        return X  # entrenaste con arrays; asumimos que X ya coincide
    cols_fit = list(cols_fit)
    missing = [c for c in cols_fit if c not in X.columns]
    extra   = [c for c in X.columns   if c not in cols_fit]
    if extra:
        X = X.drop(columns=extra)
    if missing:
        raise ValueError(
            "X_test no contiene columnas usadas al entrenar:\n"
            f"- Faltan: {missing}\n"
            "Usa el mismo esquema (with_odds/drop_cols) que en el fit, "
            "o pasa 'feature_names' con la lista exacta de columnas del entrenamiento."
        )
    return X[cols_fit]

# ---------- Curvas ROC multiclase (muestra rango de jornadas del TEST) ----------
def plot_multiclass_roc(
    df: pd.DataFrame,
    model,
    scaler,
    train_until_season: int = 2023,
    test_until_season: int | None = None,
    with_odds: bool = True,
    feature_names: list[str] | None = None
):
    # 1) TEST
    X_test, y_test, idx_test = _prep_test_split(
        df, train_until_season=train_until_season,
        with_odds=with_odds, test_until_season=test_until_season
    )
    if len(X_test) == 0:
        rango = f"{train_until_season+1}..{test_until_season}" if test_until_season is not None else f">{train_until_season}"
        print(f"⚠️ No hay TEST disponible tras filtrar (Seasons {rango}).")
        return

    # 2) Alinear columnas a las del fit
    X_test = _align_to_fit_columns(X_test, scaler, feature_names=feature_names)

    # 3) Probabilidades
    X_test_scaled = scaler.transform(X_test)
    y_proba = model.predict_proba(X_test_scaled)

    # 4) Binarización y etiquetas (usa SIEMPRE el orden real del modelo)
    classes_used = list(getattr(model, "classes_", [0,1,2]))
    y_bin = label_binarize(y_test, classes=classes_used)
    class2label = {0:'Away', 1:'Draw', 2:'Home'}
    labels_text = [class2label.get(c, str(c)) for c in classes_used]

    # Título con rango de jornadas si existe Wk
    wk_txt = ""
    if "Wk" in df.columns and len(idx_test):
        wks = pd.to_numeric(df.loc[idx_test, "Wk"], errors="coerce").dropna().astype(int)
        if len(wks):
            wk_txt = f" | Jornadas {int(wks.min())}–{int(wks.max())}"

    # 5) Curvas por clase
    plt.figure()
    auc_per_class, weights = [], []
    n = len(y_test)

    for k, cls in enumerate(classes_used):
        y_true_k = y_bin[:, k]
        y_score_k = y_proba[:, k]
        pos = int(y_true_k.sum())
        neg = n - pos
        if pos > 0 and neg > 0:
            fpr, tpr, _ = roc_curve(y_true_k, y_score_k)
            auc_k = roc_auc_score(y_true_k, y_score_k)
            auc_per_class.append(auc_k)
            weights.append(pos)
            plt.plot(fpr, tpr, label=f"{labels_text[k]} (AUC = {auc_k:.2f})")
        else:
            print(f"Nota: '{labels_text[k]}' no tiene suficientes positivos/negativos en TEST; omito su curva.")

    plt.plot([0, 1], [0, 1], 'k--', label='Aleatorio')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    rango = (f"{train_until_season+1}..{test_until_season}"
             if test_until_season is not None else f">{train_until_season}")
    plt.title(f"Curvas ROC por clase (Seasons {rango}){wk_txt}")
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # 6) AUC macro y weighted
    if auc_per_class:
        auc_macro = float(np.mean(auc_per_class))
        auc_weighted = float(np.average(auc_per_class, weights=weights)) if sum(weights) > 0 else auc_macro
        print(f"\nAUC macro: {auc_macro:.3f}")
        print(f"AUC weighted: {auc_weighted:.3f}")
    else:
        print("\nNo se pudieron calcular AUCs (todas las clases carecen de positivos/negativos suficientes en TEST).")

# === Util para reducir puntos en curvas guardadas ===
def _downsample_curve(x: np.ndarray, y: np.ndarray, max_points: int = 200):
    if len(x) <= max_points:
        return x.tolist(), y.tolist()
    idx = np.linspace(0, len(x) - 1, max_points).round().astype(int)
    return x[idx].tolist(), y[idx].tolist()

# === ROC por temporada (train ≤ S-1, test = S) → outputs/roc_grid_<modelo>.json ===
def build_roc_grid(
    df: pd.DataFrame,
    out_dir: Path,
    model: str = "base",        # "base" (sin SMOTE) | "smote"
    with_odds: bool = True,
    random_state: int = 42,
    max_points: int = 200       # nº máx. de puntos por curva guardada
):
    label_name = {0: "A", 1: "D", 2: "H"}  # tu codificación 0/1/2

    seasons_all = sorted(df["Season"].dropna().astype(int).unique())
    rows = []
    flat = []

    for test_season in seasons_all:
        train_until = test_season - 1
        if train_until < seasons_all[0]:
            continue

        try:
            if model == "base":
                mdl, _, (mtr_tr, mtr_te), y_test, y_pred, y_proba, idx_test = run_logreg_eval_no_smote(
                    df,
                    train_until_season=train_until,
                    test_until_season=test_season,
                    with_odds=with_odds,
                    random_state=random_state
                )
            else:
                mdl, _, (mtr_tr, mtr_te), y_test, y_pred, y_proba, idx_test = run_logreg_eval(
                    df,
                    train_until_season=train_until,
                    test_until_season=test_season,
                    with_odds=with_odds,
                    random_state=random_state
                )

            if (mtr_te is None) or (y_test is None) or (y_proba is None) or (len(y_test) == 0):
                continue

            # Orden REAL de columnas en y_proba:
            classes_used = list(getattr(mdl, "classes_", [0,1,2]))

            # Curvas por clase (si hay positivos y negativos)
            y_bin = label_binarize(y_test, classes=classes_used)
            per_class = {}
            aucs, weights = [], []

            for k, cls in enumerate(classes_used):
                nm = label_name.get(cls, str(cls))
                y_true_k = y_bin[:, k]
                y_score_k = y_proba[:, k]
                pos = int(y_true_k.sum())
                neg = int(len(y_true_k) - pos)
                if pos > 0 and neg > 0:
                    fpr, tpr, _ = roc_curve(y_true_k, y_score_k)
                    auc_k = float(roc_auc_score(y_true_k, y_score_k))
                    fpr_l, tpr_l = _downsample_curve(fpr, tpr, max_points=max_points)
                    per_class[nm] = {
                        "auc": auc_k,
                        "support_pos": pos,
                        "fpr": fpr_l,
                        "tpr": tpr_l,
                    }
                    aucs.append(auc_k)
                    weights.append(pos)

            if not per_class:
                continue

            auc_macro = float(np.mean(aucs))
            auc_weighted = float(np.average(aucs, weights=weights)) if sum(weights) > 0 else auc_macro

            # --- Añadir rango de jornadas del set de test ---
            wk_min = wk_max = None
            if "Wk" in df.columns and idx_test is not None and len(idx_test):
                wks = pd.to_numeric(df.loc[idx_test, "Wk"], errors="coerce").dropna().astype(int)
                if len(wks):
                    wk_min = int(wks.min())
                    wk_max = int(wks.max())

            rows.append({
                "model": model,
                "train_until": int(train_until),
                "test_season": int(test_season),
                "per_class": per_class,     # dict con A/D/H presentes
                "overall": {
                    "auc_macro": auc_macro,
                    "auc_weighted": auc_weighted,
                    "n_test": int(mtr_te["n_test"]),
                    "wk_min": wk_min,
                    "wk_max": wk_max,
                }
            })

            # fila plana para CSV (útil en tablas)
            rowf = {
                "test_season": int(test_season),
                "train_until": int(train_until),
                "auc_macro": auc_macro,
                "auc_weighted": auc_weighted,
                "n_test": int(mtr_te["n_test"]),
                "wk_min": wk_min,
                "wk_max": wk_max,
            }
            for nm in ["A","D","H"]:
                if nm in per_class:
                    rowf[f"auc_{nm}"] = per_class[nm]["auc"]
                    rowf[f"support_pos_{nm}"] = per_class[nm]["support_pos"]
            flat.append(rowf)

        except Exception as e:
            print(f"[ROC {model.upper()} SKIP] test={test_season} → {e}")

    out_dir.mkdir(parents=True, exist_ok=True)
    (out_dir / f"roc_grid_{model}.json").write_text(json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8")
    print(f"Guardado: {out_dir / f'roc_grid_{model}.json'}  ({len(rows)} temporadas)")

    if flat:
        pd.DataFrame(flat).sort_values("test_season").to_csv(out_dir / f"roc_by_season_{model}.csv", index=False)
        print(f"Guardado: {out_dir / f'roc_by_season_{model}.csv'}")

# --- EJECUCIÓN ---
OUT.mkdir(parents=True, exist_ok=True)
build_roc_grid(df, OUT, model="base",  with_odds=True)
build_roc_grid(df, OUT, model="smote", with_odds=True)

Logistic Regression (sin SMOTE) (con cuotas)

=== Train (promedio ponderado por jornada) ===
{'accuracy': 0.6, 'log_loss': 0.8503040399360499, 'brier': 0.5104307269727479, 'n_train': 380}

=== Test (Seasons 2007..2007, walk-forward por jornada) ===
{'accuracy': 0.4263157894736842, 'log_loss': 1.2349125387397448, 'brier': 0.7084656252008951, 'n_test': 380, 'season_min': 2007, 'season_max': 2007}
Logistic Regression (sin SMOTE) (con cuotas)

=== Train (promedio ponderado por jornada) ===
{'accuracy': 0.5565789473684211, 'log_loss': 0.9214888832439952, 'brier': 0.552017798278286, 'n_train': 760}

=== Test (Seasons 2008..2008, walk-forward por jornada) ===
{'accuracy': 0.48157894736842105, 'log_loss': 1.0946781196370998, 'brier': 0.6514989943768685, 'n_test': 380, 'season_min': 2008, 'season_max': 2008}
Logistic Regression (sin SMOTE) (con cuotas)

=== Train (promedio ponderado por jornada) ===
{'accuracy': 0.5482456140350878, 'log_loss': 0.9341333936534477, 'brier': 0.5582402681906519, 'n

In [20]:
# (Opcional) Visualización local rápida de una season concreta:
# mdl_base, scaler_base, *_ = run_logreg_eval(df, train_until_season=2024, test_until_season=2025, with_odds=True)
# plot_multiclass_roc(df, mdl_base, scaler_base, train_until_season=2024, test_until_season=2025, with_odds=True)

## **BENEFICIOS**

Por último, pero no por ello menos importante vamos a estudiar la última métrica: El **ROI (Return on Investment)**.

$$
ROI = \frac{\text{Beneficio}}{\text{Inversión}}
$$

Con el código siguiente lo que estoy haciendo es simular una apuesta de un euro al resultado que predice mi modelo, en todos los partidos que hay en test. Si se acierta sumamos la cuota que ofrece Bet365 pero si falla se resta la unidad apostada. Con esto calculamos el beneficio neto y el ROI.

In [21]:
# ============================================
# ROI por temporada (walk-forward por jornada) - Celda única
# ============================================

try:
    from imblearn.over_sampling import SMOTE
except Exception:
    SMOTE = None  # si no está instalado, solo fallará al pedir "smote"

# --- Rutas (fallback si no existen variables del proyecto) ---
try:
    ROOT
except NameError:
    ROOT = Path(".")
try:
    DATA
except NameError:
    DATA = ROOT / "data"
FEAT = DATA / "03_features"

# --- Carga base: df_final ya incluye nombres de equipos ---
try:
    df
except NameError:
    df = pd.read_parquet(FEAT / "df_final.parquet").reset_index(drop=True)

# --- Constantes útiles ---
CLASS2TXT = {0: "A", 1: "D", 2: "H"}   # 0=Away, 1=Draw, 2=Home
TXT2IDX   = {'A':0, 'D':1, 'H':2}

# ---------- Utilidades ----------
def _max_drawdown(equity: pd.Series):
    if equity.empty:
        return 0.0, 0.0, None, None
    running_max = equity.cummax()
    drawdown = running_max - equity
    trough_idx = drawdown.idxmax()
    peak_idx = equity.loc[:trough_idx].idxmax() if trough_idx is not None else None
    mdd_abs = float(drawdown.max())
    peak_val = float(equity.loc[peak_idx]) if peak_idx is not None else 1.0
    mdd_pct = float(mdd_abs / peak_val) if peak_val > 0 else 0.0
    return mdd_abs, mdd_pct, peak_idx, trough_idx

def _edge_bins(edge: pd.Series, bins=(-np.inf, 0.0, 0.02, 0.05, np.inf),
               labels=("<0%", "0–2%", "2–5%", "≥5%")):
    return pd.cut(edge, bins=bins, labels=labels, include_lowest=True, right=False)

def _drop_and_validate(df: pd.DataFrame, with_odds: bool = True):
    """
    Aplica el mismo esquema de columnas que en el resto del notebook.
    Devuelve:
      X_all (con 'Season' para poder filtrar por fecha/temporada),
      y_all,
      meta (Season, Date, Wk, nombres y cuotas) alineado con X_all
    """
    drop_common = [
        'FTR','target','Date','has_xg_data',
        'a_squad_size_prev_season','away_form_gd_6','home_form_gd_6',
        'HomeTeam_norm','AwayTeam_norm','row_id'  # fuera de X para evitar fugas
    ]
    drop_mode = (['overround','pimp2','B365D'] if with_odds else
                 ['fase_temporada_inicio','fase_temporada_mitad',
                  'B365H','B365D','B365A','overround','pimp1','pimpx','pimp2'])
    drop_cols = list(dict.fromkeys(drop_common + drop_mode))

    y_all = df['target']
    X_all = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore')

    # válidas: sin NaN en y ni en X; si with_odds, exige B365H y B365A
    valid = y_all.notna()
    if with_odds:
        for c in ['B365H','B365A']:
            if c in X_all.columns:
                valid &= X_all[c].notna()
    valid &= X_all.notna().all(axis=1)

    X_all = X_all.loc[valid].copy()
    y_all = y_all.loc[valid].astype(int)

    need = ["Season","Date","Wk","HomeTeam_norm","AwayTeam_norm","B365H","B365D","B365A"]
    missing = [c for c in need if c not in df.columns]
    if missing:
        raise ValueError(f"Faltan columnas en df: {missing}")

    meta = df.loc[X_all.index, need].copy()
    meta["Date"] = pd.to_datetime(meta["Date"], errors="coerce")

    if "Season" not in X_all.columns:
        raise ValueError("Falta 'Season' en X_all para el control temporal.")
    return X_all, y_all, meta

def attach_names_and_odds(df: pd.DataFrame, idx: pd.Index) -> pd.DataFrame:
    need = ["Season","Date","HomeTeam_norm","AwayTeam_norm","Wk","B365H","B365D","B365A"]
    meta = df.loc[idx, need].copy()
    meta["Date"] = pd.to_datetime(meta["Date"], errors="coerce")
    return meta

# ---------- Simulación ROI temporada, ENTRENANDO JORNADA A JORNADA ----------
def _simulate_roi_season_walkforward(
    df: pd.DataFrame,
    test_season: int,
    with_odds: bool = True,
    stake: float = 1.0,
    min_edge: float = 0.00,
    use_smote: bool = False,
    random_state: int = 42
):
    if use_smote and SMOTE is None:
        raise ImportError("Para SMOTE necesitas 'imbalanced-learn' instalado.")

    # 0) Construir matrices globales y meta
    X_all, y_all, meta = _drop_and_validate(df, with_odds=with_odds)

    # 1) Ordenar jornadas de la temporada por fecha de inicio
    g = (meta[meta["Season"] == test_season]
         .groupby("Wk")
         .agg(dmin=("Date","min"), n=("Wk","size"))
         .reset_index()
         .sort_values(["dmin","Wk"]))
    if g.empty:
        return None, np.nan, np.nan

    out_parts = []

    # 2) Recorremos cada jornada (walk-forward)
    for _, row in g.iterrows():
        wk = int(row["Wk"])
        d_start = row["dmin"]

        # índices de test (esa jornada exacta dentro de las válidas)
        idx_test_wk = meta.index[(meta["Season"] == test_season) & (meta["Wk"] == wk)]
        if len(idx_test_wk) == 0:
            continue

        # train con TODO lo anterior a la fecha de inicio de la jornada
        idx_train_wk = meta.index[(meta["Date"] < d_start)]
        if len(idx_train_wk) == 0:
            continue

        # X/y
        feat_cols = [c for c in X_all.columns if c != "Season"]  # mantenemos Season fuera del fit
        X_tr = X_all.loc[idx_train_wk, feat_cols]
        y_tr = y_all.loc[idx_train_wk]
        X_te = X_all.loc[idx_test_wk,  feat_cols]
        y_te = y_all.loc[idx_test_wk]

        # Clases suficientes
        if len(np.unique(y_tr)) < 2:
            # no se puede entrenar (una sola clase histórica antes de esta jornada)
            continue

        # Escalado
        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr)
        X_te_s = scaler.transform(X_te)

        # SMOTE opcional
        if use_smote:
            # k robusto según la minoritaria
            _, counts = np.unique(y_tr, return_counts=True)
            min_count = int(counts.min())
            if min_count > 1:
                k = max(1, min(5, min_count - 1))
                try:
                    sm = SMOTE(random_state=random_state, k_neighbors=k)
                    X_tr_s, y_tr = sm.fit_resample(X_tr_s, y_tr)
                except Exception:
                    # si falla, seguimos sin SMOTE
                    pass

        # Modelo
        mdl = LogisticRegression(solver='saga', penalty='l2', max_iter=1000, random_state=random_state)
        mdl.fit(X_tr_s, y_tr)

        # Predicciones de la jornada
        proba  = mdl.predict_proba(X_te_s)
        y_pred = mdl.predict(X_te_s)

        # Meta + cálculo de ROI para la jornada
        res = attach_names_and_odds(df, idx_test_wk)
        name_map  = {0:'A',1:'D',2:'H'}
        classes   = list(mdl.classes_)  # típicamente [0,1,2]
        proba_df  = pd.DataFrame(proba, index=idx_test_wk,
                                 columns=[name_map.get(c, str(c)) for c in classes]).loc[res.index]
        proba_fix = proba_df.reindex(columns=['A','D','H'])
        odds_fix  = res[['B365A','B365D','B365H']].rename(columns={'B365A':'A','B365D':'D','B365H':'H'})[['A','D','H']]

        res['true_result']      = y_te.loc[res.index].values
        res['predicted_result'] = pd.Series(y_pred, index=idx_test_wk).loc[res.index].map(int).values
        pred_txt = pd.Series(y_pred, index=idx_test_wk).map(name_map).loc[res.index]
        pred_idx = pred_txt.map(TXT2IDX).to_numpy()

        P, O = proba_fix.to_numpy(), odds_fix.to_numpy()
        res['Pred']           = pred_txt
        res['predicted_prob'] = P[np.arange(len(res)), pred_idx]
        res['predicted_odds'] = O[np.arange(len(res)), pred_idx]
        res['edge']           = res['predicted_prob'] * res['predicted_odds'] - 1.0

        # Value betting informativo
        EV = proba_fix * odds_fix - 1.0
        best_idx = EV.to_numpy().argmax(axis=1)
        labels = np.array(['A','D','H'])
        res['value_pick'] = labels[best_idx]
        res['value_ev']   = EV.to_numpy()[np.arange(len(EV)), best_idx]
        res['value_prob'] = P[np.arange(len(P)), best_idx]
        res['value_odds'] = O[np.arange(len(O)), best_idx]

        # Filtros de cuotas y edge
        mask_odds = res[['B365H','B365D','B365A']].notna().all(axis=1)
        res = res.loc[mask_odds].copy()
        if min_edge > 0:
            res = res.loc[res['edge'] >= min_edge].copy()
        if res.empty:
            continue

        # Apuesta SIEMPRE a la predicción
        res['bet_outcome'] = np.where(
            res['predicted_result'] == res['true_result'],
            res['predicted_odds'] * stake, 0.0
        )
        res['net_profit'] = res['bet_outcome'] - stake

        out_parts.append(res)

    # 3) Unir todas las jornadas de la temporada
    if not out_parts:
        return None, np.nan, np.nan
    out = pd.concat(out_parts, axis=0).sort_index()
    out['Date'] = pd.to_datetime(out['Date'], errors='coerce').dt.strftime('%Y-%m-%d')

    # 4) Agregados de la temporada
    total_net = float(out['net_profit'].sum())
    n_bets    = int(len(out))
    roi       = total_net / (stake * n_bets) if n_bets > 0 else np.nan
    return out, roi, total_net

# ---------- ROI por temporada (resumen + CSV/JSON) ----------
def build_roi_grid(
    df: pd.DataFrame,
    model=None, scaler=None,               # se ignoran (se reentrena jornada a jornada)
    seasons: list[int] | None = None,
    with_odds: bool = True,
    stake: float = 1.0,
    feature_names: list[str] | None = None, # mantenido por compatibilidad
    min_edge: float = 0.00,
    model_name: str = "base",              # "base" | "smote"
    out_dir: Path | None = None,
    random_state: int = 42
):
    seasons_all = sorted(df["Season"].dropna().astype(int).unique())
    if seasons is None:
        seasons = seasons_all

    OUT = (out_dir or (ROOT / "outputs"))
    OUT.mkdir(parents=True, exist_ok=True)

    rows = []
    flat_for_csv = []

    for test_season in seasons:
        # Walk-forward por jornada dentro de la season 'test_season'
        res, roi, total_net = _simulate_roi_season_walkforward(
            df,
            test_season=test_season,
            with_odds=with_odds,
            stake=stake,
            min_edge=min_edge,
            use_smote=(str(model_name).lower() == "smote"),
            random_state=random_state
        )
        if res is None or len(res) == 0:
            continue

        # Orden por fecha para equity y métricas
        tmp = res.copy()
        tmp['_Date'] = pd.to_datetime(tmp['Date'], errors='coerce')
        tmp = tmp.sort_values('_Date').drop(columns=['_Date'])

        equity = tmp['net_profit'].cumsum()
        mdd_abs, mdd_pct, *_ = _max_drawdown(equity)

        hit_rate = float((tmp['predicted_result'] == tmp['true_result']).mean())
        avg_odds = float(tmp['predicted_odds'].mean())
        avg_edge = float(tmp['edge'].mean())
        avg_value_ev = float(tmp['value_ev'].mean())

        by_class = tmp.groupby(tmp['predicted_result']).agg(
            profit=('net_profit','sum'), n=('net_profit','size')
        )
        profit_by_class = {CLASS2TXT.get(int(k), str(k)): float(v) for k, v in by_class['profit'].items()}

        # Rango de jornadas en ese test
        wk_min = wk_max = None
        if 'Wk' in tmp.columns and len(tmp):
            wks = pd.to_numeric(tmp['Wk'], errors='coerce').dropna().astype(int)
            if len(wks):
                wk_min = int(wks.min())
                wk_max = int(wks.max())

        bins = _edge_bins(tmp['edge'])
        by_bin = tmp.groupby(bins, observed=True).agg(
            n=('net_profit','size'),
            profit=('net_profit','sum'),
            avg_prob=('predicted_prob','mean'),
            avg_odds=('predicted_odds','mean'),
            avg_edge=('edge','mean')
        ).reset_index(names='edge_bin')
        by_bin['roi'] = by_bin.apply(lambda r: (r['profit']/(stake*r['n'])) if r['n']>0 else np.nan, axis=1)
        roi_by_edge_bins = [
            {
                "bin": str(row['edge_bin']),
                "n": int(row['n']),
                "roi": float(row['roi']),
                "profit_total": float(row['profit']),
                "avg_prob": float(row['avg_prob']),
                "avg_odds": float(row['avg_odds']),
                "avg_edge": float(row['avg_edge']),
            }
            for _, row in by_bin.iterrows()
        ]

        rows.append({
            "model": model_name,
            "train_until": int(test_season),  # referencia temporal en esta configuración walk-forward
            "test_season": int(test_season),
            "n_bets": int(len(tmp)),
            "profit_total": float(total_net),
            "roi": float(roi),
            "hit_rate": float(hit_rate),
            "avg_odds": float(avg_odds),
            "avg_edge": float(avg_edge),
            "avg_value_ev": float(avg_value_ev),
            "profit_by_class": profit_by_class,
            "equity": [float(x) for x in equity.tolist()],
            "max_drawdown_abs": float(mdd_abs),
            "max_drawdown_pct": float(mdd_pct),
            "roi_by_edge_bins": roi_by_edge_bins,
            "stake": float(stake),
            "min_edge": float(min_edge),
            "wk_min": wk_min,
            "wk_max": wk_max,
        })

        flat_for_csv.append({
            "model": model_name,
            "test_season": int(test_season),
            "train_until": int(test_season),
            "n_bets": int(len(tmp)),
            "roi": float(roi),
            "profit_total": float(total_net),
            "hit_rate": float(hit_rate),
            "avg_odds": float(avg_odds),
            "avg_edge": float(avg_edge),
            "avg_value_ev": float(avg_value_ev),
            "max_drawdown_pct": float(mdd_pct),
            "stake": float(stake),
            "min_edge": float(min_edge),
            "wk_min": wk_min,
            "wk_max": wk_max,
        })

    tag = f"{model_name}".replace(" ", "_").lower()
    (OUT / f"roi_by_season_{tag}.json").write_text(json.dumps(rows, ensure_ascii=False, indent=2), encoding="utf-8")
    if flat_for_csv:
        pd.DataFrame(flat_for_csv).sort_values("test_season").to_csv(OUT / f"roi_by_season_{tag}.csv", index=False)

    print(f"Guardados:\n- {OUT/f'roi_by_season_{tag}.json'}\n- {OUT/f'roi_by_season_{tag}.csv'}")
    return rows

# =========================
# EJEMPLOS DE USO (comenta/ajusta)
# =========================
OUT = ROOT / "outputs"
OUT.mkdir(parents=True, exist_ok=True)

# Baseline (reentrena jornada a jornada SIN SMOTE)
_ = build_roi_grid(
    df=df, model=None, scaler=None,                # se ignoran
    seasons=None, with_odds=True, stake=1.0,
    min_edge=0.00, model_name="base", out_dir=OUT
)

# SMOTE (reentrena jornada a jornada CON SMOTE)
_ = build_roi_grid(
    df=df, model=None, scaler=None,                # se ignoran
    seasons=None, with_odds=True, stake=1.0,
    min_edge=0.00, model_name="smote", out_dir=OUT
)

Guardados:
- outputs/roi_by_season_base.json
- outputs/roi_by_season_base.csv
Guardados:
- outputs/roi_by_season_smote.json
- outputs/roi_by_season_smote.csv


Sin SMOTE:

In [22]:
# # ==========================================================
# # MATCH-LOG (walk-forward por jornada) — target robusto + sin Wk en outputs
# # ==========================================================
# import pandas as pd
# import numpy as np
# from pathlib import Path
# from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LogisticRegression
# import json

# # -------------------------
# # Rutas y carga base
# # -------------------------
# ROOT = Path(".")
# DATA = ROOT / "data"
# FEAT = DATA / "03_features"
# PROC = DATA / "02_processed"
# OUT  = ROOT / "outputs"
# OUT.mkdir(parents=True, exist_ok=True)

# df_path = FEAT / "df_final.parquet"
# cal_paths = [PROC / "wk_actualizado_2005_2025.parquet", PROC / "wk_2005_2025.parquet"]

# df = pd.read_parquet(df_path).reset_index(drop=True)

# # -------------------------
# # Utilidades calendario/jornada
# # -------------------------
# def _safe_to_datetime(s):
#     return pd.to_datetime(s, errors="coerce")

# def _load_calendar_unique(paths):
#     """Calendario único por (Season, Date_day) con Wk_cal entero."""
#     for p in paths:
#         if p.exists():
#             cal = pd.read_parquet(p).copy()
#             need = {"Season","Date","Wk"}
#             if not need.issubset(cal.columns):
#                 continue
#             cal["Date"] = _safe_to_datetime(cal["Date"])
#             cal["Date_day"] = cal["Date"].dt.date
#             cal["Wk"] = pd.to_numeric(cal["Wk"], errors="coerce")
#             cal = cal.dropna(subset=["Season","Date_day"])

#             cal["Wk_pos"] = cal["Wk"].where(cal["Wk"] > 0)
#             g = cal.groupby(["Season","Date_day"], as_index=False).agg(Wk_cal=("Wk_pos","median"))
#             # si no hay Wk > 0 ese día, usa mediana de Wk (aunque <=0)
#             nan_mask = g["Wk_cal"].isna()
#             if nan_mask.any():
#                 g2 = cal.groupby(["Season","Date_day"], as_index=False).agg(Wk_cal=("Wk","median"))
#                 g2 = g2.set_index(["Season","Date_day"])
#                 g.loc[nan_mask, "Wk_cal"] = g2.loc[
#                     g.loc[nan_mask, ["Season","Date_day"]].set_index(["Season","Date_day"]).index
#                 ].to_numpy()
#             g["Wk_cal"] = pd.to_numeric(g["Wk_cal"], errors="coerce").round().astype("Int64")
#             return g[["Season","Date_day","Wk_cal"]]
#     return None

# def build_jornada(meta: pd.DataFrame, cal_unique: pd.DataFrame | None) -> pd.Series:
#     """
#     Devuelve 'jornada' con prioridad:
#       1) Wk propio > 0 (si existiera),
#       2) calendario por (Season, Date_day),
#       3) fallback por orden de días dentro de cada Season (1..N).
#     Nunca devuelve 0/negativos. Tipo Int64.
#     """
#     m = meta.copy()
#     m["Date"] = _safe_to_datetime(m["Date"])
#     m["Date_day"] = m["Date"].dt.date

#     if "Wk" in m.columns:
#         wk_own = pd.to_numeric(m["Wk"], errors="coerce").where(lambda x: x > 0)
#     else:
#         wk_own = pd.Series(np.nan, index=m.index, dtype="float64")

#     if cal_unique is not None:
#         m = m.merge(cal_unique, on=["Season","Date_day"], how="left")
#         wk_cal = pd.to_numeric(m["Wk_cal"], errors="coerce")
#         jornada = wk_own.fillna(wk_cal)
#     else:
#         jornada = wk_own

#     if jornada.isna().any():
#         tmp = (m[["Season","Date_day"]]
#                .drop_duplicates()
#                .sort_values(["Season","Date_day"]))
#         tmp["j_fallback"] = tmp.groupby("Season").cumcount() + 1
#         m = m.merge(tmp, on=["Season","Date_day"], how="left")
#         jornada = jornada.fillna(m["j_fallback"])

#     jornada = pd.to_numeric(jornada, errors="coerce")
#     jornada = jornada.where(jornada > 0)
#     jornada = jornada.round().astype("Int64")
#     return jornada

# # -------------------------
# # Construcción de 'target' robusto
# # -------------------------
# CLASS2TXT = {0:"A", 1:"D", 2:"H"}
# TXT2IDX   = {"A":0, "D":1, "H":2}

# def build_target(df_in: pd.DataFrame) -> pd.Series:
#     """
#     Construye etiqueta 0/1/2 (A/D/H) desde:
#       - 'target' si existe (numérico 0/1/2 o convertible),
#       - si no, 'FTR' con mapping {'A':0,'D':1,'H':2}.
#     Devuelve Int64 con NaNs donde no se pueda mapear.
#     """
#     t = None
#     if "target" in df_in.columns:
#         t_num = pd.to_numeric(df_in["target"], errors="coerce")
#         # Si hay valores fuera de {0,1,2}, intentamos FTR como respaldo
#         bad = ~t_num.isin([0,1,2])
#         if bad.any() and "FTR" in df_in.columns:
#             t_ftr = df_in["FTR"].map(TXT2IDX).astype("Int64")
#             t = t_num.astype("Int64")
#             t = t.mask(bad, t_ftr)
#         else:
#             t = t_num.astype("Int64")
#     elif "FTR" in df_in.columns:
#         t = df_in["FTR"].map(TXT2IDX).astype("Int64")
#     else:
#         raise ValueError("No encuentro 'target' ni 'FTR' para construir la etiqueta.")

#     # Limita a {0,1,2}; el resto queda NaN
#     t = t.where(t.isin([0,1,2]))
#     return t

# # -------------------------
# # Preparar datos + inyectar 'jornada'
# # -------------------------
# cal_u = _load_calendar_unique(cal_paths)

# df["Date"] = _safe_to_datetime(df["Date"])
# meta_cols = ["Season","Date","HomeTeam_norm","AwayTeam_norm","B365H","B365D","B365A"]
# missing = [c for c in meta_cols if c not in df.columns]
# if missing:
#     raise ValueError(f"Faltan columnas en df_final: {missing}")

# # Crea 'jornada' UNA VEZ y úsala en todo el pipeline
# df["jornada"] = build_jornada(df[["Season","Date","Wk"] if "Wk" in df.columns else ["Season","Date"]], cal_u)
# if (df["jornada"].fillna(0) <= 0).any():
#     raise RuntimeError("Jornadas no válidas detectadas (<=0). Revisa calendario/fechas.")

# # Row_id único
# df = df.reset_index(drop=False).rename(columns={"index":"row_id"})
# assert df["row_id"].is_unique, "row_id no es único."

# # -------------------------
# # Construcción de matrices X/y y meta
# # -------------------------
# # Columnas a descartar de features (ajústalo a tus features reales)
# drop_common = [
#     'FTR','target','Date','has_xg_data',
#     'a_squad_size_prev_season','away_form_gd_6','home_form_gd_6',
#     'HomeTeam_norm','AwayTeam_norm','row_id'  # meta/no-features
# ]
# # Asumimos que NO se usan cuotas en X (quedan solo en meta)
# drop_mode = ['B365H','B365D','B365A','overround','pimp1','pimpx','pimp2']
# drop_cols = list(dict.fromkeys(drop_common + drop_mode))

# # TARGET robusto (Int64 con NaNs si falla)
# target_ser = build_target(df)

# # Features
# X_all = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore").copy()
# # Evitar infs
# X_all = X_all.replace([np.inf, -np.inf], np.nan)

# # Meta
# meta_all = df.loc[:, ["row_id"] + meta_cols + ["jornada"]].copy()
# for c in ["B365H","B365D","B365A"]:
#     meta_all[c] = pd.to_numeric(meta_all[c], errors="coerce")

# # Validación/filtrado
# valid = target_ser.notna()
# valid &= X_all.notna().all(axis=1)
# valid &= meta_all[["B365H","B365D","B365A"]].notna().all(axis=1)

# # Subset final y casteo a int (ya sin NaNs)
# X_all = X_all.loc[valid].copy()
# y_all = target_ser.loc[valid].astype(int)   # <-- aquí ya no habrá NaNs
# meta_all = meta_all.loc[valid].copy()

# # Asegúrate de llevar 'Season' en X (para seleccionar features más abajo)
# if "Season" not in X_all.columns:
#     X_all["Season"] = df.loc[valid, "Season"].values

# # -------------------------
# # Helper bin de edge
# # -------------------------
# def _edge_bins(edge: pd.Series,
#                bins=(-np.inf, 0.0, 0.02, 0.05, np.inf),
#                labels=("<0%","0–2%","2–5%","≥5%")):
#     return pd.cut(edge, bins=bins, labels=labels, include_lowest=True, right=False)

# # -------------------------
# # Walk-forward por jornada (por temporada)
# # -------------------------
# def _walkforward_one_season(test_season: int,
#                             *,
#                             stake=1.0,
#                             min_edge_pred=0.0,
#                             min_edge_value=None,
#                             random_state=42,
#                             use_smote=False):
#     m_season = meta_all[meta_all["Season"] == test_season].copy()
#     if m_season.empty:
#         return pd.DataFrame()

#     g = (m_season.groupby("jornada", dropna=True)
#                 .agg(dmin=("Date","min"), n=("jornada","size"))
#                 .reset_index()
#                 .sort_values(["dmin","jornada"]))
#     if g.empty:
#         return pd.DataFrame()

#     parts = []
#     for _, row in g.iterrows():
#         wk = int(row["jornada"])
#         d_start = row["dmin"]

#         idx_te_mask = (meta_all["Season"] == test_season) & (meta_all["jornada"] == wk)
#         idx_tr_mask = (meta_all["Date"] < d_start)
#         if not idx_te_mask.any() or not idx_tr_mask.any():
#             continue

#         feat_cols = [c for c in X_all.columns if c != "Season"]
#         X_tr = X_all.loc[idx_tr_mask, feat_cols].to_numpy()
#         y_tr = y_all.loc[idx_tr_mask].to_numpy()

#         X_te = X_all.loc[idx_te_mask, feat_cols].to_numpy()
#         y_te = y_all.loc[idx_te_mask].to_numpy()

#         if len(np.unique(y_tr)) < 2:
#             continue

#         scaler = StandardScaler()
#         X_tr_s = scaler.fit_transform(X_tr)
#         X_te_s = scaler.transform(X_te)

#         if use_smote:
#             try:
#                 from imblearn.over_sampling import SMOTE
#                 _, counts = np.unique(y_tr, return_counts=True)
#                 minc = int(counts.min())
#                 if minc > 1:
#                     k = max(1, min(5, minc - 1))
#                     sm = SMOTE(random_state=random_state, k_neighbors=k)
#                     X_tr_s, y_tr = sm.fit_resample(X_tr_s, y_tr)
#             except Exception:
#                 pass

#         mdl = LogisticRegression(solver="saga", penalty="l2", max_iter=1000, random_state=random_state)
#         mdl.fit(X_tr_s, y_tr)

#         proba = mdl.predict_proba(X_te_s)   # (n_te, 3)
#         yhat  = mdl.predict(X_te_s)         # (n_te,)

#         # Meta y odds POSICIONALES
#         meta_te = meta_all.loc[idx_te_mask, ["Season","Date","jornada","HomeTeam_norm","AwayTeam_norm","B365H","B365D","B365A"]].reset_index(drop=True)
#         odds_te = meta_te[["B365A","B365D","B365H"]].to_numpy()   # orden A,D,H

#         # Reordenar proba a columnas A,D,H según clases del modelo
#         CLASS2TXT = {0:"A", 1:"D", 2:"H"}
#         P = np.full((proba.shape[0], 3), np.nan, dtype=float)
#         for col_idx, cls in enumerate(mdl.classes_):
#             label = CLASS2TXT.get(int(cls))
#             if label == "A": P[:,0] = proba[:, col_idx]
#             if label == "D": P[:,1] = proba[:, col_idx]
#             if label == "H": P[:,2] = proba[:, col_idx]

#         # Predicción textual y edge
#         idx_of = {"A":0,"D":1,"H":2}
#         pred_txt = np.vectorize({0:"A",1:"D",2:"H"}.get)(yhat)
#         pred_idx = np.vectorize(idx_of.get)(pred_txt)
#         pred_prob = P[np.arange(P.shape[0]), pred_idx]
#         pred_odds = odds_te[np.arange(odds_te.shape[0]), pred_idx]
#         edge_pred = pred_prob * pred_odds - 1.0

#         # Apuesta de valor
#         EV = P * odds_te - 1.0
#         best_idx = EV.argmax(axis=1)                # 0=A,1=D,2=H
#         labels = np.array(["A","D","H"])
#         value_pick = labels[best_idx]
#         value_ev   = EV[np.arange(EV.shape[0]), best_idx]
#         value_prob = P[np.arange(P.shape[0]), best_idx]
#         value_odds = odds_te[np.arange(odds_te.shape[0]), best_idx]

#         # Métricas de retorno
#         true_result = y_te
#         predicted_result = yhat
#         correct = (predicted_result == true_result)
#         value_hit = (np.vectorize(idx_of.get)(value_pick) == true_result)

#         stake = 1.0
#         bet_return = np.where(correct, pred_odds * stake, 0.0)
#         net_profit = bet_return - stake

#         thr_val = 0.0 if (min_edge_value is None) else min_edge_value
#         use_value = (value_ev >= (0.0 if min_edge_value is None else min_edge_value)) if (thr_val and thr_val > 0) else np.ones(len(value_ev), dtype=bool)
#         value_bet_return = np.where(value_hit, value_odds * stake, 0.0)
#         value_bet_return = np.where(use_value, value_bet_return, 0.0)
#         value_net_profit = value_bet_return - np.where(use_value, stake, 0.0)

#         out = meta_te.copy()
#         out["true_result"]      = true_result
#         out["predicted_result"] = predicted_result
#         out["Pred"]             = pred_txt
#         out["predicted_prob"]   = pred_prob
#         out["predicted_odds"]   = pred_odds
#         out["edge"]             = edge_pred

#         out["value_pick"]       = value_pick
#         out["value_ev"]         = value_ev
#         out["value_prob"]       = value_prob
#         out["value_odds"]       = value_odds
#         out["use_value"]        = use_value

#         out["bet_return"]       = bet_return
#         out["net_profit"]       = net_profit
#         out["value_bet_return"] = value_bet_return
#         out["value_net_profit"] = value_net_profit

#         out["Correct"]          = np.where(correct, "✓", "✗")
#         out["value_correct"]    = np.where(value_hit, "✓", "✗")

#         out["edge_bin"]  = _edge_bins(out["edge"])
#         out["value_bin"] = _edge_bins(out["value_ev"])

#         parts.append(out)

#     if not parts:
#         return pd.DataFrame()

#     ml = pd.concat(parts, axis=0, ignore_index=True)
#     ml["Date"] = pd.to_datetime(ml["Date"], errors="coerce").dt.strftime("%Y-%m-%d")
#     ml["jornada"] = pd.to_numeric(ml["jornada"], errors="coerce").round().astype("Int64")
#     return ml

# def build_matchlog_grid(df_source: pd.DataFrame,
#                         out_dir: Path,
#                         *,
#                         model_name="base",
#                         stake=1.0,
#                         min_edge_pred=0.0,
#                         min_edge_value=None,
#                         random_state=42,
#                         use_smote=False):

#     per_season_dir = out_dir / f"matchlogs_{model_name}"
#     per_season_dir.mkdir(parents=True, exist_ok=True)

#     seasons_all = sorted(df_source["Season"].dropna().astype(int).unique())
#     season_summary = []

#     for season in seasons_all:
#         try:
#             ml = _walkforward_one_season(
#                 season,
#                 stake=stake,
#                 min_edge_pred=min_edge_pred,
#                 min_edge_value=min_edge_value,
#                 random_state=random_state,
#                 use_smote=use_smote
#             )
#             if ml.empty:
#                 print(f"[{model_name}] Season {season}: sin filas válidas.")
#                 continue

#             # Verificación: NO debe haber jornada <= 0 ni columna Wk
#             if (ml["jornada"].fillna(0) <= 0).any():
#                 raise RuntimeError(f"Season {season}: detectadas jornadas <= 0 en output.")

#             n_pred = len(ml)
#             roi_pred = float(ml["net_profit"].sum() / (stake * n_pred)) if n_pred > 0 else np.nan
#             n_val = int(ml["use_value"].sum())
#             roi_val = float(ml.loc[ml["use_value"], "value_net_profit"].sum() / (stake * n_val)) if n_val > 0 else np.nan

#             csv_path  = per_season_dir / f"matchlog_{season}.csv"
#             json_path = per_season_dir / f"matchlog_{season}.json"
#             ml.to_csv(csv_path, index=False)
#             ml.to_json(json_path, orient="records", force_ascii=False, indent=2)
#             print(f"[{model_name}] Season {season}: guardado match-log ({len(ml)} filas)")

#             season_summary.append({
#                 "model": model_name,
#                 "train_mode": "walk-forward por jornada",
#                 "test_season": int(season),
#                 "n_pred_bets": int(n_pred),
#                 "roi_pred": roi_pred,
#                 "profit_pred": float(ml["net_profit"].sum()),
#                 "n_value_bets": int(n_val),
#                 "roi_value": roi_val,
#                 "profit_value": float(ml.loc[ml['use_value'], 'value_net_profit'].sum() if n_val > 0 else 0.0),
#                 "min_edge_pred": float(min_edge_pred),
#                 "min_edge_value": float(min_edge_pred if (min_edge_value is None) else min_edge_value),
#                 "stake": float(stake),
#             })
#         except Exception as e:
#             print(f"[MATCHLOG {model_name.upper()} SKIP] Season {season} → {e}")

#     if season_summary:
#         df_sum = pd.DataFrame(season_summary).sort_values("test_season")
#         df_sum.to_csv(out_dir / f"matchlog_season_summary_{model_name}.csv", index=False)
#         (out_dir / f"matchlog_season_summary_{model_name}.json").write_text(
#             json.dumps(season_summary, ensure_ascii=False, indent=2),
#             encoding="utf-8"
#         )
#         print(f"Guardados:\n- {out_dir/f'matchlog_season_summary_{model_name}.csv'}\n- {out_dir/f'matchlog_season_summary_{model_name}.json'}")
#     else:
#         print(f"Sin temporadas válidas para exportar matchlogs ({model_name}).")

# # -------------------------
# # EJECUCIÓN
# # -------------------------
# build_matchlog_grid(
#     df_source=df,
#     out_dir=OUT,
#     model_name="base",
#     stake=1.0,
#     min_edge_pred=0.00,
#     min_edge_value=None,
#     random_state=42,
#     use_smote=False
# )

# # -------------------------
# # CHEQUEO FINAL: no hay 'Wk' en outputs y 'jornada' es válida
# # -------------------------
# for f in sorted((OUT / "matchlogs_base").glob("matchlog_*.csv"))[:3]:
#     tmp = pd.read_csv(f)
#     assert "Wk" not in tmp.columns, f"{f} contiene Wk."
#     assert (tmp["jornada"].fillna(0) > 0).all(), f"{f} tiene jornada <= 0."
# print("Chequeo final OK: 'jornada' presente y válida en los outputs; 'Wk' eliminado.")

In [23]:
# ============================================================
# UPDATE MATCHLOG (INCREMENTAL) SOLO PARA LA ÚLTIMA TEMPORADA
# - Actualiza matchlog_{SEASON}.csv/json añadiendo SOLO jornadas nuevas
# - Usa relleno robusto de 'jornada' (calendario) y LBFGS (rápido)
# - Evita duplicados al guardar
# ============================================================
import warnings
from sklearn.exceptions import ConvergenceWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)

# ---------- PARAMS RÁPIDOS (puedes ajustar si quieres) ----------
MODEL_NAME = "base"        # carpeta: outputs/matchlogs_base
WITH_ODDS  = True          # como en tu pipeline original
STAKE      = 1.0
MIN_EDGE_PRED  = 0.00
MIN_EDGE_VALUE = None      # si quieres filtrar picks de valor
RANDOM_STATE   = 42

# ---------- RUTAS ----------
try:
    ROOT
except NameError:
    ROOT = Path(".")
try:
    DATA
except NameError:
    DATA = ROOT / "data"
FEAT = DATA / "03_features"
PROC = DATA / "02_processed"

OUT  = ROOT / "outputs"
OUT.mkdir(parents=True, exist_ok=True)
PER_SEASON_DIR = OUT / f"matchlogs_{MODEL_NAME}"
PER_SEASON_DIR.mkdir(parents=True, exist_ok=True)

# ---------- CARGA DF ----------
df_path = FEAT / "df_final.parquet"
print(f"==> Cargando df_final: {df_path}")
df = pd.read_parquet(df_path).reset_index(drop=True)
print(f"Filas df: {len(df):,} | Columnas: {df.shape[1]}")

# ---------- CALENDARIO ----------
def _load_calendar():
    for name in ["wk_actualizado_2005_2025.parquet", "wk_2005_2025.parquet"]:
        p = PROC / name
        if p.exists():
            cal = pd.read_parquet(p)
            need = {"Season","Date","Wk"}
            if not need.issubset(cal.columns):
                continue
            cal = cal.loc[:, ["Season","Date","Wk"]].copy()
            cal["Date"] = pd.to_datetime(cal["Date"], errors="coerce")
            cal["Date_day"] = cal["Date"].dt.date
            cal["Wk"] = pd.to_numeric(cal["Wk"], errors="coerce")
            return cal[["Season","Date_day","Wk"]].dropna()
    return None

_CAL = _load_calendar()

def _fill_jornada(meta: pd.DataFrame) -> pd.DataFrame:
    """Construye 'jornada' con prioridad: Wk>0 -> calendario -> orden por fecha."""
    meta = meta.copy()
    meta["Date"] = pd.to_datetime(meta["Date"], errors="coerce")
    meta["Date_day"] = meta["Date"].dt.date

    wk_series = pd.to_numeric(meta.get("Wk", pd.Series(index=meta.index)), errors="coerce")
    wk_series = wk_series.where(wk_series > 0)  # 0/negativos -> NaN

    if _CAL is not None:
        meta = meta.merge(_CAL, on=["Season","Date_day"], how="left", suffixes=("","_cal"))
        wk_series = wk_series.combine_first(meta["Wk_cal"])
        meta.drop(columns=["Wk_cal"], inplace=True, errors="ignore")

    if wk_series.isna().any():
        tmp = (meta.loc[:, ["Season","Date_day"]]
                    .drop_duplicates()
                    .sort_values(["Season","Date_day"]))
        tmp["jornada_fallback"] = tmp.groupby("Season").cumcount() + 1
        meta = meta.merge(tmp, on=["Season","Date_day"], how="left")
        wk_series = wk_series.combine_first(meta["jornada_fallback"])
        meta.drop(columns=["jornada_fallback"], inplace=True, errors="ignore")

    meta["jornada"] = pd.to_numeric(wk_series, errors="coerce").astype("Int64")
    meta.drop(columns=["Date_day"], inplace=True, errors="ignore")
    return meta

def _edge_bins(edge: pd.Series,
               bins=(-np.inf, 0.0, 0.02, 0.05, np.inf),
               labels=("<0%","0–2%","2–5%","≥5%")):
    return pd.cut(edge, bins=bins, labels=labels, include_lowest=True, right=False)

# ---------- PREPARACIÓN DE FEATURES Y META ----------
CLASS2TXT = {0:"A", 1:"D", 2:"H"}
TXT2IDX   = {"A":0, "D":1, "H":2}

# columnas a quitar del set de features (manteniendo mismas decisiones que usabas)
drop_common = [
    'FTR','target','Date','has_xg_data',
    'a_squad_size_prev_season','away_form_gd_6','home_form_gd_6',
    'HomeTeam_norm','AwayTeam_norm','row_id'
]
drop_mode = (['overround','pimp2','B365D'] if WITH_ODDS else
             ['fase_temporada_inicio','fase_temporada_mitad',
              'B365H','B365D','B365A','overround','pimp1','pimpx','pimp2'])
drop_cols = list(dict.fromkeys(drop_common + drop_mode))

need_meta = ["Season","Date","HomeTeam_norm","AwayTeam_norm","B365H","B365D","B365A","Wk"]

y_all = df['target']
X_all = df.drop(columns=[c for c in drop_cols if c in df.columns], errors='ignore').copy()

valid = y_all.notna()
if WITH_ODDS:
    for c in ['B365H','B365A']:
        if c in df.columns:
            valid &= df[c].notna()
valid &= X_all.notna().all(axis=1)

X_all = X_all.loc[valid].copy()
y_all = y_all.loc[valid].astype(int)
meta_all = df.loc[valid, [c for c in need_meta if c in df.columns]].copy()
meta_all["Date"] = pd.to_datetime(meta_all["Date"], errors="coerce")

# Construir jornada robusta UNA vez
meta_all = _fill_jornada(meta_all)

# ---------- DETECTAR ÚLTIMA TEMPORADA Y JORNADAS PENDIENTES ----------
latest_season = int(meta_all["Season"].dropna().max())
TEST_SEASON   = latest_season  # si quieres fijar 2025: TEST_SEASON = 2025

ml_path_csv  = PER_SEASON_DIR / f"matchlog_{TEST_SEASON}.csv"
ml_path_json = PER_SEASON_DIR / f"matchlog_{TEST_SEASON}.json"

if ml_path_csv.exists():
    ml_exist = pd.read_csv(ml_path_csv)
    last_done = pd.to_numeric(ml_exist.get("jornada", pd.Series(dtype="Int64")), errors="coerce").max()
    if pd.isna(last_done):
        last_done = 0
else:
    ml_exist = pd.DataFrame()
    last_done = 0

# Calendario real de la temporada (por jornada, con fecha de inicio)
g = (meta_all[meta_all["Season"] == TEST_SEASON]
         .groupby("jornada", dropna=True)
         .agg(dmin=("Date","min"), n=("jornada","size"))
         .reset_index()
         .sort_values(["dmin","jornada"]))

pending = g.loc[g["jornada"] > int(last_done), "jornada"].astype(int).tolist()
print(f"[UPDATE] Temporada {TEST_SEASON} | Última jornada guardada: {int(last_done)} | Pendientes: {pending}")

if not pending:
    print("No hay jornadas nuevas que calcular. Salgo sin cambios.")
else:
    # ---------- ENTRENAR SOLO JORNADAS PENDIENTES ----------
    feat_cols = [c for c in X_all.columns if c != "Season"]
    idx_of = {"A":0,"D":1,"H":2}
    labels = np.array(["A","D","H"])
    updates = []

    # Config rápido/estable
    logreg_kw = dict(
        solver="lbfgs",
        multi_class="multinomial",
        penalty="l2",
        C=0.5,
        tol=1e-3,
        max_iter=300,
        random_state=RANDOM_STATE,
    )

    t0_all = time.time()
    for wk in pending:
        d_start = g.loc[g["jornada"] == wk, "dmin"].iloc[0]

        te_mask = (meta_all["Season"] == TEST_SEASON) & (meta_all["jornada"] == wk)
        tr_mask = (meta_all["Date"] < d_start)

        if not te_mask.any() or not tr_mask.any():
            continue

        # Numpy float32 para acelerar
        X_tr = X_all.loc[tr_mask, feat_cols].to_numpy(dtype=np.float32)
        y_tr = y_all.loc[tr_mask].to_numpy()
        X_te = X_all.loc[te_mask, feat_cols].to_numpy(dtype=np.float32)
        y_te = y_all.loc[te_mask].to_numpy()

        if np.unique(y_tr).size < 2:
            continue

        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr).astype(np.float32, copy=False)
        X_te_s = scaler.transform(X_te).astype(np.float32, copy=False)

        mdl = LogisticRegression(**logreg_kw)
        mdl.fit(X_tr_s, y_tr)

        proba = mdl.predict_proba(X_te_s)
        yhat  = mdl.predict(X_te_s)

        # meta de test
        meta_te = meta_all.loc[te_mask, ["Season","Date","jornada","HomeTeam_norm","AwayTeam_norm","B365H","B365D","B365A"]].reset_index(drop=True)
        odds_te = meta_te[["B365A","B365D","B365H"]].to_numpy(dtype=np.float32)

        # Probabilidades ordenadas A,D,H
        P = np.full((proba.shape[0], 3), np.nan, dtype=np.float32)
        for col_idx, cls in enumerate(mdl.classes_):
            lab = CLASS2TXT.get(int(cls))
            if lab == "A": P[:,0] = proba[:, col_idx]
            if lab == "D": P[:,1] = proba[:, col_idx]
            if lab == "H": P[:,2] = proba[:, col_idx]

        pred_txt = np.vectorize({0:"A",1:"D",2:"H"}.get)(yhat)
        pred_idx = np.vectorize(idx_of.get)(pred_txt)
        pred_prob = P[np.arange(P.shape[0]), pred_idx]
        pred_odds = odds_te[np.arange(odds_te.shape[0]), pred_idx]
        edge_pred = pred_prob * pred_odds - 1.0

        EV = P * odds_te - 1.0
        best_idx = EV.argmax(axis=1)
        value_pick = labels[best_idx]
        value_ev   = EV[np.arange(EV.shape[0]), best_idx]
        value_prob = P[np.arange(P.shape[0]), best_idx]
        value_odds = odds_te[np.arange(odds_te.shape[0]), best_idx]

        correct   = (yhat == y_te)
        value_hit = (np.vectorize(idx_of.get)(value_pick) == y_te)

        bet_return = np.where(correct, pred_odds * STAKE, 0.0)
        net_profit = bet_return - STAKE

        thr_val   = 0.0 if (MIN_EDGE_VALUE is None) else float(MIN_EDGE_VALUE)
        use_value = (value_ev >= thr_val) if (thr_val > 0.0) else np.ones(len(value_ev), dtype=bool)
        value_bet_return = np.where(value_hit, value_odds * STAKE, 0.0)
        value_bet_return = np.where(use_value, value_bet_return, 0.0)
        value_net_profit = value_bet_return - np.where(use_value, STAKE, 0.0)

        out = meta_te.copy()
        out["true_result"]      = y_te
        out["predicted_result"] = yhat
        out["Pred"]             = pred_txt
        out["predicted_prob"]   = pred_prob
        out["predicted_odds"]   = pred_odds
        out["edge"]             = edge_pred

        out["value_pick"]       = value_pick
        out["value_ev"]         = value_ev
        out["value_prob"]       = value_prob
        out["value_odds"]       = value_odds
        out["use_value"]        = use_value

        out["bet_return"]       = bet_return
        out["net_profit"]       = net_profit
        out["value_bet_return"] = value_bet_return
        out["value_net_profit"] = value_net_profit

        out["Correct"]          = np.where(correct, "✓", "✗")
        out["value_correct"]    = np.where(value_hit, "✓", "✗")
        out["edge_bin"]         = _edge_bins(out["edge"])
        out["value_bin"]        = _edge_bins(out["value_ev"])

        # Formateo final
        out["Date"]    = pd.to_datetime(out["Date"], errors="coerce").dt.strftime("%Y-%m-%d")
        out["jornada"] = pd.to_numeric(out["jornada"], errors="coerce").astype("Int64")

        updates.append(out)
        print(f"[UPDATE] Jornada {wk}: añadidas {len(out)} filas.")

    ml_new = pd.concat(updates, axis=0, ignore_index=True) if updates else pd.DataFrame()
    if ml_new.empty:
        print("No se generó ninguna fila nueva (¿sin partidos en pendientes?).")
    else:
        # Si existe histórico, concatena y evita duplicados por claves naturales
        if not ml_exist.empty:
            # Alinear columnas
            for c in ml_exist.columns:
                if c not in ml_new.columns:
                    ml_new[c] = np.nan
            for c in ml_new.columns:
                if c not in ml_exist.columns:
                    ml_exist[c] = np.nan
            # Ordenar columnas como el histórico
            ml_new = ml_new[ml_exist.columns.tolist()]
            ml_all = pd.concat([ml_exist, ml_new], ignore_index=True)
        else:
            ml_all = ml_new

        # Evitar duplicados (misma Season+Date+local+visitante)
        key_cols = ["Season","Date","HomeTeam_norm","AwayTeam_norm"]
        key_cols = [c for c in key_cols if c in ml_all.columns]
        ml_all = ml_all.drop_duplicates(subset=key_cols, keep="last").sort_values(["Season","jornada","Date"]).reset_index(drop=True)

        # IMPORTANTE: solo dejamos 'jornada' (no 'Wk') en outputs
        if "Wk" in ml_all.columns:
            ml_all = ml_all.drop(columns=["Wk"])

        # Guardar
        ml_all.to_csv(ml_path_csv, index=False)
        ml_all.to_json(ml_path_json, orient="records", force_ascii=False, indent=2)
        print(f"[OK] Guardado actualizado:\n- {ml_path_csv}\n- {ml_path_json}\nTotal filas {len(ml_all):,} (Temporada {TEST_SEASON})")

    print(f"Tiempo total update: {time.time()-t0_all:,.1f}s")

==> Cargando df_final: /content/data/03_features/df_final.parquet
Filas df: 7,290 | Columnas: 76
[UPDATE] Temporada 2025 | Última jornada guardada: 6 | Pendientes: []
No hay jornadas nuevas que calcular. Salgo sin cambios.


  wk_series = wk_series.combine_first(meta["Wk_cal"])


Con SMOTE:

In [24]:
# # ==========================================================
# # MATCH-LOG (walk-forward por jornada) — versión SMOTE
# # (construcción completa, una sola vez)
# # ==========================================================
# import pandas as pd
# import numpy as np
# from pathlib import Path
# from sklearn.preprocessing import StandardScaler
# from sklearn.linear_model import LogisticRegression
# from sklearn.exceptions import ConvergenceWarning
# import warnings, json, sys, subprocess

# # ---------- dependencias SMOTE ----------
# warnings.filterwarnings("ignore", category=ConvergenceWarning)
# try:
#     from imblearn.over_sampling import SMOTE
# except Exception:
#     subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "imbalanced-learn"])
#     from imblearn.over_sampling import SMOTE

# # -------------------------
# # Rutas y carga base
# # -------------------------
# ROOT = Path(".")
# DATA = ROOT / "data"
# FEAT = DATA / "03_features"
# PROC = DATA / "02_processed"
# OUT  = ROOT / "outputs"
# OUT.mkdir(parents=True, exist_ok=True)

# df_path = FEAT / "df_final.parquet"
# cal_paths = [PROC / "wk_actualizado_2005_2025.parquet", PROC / "wk_2005_2025.parquet"]

# assert df_path.exists(), f"No existe {df_path}"
# df = pd.read_parquet(df_path).reset_index(drop=True)
# print(f"==> Cargando df_final: {df_path}\nFilas: {len(df):,} | Columnas: {len(df.columns)}")

# # -------------------------
# # Utilidades calendario/jornada
# # -------------------------
# def _safe_to_datetime(s):
#     return pd.to_datetime(s, errors="coerce")

# def _load_calendar_unique(paths):
#     """Calendario único por (Season, Date_day) con Wk_cal entero."""
#     for p in paths:
#         if p.exists():
#             cal = pd.read_parquet(p).copy()
#             need = {"Season","Date","Wk"}
#             if not need.issubset(cal.columns):
#                 continue
#             cal["Date"] = _safe_to_datetime(cal["Date"])
#             cal["Date_day"] = cal["Date"].dt.date
#             cal["Wk"] = pd.to_numeric(cal["Wk"], errors="coerce")
#             cal = cal.dropna(subset=["Season","Date_day"])

#             # Elegir Wk > 0 si existe para ese día, si no, cualquier Wk disponible.
#             cal.sort_values(["Season","Date_day","Wk"], inplace=True)
#             cal_pos = cal[cal["Wk"] > 0].drop_duplicates(["Season","Date_day"], keep="first")
#             cal_any = cal.drop_duplicates(["Season","Date_day"], keep="first")
#             g = cal_pos.set_index(["Season","Date_day"]).combine_first(
#                     cal_any.set_index(["Season","Date_day"])
#                 ).reset_index()
#             g.rename(columns={"Wk":"Wk_cal"}, inplace=True)
#             g["Wk_cal"] = pd.to_numeric(g["Wk_cal"], errors="coerce").round().astype("Int64")
#             return g[["Season","Date_day","Wk_cal"]]
#     return None

# def build_jornada(meta: pd.DataFrame, cal_unique: pd.DataFrame | None) -> pd.Series:
#     """
#     Devuelve 'jornada' con prioridad:
#       1) Wk propio > 0 (si existiera),
#       2) calendario por (Season, Date_day),
#       3) fallback por orden de días dentro de cada Season (1..N).
#     Nunca devuelve 0/negativos. Tipo Int64.
#     """
#     m = meta.copy()
#     m["Date"] = _safe_to_datetime(m["Date"])
#     m["Date_day"] = m["Date"].dt.date

#     if "Wk" in m.columns:
#         wk_own = pd.to_numeric(m["Wk"], errors="coerce").where(lambda x: x > 0)
#     else:
#         wk_own = pd.Series(np.nan, index=m.index, dtype="float64")

#     if cal_unique is not None:
#         m = m.merge(cal_unique, on=["Season","Date_day"], how="left")
#         wk_cal = pd.to_numeric(m["Wk_cal"], errors="coerce")
#         jornada = wk_own.fillna(wk_cal)
#     else:
#         jornada = wk_own

#     if jornada.isna().any():
#         tmp = (m[["Season","Date_day"]]
#                .drop_duplicates()
#                .sort_values(["Season","Date_day"]))
#         tmp["j_fallback"] = tmp.groupby("Season").cumcount() + 1
#         m = m.merge(tmp, on=["Season","Date_day"], how="left")
#         jornada = jornada.fillna(m["j_fallback"])

#     jornada = pd.to_numeric(jornada, errors="coerce")
#     jornada = jornada.where(jornada > 0)
#     jornada = jornada.round().astype("Int64")
#     return jornada

# # -------------------------
# # Construcción de 'target' robusto
# # -------------------------
# CLASS2TXT = {0:"A", 1:"D", 2:"H"}
# TXT2IDX   = {"A":0, "D":1, "H":2}

# def build_target(df_in: pd.DataFrame) -> pd.Series:
#     """
#     Construye etiqueta 0/1/2 (A/D/H) desde:
#       - 'target' si existe,
#       - si no, 'FTR' con mapping {'A':0,'D':1,'H':2}.
#     Devuelve Int64 con NaNs donde no se pueda mapear.
#     """
#     if "target" in df_in.columns:
#         t = pd.to_numeric(df_in["target"], errors="coerce").astype("Int64")
#         bad = ~t.isin([0,1,2])
#         if bad.any() and "FTR" in df_in.columns:
#             t_ftr = df_in["FTR"].map(TXT2IDX).astype("Int64")
#             t = t.mask(bad, t_ftr)
#     elif "FTR" in df_in.columns:
#         t = df_in["FTR"].map(TXT2IDX).astype("Int64")
#     else:
#         raise ValueError("No encuentro 'target' ni 'FTR' para construir la etiqueta.")
#     return t.where(t.isin([0,1,2]))

# # -------------------------
# # Preparar datos + inyectar 'jornada'
# # -------------------------
# cal_u = _load_calendar_unique(cal_paths)

# df["Date"] = _safe_to_datetime(df["Date"])
# meta_cols = ["Season","Date","HomeTeam_norm","AwayTeam_norm","B365H","B365D","B365A"]
# missing = [c for c in meta_cols if c not in df.columns]
# if missing:
#     raise ValueError(f"Faltan columnas en df_final: {missing}")

# df["jornada"] = build_jornada(df[["Season","Date","Wk"] if "Wk" in df.columns else ["Season","Date"]], cal_u)
# if (df["jornada"].fillna(0) <= 0).any():
#     raise RuntimeError("Jornadas no válidas detectadas (<=0). Revisa calendario/fechas.")

# # Row_id único para trazabilidad (no entra en X)
# df = df.reset_index(drop=False).rename(columns={"index":"row_id"})
# assert df["row_id"].is_unique, "row_id no es único."

# # -------------------------
# # Construcción X / y / meta
# # -------------------------
# drop_common = [
#     'FTR','target','Date','has_xg_data',
#     'a_squad_size_prev_season','away_form_gd_6','home_form_gd_6',
#     'HomeTeam_norm','AwayTeam_norm','row_id'
# ]
# # Cuotas solo en meta (no en X)
# drop_mode = ['B365H','B365D','B365A','overround','pimp1','pimpx','pimp2']
# drop_cols = list(dict.fromkeys(drop_common + drop_mode))

# target_ser = build_target(df)

# X_all = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore").copy()
# X_all = X_all.replace([np.inf, -np.inf], np.nan)

# meta_all = df.loc[:, ["row_id"] + meta_cols + ["jornada"]].copy()
# for c in ["B365H","B365D","B365A"]:
#     meta_all[c] = pd.to_numeric(meta_all[c], errors="coerce")

# valid = target_ser.notna()
# valid &= X_all.notna().all(axis=1)
# valid &= meta_all[["B365H","B365D","B365A"]].notna().all(axis=1)

# X_all = X_all.loc[valid].copy()
# y_all = target_ser.loc[valid].astype(int)
# meta_all = meta_all.loc[valid].copy()

# if "Season" not in X_all.columns:
#     X_all["Season"] = df.loc[valid, "Season"].values

# # -------------------------
# # Helper bin de edge
# # -------------------------
# def _edge_bins(edge: pd.Series,
#                bins=(-np.inf, 0.0, 0.02, 0.05, np.inf),
#                labels=("<0%","0–2%","2–5%","≥5%")):
#     return pd.cut(edge, bins=bins, labels=labels, include_lowest=True, right=False)

# # -------------------------
# # Walk-forward por jornada (con SMOTE)
# # -------------------------
# def _walkforward_one_season(test_season: int,
#                             *,
#                             stake=1.0,
#                             min_edge_pred=0.0,
#                             min_edge_value=None,
#                             random_state=42,
#                             jornadas_limit: set | None = None,
#                             use_smote=True):
#     m_season = meta_all[meta_all["Season"] == test_season].copy()
#     if m_season.empty:
#         return pd.DataFrame()

#     g = (m_season.groupby("jornada", dropna=True)
#                  .agg(dmin=("Date","min"), n=("jornada","size"))
#                  .reset_index()
#                  .sort_values(["dmin","jornada"]))
#     if g.empty:
#         return pd.DataFrame()

#     parts = []
#     for _, row in g.iterrows():
#         wk = int(row["jornada"])
#         if (jornadas_limit is not None) and (wk not in jornadas_limit):
#             continue

#         d_start = row["dmin"]
#         idx_te_mask = (meta_all["Season"] == test_season) & (meta_all["jornada"] == wk)
#         idx_tr_mask = (meta_all["Date"] < d_start)
#         if not idx_te_mask.any() or not idx_tr_mask.any():
#             continue

#         feat_cols = [c for c in X_all.columns if c != "Season"]

#         # Aseguramos ARRAYS NumPy (evita desalineaciones y shapes raras)
#         X_tr = X_all.loc[idx_tr_mask, feat_cols].to_numpy()
#         y_tr = y_all.loc[idx_tr_mask].to_numpy()
#         X_te = X_all.loc[idx_te_mask, feat_cols].to_numpy()
#         y_te = y_all.loc[idx_te_mask].to_numpy()

#         if len(np.unique(y_tr)) < 2:
#             continue

#         scaler = StandardScaler()
#         X_tr_s = scaler.fit_transform(X_tr)
#         X_te_s = scaler.transform(X_te)

#         # ---------- SMOTE ----------
#         if use_smote:
#             try:
#                 _, counts = np.unique(y_tr, return_counts=True)
#                 minc = int(counts.min())
#                 if minc > 1:
#                     k = max(1, min(5, minc - 1))  # seguro (1..5)
#                     sm = SMOTE(random_state=random_state, k_neighbors=k)
#                     X_tr_s, y_tr = sm.fit_resample(X_tr_s, y_tr)
#             except Exception:
#                 pass

#         mdl = LogisticRegression(
#             solver="saga", penalty="l2", max_iter=1000, random_state=random_state
#         )
#         mdl.fit(X_tr_s, y_tr)

#         proba = mdl.predict_proba(X_te_s)   # (n_te, n_clases)
#         yhat  = mdl.predict(X_te_s)         # (n_te,)

#         # Meta y odds POSICIONALES (con reset_index para longitud EXACTA)
#         meta_te = m_season.loc[idx_te_mask, ["Season","Date","jornada","HomeTeam_norm","AwayTeam_norm","B365H","B365D","B365A"]].reset_index(drop=True)
#         odds_te = meta_te[["B365A","B365D","B365H"]].to_numpy()   # orden A,D,H

#         # Reordenar proba a columnas A,D,H según clases del modelo
#         P = np.full((proba.shape[0], 3), np.nan, dtype=float)
#         for col_idx, cls in enumerate(mdl.classes_):
#             label = CLASS2TXT.get(int(cls))
#             if label == "A": P[:,0] = proba[:, col_idx]
#             if label == "D": P[:,1] = proba[:, col_idx]
#             if label == "H": P[:,2] = proba[:, col_idx]

#         # Predicción textual y edge
#         idx_of = {"A":0,"D":1,"H":2}
#         pred_txt = np.vectorize({0:"A",1:"D",2:"H"}.get)(yhat)
#         pred_idx = np.vectorize(idx_of.get)(pred_txt)
#         pred_prob = P[np.arange(P.shape[0]), pred_idx]
#         pred_odds = odds_te[np.arange(odds_te.shape[0]), pred_idx]
#         edge_pred = pred_prob * pred_odds - 1.0

#         # Apuesta de valor
#         EV = P * odds_te - 1.0
#         best_idx = EV.argmax(axis=1)                # 0=A,1=D,2=H
#         labels = np.array(["A","D","H"])
#         value_pick = labels[best_idx]
#         value_ev   = EV[np.arange(EV.shape[0]), best_idx]
#         value_prob = P[np.arange(P.shape[0]), best_idx]
#         value_odds = odds_te[np.arange(odds_te.shape[0]), best_idx]

#         # Métricas
#         true_result = y_te
#         predicted_result = yhat
#         correct = (predicted_result == true_result)
#         value_hit = (np.vectorize(idx_of.get)(value_pick) == true_result)

#         stake = 1.0
#         bet_return = np.where(correct, pred_odds * stake, 0.0)
#         net_profit = bet_return - stake

#         thr_val = 0.0 if (min_edge_value is None) else min_edge_value
#         use_value = (value_ev >= (0.0 if min_edge_value is None else min_edge_value)) if (thr_val and thr_val > 0) else np.ones(len(value_ev), dtype=bool)
#         value_bet_return = np.where(value_hit, value_odds * stake, 0.0)
#         value_bet_return = np.where(use_value, value_bet_return, 0.0)
#         value_net_profit = value_bet_return - np.where(use_value, stake, 0.0)

#         out = meta_te.copy()  # NO incluimos 'Wk', solo 'jornada'
#         out["true_result"]      = true_result
#         out["predicted_result"] = predicted_result
#         out["Pred"]             = pred_txt
#         out["predicted_prob"]   = pred_prob
#         out["predicted_odds"]   = pred_odds
#         out["edge"]             = edge_pred

#         out["value_pick"]       = value_pick
#         out["value_ev"]         = value_ev
#         out["value_prob"]       = value_prob
#         out["value_odds"]       = value_odds
#         out["use_value"]        = use_value

#         out["bet_return"]       = bet_return
#         out["net_profit"]       = net_profit
#         out["value_bet_return"] = value_bet_return
#         out["value_net_profit"] = value_net_profit

#         out["Correct"]          = np.where(correct, "✓", "✗")
#         out["value_correct"]    = np.where(value_hit, "✓", "✗")

#         out["edge_bin"]  = _edge_bins(out["edge"])
#         out["value_bin"] = _edge_bins(out["value_ev"])

#         parts.append(out)

#     if not parts:
#         return pd.DataFrame()

#     ml = pd.concat(parts, axis=0, ignore_index=True)
#     ml["Date"] = pd.to_datetime(ml["Date"], errors="coerce").dt.strftime("%Y-%m-%d")
#     ml["jornada"] = pd.to_numeric(ml["jornada"], errors="coerce").round().astype("Int64")
#     return ml

# def build_matchlog_grid_smote(df_source: pd.DataFrame,
#                               out_dir: Path,
#                               *,
#                               model_name="smote",
#                               stake=1.0,
#                               min_edge_pred=0.0,
#                               min_edge_value=None,
#                               random_state=42):

#     per_season_dir = out_dir / f"matchlogs_{model_name}"
#     per_season_dir.mkdir(parents=True, exist_ok=True)

#     seasons_all = sorted(df_source["Season"].dropna().astype(int).unique())
#     season_summary = []

#     for season in seasons_all:
#         try:
#             ml = _walkforward_one_season(
#                 season,
#                 stake=stake,
#                 min_edge_pred=min_edge_pred,
#                 min_edge_value=min_edge_value,
#                 random_state=random_state,
#                 jornadas_limit=None,
#                 use_smote=True
#             )
#             if ml.empty:
#                 print(f"[{model_name}] Season {season}: sin filas válidas.")
#                 continue

#             if (ml["jornada"].fillna(0) <= 0).any():
#                 raise RuntimeError(f"Season {season}: detectadas jornadas <= 0 en output.")

#             n_pred = len(ml)
#             roi_pred = float(ml["net_profit"].sum() / (stake * n_pred)) if n_pred > 0 else np.nan
#             n_val = int(ml["use_value"].sum())
#             roi_val = float(ml.loc[ml["use_value"], "value_net_profit"].sum() / (stake * n_val)) if n_val > 0 else np.nan

#             csv_path  = per_season_dir / f"matchlog_{season}.csv"
#             json_path = per_season_dir / f"matchlog_{season}.json"
#             ml.to_csv(csv_path, index=False)
#             ml.to_json(json_path, orient="records", force_ascii=False, indent=2)
#             print(f"[{model_name}] Season {season}: guardado match-log ({len(ml)} filas)")

#             season_summary.append({
#                 "model": model_name,
#                 "train_mode": "walk-forward por jornada (SMOTE)",
#                 "test_season": int(season),
#                 "n_pred_bets": int(n_pred),
#                 "roi_pred": roi_pred,
#                 "profit_pred": float(ml["net_profit"].sum()),
#                 "n_value_bets": int(n_val),
#                 "roi_value": roi_val,
#                 "profit_value": float(ml.loc[ml['use_value'], 'value_net_profit'].sum() if n_val > 0 else 0.0),
#                 "min_edge_pred": float(min_edge_pred),
#                 "min_edge_value": float(min_edge_pred if (min_edge_value is None) else min_edge_value),
#                 "stake": float(stake),
#             })
#         except Exception as e:
#             print(f"[MATCHLOG {model_name.upper()} SKIP] Season {season} → {e}")

#     if season_summary:
#         df_sum = pd.DataFrame(season_summary).sort_values("test_season")
#         df_sum.to_csv(out_dir / f"matchlog_season_summary_{model_name}.csv", index=False)
#         (out_dir / f"matchlog_season_summary_{model_name}.json").write_text(
#             json.dumps(season_summary, ensure_ascii=False, indent=2),
#             encoding="utf-8"
#         )
#         print(f"Guardados:\n- {out_dir/f'matchlog_season_summary_{model_name}.csv'}\n- {out_dir/f'matchlog_season_summary_{model_name}.json'}")
#     else:
#         print(f"Sin temporadas válidas para exportar matchlogs ({model_name}).")

# # -------------------------
# # EJECUCIÓN COMPLETA (SMOTE)
# # -------------------------
# build_matchlog_grid_smote(
#     df_source=df,
#     out_dir=OUT,
#     model_name="smote",
#     stake=1.0,
#     min_edge_pred=0.00,
#     min_edge_value=None,
#     random_state=42,
# )

# # -------------------------
# # CHEQUEO FINAL
# # -------------------------
# for f in sorted((OUT / "matchlogs_smote").glob("matchlog_*.csv"))[:3]:
#     tmp = pd.read_csv(f)
#     assert "Wk" not in tmp.columns, f"{f} contiene Wk."
#     assert (tmp["jornada"].fillna(0) > 0).all(), f"{f} tiene jornada <= 0."
# print("Chequeo final OK (SMOTE): 'jornada' presente y válida en outputs; 'Wk' eliminado.")


In [25]:
# ==========================================================
# ACTUALIZACIÓN INCREMENTAL 2025 — versión SMOTE
# (añade ÚNICAMENTE las jornadas nuevas)
# ==========================================================
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.exceptions import ConvergenceWarning
import warnings, json, sys, subprocess

warnings.filterwarnings("ignore", category=ConvergenceWarning)
try:
    from imblearn.over_sampling import SMOTE
except Exception:
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", "imbalanced-learn"])
    from imblearn.over_sampling import SMOTE

ROOT = Path(".")
DATA = ROOT / "data"
FEAT = DATA / "03_features"
PROC = DATA / "02_processed"
OUT  = ROOT / "outputs"
OUT.mkdir(parents=True, exist_ok=True)

df_path = FEAT / "df_final.parquet"
assert df_path.exists(), f"No existe {df_path}"
df = pd.read_parquet(df_path).reset_index(drop=True)

# ---------- helpers reusados ----------
def _safe_to_datetime(s):
    return pd.to_datetime(s, errors="coerce")

def _load_calendar_unique(paths):
    for p in paths:
        if p.exists():
            cal = pd.read_parquet(p).copy()
            need = {"Season","Date","Wk"}
            if not need.issubset(cal.columns):
                continue
            cal["Date"] = _safe_to_datetime(cal["Date"])
            cal["Date_day"] = cal["Date"].dt.date
            cal["Wk"] = pd.to_numeric(cal["Wk"], errors="coerce")
            cal = cal.dropna(subset=["Season","Date_day"])
            cal.sort_values(["Season","Date_day","Wk"], inplace=True)
            cal_pos = cal[cal["Wk"] > 0].drop_duplicates(["Season","Date_day"], keep="first")
            cal_any = cal.drop_duplicates(["Season","Date_day"], keep="first")
            g = cal_pos.set_index(["Season","Date_day"]).combine_first(
                    cal_any.set_index(["Season","Date_day"])
                ).reset_index()
            g.rename(columns={"Wk":"Wk_cal"}, inplace=True)
            g["Wk_cal"] = pd.to_numeric(g["Wk_cal"], errors="coerce").round().astype("Int64")
            return g[["Season","Date_day","Wk_cal"]]
    return None

def build_jornada(meta: pd.DataFrame, cal_unique: pd.DataFrame | None) -> pd.Series:
    m = meta.copy()
    m["Date"] = _safe_to_datetime(m["Date"])
    m["Date_day"] = m["Date"].dt.date
    if "Wk" in m.columns:
        wk_own = pd.to_numeric(m["Wk"], errors="coerce").where(lambda x: x > 0)
    else:
        wk_own = pd.Series(np.nan, index=m.index, dtype="float64")
    if cal_unique is not None:
        m = m.merge(cal_unique, on=["Season","Date_day"], how="left")
        wk_cal = pd.to_numeric(m["Wk_cal"], errors="coerce")
        jornada = wk_own.fillna(wk_cal)
    else:
        jornada = wk_own
    if jornada.isna().any():
        tmp = (m[["Season","Date_day"]]
               .drop_duplicates()
               .sort_values(["Season","Date_day"]))
        tmp["j_fallback"] = tmp.groupby("Season").cumcount() + 1
        m = m.merge(tmp, on=["Season","Date_day"], how="left")
        jornada = jornada.fillna(m["j_fallback"])
    jornada = pd.to_numeric(jornada, errors="coerce").where(lambda x: x>0).round().astype("Int64")
    return jornada

CLASS2TXT = {0:"A", 1:"D", 2:"H"}
TXT2IDX   = {"A":0, "D":1, "H":2}

# Preparación mínima (igual que la celda completa)
df["Date"] = _safe_to_datetime(df["Date"])
meta_cols = ["Season","Date","HomeTeam_norm","AwayTeam_norm","B365H","B365D","B365A"]
for c in meta_cols:
    if c not in df.columns:
        raise ValueError(f"Falta {c} en df_final")

cal_paths = [PROC / "wk_actualizado_2005_2025.parquet", PROC / "wk_2005_2025.parquet"]
cal_u = _load_calendar_unique(cal_paths)
df["jornada"] = build_jornada(df[["Season","Date","Wk"] if "Wk" in df.columns else ["Season","Date"]], cal_u)
if (df["jornada"].fillna(0) <= 0).any():
    raise RuntimeError("Jornadas no válidas detectadas (<=0). Revisa calendario/fechas.")

df = df.reset_index(drop=False).rename(columns={"index":"row_id"})

drop_common = [
    'FTR','target','Date','has_xg_data',
    'a_squad_size_prev_season','away_form_gd_6','home_form_gd_6',
    'HomeTeam_norm','AwayTeam_norm','row_id'
]
drop_mode = ['B365H','B365D','B365A','overround','pimp1','pimpx','pimp2']
drop_cols = list(dict.fromkeys(drop_common + drop_mode))

target_ser = (pd.to_numeric(df["target"], errors="coerce")
              .where(lambda x: x.isin([0,1,2])).astype("Int64"))

X_all = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore").copy()
X_all = X_all.replace([np.inf, -np.inf], np.nan)
meta_all = df.loc[:, ["row_id"] + meta_cols + ["jornada"]].copy()
for c in ["B365H","B365D","B365A"]:
    meta_all[c] = pd.to_numeric(meta_all[c], errors="coerce")

valid = target_ser.notna() & X_all.notna().all(axis=1) & meta_all[["B365H","B365D","B365A"]].notna().all(axis=1)
X_all = X_all.loc[valid].copy()
y_all = target_ser.loc[valid].astype(int)
meta_all = meta_all.loc[valid].copy()
if "Season" not in X_all.columns:
    X_all["Season"] = df.loc[valid, "Season"].values

def _edge_bins(edge: pd.Series,
               bins=(-np.inf, 0.0, 0.02, 0.05, np.inf),
               labels=("<0%","0–2%","2–5%","≥5%")):
    return pd.cut(edge, bins=bins, labels=labels, include_lowest=True, right=False)

def _walkforward_one_season_smote_incremental(test_season: int, jornadas_limit: set):
    m_season = meta_all[meta_all["Season"] == test_season].copy()
    if m_season.empty:
        return pd.DataFrame()
    g = (m_season.groupby("jornada", dropna=True)
                 .agg(dmin=("Date","min"), n=("jornada","size"))
                 .reset_index()
                 .sort_values(["dmin","jornada"]))
    if g.empty:
        return pd.DataFrame()

    parts = []
    for _, row in g.iterrows():
        wk = int(row["jornada"])
        if wk not in jornadas_limit:
            continue

        d_start = row["dmin"]
        idx_te_mask = (meta_all["Season"] == test_season) & (meta_all["jornada"] == wk)
        idx_tr_mask = (meta_all["Date"] < d_start)
        if not idx_te_mask.any() or not idx_tr_mask.any():
            continue

        feat_cols = [c for c in X_all.columns if c != "Season"]
        X_tr = X_all.loc[idx_tr_mask, feat_cols].to_numpy()
        y_tr = y_all.loc[idx_tr_mask].to_numpy()
        X_te = X_all.loc[idx_te_mask, feat_cols].to_numpy()
        y_te = y_all.loc[idx_te_mask].to_numpy()
        if len(np.unique(y_tr)) < 2:
            continue

        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr)
        X_te_s = scaler.transform(X_te)

        try:
            _, counts = np.unique(y_tr, return_counts=True)
            minc = int(counts.min())
            if minc > 1:
                k = max(1, min(5, minc - 1))
                sm = SMOTE(random_state=42, k_neighbors=k)
                X_tr_s, y_tr = sm.fit_resample(X_tr_s, y_tr)
        except Exception:
            pass

        mdl = LogisticRegression(solver="saga", penalty="l2", max_iter=1000, random_state=42)
        mdl.fit(X_tr_s, y_tr)

        proba = mdl.predict_proba(X_te_s)
        yhat  = mdl.predict(X_te_s)

        meta_te = m_season.loc[idx_te_mask, ["Season","Date","jornada","HomeTeam_norm","AwayTeam_norm","B365H","B365D","B365A"]].reset_index(drop=True)
        odds_te = meta_te[["B365A","B365D","B365H"]].to_numpy()

        P = np.full((proba.shape[0], 3), np.nan, dtype=float)
        for col_idx, cls in enumerate(mdl.classes_):
            label = {0:"A",1:"D",2:"H"}.get(int(cls))
            if label == "A": P[:,0] = proba[:, col_idx]
            if label == "D": P[:,1] = proba[:, col_idx]
            if label == "H": P[:,2] = proba[:, col_idx]

        idx_of = {"A":0,"D":1,"H":2}
        pred_txt = np.vectorize({0:"A",1:"D",2:"H"}.get)(yhat)
        pred_idx = np.vectorize(idx_of.get)(pred_txt)
        pred_prob = P[np.arange(P.shape[0]), pred_idx]
        pred_odds = odds_te[np.arange(odds_te.shape[0]), pred_idx]
        edge_pred = pred_prob * pred_odds - 1.0

        EV = P * odds_te - 1.0
        best_idx = EV.argmax(axis=1)
        labels = np.array(["A","D","H"])
        value_pick = labels[best_idx]
        value_ev   = EV[np.arange(EV.shape[0]), best_idx]
        value_prob = P[np.arange(P.shape[0]), best_idx]
        value_odds = odds_te[np.arange(odds_te.shape[0]), best_idx]

        true_result = y_te
        predicted_result = yhat
        correct = (predicted_result == true_result)
        value_hit = (np.vectorize(idx_of.get)(value_pick) == true_result)

        stake = 1.0
        bet_return = np.where(correct, pred_odds * stake, 0.0)
        net_profit = bet_return - stake
        use_value = np.ones(len(value_ev), dtype=bool)
        value_bet_return = np.where(value_hit, value_odds * stake, 0.0)
        value_bet_return = np.where(use_value, value_bet_return, 0.0)
        value_net_profit = value_bet_return - np.where(use_value, stake, 0.0)

        out = meta_te.copy()
        out["true_result"]      = true_result
        out["predicted_result"] = predicted_result
        out["Pred"]             = pred_txt
        out["predicted_prob"]   = pred_prob
        out["predicted_odds"]   = pred_odds
        out["edge"]             = edge_pred

        out["value_pick"]       = value_pick
        out["value_ev"]         = value_ev
        out["value_prob"]       = value_prob
        out["value_odds"]       = value_odds
        out["use_value"]        = use_value

        out["bet_return"]       = bet_return
        out["net_profit"]       = net_profit
        out["value_bet_return"] = value_bet_return
        out["value_net_profit"] = value_net_profit

        out["Correct"]          = np.where(correct, "✓", "✗")
        out["value_correct"]    = np.where(value_hit, "✓", "✗")

        out["edge_bin"]  = _edge_bins(out["edge"])
        out["value_bin"] = _edge_bins(out["value_ev"])

        parts.append(out)

    if not parts:
        return pd.DataFrame()

    ml = pd.concat(parts, axis=0, ignore_index=True)
    ml["Date"] = pd.to_datetime(ml["Date"], errors="coerce").dt.strftime("%Y-%m-%d")
    ml["jornada"] = pd.to_numeric(ml["jornada"], errors="coerce").round().astype("Int64")
    return ml

# ---------- detectar qué jornadas faltan en 2025 ----------
season = 2025
per_season_dir = OUT / "matchlogs_smote"
per_season_dir.mkdir(parents=True, exist_ok=True)

path_csv  = per_season_dir / f"matchlog_{season}.csv"
path_json = per_season_dir / f"matchlog_{season}.json"

meta_2025 = df.loc[df["Season"].astype(int) == season, ["Season","Date","Wk"]].copy()
meta_2025["jornada"] = build_jornada(meta_2025, _load_calendar_unique([PROC / "wk_actualizado_2005_2025.parquet", PROC / "wk_2005_2025.parquet"]))
j_all = sorted([int(j) for j in meta_2025["jornada"].dropna().unique()])

if path_csv.exists():
    old = pd.read_csv(path_csv)
    j_done = sorted([int(j) for j in old["jornada"].dropna().unique()])
else:
    old = pd.DataFrame()
    j_done = []

j_todo = sorted(set(j_all) - set(j_done))
print(f"Jornadas en df 2025: {j_all}")
print(f"Jornadas ya guardadas: {j_done}")
print(f"Jornadas por calcular ahora: {j_todo}")

if not j_todo:
    print("No hay jornadas nuevas para 2025. Nada que actualizar.")
else:
    ml_new = _walkforward_one_season_smote_incremental(season, jornadas_limit=set(j_todo))
    if ml_new.empty:
        print("No se generaron filas nuevas (¿odds faltantes o datos insuficientes?).")
    else:
        if old.empty:
            out = ml_new
        else:
            out = pd.concat([old, ml_new], axis=0)
            out["Date_dt"] = pd.to_datetime(out["Date"], errors="coerce")
            out = (out
                   .sort_values(["Season","jornada","Date_dt","HomeTeam_norm","AwayTeam_norm"])
                   .drop_duplicates(["Season","jornada","Date","HomeTeam_norm","AwayTeam_norm"], keep="last")
                   .drop(columns=["Date_dt"]))
        out.to_csv(path_csv, index=False)
        out.to_json(path_json, orient="records", force_ascii=False, indent=2)
        print(f"[smote] Season {season}: actualizado match-log → {len(out)} filas totales (añadidas {len(ml_new)})")

Jornadas en df 2025: [7]
Jornadas ya guardadas: [1, 2, 3, 4, 5, 6]
Jornadas por calcular ahora: [7]
No se generaron filas nuevas (¿odds faltantes o datos insuficientes?).


## **COMPARACIÓN CON EL MODELO DE BET365**

El modelo basado en las cuotas de Bet365 consiste en predecir siempre el resultado más probable según la probabilidad implícita.

In [26]:
# ==========================================================
# Bet365 Baseline + Export + Comparaciones
#   - Incluye "jornada" (sin columna Wk en outputs)
#   - Rutas/outputs consistentes con el resto del pipeline
# ==========================================================
import pandas as pd
import numpy as np
from pathlib import Path
import json

from sklearn.metrics import accuracy_score, log_loss
from sklearn.preprocessing import label_binarize

# -------------------------
# Rutas y carga base
# -------------------------
ROOT = Path(".")
DATA = ROOT / "data"
FEAT = DATA / "03_features"
PROC = DATA / "02_processed"
OUT  = ROOT / "outputs"
OUT.mkdir(parents=True, exist_ok=True)

df_path = FEAT / "df_final.parquet"
cal_paths = [PROC / "wk_actualizado_2005_2025.parquet", PROC / "wk_2005_2025.parquet"]

df = pd.read_parquet(df_path).reset_index(drop=True)

# -------------------------
# Utilidades: fecha, target y jornada
# -------------------------
def _safe_to_datetime(s):
    return pd.to_datetime(s, errors="coerce")

TXT2IDX = {"A":0, "D":1, "H":2}

def build_target(df_in: pd.DataFrame) -> pd.Series:
    """
    Construye etiqueta 0/1/2 (A/D/H) desde:
      - 'target' si existe (numérico 0/1/2 o convertible),
      - si no, 'FTR' con mapping {'A':0,'D':1,'H':2}.
    Devuelve Int64 con NaNs donde no se pueda mapear.
    """
    t = None
    if "target" in df_in.columns:
        t_num = pd.to_numeric(df_in["target"], errors="coerce")
        bad = ~t_num.isin([0,1,2])
        if bad.any() and "FTR" in df_in.columns:
            t_ftr = df_in["FTR"].map(TXT2IDX).astype("Int64")
            t = t_num.astype("Int64")
            t = t.mask(bad, t_ftr)
        else:
            t = t_num.astype("Int64")
    elif "FTR" in df_in.columns:
        t = df_in["FTR"].map(TXT2IDX).astype("Int64")
    else:
        raise ValueError("No encuentro 'target' ni 'FTR' para construir la etiqueta.")

    t = t.where(t.isin([0,1,2]))
    return t

def _load_calendar_unique(paths):
    """Calendario único por (Season, Date_day) con Wk_cal entero."""
    for p in paths:
        if p.exists():
            cal = pd.read_parquet(p).copy()
            need = {"Season","Date","Wk"}
            if not need.issubset(cal.columns):
                continue
            cal["Date"] = _safe_to_datetime(cal["Date"])
            cal["Date_day"] = cal["Date"].dt.date
            cal["Wk"] = pd.to_numeric(cal["Wk"], errors="coerce")
            cal = cal.dropna(subset=["Season","Date_day"])

            cal["Wk_pos"] = cal["Wk"].where(cal["Wk"] > 0)
            g = cal.groupby(["Season","Date_day"], as_index=False).agg(Wk_cal=("Wk_pos","median"))
            # si no hay Wk > 0 ese día, usa mediana de Wk (aunque <=0)
            nan_mask = g["Wk_cal"].isna()
            if nan_mask.any():
                g2 = cal.groupby(["Season","Date_day"], as_index=False).agg(Wk_cal=("Wk","median"))
                g2 = g2.set_index(["Season","Date_day"])
                g.loc[nan_mask, "Wk_cal"] = g2.loc[
                    g.loc[nan_mask, ["Season","Date_day"]].set_index(["Season","Date_day"]).index
                ].to_numpy()
            g["Wk_cal"] = pd.to_numeric(g["Wk_cal"], errors="coerce").round().astype("Int64")
            return g[["Season","Date_day","Wk_cal"]]
    return None

def build_jornada(meta: pd.DataFrame, cal_unique: pd.DataFrame | None) -> pd.Series:
    """
    Devuelve 'jornada' con prioridad:
      1) Wk propio > 0 (si existiera),
      2) calendario por (Season, Date_day),
      3) fallback por orden de días dentro de cada Season (1..N).
    Nunca devuelve 0/negativos. Tipo Int64.
    """
    m = meta.copy()
    m["Date"] = _safe_to_datetime(m["Date"])
    m["Date_day"] = m["Date"].dt.date

    if "Wk" in m.columns:
        wk_own = pd.to_numeric(m["Wk"], errors="coerce").where(lambda x: x > 0)
    else:
        wk_own = pd.Series(np.nan, index=m.index, dtype="float64")

    if cal_unique is not None:
        m = m.merge(cal_unique, on=["Season","Date_day"], how="left")
        wk_cal = pd.to_numeric(m["Wk_cal"], errors="coerce")
        jornada = wk_own.fillna(wk_cal)
        m.drop(columns=["Wk_cal"], inplace=True, errors="ignore")
    else:
        jornada = wk_own

    if jornada.isna().any():
        tmp = (m[["Season","Date_day"]]
               .drop_duplicates()
               .sort_values(["Season","Date_day"]))
        tmp["j_fallback"] = tmp.groupby("Season").cumcount() + 1
        m = m.merge(tmp, on=["Season","Date_day"], how="left")
        jornada = jornada.fillna(m["j_fallback"])

    jornada = pd.to_numeric(jornada, errors="coerce")
    jornada = jornada.where(jornada > 0)
    jornada = jornada.round().astype("Int64")
    return jornada

# -------------------------
# Prepara DF con target y jornada (una sola vez)
# -------------------------
df["Date"] = _safe_to_datetime(df["Date"])
target_ser = build_target(df)
cal_u = _load_calendar_unique(cal_paths)
df["jornada"] = build_jornada(df[["Season","Date","Wk"] if "Wk" in df.columns else ["Season","Date"]], cal_u)
if (df["jornada"].fillna(0) <= 0).any():
    raise RuntimeError("Jornadas no válidas detectadas (<=0). Revisa calendario/fechas.")

# -------------------------
# Baseline Bet365
# -------------------------
def evaluate_bet365_baseline(
    df_full: pd.DataFrame,
    train_until_season: int = 2023,
    test_until_season: int | None = None,
    round_decimals: int = 4,
    stake: float = 1.0,
):
    """
    Baseline Bet365:
      - TEST: (train_until, test_until]
      - Prob implícitas normalizadas
      - Métricas: accuracy, log_loss, brier
      - ROI apostando al favorito Bet365
      - Devuelve (tabla partido a partido, métricas)
    """
    # 1) Filtrado TEST por temporadas
    assert "Season" in df_full.columns, "df debe contener 'Season'."
    if test_until_season is None:
        mask_test = df_full["Season"] > train_until_season
    else:
        mask_test = (df_full["Season"] > train_until_season) & (df_full["Season"] <= test_until_season)
    df_te = df_full.loc[mask_test].copy()
    if df_te.empty:
        rng = f"{train_until_season+1}..{test_until_season}" if test_until_season is not None else f">{train_until_season}"
        print(f"⚠️ No hay TEST disponible tras filtrar (Seasons {rng}).")
        return pd.DataFrame(), {}

    # 2) Necesitamos target y cuotas completas
    need_cols = ['B365H','B365D','B365A','Date','HomeTeam_norm','AwayTeam_norm','jornada']
    for c in need_cols:
        if c not in df_te.columns:
            raise ValueError(f"Falta columna necesaria en df: {c}")

    # target robusto (ya calculado arriba), alineado con df_te
    y_te = target_ser.loc[df_te.index]
    # Cuotas válidas
    for c in ['B365H','B365D','B365A']:
        df_te[c] = pd.to_numeric(df_te[c], errors="coerce")

    df_te = df_te.loc[y_te.notna()].copy()
    y_te = y_te.loc[df_te.index].astype(int)

    mask_ok = df_te[['B365H','B365D','B365A']].notna().all(axis=1)
    mask_ok &= (df_te[['B365H','B365D','B365A']] > 0).all(axis=1)

    df_te = df_te.loc[mask_ok].copy()
    y_te = y_te.loc[df_te.index]

    if df_te.empty:
        print("⚠️ No hay partidos con cuotas B365 completas en el TEST.")
        return pd.DataFrame(), {}

    # 3) Prob implícitas normalizadas
    inv = 1.0 / df_te[['B365H','B365D','B365A']]
    overround = inv.sum(axis=1)
    overround = overround.replace(0, np.nan)
    prob_norm = inv.div(overround, axis=0)

    # 4) Proba en orden de clases (0=A,1=D,2=H) y pick favorito
    bet365_proba = np.column_stack([
        prob_norm['B365A'].to_numpy(),
        prob_norm['B365D'].to_numpy(),
        prob_norm['B365H'].to_numpy()
    ])
    bet365_pred = bet365_proba.argmax(axis=1)

    # 5) Métricas
    classes = [0,1,2]
    acc = float(accuracy_score(y_te, bet365_pred))
    ll  = float(log_loss(y_te, bet365_proba, labels=classes))
    y_bin = label_binarize(y_te, classes=classes)
    brier = float(np.mean(np.sum((bet365_proba - y_bin)**2, axis=1)))

    # 6) Tabla partido a partido (con jornada)
    out = pd.DataFrame({
        "Date": _safe_to_datetime(df_te["Date"]).dt.strftime('%Y-%m-%d'),
        "Season": df_te["Season"].astype("Int64"),
        "jornada": pd.to_numeric(df_te["jornada"], errors="coerce").round().astype("Int64"),
        "HomeTeam_norm": df_te["HomeTeam_norm"].astype("string"),
        "AwayTeam_norm": df_te["AwayTeam_norm"].astype("string"),
        "B365H": df_te["B365H"].round(round_decimals),
        "B365D": df_te["B365D"].round(round_decimals),
        "B365A": df_te["B365A"].round(round_decimals),
        "p_H":   prob_norm["B365H"].round(round_decimals),
        "p_D":   prob_norm["B365D"].round(round_decimals),
        "p_A":   prob_norm["B365A"].round(round_decimals),
        "true_result": y_te.values,
        "bet365_pred": bet365_pred
    })

    # 7) ROI del favorito Bet365
    pick_idx = bet365_pred
    odds_mat = np.column_stack([df_te['B365A'].to_numpy(), df_te['B365D'].to_numpy(), df_te['B365H'].to_numpy()])
    picked_odds = odds_mat[np.arange(len(odds_mat)), pick_idx]
    out['picked_odds'] = picked_odds
    out['bet_return']  = np.where(out['bet365_pred'] == out['true_result'], out['picked_odds'] * stake, 0.0)
    out['net_profit']  = out['bet_return'] - stake
    out['Cum_net_profit'] = out['net_profit'].cumsum()

    # Edge informativo del pick
    p_mat = bet365_proba  # [A,D,H]
    out['edge_b365_pick'] = (p_mat[np.arange(len(p_mat)), pick_idx] * picked_odds) - 1.0

    n_eval = int(len(out))
    total_profit = float(out['net_profit'].sum())
    investment_total = float(stake * n_eval)
    roi = float(total_profit / investment_total) if investment_total > 0 else np.nan

    metrics = {
        "accuracy": acc,
        "log_loss": ll,
        "brier": brier,
        "n_test_with_odds": n_eval,
        "roi": roi,
        "profit_total": total_profit,
        "investment_total": investment_total,
        "stake": float(stake)
    }

    rng = f"{train_until_season+1}..{test_until_season}" if test_until_season is not None else f">{train_until_season}"
    print("Baseline Bet365 — Prob. implícitas normalizadas")
    print(f"Rango TEST: Seasons {rng} | n={n_eval} | ROI: {roi*100:.2f}% | Profit: {total_profit:.2f}")

    return out.reset_index(drop=True), metrics

# -------------------------
# Grid por temporada + export
# -------------------------
def build_bet365_grid(
    df_source: pd.DataFrame,
    out_dir: Path,
    seasons: list[int] | None = None,
    stake: float = 1.0,
    round_decimals: int = 4,
    save_matchlogs: bool = True
):
    """
    Para cada temporada S (train ≤ S-1, test = S):
      - matchlog Bet365 (opcional CSV/JSON, con 'jornada')
      - resumen por temporada (JSON+CSV) con ROI e investment_total
    """
    out_dir.mkdir(parents=True, exist_ok=True)
    per_season_dir = out_dir / "bet365_matchlogs"
    if save_matchlogs:
        per_season_dir.mkdir(parents=True, exist_ok=True)

    seasons_all = sorted(df_source["Season"].dropna().astype(int).unique())
    if seasons is None:
        seasons = seasons_all

    rows_json, rows_flat = [], []

    for test_season in seasons:
        train_until = test_season - 1
        if train_until < seasons_all[0]:  # sin historial
            continue

        try:
            tbl, met = evaluate_bet365_baseline(
                df_source,
                train_until_season=train_until,
                test_until_season=test_season,
                round_decimals=round_decimals,
                stake=stake
            )
            if tbl.empty:
                continue

            if save_matchlogs:
                tbl.to_csv(per_season_dir / f"matchlog_{test_season}.csv", index=False)
                (per_season_dir / f"matchlog_{test_season}.json").write_text(
                    tbl.to_json(orient="records", force_ascii=False, indent=2),
                    encoding="utf-8"
                )

            rows_json.append({
                "train_until": int(train_until),
                "test_season": int(test_season),
                "metrics": {
                    "accuracy": float(met["accuracy"]),
                    "log_loss": float(met["log_loss"]),
                    "brier":    float(met["brier"]),
                    "roi":      float(met["roi"]),
                    "profit_total": float(met["profit_total"]),
                    "investment_total": float(met["investment_total"]),
                    "n_test":   int(met["n_test_with_odds"]),
                    "stake":    float(met["stake"])
                }
            })
            rows_flat.append({
                "test_season": int(test_season),
                "train_until": int(train_until),
                "acc": float(met["accuracy"]),
                "logloss": float(met["log_loss"]),
                "brier": float(met["brier"]),
                "roi": float(met["roi"]),
                "profit_total": float(met["profit_total"]),
                "investment_total": float(met["investment_total"]),
                "n_test": int(met["n_test_with_odds"]),
            })

            print(f"[Bet365] Season {test_season}: OK ({len(tbl)} partidos)")

        except Exception as e:
            print(f"[BET365 SKIP] test={test_season} → {e}")

    (out_dir / "bet365_grid.json").write_text(json.dumps(rows_json, ensure_ascii=False, indent=2), encoding="utf-8")
    pd.DataFrame(rows_flat).sort_values("test_season").to_csv(out_dir / "bet365_metrics_by_season.csv", index=False)

    print("Guardados:")
    print(f"- {out_dir/'bet365_grid.json'}")
    print(f"- {out_dir/'bet365_metrics_by_season.csv'}")
    if save_matchlogs:
        print(f"- {out_dir/'bet365_matchlogs'}/matchlog_<SEASON>.csv/json")

# -------------------------
# Comparaciones
# -------------------------
def build_season_comparison_model_vs_bet365(
    out_dir: Path,
    model_tag: str = "base"  # coincide con roi_by_season_<tag>.csv
):
    """
    Une outputs/roi_by_season_<model_tag>.csv (tu modelo) con
    outputs/bet365_metrics_by_season.csv y calcula deltas.
    """
    df_m = pd.read_csv(out_dir / f"roi_by_season_{model_tag}.csv")
    df_b = pd.read_csv(out_dir / "bet365_metrics_by_season.csv")

    # Normaliza nombres por si difieren
    df_m = df_m.rename(columns={"profit_total":"profit_model", "roi":"roi_model", "n_bets":"n_bets_model"})
    df_b = df_b.rename(columns={"profit_total":"profit_bet365", "roi":"roi_bet365", "n_test":"n_bets_bet365"})

    if "stake" not in df_m.columns:
        df_m["stake"] = 1.0
    if "stake" not in df_b.columns:
        df_b["stake"] = 1.0

    df_m["investment_total_model"] = df_m["stake"] * df_m["n_bets_model"]
    df_b["investment_total_bet365"] = df_b["stake"] * df_b["n_bets_bet365"]

    comp = pd.merge(df_m, df_b, on=["test_season","train_until"], how="inner", suffixes=("_m","_b"))
    comp["delta_roi"]    = comp["roi_model"]    - comp["roi_bet365"]
    comp["delta_profit"] = comp["profit_model"] - comp["profit_bet365"]

    comp_sorted = comp.sort_values("test_season")
    comp_sorted.to_csv(out_dir / f"comparison_season_{model_tag}_vs_bet365.csv", index=False)
    (out_dir / f"comparison_season_{model_tag}_vs_bet365.json").write_text(
        comp_sorted.to_json(orient="records", force_ascii=False, indent=2),
        encoding="utf-8"
    )
    print(f"Guardados comparativos temporada:\n- {out_dir/f'comparison_season_{model_tag}_vs_bet365.csv'}\n- {out_dir/f'comparison_season_{model_tag}_vs_bet365.json'}")

def build_match_comparison_for_season(
    out_dir: Path,
    season: int,
    model_tag: str = "base"
):
    """
    Une matchlogs:
      - outputs/matchlogs_<model_tag>/matchlog_<season>.csv
      - outputs/bet365_matchlogs/matchlog_<season>.csv
    por (Date, HomeTeam_norm, AwayTeam_norm) y calcula deltas por partido.
    """
    ml_model = pd.read_csv(out_dir / f"matchlogs_{model_tag}" / f"matchlog_{season}.csv")
    ml_b365  = pd.read_csv(out_dir / "bet365_matchlogs" / f"matchlog_{season}.csv")

    key = ["Date","HomeTeam_norm","AwayTeam_norm"]
    both = pd.merge(ml_model, ml_b365, on=key, how="inner", suffixes=("_model","_b365"))

    # Deltas por partido
    if "net_profit_model" in both.columns:
        both["delta_profit"] = both["net_profit_model"] - both.get("net_profit_b365", both["net_profit_b365"] if "net_profit_b365" in both.columns else 0.0)
    else:
        # si el matchlog de tu modelo usa 'net_profit' a secas
        both["delta_profit"] = both["net_profit"] - both["net_profit_b365"]

    # Orden temporal
    both["Date"] = pd.to_datetime(both["Date"], errors="coerce")
    both = both.sort_values(["Date"]).reset_index(drop=True)
    both["Date"] = both["Date"].dt.strftime("%Y-%m-%d")

    out_csv  = out_dir / f"comparison_matchlog_{season}_{model_tag}_vs_bet365.csv"
    out_json = out_dir / f"comparison_matchlog_{season}_{model_tag}_vs_bet365.json"
    both.to_csv(out_csv, index=False)
    both.to_json(out_json, orient="records", force_ascii=False, indent=2)
    print(f"Guardados comparativos por partido ({season}):\n- {out_csv}\n- {out_json}")

# -------------------------
# EJECUCIÓN (puedes comentar lo que no necesites)
# -------------------------
# 1) Generar baseline Bet365 por temporada (incluye investment_total y matchlogs con 'jornada')
build_bet365_grid(df, out_dir=OUT, seasons=None, stake=1.0, save_matchlogs=True)

# 2) Comparar tu modelo vs Bet365 por temporada (usa tu CSV: outputs/roi_by_season_base.csv)
build_season_comparison_model_vs_bet365(OUT, model_tag="base")

# 3) Comparar por partido en una temporada concreta
#    Cambia la temporada si quieres otra.
build_match_comparison_for_season(OUT, season=2025, model_tag="base")

# 4) Chequeo rápido: los matchlogs de Bet365 tienen 'jornada' y no tienen 'Wk'
check = list(sorted((OUT / "bet365_matchlogs").glob("matchlog_*.csv")))
if check:
    tmp = pd.read_csv(check[0])
    assert "jornada" in tmp.columns, "El matchlog Bet365 no contiene 'jornada'."
    assert "Wk" not in tmp.columns, "El matchlog Bet365 no debe contener 'Wk'."
    print("Chequeo OK: 'jornada' presente en outputs de Bet365; 'Wk' ausente.")

Baseline Bet365 — Prob. implícitas normalizadas
Rango TEST: Seasons 2007..2007 | n=380 | ROI: -3.47% | Profit: -13.19
[Bet365] Season 2007: OK (380 partidos)
Baseline Bet365 — Prob. implícitas normalizadas
Rango TEST: Seasons 2008..2008 | n=380 | ROI: 7.91% | Profit: 30.05
[Bet365] Season 2008: OK (380 partidos)
Baseline Bet365 — Prob. implícitas normalizadas
Rango TEST: Seasons 2009..2009 | n=380 | ROI: 4.28% | Profit: 16.28
[Bet365] Season 2009: OK (380 partidos)
Baseline Bet365 — Prob. implícitas normalizadas
Rango TEST: Seasons 2010..2010 | n=380 | ROI: 9.04% | Profit: 34.36
[Bet365] Season 2010: OK (380 partidos)
Baseline Bet365 — Prob. implícitas normalizadas
Rango TEST: Seasons 2011..2011 | n=380 | ROI: -7.91% | Profit: -30.06
[Bet365] Season 2011: OK (380 partidos)
Baseline Bet365 — Prob. implícitas normalizadas
Rango TEST: Seasons 2012..2012 | n=380 | ROI: -3.74% | Profit: -14.20
[Bet365] Season 2012: OK (380 partidos)
Baseline Bet365 — Prob. implícitas normalizadas
Rango TEST

In [28]:
# ============================================
# FIX curvas acumuladas: recalcular retornos desde matchlogs (modelo vs Bet365)
# - Evita depender de columnas 'net_profit' ya sufijadas/renombradas tras el merge
# - Recalcula con true_result, pick y odds B365
# - Incluye 'jornada'
# Salidas: outputs/cumprofit_curves_<model_tag>/cumprofit_<SEASON>.csv/.json + índice
# ============================================
import json
from pathlib import Path
import numpy as np
import pandas as pd

try:
    ROOT
except NameError:
    ROOT = Path(".")
OUT = ROOT / "outputs"
OUT.mkdir(parents=True, exist_ok=True)

CLASS2LABEL = {0: "Away", 1: "Draw", 2: "Home"}
TXT2IDX = {"A":0, "D":1, "H":2}

def _safe_dt(x):
    return pd.to_datetime(x, errors="coerce")

def _pick_first_col(df: pd.DataFrame, candidates: list[str], default=None):
    for c in candidates:
        if c in df.columns:
            return df[c]
    if default is not None:
        return pd.Series(default, index=df.index)
    raise KeyError(f"No encontré ninguna de estas columnas: {candidates}")

def _load_matchlogs_pair(out_dir: Path, model_tag: str, season: int) -> pd.DataFrame:
    """Merge de matchlogs (modelo vs Bet365) por clave (Date, HomeTeam_norm, AwayTeam_norm)."""
    p_model = out_dir / f"matchlogs_{model_tag}" / f"matchlog_{season}.csv"
    p_b365  = out_dir / "bet365_matchlogs" / f"matchlog_{season}.csv"
    if not (p_model.exists() and p_b365.exists()):
        return pd.DataFrame()

    ml_model = pd.read_csv(p_model)
    ml_b365  = pd.read_csv(p_b365)

    key = ["Date","HomeTeam_norm","AwayTeam_norm"]
    for c in key:
        ml_model[c] = ml_model[c].astype("string")
        ml_b365[c]  = ml_b365[c].astype("string")

    both = pd.merge(ml_model, ml_b365, on=key, how="inner", suffixes=("_model","_b365"))
    if both.empty:
        return both

    # Orden temporal
    both["Date"] = _safe_dt(both["Date"])
    both = both.sort_values("Date").reset_index(drop=True)
    return both

def _recalc_returns_from_merged(both: pd.DataFrame, stake: float = 1.0, round_decimals: int = 3):
    """
    Recalcula retornos (modelo y Bet365) con:
      - true_result:  true_result_model | true_result
      - pick modelo:  predicted_result_model | predicted_result | Pred (A/D/H)
      - pick Bet365:  bet365_pred
      - odds B365:    (B365A/B365D/B365H) prefiriendo *_model, luego sin sufijo, luego *_b365
    Devuelve DataFrame con columnas preparadas para curvas.
    """
    if both.empty:
        return pd.DataFrame()

    df = both.copy()

    # Verdadero
    true_ = _pick_first_col(df, ["true_result_model","true_result"]).astype(int).to_numpy()

    # Predicción del modelo (num 0/1/2 o desde 'Pred' textual)
    if "predicted_result_model" in df.columns:
        pred_m = df["predicted_result_model"].astype(int).to_numpy()
    elif "predicted_result" in df.columns:
        pred_m = df["predicted_result"].astype(int).to_numpy()
    elif "Pred" in df.columns:
        pred_m = df["Pred"].map(TXT2IDX).astype(int).to_numpy()
    else:
        raise KeyError("No encuentro la predicción del modelo (predicted_result[_model] o Pred).")

    # Predicción Bet365
    if "bet365_pred" in df.columns:
        pred_b = df["bet365_pred"].astype(int).to_numpy()
    else:
        raise KeyError("No encuentro 'bet365_pred' en el matchlog de Bet365.")

    # Cuotas B365: preferimos las del matchlog de tu modelo (suelen venir completas)
    B365A = _pick_first_col(df, ["B365A_model","B365A","B365A_b365"]).astype(float).to_numpy()
    B365D = _pick_first_col(df, ["B365D_model","B365D","B365D_b365"]).astype(float).to_numpy()
    B365H = _pick_first_col(df, ["B365H_model","B365H","B365H_b365"]).astype(float).to_numpy()

    # Validación rápida (cuotas > 0)
    mask_ok = (B365A > 0) & (B365D > 0) & (B365H > 0)
    if not np.all(mask_ok):
        # Filtramos filas inválidas
        df  = df.loc[mask_ok].reset_index(drop=True)
        true_ = true_[mask_ok]
        pred_m = pred_m[mask_ok]
        pred_b = pred_b[mask_ok]
        B365A = B365A[mask_ok]; B365D = B365D[mask_ok]; B365H = B365H[mask_ok]

    # Matriz de cuotas en orden [A,D,H]
    odds_mat = np.column_stack([B365A, B365D, B365H])

    # Retornos por partido: odds*stake - stake si acierta; si falla, -stake
    model_ret = np.where(pred_m == true_, odds_mat[np.arange(len(pred_m)), pred_m] * stake - stake, -stake)
    b365_ret  = np.where(pred_b == true_, odds_mat[np.arange(len(pred_b)), pred_b] * stake - stake, -stake)

    # Acumulados
    model_cum = np.round(np.cumsum(model_ret), round_decimals)
    b365_cum  = np.round(np.cumsum(b365_ret ), round_decimals)

    # Meta: fechas, equipos y jornada (si existe)
    dates   = df["Date"].dt.strftime("%Y-%m-%d")
    home    = _pick_first_col(df, ["HomeTeam_norm"], default="").astype("string")
    away    = _pick_first_col(df, ["AwayTeam_norm"], default="").astype("string")
    jornada = _pick_first_col(df, ["jornada_model","jornada","jornada_b365"], default=pd.NA)
    jornada = pd.to_numeric(jornada, errors="coerce").astype("Int64")

    series_df = pd.DataFrame({
        "match_num": np.arange(1, len(model_cum)+1, dtype=int),
        "date": dates,
        "jornada": jornada,
        "model_cum": np.round(model_cum, round_decimals),
        "bet365_cum": np.round(b365_cum, round_decimals),
        "model_ret": np.round(model_ret, round_decimals),
        "bet365_ret": np.round(b365_ret, round_decimals),
        "home": home,
        "away": away,
        "true_txt": pd.Series(true_).map({0:"Away",1:"Draw",2:"Home"}).astype("string"),
        "model_txt": pd.Series(pred_m).map({0:"Away",1:"Draw",2:"Home"}).astype("string"),
        "bet365_txt": pd.Series(pred_b).map({0:"Away",1:"Draw",2:"Home"}).astype("string"),
    })
    return series_df

def export_cumprofit_curves_from_saved_matchlogs(
    out_dir: Path,
    model_tag: str = "base",
    round_decimals: int = 3,
    stake: float = 1.0
):
    curves_dir = out_dir / f"cumprofit_curves_{model_tag}"
    curves_dir.mkdir(parents=True, exist_ok=True)

    seasons_model = sorted({int(p.stem.split("_")[-1]) for p in (out_dir / f"matchlogs_{model_tag}").glob("matchlog_*.csv")})
    seasons_b365  = sorted({int(p.stem.split("_")[-1]) for p in (out_dir / "bet365_matchlogs").glob("matchlog_*.csv")})
    seasons = sorted(set(seasons_model).intersection(seasons_b365))
    if not seasons:
        print(f"⚠️ No hay temporadas coincidentes entre matchlogs_{model_tag} y bet365_matchlogs.")
        return

    idx_rows = []
    for season in seasons:
        merged = _load_matchlogs_pair(out_dir, model_tag, season)
        if merged.empty:
            continue

        series_df = _recalc_returns_from_merged(merged, stake=stake, round_decimals=round_decimals)
        if series_df.empty:
            print(f"[CURVA {model_tag}] Season {season}: sin filas válidas tras filtrado de cuotas.")
            continue

        # Resumen
        n = int(len(series_df))
        final_model = float(series_df["model_cum"].iloc[-1]) if n else 0.0
        final_b365  = float(series_df["bet365_cum"].iloc[-1]) if n else 0.0
        roi_model   = float(final_model / (n*stake)) if n else 0.0
        roi_b365    = float(final_b365  / (n*stake)) if n else 0.0

        # CSV
        csv_path = curves_dir / f"cumprofit_{season}.csv"
        series_df.to_csv(csv_path, index=False)

        # JSON compacto (incluye jornada como 'j')
        payload = {
            "train_until": int(season - 1),
            "test_season": int(season),
            "n_matches": n,
            "series": [
                {
                    "i": int(r.match_num),
                    "d": str(r.date),
                    "j": (None if pd.isna(r.jornada) else int(r.jornada)),
                    "m": float(r.model_cum),
                    "b": float(r.bet365_cum),
                    "hm": str(r.home),
                    "aw": str(r.away),
                    "t":  str(r.true_txt),
                    "pm": str(r.model_txt),
                    "pb": str(r.bet365_txt),
                } for _, r in series_df.iterrows()
            ],
            "final": {
                "model": float(final_model),
                "bet365": float(final_b365),
                "roi_model": float(roi_model),
                "roi_bet365": float(roi_b365),
            }
        }
        (curves_dir / f"cumprofit_{season}.json").write_text(
            json.dumps(payload, ensure_ascii=False), encoding="utf-8"
        )

        idx_rows.append({
            "test_season": int(season),
            "train_until": int(season - 1),
            "n_matches": n,
            "profit_model": float(final_model),
            "profit_bet365": float(final_b365),
            "roi_model": float(roi_model),
            "roi_bet365": float(roi_b365),
            "csv_file": f"cumprofit_{season}.csv",
            "json_file": f"cumprofit_{season}.json",
        })
        print(f"[CURVA {model_tag}] Season {season}: {n} puntos → guardado CSV/JSON.")

    if idx_rows:
        idx_df = pd.DataFrame(idx_rows).sort_values("test_season")
        idx_df.to_csv(out_dir / f"cumprofit_index_{model_tag}.csv", index=False)
        (out_dir / f"cumprofit_index_{model_tag}.json").write_text(
            json.dumps(idx_rows, ensure_ascii=False, indent=2), encoding="utf-8"
        )
        print("Guardados:")
        print(f"- {out_dir / f'cumprofit_index_{model_tag}.csv'}")
        print(f"- {out_dir / f'cumprofit_index_{model_tag}.json'}")
        print(f"- {curves_dir}/cumprofit_<SEASON>.csv / .json")
    else:
        print(f"No se generaron curvas para {model_tag} (matchlogs no alineables).")

# =========================
# EJECUCIÓN (por defecto para BASE)
# =========================
export_cumprofit_curves_from_saved_matchlogs(OUT, model_tag="base", round_decimals=3, stake=1.0)
# Si quieres también SMOTE, descomenta:
export_cumprofit_curves_from_saved_matchlogs(OUT, model_tag="smote", round_decimals=3, stake=1.0)

[CURVA base] Season 2007: 380 puntos → guardado CSV/JSON.
[CURVA base] Season 2008: 380 puntos → guardado CSV/JSON.
[CURVA base] Season 2009: 380 puntos → guardado CSV/JSON.
[CURVA base] Season 2010: 380 puntos → guardado CSV/JSON.
[CURVA base] Season 2011: 380 puntos → guardado CSV/JSON.
[CURVA base] Season 2012: 380 puntos → guardado CSV/JSON.
[CURVA base] Season 2013: 380 puntos → guardado CSV/JSON.
[CURVA base] Season 2014: 380 puntos → guardado CSV/JSON.
[CURVA base] Season 2015: 380 puntos → guardado CSV/JSON.
[CURVA base] Season 2016: 380 puntos → guardado CSV/JSON.
[CURVA base] Season 2017: 380 puntos → guardado CSV/JSON.
[CURVA base] Season 2018: 380 puntos → guardado CSV/JSON.
[CURVA base] Season 2019: 380 puntos → guardado CSV/JSON.
[CURVA base] Season 2020: 380 puntos → guardado CSV/JSON.
[CURVA base] Season 2021: 380 puntos → guardado CSV/JSON.
[CURVA base] Season 2022: 380 puntos → guardado CSV/JSON.
[CURVA base] Season 2023: 380 puntos → guardado CSV/JSON.
[CURVA base] S