In [None]:
# --- PARÁMETROS INYECTADOS POR EL RUNNER (papermill) ---
from pathlib import Path
import os, sys
from datetime import date

try:
    MODE
except NameError:
    MODE = "auto"  # valores: "make_template" | "consume" | "auto"

try:
    RUN_DATE
except NameError:
    RUN_DATE = None
if RUN_DATE is None:
    RUN_DATE = date.today().strftime("%Y-%m-%d")

ROOT = Path.cwd()
MANUAL_DIR = ROOT / "manual"
MANUAL_DIR.mkdir(parents=True, exist_ok=True)

print(f"MODE = {MODE} | RUN_DATE = {RUN_DATE} | MANUAL_DIR = {MANUAL_DIR}")

MODE = auto | RUN_DATE = 2025-10-09 | MANUAL_DIR = /content/manual


In [None]:
# --- Parámetros (la CI podrá sobreescribirlos) ---
# IMPORTANTE: no reasignar RUN_DATE aquí. Ya viene fijado en la celda anterior (o por papermill/CI).

# SEASON por defecto calculada a partir de RUN_DATE (formato 2025_26)
import pandas as pd
SEASON = globals().get("SEASON", None)
if not SEASON:
    _dt = pd.to_datetime(RUN_DATE)
    _y = int(_dt.year) if _dt.month >= 7 else int(_dt.year) - 1
    SEASON = f"{_y}_{(_y+1) % 100:02d}"

MATCHDAY = globals().get("MATCHDAY", None)
MODEL_VERSION = globals().get("MODEL_VERSION", "xgb-local")

# --- Rutas coherentes local/CI ---
from pathlib import Path
ROOT   = Path.cwd()
DATA   = ROOT / "data"
RAW    = DATA / "01_raw"
PROC   = DATA / "02_processed"
FEAT   = DATA / "03_features"
MODELS = DATA / "04_models"
OUT    = ROOT / "outputs"

for p in [RAW, PROC, FEAT, MODELS, OUT]:
    p.mkdir(parents=True, exist_ok=True)

In [None]:
import pandas as pd

def load_raw(name: str):   return pd.read_parquet(RAW / name)
def save_raw(df, name: str):   df.to_parquet(RAW / name, index=False)

def load_proc(name: str):  return pd.read_parquet(PROC / name)
def save_proc(df, name: str):  df.to_parquet(PROC / name, index=False)

def load_feat(name: str):  return pd.read_parquet(FEAT / name)
def save_feat(df, name: str):  df.to_parquet(FEAT / name, index=False)

# **LIMPIEZA DE VARIABLES Y CREACIÓN DE NUEVAS**

In [None]:
!pip install soccerdata

Collecting soccerdata
  Downloading soccerdata-1.8.7-py3-none-any.whl.metadata (5.6 kB)
Collecting Unidecode<2.0.0,>=1.2.0 (from soccerdata)
  Downloading Unidecode-1.4.0-py3-none-any.whl.metadata (13 kB)
Collecting cloudscraper<2.0.0,>=1.2.71 (from soccerdata)
  Downloading cloudscraper-1.2.71-py2.py3-none-any.whl.metadata (19 kB)
Collecting lxml<5.0.0,>=4.9.3 (from soccerdata)
  Downloading lxml-4.9.4-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.7 kB)
Collecting packaging<25.0,>=24.1 (from soccerdata)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting undetected-chromedriver<4.0.0,>=3.5.0 (from soccerdata)
  Downloading undetected-chromedriver-3.5.5.tar.gz (65 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.4/65.4 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting unicode<3.0,>=2.7 (from soccerdata)
  Downloading unicode-2.9-py2.py3-none-any.whl.metadata (1.1 kB)
Colle

In [None]:
from datetime import datetime, date, time, timedelta
from dateutil import parser
from collections import defaultdict
from pathlib import Path

import pandas as pd
import numpy as np
import soccerdata as sd
import os, re, unicodedata, io, time as _time, random, requests
import pytz

# Limpieza de variables

In [None]:
IN_PATH = PROC / "fd_xg_elo_transfermarkt_wk_2005_2025.parquet"
df = pd.read_parquet(IN_PATH)
df

Unnamed: 0,1XBA,1XBCA,1XBCD,1XBCH,1XBD,1XBH,AC,AF,AHCh,AHh,...,h_value_mio,h_value_avg_mio,h_squad_size,h_pct_foreigners,a_avg_age,a_value_mio,a_value_avg_mio,a_squad_size,a_pct_foreigners,Matchweek
0,,,,,,,7,19,,,...,34.830,1.120,31.0,54.84,25.4,327.50,9.63,34.0,47.06,1.0
1,,,,,,,4,19,,,...,47.230,1.150,41.0,2.44,25.9,53.83,1.74,31.0,22.58,1.0
2,,,,,,,5,14,,,...,213.550,6.280,34.0,41.18,26.2,85.95,2.60,33.0,24.24,1.0
3,,,,,,,4,22,,,...,134.150,4.330,31.0,25.81,27.7,66.55,2.66,25.0,28.00,1.0
4,,,,,,,8,25,,,...,2.215,0.791,28.0,46.43,25.4,281.60,7.82,36.0,36.11,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7675,,,,,,,4,9,-0.25,-0.25,...,,,,,,,,,,8.0
7676,,,,,,,6,9,1.25,1.25,...,,,,,,,,,,8.0
7677,,,,,,,5,4,0.00,0.00,...,,,,,,,,,,8.0
7678,,,,,,,5,21,-0.25,-0.50,...,,,,,,,,,,8.0


In [None]:
protect_explicit = [
    'h_xg','a_xg', 'home_team_slug', 'away_team_slug',
    'h_avg_age', 'h_value_mio', 'h_value_avg_mio', 'h_squad_size', 'h_pct_foreigners',
    'a_avg_age', 'a_value_mio', 'a_value_avg_mio', 'a_squad_size', 'a_pct_foreigners'
]

protected = [c for c in protect_explicit if c in df.columns]

cols_with_na = df.columns[df.isna().any()].tolist()

cols_to_drop_na = [c for c in cols_with_na if c not in protected]

df = df.drop(columns=cols_to_drop_na)
df = df.drop(columns=['Div', 'HomeTeam', 'AwayTeam', 'home_team_slug', 'away_team_slug'], errors='ignore')

print(f"Eliminadas por NaN (excepto xG): {len(cols_to_drop_na)}")

Eliminadas por NaN (excepto xG): 166


In [None]:
df

Unnamed: 0,AC,AF,AR,AS,AST,AY,B365A,B365D,B365H,Date,...,h_value_mio,h_value_avg_mio,h_squad_size,h_pct_foreigners,a_avg_age,a_value_mio,a_value_avg_mio,a_squad_size,a_pct_foreigners,Matchweek
0,7,19,0,17,10,1,1.50,3.75,7.00,2005-08-27,...,34.830,1.120,31.0,54.84,25.4,327.50,9.63,34.0,47.06,1.0
1,4,19,0,9,2,1,3.25,3.25,2.00,2005-08-27,...,47.230,1.150,41.0,2.44,25.9,53.83,1.74,31.0,22.58,1.0
2,5,14,0,14,3,3,3.25,3.25,2.00,2005-08-27,...,213.550,6.280,34.0,41.18,26.2,85.95,2.60,33.0,24.24,1.0
3,4,22,0,9,2,7,4.00,3.40,1.72,2005-08-28,...,134.150,4.330,31.0,25.81,27.7,66.55,2.66,25.0,28.00,1.0
4,8,25,0,17,6,2,1.44,4.00,7.50,2005-08-28,...,2.215,0.791,28.0,46.43,25.4,281.60,7.82,36.0,36.11,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7675,4,9,1,6,3,1,3.25,3.10,2.35,2025-10-05,...,,,,,,,,,,8.0
7676,6,9,0,17,8,5,1.45,5.25,5.50,2025-10-05,...,,,,,,,,,,8.0
7677,5,4,0,12,3,6,2.70,3.30,2.55,2025-10-05,...,,,,,,,,,,8.0
7678,5,21,0,7,2,2,3.50,3.60,2.00,2025-10-05,...,,,,,,,,,,8.0


In [None]:
PROC.mkdir(parents=True, exist_ok=True)
OUT_PATH = PROC / "df_clean_vars.parquet"
df.to_parquet(OUT_PATH, index=False)
print(f"Guardado: {OUT_PATH}")

Guardado: /content/data/02_processed/df_clean_vars.parquet


# Jornada a predecir

Aqui vamos a añadir los nuevos partidos para la jornada proxima a predecir junto con los datos de elo y las cuotas bet365:

In [None]:
IN_PATH = PROC / "df_clean_vars.parquet"
df = pd.read_parquet(IN_PATH)
df

Unnamed: 0,AC,AF,AR,AS,AST,AY,B365A,B365D,B365H,Date,...,h_value_mio,h_value_avg_mio,h_squad_size,h_pct_foreigners,a_avg_age,a_value_mio,a_value_avg_mio,a_squad_size,a_pct_foreigners,Matchweek
0,7,19,0,17,10,1,1.50,3.75,7.00,2005-08-27,...,34.830,1.120,31.0,54.84,25.4,327.50,9.63,34.0,47.06,1.0
1,4,19,0,9,2,1,3.25,3.25,2.00,2005-08-27,...,47.230,1.150,41.0,2.44,25.9,53.83,1.74,31.0,22.58,1.0
2,5,14,0,14,3,3,3.25,3.25,2.00,2005-08-27,...,213.550,6.280,34.0,41.18,26.2,85.95,2.60,33.0,24.24,1.0
3,4,22,0,9,2,7,4.00,3.40,1.72,2005-08-28,...,134.150,4.330,31.0,25.81,27.7,66.55,2.66,25.0,28.00,1.0
4,8,25,0,17,6,2,1.44,4.00,7.50,2005-08-28,...,2.215,0.791,28.0,46.43,25.4,281.60,7.82,36.0,36.11,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7675,4,9,1,6,3,1,3.25,3.10,2.35,2025-10-05,...,,,,,,,,,,8.0
7676,6,9,0,17,8,5,1.45,5.25,5.50,2025-10-05,...,,,,,,,,,,8.0
7677,5,4,0,12,3,6,2.70,3.30,2.55,2025-10-05,...,,,,,,,,,,8.0
7678,5,21,0,7,2,2,3.50,3.60,2.00,2025-10-05,...,,,,,,,,,,8.0


In [None]:
# ================== AÑADIR PRÓXIMA JORNADA (10/10) + ELO PRE-PARTIDO ==================

# --- Zona horaria
TZ = pytz.timezone("Europe/Madrid")

# --- Paths: preferir parquet actualizado si existe ---
PROC = Path(PROC) if "PROC" in globals() else Path("./data/02_processed")
WK_CANDIDATES = [
    PROC / "wk_actualizado_2005_2025.parquet",
    PROC / "wk_2005_2025.parquet"
]
WEEK_PARQUET = next((p for p in WK_CANDIDATES if p.exists()), WK_CANDIDATES[-1])
print(f"[WK] Usando: {WEEK_PARQUET.name}")

# ------------------- Normalización de nombres -------------------
def _strip_accents(s: str) -> str:
    t = unicodedata.normalize("NFKD", s or "")
    return "".join(c for c in t if not unicodedata.combining(c))

def _canon(s: str) -> str:
    s = _strip_accents(str(s)).lower()
    s = re.sub(r"[^a-z0-9]+"," ", s).strip()
    return s

# de nombres "oficiales" (parquet/FD) → tus 'norm'
MAP_WK_TO_NORM = {
    "real madrid cf":"real madrid","real madrid":"real madrid",
    "fc barcelona":"barcelona","barcelona":"barcelona",
    "club atletico de madrid":"ath madrid","atletico de madrid":"ath madrid","atletico madrid":"ath madrid",
    "athletic club":"ath bilbao","athletic bilbao":"ath bilbao",
    "sevilla fc":"sevilla","sevilla":"sevilla",
    "valencia cf":"valencia","valencia":"valencia",
    "villarreal cf":"villarreal","villarreal":"villarreal",
    "real sociedad de futbol":"sociedad","real sociedad":"sociedad",
    "real betis balompie":"betis","real betis":"betis","betis":"betis",
    "ca osasuna":"osasuna","osasuna":"osasuna",
    "rcd espanyol de barcelona":"espanol","rcd espanyol":"espanol","espanyol":"espanol",
    "getafe cf":"getafe","getafe":"getafe",
    "rc celta de vigo":"celta","celta vigo":"celta","celta":"celta",
    "rcd mallorca":"mallorca","mallorca":"mallorca",
    "ud las palmas":"las palmas","las palmas":"las palmas",
    "cadiz cf":"cadiz","cadiz":"cadiz",
    "granada cf":"granada","granada":"granada",
    "deportivo alaves":"alaves","alaves":"alaves",
    "levante ud":"levante","levante":"levante",
    "rayo vallecano de madrid":"vallecano","rayo vallecano":"vallecano","vallecano":"vallecano",
    "girona fc":"girona","girona":"girona",
    "cd leganes":"leganes","leganes":"leganes",
    "sd eibar":"eibar","eibar":"eibar",
    "real valladolid":"valladolid","valladolid":"valladolid",
    "elche cf":"elche","elche":"elche",
    "malaga cf":"malaga","malaga":"malaga",
    "real oviedo":"real oviedo","oviedo":"real oviedo",
    "real zaragoza":"zaragoza","zaragoza":"zaragoza",
}
def wk_name_to_norm(name: str) -> str:
    return MAP_WK_TO_NORM.get(_canon(name), _canon(name))

# Mapeo a ClubElo
NORM_TO_CLUBELO = {
    "real madrid":"Real Madrid","barcelona":"Barcelona","ath madrid":"Atletico","ath bilbao":"Bilbao",
    "sevilla":"Sevilla","valencia":"Valencia","villarreal":"Villarreal","sociedad":"Sociedad",
    "betis":"Betis","osasuna":"Osasuna","espanol":"Espanyol","getafe":"Getafe","celta":"Celta",
    "mallorca":"Mallorca","las palmas":"Las Palmas","cadiz":"Cadiz","granada":"Granada",
    "alaves":"Alaves","levante":"Levante","vallecano":"Rayo Vallecano","girona":"Girona",
    "leganes":"Leganes","eibar":"Eibar","valladolid":"Valladolid","elche":"Elche","malaga":"Malaga",
    "real oviedo":"Oviedo","zaragoza":"Zaragoza"
}

# ------------------- Utilidades de temporada / wk -------------------
def season_from_run_date(run_date_str: str) -> int:
    d = pd.to_datetime(run_date_str)
    return int(d.year) if d.month >= 7 else int(d.year) - 1

def load_wk_table(path: Path) -> pd.DataFrame:
    wk = pd.read_parquet(path)
    req = {"Season","Wk","Date","Home","Away"}
    miss = req - set(wk.columns)
    if miss:
        raise ValueError(f"Parquet WK sin columnas requeridas: {sorted(miss)}")
    wk = wk.copy()
    wk["Season"] = pd.to_numeric(wk["Season"], errors="coerce").astype("Int64").astype(int)
    wk["Wk"]     = pd.to_numeric(wk["Wk"],     errors="coerce").astype("Int64").astype(int)
    wk["Date"]   = pd.to_datetime(wk["Date"],  errors="coerce")
    return wk

def choose_next_wk_strict(wk_raw: pd.DataFrame, run_date_str: str, expected: int = 10):
    """
    SOLO parquet:
      1) FULL FUTURE: n_dated==expected y n_future==expected → 10/10 futuros (elige por dmin más cercana)
      2) Si no hay:   n_past==0 y n_future>0 (nadie pasado en esa Wk; elegir por dmin_future)
      3) Si no hay:   dmin >= RUN_DATE
      4) Si no hay:   última Wk completamente pasada + 1
    """
    run_dt = pd.to_datetime(run_date_str).normalize()
    season = season_from_run_date(run_date_str)
    wk_s = wk_raw[wk_raw["Season"] == season].copy()
    if wk_s.empty:
        raise ValueError(f"No hay Season={season} en WK.")

    diag = (
        wk_s.groupby("Wk").apply(
            lambda g: pd.Series({
                "n_total": len(g),
                "n_dated": g["Date"].notna().sum(),
                "n_future": (g["Date"] >= run_dt).sum(),
                "n_past":   (g["Date"] <  run_dt).sum(),
                "dmin":      g["Date"].min(),
                "dmin_future": g.loc[g["Date"] >= run_dt, "Date"].min()
            })
        ).reset_index()
    )

    full = diag[(diag["n_dated"] == expected) & (diag["n_future"] == expected)]
    if len(full):
        wk_no = int(full.sort_values(["dmin","Wk"]).iloc[0]["Wk"])
        return season, wk_no

    cand = diag[(diag["n_past"] == 0) & (diag["n_future"] > 0)].copy()
    if len(cand):
        wk_no = int(cand.sort_values(["dmin_future","Wk"]).iloc[0]["Wk"])
        return season, wk_no

    future_any = diag[diag["dmin"] >= run_dt]
    if len(future_any):
        wk_no = int(future_any.sort_values(["dmin","Wk"]).iloc[0]["Wk"])
        return season, wk_no

    past_full = diag[diag["n_past"] == expected]
    if len(past_full):
        wk_no = int(past_full["Wk"].max()) + 1
        return season, wk_no

    return season, int(diag["Wk"].min())

def build_fixtures_for_wk(wk_raw: pd.DataFrame, season: int, wk_no: int) -> pd.DataFrame:
    rows = wk_raw[(wk_raw["Season"] == season) & (wk_raw["Wk"] == wk_no)].copy()
    if rows.empty:
        return pd.DataFrame(columns=["Date","Season","Wk","HomeTeam_norm","AwayTeam_norm","Date_dt"])
    rows["HomeTeam_norm"] = rows["Home"].map(wk_name_to_norm)
    rows["AwayTeam_norm"] = rows["Away"].map(wk_name_to_norm)
    rows["Date_dt"] = pd.to_datetime(rows["Date"], errors="coerce")
    rows["Date"] = rows["Date_dt"].dt.strftime("%Y-%m-%d")
    return rows[["Date","Season","Wk","HomeTeam_norm","AwayTeam_norm","Date_dt"]]

# ------------------- ClubElo (pre-partido) -------------------
def _build_clubelo_table(teams_norm: list) -> pd.DataFrame:
    ce = sd.ClubElo()
    frames = []
    for tnorm in teams_norm:
        ce_name = NORM_TO_CLUBELO.get(tnorm) or tnorm.title().replace(" ", "")
        try:
            hist = ce.read_team_history(ce_name)
            if hist is None or hist.empty:
                continue
            hist = hist.reset_index().rename(columns={hist.index.name or "index":"Date"})
            rating_col = "elo" if "elo" in hist.columns else ("Elo" if "Elo" in hist.columns else None)
            if rating_col is None:
                continue
            df_ = hist[["Date", rating_col]].rename(columns={rating_col:"Elo"})
            df_["team_norm"] = tnorm
            frames.append(df_)
        except Exception:
            continue
    if not frames:
        return pd.DataFrame(columns=["team_norm","Date","Elo"])
    elo = pd.concat(frames, ignore_index=True)
    elo["Date"] = pd.to_datetime(elo["Date"]).dt.normalize()
    elo = elo.sort_values(["team_norm","Date"]).reset_index(drop=True)
    return elo

def _merge_asof_by_team(left, right, by_col, left_time_col, right_time_col, right_val_col, out_col):
    out_parts = []
    for team, subL in left.groupby(by_col, sort=False):
        subL = subL.sort_values(left_time_col).copy()
        subR = right[right[by_col] == team].sort_values(right_time_col)
        if subR.empty:
            subL[out_col] = np.nan
        else:
            tmp = pd.merge_asof(
                subL,
                subR[[right_time_col, right_val_col]].rename(columns={right_time_col: "_rtime", right_val_col: out_col}),
                left_on=left_time_col, right_on="_rtime",
                direction="backward", allow_exact_matches=False  # ← PRE-PARTIDO
            ).drop(columns=["_rtime"])
            subL = tmp
        out_parts.append(subL)
    return pd.concat(out_parts, axis=0).sort_index()

def attach_elo(fixt: pd.DataFrame) -> pd.DataFrame:
    fixt = fixt.copy()
    # asegurar Date_dt
    if "Date_dt" not in fixt.columns:
        if "Date" in fixt.columns:
            fixt["Date_dt"] = pd.to_datetime(fixt["Date"], errors="coerce")
        else:
            fixt["Date_dt"] = pd.NaT
    if fixt["Date_dt"].isna().any():
        base_ts = pd.to_datetime(globals().get("RUN_DATE", pd.Timestamp.now(TZ))).tz_localize(None)
        fixt.loc[fixt["Date_dt"].isna(), "Date_dt"] = base_ts

    teams = sorted(set(fixt["HomeTeam_norm"]).union(set(fixt["AwayTeam_norm"])))
    elo = _build_clubelo_table(teams)
    if elo.empty:
        fixt["h_elo"] = np.nan; fixt["a_elo"] = np.nan
        return fixt

    rh = elo.rename(columns={"team_norm":"HomeTeam_norm"})
    ra = elo.rename(columns={"team_norm":"AwayTeam_norm"})
    m1 = _merge_asof_by_team(fixt, rh, "HomeTeam_norm", "Date_dt", "Date", "Elo", "h_elo")
    m2 = _merge_asof_by_team(m1, ra, "AwayTeam_norm", "Date_dt", "Date", "Elo", "a_elo")
    return m2

# ------------------- Consolidación de jornada -------------------
JORNADA_COL = "Matchweek"   # Columna canónica de jornada (mantiene tu 'Matchweek')

def consolidate_jornada(df_in: pd.DataFrame, jornada_col: str = JORNADA_COL) -> pd.DataFrame:
    """
    Unifica la jornada a una sola columna (jornada_col).
    - Si existe 'Wk', vuelca sus valores en jornada_col donde esté NaN.
    - Convierte a Int64 y elimina 'Wk'.
    """
    df = df_in.copy()
    if jornada_col not in df.columns:
        df[jornada_col] = pd.Series(dtype="Int64")
    df[jornada_col] = pd.to_numeric(df[jornada_col], errors="coerce").astype("Int64")
    if "Wk" in df.columns:
        wk_num = pd.to_numeric(df["Wk"], errors="coerce").astype("Int64")
        df[jornada_col] = df[jornada_col].fillna(wk_num)
        df = df.drop(columns=["Wk"])
    df[jornada_col] = pd.to_numeric(df[jornada_col], errors="coerce").astype("Int64")
    return df

# ------------------- API de alto nivel -------------------
def append_next_matchday_with_elo(
    df: pd.DataFrame,
    run_date: str,
    wk_path: Path = WEEK_PARQUET,
    expected_matches: int = 10
) -> tuple[pd.DataFrame, pd.DataFrame, dict]:
    """
    Devuelve (df_out, added_rows, info)
      - Selecciona la próxima jornada (10/10) por fecha usando SOLO wk.
      - Construye fixtures y adjunta ELO pre-partido.
      - Añade al df solo las filas que no existan: (Season, HomeTeam_norm, AwayTeam_norm).
      - Consolida la jornada en una única columna canónica (JORNADA_COL='Matchweek').
    """
    wk_raw = load_wk_table(wk_path)
    season, wk_no = choose_next_wk_strict(wk_raw, run_date, expected=expected_matches)
    fixt = build_fixtures_for_wk(wk_raw, season, wk_no)
    if fixt.empty:
        print(f"[MD] Season={season} Wk={wk_no} sin fixtures en WK.")
        df_out = consolidate_jornada(df, JORNADA_COL)
        return df_out, pd.DataFrame(), {"season":season,"wk":wk_no,"added":0}

    # Adjunta ELO pre-partido
    fixt = attach_elo(fixt)
    fixt["Date"] = pd.to_datetime(fixt["Date_dt"], errors="coerce").dt.strftime("%Y-%m-%d")

    # Renombra Wk -> columna canónica de jornada en fixtures
    if "Wk" in fixt.columns and JORNADA_COL != "Wk":
        fixt = fixt.rename(columns={"Wk": JORNADA_COL})
    # Asegura tipos
    fixt["Season"] = pd.to_numeric(fixt["Season"], errors="coerce").astype("Int64")
    if JORNADA_COL in fixt.columns:
        fixt[JORNADA_COL] = pd.to_numeric(fixt[JORNADA_COL], errors="coerce").astype("Int64")

    # preparar df (crear columnas si faltan)
    need_cols = ["Season", "HomeTeam_norm", "AwayTeam_norm", "h_elo", "a_elo", JORNADA_COL]
    for c in need_cols:
        if c not in df.columns:
            if c in ["h_elo","a_elo"]:
                df[c] = np.nan
            else:
                df[c] = pd.Series(dtype="object")
    # tipa jornada canónica en df
    df[JORNADA_COL] = pd.to_numeric(df[JORNADA_COL], errors="coerce").astype("Int64")

    # deduplicación por clave Season+Home+Away
    have = set(zip(
        pd.to_numeric(df["Season"], errors="ignore").astype("Int64").astype(int, errors="ignore"),
        df["HomeTeam_norm"].astype(str),
        df["AwayTeam_norm"].astype(str)
    ))
    fixt["key"] = list(zip(
        pd.to_numeric(fixt["Season"], errors="ignore").astype("Int64").astype(int, errors="ignore"),
        fixt["HomeTeam_norm"].astype(str),
        fixt["AwayTeam_norm"].astype(str)
    ))
    to_add = fixt[~fixt["key"].isin(have)].copy()

    if to_add.empty:
        print(f"[MD] Season={season} Wk={wk_no} Added=0 (ya presentes).")
        df_out = consolidate_jornada(df, JORNADA_COL)
        return df_out, pd.DataFrame(), {"season":season,"wk":wk_no,"added":0}

    # Construye filas a añadir usando SOLO la columna canónica
    add_cols = ["Date","Season",JORNADA_COL,"HomeTeam_norm","AwayTeam_norm","h_elo","a_elo"]
    add_rows = to_add[add_cols].copy()

    # Concatena y consolida (elimina 'Wk' si hubiese quedado en df)
    out = pd.concat([df, add_rows], ignore_index=True, sort=False)
    out = consolidate_jornada(out, JORNADA_COL)

    # Formatos de salida
    out["Date"] = pd.to_datetime(out["Date"], errors="coerce").dt.strftime("%Y-%m-%d")

    info = {"season": season, "wk": int(wk_no), "added": int(len(add_rows))}
    print(f"[MD] Season={season}  Wk={wk_no}  Added={len(add_rows)}")

    # Devuelve también las filas añadidas con la columna canónica tipada
    add_rows = add_rows.copy()
    add_rows[JORNADA_COL] = pd.to_numeric(add_rows[JORNADA_COL], errors="coerce").astype("Int64")
    return out, add_rows.reset_index(drop=True), info

# ------------------- EJECUCIÓN -------------------
df, added_rows_md, md_info = append_next_matchday_with_elo(
    df=df,
    run_date=RUN_DATE,           # Debe existir en el entorno; si no, define RUN_DATE = '2025-10-02'
    wk_path=WEEK_PARQUET,
    expected_matches=10
)
print(f"[RESULT] Season={md_info['season']}  Wk={md_info['wk']}  Filas añadidas={md_info['added']}")

# --- Sanity checks de consolidación ---
assert "Wk" not in df.columns, "Wk no debería existir tras consolidar"
assert "Matchweek" in df.columns, "Falta la columna canónica 'Matchweek'"

[WK] Usando: wk_actualizado_2005_2025.parquet


[MD] Season=2025  Wk=9  Added=10
[RESULT] Season=2025  Wk=9  Filas añadidas=10


In [None]:
display(added_rows_md)

Unnamed: 0,Date,Season,Matchweek,HomeTeam_norm,AwayTeam_norm,h_elo,a_elo
0,2025-10-17,2025,9,real oviedo,espanol,1570.905396,1650.074707
1,2025-10-18,2025,9,ath madrid,osasuna,1854.951904,1689.319946
2,2025-10-18,2025,9,barcelona,girona,1930.029053,1600.169434
3,2025-10-18,2025,9,sevilla,mallorca,1671.639648,1626.763062
4,2025-10-18,2025,9,villarreal,betis,1788.739136,1757.166382
5,2025-10-19,2025,9,elche,ath bilbao,1613.940063,1753.952026
6,2025-10-19,2025,9,getafe,real madrid,1634.252441,1942.6073
7,2025-10-19,2025,9,levante,vallecano,1624.64917,1659.343384
8,2025-10-19,2025,9,celta,sociedad,1667.994751,1641.45459
9,2025-10-20,2025,9,alaves,valencia,1652.426514,1668.082886


Solo faltaría actualizar manualmente las cuotas de Bet365 con el siguiente código:

In [None]:
def _b365_target_idx(
    df: pd.DataFrame,
    run_date_str: str,
    n_max: int = 12,
    window_days: int = 3,
) -> list[int]:
    """
    Índices (del df original) de los partidos objetivo para la plantilla B365:

    - Partidos con Date >= RUN_DATE
    - B365H/B365D/B365A son NaN
    - Ordenados por fecha ascendente
    - SOLO aquellos cuya fecha está dentro de una ventana temporal corta
      respecto al primer partido futuro (window_days días)
    - Limitados a n_max
    """
    run_dt = pd.to_datetime(run_date_str)
    df_sorted = df.sort_values("Date", kind="mergesort").copy()

    # Todos los futuros sin cuota
    mask_future_nan = (
        (df_sorted["Date"] >= run_dt)
        & df_sorted[["B365H", "B365D", "B365A"]].isna().all(axis=1)
    )
    future_nan = df_sorted.loc[mask_future_nan].copy()

    if future_nan.empty:
        return []

    # Primer partido futuro sin cuotas
    first_date = future_nan["Date"].min()

    # Ventana temporal: p.ej. 3 días a partir del primero
    window_end = first_date + pd.Timedelta(days=window_days)

    # Nos quedamos solo con los partidos dentro de esa ventana
    in_window = future_nan[future_nan["Date"] <= window_end]

    idx = in_window.index.tolist()
    return idx[:n_max]

def make_b365_template(df: pd.DataFrame, n_tail: int = 10, out_csv: str | None = None) -> pd.DataFrame:
    need = ["Date","HomeTeam_norm","AwayTeam_norm","B365H","B365D","B365A"]
    for c in ["Date","HomeTeam_norm","AwayTeam_norm"]:
        if c not in df.columns:
            raise ValueError(f"Falta columna requerida en df: {c}")
    for c in ["B365H","B365D","B365A"]:
        if c not in df.columns:
            df[c] = np.nan

    # Selección determinista por RUN_DATE (en lugar del "tail" genérico)
    target_idx = _b365_target_idx(df, RUN_DATE, n_max=n_tail)

    # Si no hay futuros con NaN (p.ej. ya rellenaste), caemos a tu fallback habitual
    if not target_idx:
        order_idx = pd.to_datetime(df["Date"], errors="coerce").argsort(kind="mergesort")
        tail_idx = df.iloc[order_idx].tail(n_tail).index
        na_mask = df.loc[tail_idx, ["B365H","B365D","B365A"]].isna().all(axis=1)
        target_idx = tail_idx[na_mask].tolist()

    target = df.loc[target_idx, ["Date","HomeTeam_norm","AwayTeam_norm"]].copy()
    target["Date"] = pd.to_datetime(target["Date"], errors="coerce").dt.strftime("%Y-%m-%d")

    # row_id = índice REAL del df (clave para aplicar luego)
    target.insert(0, "row_id", target.index.astype(int))

    # columnas vacías para rellenar
    target["B365H"] = np.nan
    target["B365D"] = np.nan
    target["B365A"] = np.nan

    if out_csv:
        target.to_csv(out_csv, index=False)
        print(f"Plantilla guardada en: {out_csv}\nNo cambies la columna 'row_id'. Solo rellena B365H/B365D/B365A.")
    return target

def apply_b365_from_template(df: pd.DataFrame, manual_template: pd.DataFrame | str, n_tail: int = 10) -> pd.DataFrame:
    """
    Actualiza EXCLUSIVAMENTE B365H/B365D/B365A de las filas objetivo,
    identificadas por 'row_id' (el índice original del df).
    """
    if isinstance(manual_template, str):
        upd = pd.read_csv(manual_template)
    else:
        upd = manual_template.copy()

    must = {"row_id","Date","HomeTeam_norm","AwayTeam_norm","B365H","B365D","B365A"}
    missing = must - set(upd.columns)
    if missing:
        raise ValueError(f"Faltan columnas en la plantilla: {sorted(missing)}")

    for c in ["B365H","B365D","B365A"]:
        upd[c] = pd.to_numeric(upd[c], errors="coerce")

    # Recalcular el conjunto objetivo del df actual (mismo criterio que la plantilla)
    target_idx_now = set(_b365_target_idx(df, RUN_DATE, n_max=n_tail))

    upd["row_id"] = pd.to_numeric(upd["row_id"], errors="coerce").astype("Int64")
    upd_valid = upd[upd["row_id"].isin(target_idx_now)].dropna(subset=["row_id"]).copy()

    if upd_valid.empty:
        print("No hay filas válidas para actualizar (¿cambiaste 'row_id' o esas filas ya no son futuras/NaN?).")
        return df

    for _, r in upd_valid.iterrows():
        ridx = int(r["row_id"])
        df.loc[ridx, ["B365H","B365D","B365A"]] = [r["B365H"], r["B365D"], r["B365A"]]

    print(f"Actualizadas {len(upd_valid)} fila(s) por 'row_id'.")
    still_nan = df.loc[list(target_idx_now), ["B365H","B365D","B365A"]].isna().all(axis=1).sum()
    print(f"Quedan {still_nan} partidos con B365* = NaN entre las {min(n_tail, len(target_idx_now))} filas objetivo.")
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce").dt.strftime("%Y-%m-%d")
    return df

In [None]:
# --- CONTROL DEL FLUJO B365: CREAR PLANTILLA o CONSUMIR CSV RELLENADO ---

# Asegura índice limpio y formatos
df = df.reset_index(drop=True)
df["Date"] = pd.to_datetime(df["Date"], errors="coerce")

# Cálculo de n_tail (cuántos partidos recientes sin cuotas abarcar)
mask_future = df["Date"] >= pd.to_datetime(RUN_DATE)
has_b365_cols = {"B365H","B365D","B365A"}.issubset(df.columns)
if not has_b365_cols:
    for c in ["B365H","B365D","B365A"]:
        if c not in df.columns:
            df[c] = np.nan

mask_nan = df[["B365H","B365D","B365A"]].isna().all(axis=1)
n_tail = int(df[mask_future & mask_nan].shape[0])

if n_tail == 0:
    recent = df.sort_values("Date").tail(30)
    n_tail = int(recent[["B365H","B365D","B365A"]].isna().all(axis=1).sum())

n_tail = max(1, min(n_tail, 12))  # límite sano

# Rutas de plantilla y rellenado según RUN_DATE (y compatibilidad con tu nombre antiguo)
tpl_path    = MANUAL_DIR / f"b365_template_{RUN_DATE}.csv"
filled_path = MANUAL_DIR / f"b365_filled_{RUN_DATE}.csv"
fallbacks   = [MANUAL_DIR / "plantilla_bet365.csv"]  # compatibilidad

def _do_make_template():
    _ = make_b365_template(df.copy(), n_tail=n_tail, out_csv=str(tpl_path))
    print(f"[B365] Plantilla generada ({n_tail} filas): {tpl_path}")
    print(f"      Rellena solo B365H/B365D/B365A y guarda como: {filled_path.name}")

def _do_consume(fpath: Path):
    global df
    df = apply_b365_from_template(df, str(fpath), n_tail=n_tail)
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce").dt.strftime("%Y-%m-%d")
    print(f"[B365] Cuotas aplicadas desde: {fpath.name}. Continúo con el pipeline.")

# --- Modos de ejecución ---
if MODE == "make_template":
    _do_make_template()
    # Paramos aquí la ETAPA 1 (NB2 acaba tras crear plantilla)
    sys.exit(0)

elif MODE == "consume":
    if filled_path.exists():
        _do_consume(filled_path)
    else:
        # Soporte al nombre antiguo, por si lo usas
        legacy = next((p for p in fallbacks if p.exists()), None)
        if legacy is not None:
            _do_consume(legacy)
        else:
            raise FileNotFoundError(
                f"No existe el CSV rellenado: {filled_path.name} "
                f"(ni el legado {fallbacks[0].name}). Sube el CSV antes de continuar."
            )

else:  # MODE == "auto"
    if filled_path.exists():
        _do_consume(filled_path)
    else:
        legacy = next((p for p in fallbacks if p.exists()), None)
        if legacy is not None:
            _do_consume(legacy)
        else:
            _do_make_template()
            # En modo auto, si no hay filled, generamos plantilla y paramos aquí.
            sys.exit(0)

Actualizadas 10 fila(s) por 'row_id'.
Quedan 0 partidos con B365* = NaN entre las 10 filas objetivo.
[B365] Cuotas aplicadas desde: b365_filled_2025-10-09.csv. Continúo con el pipeline.


# Creación de variables

Uno de los aspectos fundamentales a considerar en la construcción de modelos predictivos es la prevención del data leakage, es decir, la **incorporación en el conjunto de entrenamiento de variables que contienen información que no estaría disponible en el momento real de la predicción**. En el dataset empleado para este trabajo, que recoge información detallada de partidos de la Primera División española, se identificaron diversas variables que incurren en esta problemática. Concretamente, variables como los goles totales (`FTHG`, `FTAG`), el resultado final (`FTR`), las estadísticas de mitad de partido (`HTHG`, `HTAG`, `HTR`) y otras métricas post-partido como tiros, faltas, córners, tarjetas o disparos a puerta representan información generada una vez disputado el encuentro. La inclusión de estos campos en el modelo supondría una fuga de información desde el futuro hacia el presente, lo que comprometería gravemente la validez del proceso de entrenamiento y evaluación. Por ello, dichas variables han de ser excluidas del conjunto de entrenamiento.

Dado que múltiples variables del dataset original recogen estadísticas generadas durante el transcurso del propio partido (goles, tiros, tarjetas, etc.), y por tanto no pueden utilizarse como predictoras sin incurrir en data leakage, se optó por sustituirlas por métricas históricas calculadas exclusivamente con datos previos al encuentro. En concreto, se construyeron variables agregadas como la media de goles anotados, disparos realizados o córners obtenidos por cada equipo en sus últimos encuentros disputados antes del partido en cuestión. Estas variables permiten capturar la dinámica reciente de los equipos de forma legítima y temporalmente coherente, manteniendo la validez del modelo predictivo.

In [None]:
window_size = 7

rows = []

for idx, row in df.iterrows():
    date = row['Date']
    season = row['Season']

    rows.append({
        'Team': row['HomeTeam_norm'],
        'Date': date,
        'Season': season,
        'Shots': row['HS'],
        'ShotsOnTarget': row['HST'],
        'Fouls': row['HF'],
        'Corners': row['HC'],
        'Yellows': row['HY'],
        'Reds': row['HR'],
        'xG': row.get('h_xg', np.nan)
    })

    rows.append({
        'Team': row['AwayTeam_norm'],
        'Date': date,
        'Season': season,
        'Shots': row['AS'],
        'ShotsOnTarget': row['AST'],
        'Fouls': row['AF'],
        'Corners': row['AC'],
        'Yellows': row['AY'],
        'Reds': row['AR'],
        'xG': row.get('a_xg', np.nan)
    })

team_stats = pd.DataFrame(rows)

stats = ['Shots', 'ShotsOnTarget', 'Fouls', 'Corners', 'Yellows', 'Reds', 'xG']

for stat in stats:
    team_stats[stat] = pd.to_numeric(team_stats[stat], errors='coerce')
    team_stats[f'{stat}_avg_last{window_size}'] = (
        team_stats.sort_values('Date')
        .groupby('Team', group_keys=False)[stat]
        .apply(lambda x: x.shift(1).rolling(window=window_size, min_periods=1).mean())
    )

df = df.merge(
    team_stats[['Team', 'Date'] + [f'{stat}_avg_last{window_size}' for stat in stats]],
    left_on=['HomeTeam_norm', 'Date'],
    right_on=['Team', 'Date'],
    how='left'
)
df.rename(columns={
    f'{stat}_avg_last{window_size}': f'home_avg_{stat.lower()}_last{window_size}' for stat in stats
}, inplace=True)
df.drop(columns='Team', inplace=True)

df = df.merge(
    team_stats[['Team', 'Date'] + [f'{stat}_avg_last{window_size}' for stat in stats]],
    left_on=['AwayTeam_norm', 'Date'],
    right_on=['Team', 'Date'],
    how='left'
)
df.rename(columns={
    f'{stat}_avg_last{window_size}': f'away_avg_{stat.lower()}_last{window_size}' for stat in stats
}, inplace=True)
df.drop(columns='Team', inplace=True)

* **PUNTOS O GOLES**

La diferencia de goles es un mejor predictor del rendimiento futuro que los puntos.

* `home_points_cum` / `away_points_cum`: Puntos acumulados por el equipo local/visitante hasta antes del partido actual

* `home_gd_cum` / `away_gd_cum`: Diferencia de goles acumulada (FTHG - FTAG) hasta antes del partido actual

In [None]:
df['home_points'] = df['FTR'].map({'H': 3, 'D': 1, 'A': 0})
df['away_points'] = df['FTR'].map({'H': 0, 'D': 1, 'A': 3})

df['home_gd'] = df['FTHG'] - df['FTAG']
df['away_gd'] = df['FTAG'] - df['FTHG']

df = df.sort_values(['Season', 'Date']).reset_index(drop=True)

df['home_points_cum'] = (
    df.groupby(['Season','HomeTeam_norm'])['home_points']
      .transform(lambda s: s.fillna(0).cumsum().shift(1))
      .fillna(0)
)
df['away_points_cum'] = (
    df.groupby(['Season','AwayTeam_norm'])['away_points']
      .transform(lambda s: s.fillna(0).cumsum().shift(1))
      .fillna(0)
)
df['home_gd_cum'] = (
    df.groupby(['Season','HomeTeam_norm'])['home_gd']
      .transform(lambda s: s.fillna(0).cumsum().shift(1))
      .fillna(0)
)
df['away_gd_cum'] = (
    df.groupby(['Season','AwayTeam_norm'])['away_gd']
      .transform(lambda s: s.fillna(0).cumsum().shift(1))
      .fillna(0)
)

Además tambien vas a crear:

* `total_points_cum_home`: puntos totales acumulados por el equipo local hasta antes de ese partido

* `total_points_cum_away`: puntos acumulados por el equipo visitante

* `total_gd_cum_home`: diferencia de goles acumulada por el equipo local

* `total_gd_cum_away`: diferencia de goles acumulada por el visitante

In [None]:
home_df = df[['Season','Date','HomeTeam_norm','home_points','home_gd']].rename(
    columns={'HomeTeam_norm':'Team','home_points':'Points','home_gd':'GD'}
)
away_df = df[['Season','Date','AwayTeam_norm','away_points','away_gd']].rename(
    columns={'AwayTeam_norm':'Team','away_points':'Points','away_gd':'GD'}
)

team_perf = pd.concat([home_df, away_df], axis=0, ignore_index=True)
team_perf = team_perf.sort_values(['Season','Team','Date'])

team_perf['team_points_cum'] = (
    team_perf.groupby(['Season','Team'])['Points']
             .transform(lambda s: s.fillna(0).cumsum().shift(1))
             .fillna(0)
)
team_perf['team_gd_cum'] = (
    team_perf.groupby(['Season','Team'])['GD']
             .transform(lambda s: s.fillna(0).cumsum().shift(1))
             .fillna(0)
)

df = df.merge(
    team_perf[['Season','Date','Team','team_points_cum','team_gd_cum']],
    left_on=['Season','Date','HomeTeam_norm'],
    right_on=['Season','Date','Team'],
    how='left'
).rename(columns={
    'team_points_cum':'home_total_points_cum',
    'team_gd_cum':'home_total_gd_cum'
}).drop(columns='Team')

df = df.merge(
    team_perf[['Season','Date','Team','team_points_cum','team_gd_cum']],
    left_on=['Season','Date','AwayTeam_norm'],
    right_on=['Season','Date','Team'],
    how='left'
).rename(columns={
    'team_points_cum':'away_total_points_cum',
    'team_gd_cum':'away_total_gd_cum'
}).drop(columns='Team')

Partidos acumulados:

In [None]:
# Asegura tipo fecha y un id de fila para ordenar/desempatar
df = df.copy()
df['Date'] = pd.to_datetime(df['Date']).dt.normalize()
df['row_id'] = np.arange(len(df))  # identificador estable por partido

# Largo: una fila por (Season, Team, partido)
home_long = df[['row_id','Season','Date','HomeTeam_norm']].rename(columns={'HomeTeam_norm':'Team'})
away_long = df[['row_id','Season','Date','AwayTeam_norm']].rename(columns={'AwayTeam_norm':'Team'})
long_matches = pd.concat([home_long, away_long], ignore_index=True)

# Orden cronológico por equipo (desempata con row_id si hay mismo día)
long_matches = long_matches.sort_values(['Season','Team','Date','row_id'])

# Partidos anteriores (sin incluir el actual)
long_matches['matches_prev'] = long_matches.groupby(['Season','Team']).cumcount()

# Volver al ancho: local y visitante (merge por row_id + equipo)
home_prev = long_matches.rename(columns={'Team':'HomeTeam_norm'})[['row_id','HomeTeam_norm','matches_prev']] \
                        .rename(columns={'matches_prev':'home_total_matches_prev'})
away_prev = long_matches.rename(columns={'Team':'AwayTeam_norm'})[['row_id','AwayTeam_norm','matches_prev']] \
                        .rename(columns={'matches_prev':'away_total_matches_prev'})

df = df.merge(home_prev, on=['row_id','HomeTeam_norm'], how='left') \
       .merge(away_prev, on=['row_id','AwayTeam_norm'], how='left') \
       .drop(columns=['row_id'])

* **DEFENSA O ATAQUE**

¿Qué estilo de juego predice mejor el rendimiento en fútbol: el defensivo o el ofensivo?

Vamos a crear la variable `playstyle` que clasifique a cada equipo como "ofensivo", "defensivo" o "equilibrado" usando la métrica diferencia de goles media en los últimos 6 partidos.

De esta forma tendríamos una media móvil de diferencia de goles (goal_diff) en los últimos 6 partidos del equipo (como local y visitante).

In [None]:
window = 6                   # nº máximo de partidos previos a considerar
prev_weight = 0.7            # peso para partidos de la temporada PREVIA
min_total_periods = 3        # mínimo de partidos previos (actual+previa) para calcular media si NO es ascendido
min_periods_promoted = 1     # mínimo de partidos previos si ES ascendido (solo actual)
thr_off = 0.75               # umbral estilo ofensivo
thr_def = -0.75              # umbral estilo defensivo
fill_neutral_on_nan = False  # si True, rellena NaN de estilo con 'equilibrado' (cold start)

home = df[['Season', 'Date', 'HomeTeam_norm', 'home_gd']].copy()
home.rename(columns={'HomeTeam_norm': 'Team', 'home_gd': 'gd'}, inplace=True)

away = df[['Season', 'Date', 'AwayTeam_norm', 'away_gd']].copy()
away.rename(columns={'AwayTeam_norm': 'Team', 'away_gd': 'gd'}, inplace=True)

perf = pd.concat([home, away], ignore_index=True)

perf = perf.sort_values(['Team', 'Date']).reset_index(drop=True)

first_season_by_team = perf.groupby('Team')['Season'].transform('min')
perf['is_promoted_season'] = perf['Season'] == first_season_by_team

def weighted_hybrid_gd_mean_with_promoted(group: pd.DataFrame) -> pd.Series:
    """
    Para cada partido (fila) del equipo:
      - Si NO es temporada de debut en el dataset:
          * Usa últimos 'window' partidos previos (actual + previas con peso prev_weight).
          * Requiere al menos 'min_total_periods' partidos previos (entre actual+previa).
      - Si SÍ es temporada de debut:
          * Usa SOLO los partidos previos de la misma temporada (peso 1.0).
          * Requiere al menos 'min_periods_promoted' partidos previos (típicamente 1).
    """
    seasons = group['Season'].to_numpy()
    gds = group['gd'].to_numpy()
    is_promoted = group['is_promoted_season'].to_numpy()
    n = len(group)
    out = np.full(n, np.nan, dtype=float)

    for i in range(n):
        prev_end = i
        if prev_end == 0:
            continue

        start = max(0, prev_end - window)
        prev_slice = slice(start, prev_end)

        current_season = seasons[i]
        prev_seasons = seasons[prev_slice]
        prev_gds = gds[prev_slice]

        if is_promoted[i]:
            mask_same_season = (prev_seasons == current_season)
            prev_gds_sel = prev_gds[mask_same_season]
            if prev_gds_sel.size < min_periods_promoted:
                continue
            out[i] = prev_gds_sel.mean()
        else:
            if (prev_end - start) < min_total_periods:
                continue
            weights = np.where(prev_seasons == current_season, 1.0, prev_weight)
            wsum = weights.sum()
            if wsum <= 0:
                continue
            out[i] = np.dot(prev_gds, weights) / wsum

    return pd.Series(out, index=group.index)

perf['gd_mean_hybrid'] = perf.groupby('Team', group_keys=False)\
                             .apply(weighted_hybrid_gd_mean_with_promoted)

def clasificar_estilo(gd_mean, thr_off=thr_off, thr_def=thr_def):
    if pd.isna(gd_mean):
        return np.nan
    if gd_mean >= thr_off:
        return 'ofensivo'
    if gd_mean <= thr_def:
        return 'defensivo'
    return 'equilibrado'

perf['estilo_de_juego'] = perf['gd_mean_hybrid'].apply(clasificar_estilo)

if fill_neutral_on_nan:
    perf['estilo_de_juego'] = perf['estilo_de_juego'].fillna('equilibrado')

df = df.merge(
    perf[['Season', 'Date', 'Team', 'estilo_de_juego']],
    left_on=['Season', 'Date', 'HomeTeam_norm'],
    right_on=['Season', 'Date', 'Team'],
    how='left'
)
df.rename(columns={'estilo_de_juego': 'home_playstyle'}, inplace=True)
df.drop(columns='Team', inplace=True)

df = df.merge(
    perf[['Season', 'Date', 'Team', 'estilo_de_juego']],
    left_on=['Season', 'Date', 'AwayTeam_norm'],
    right_on=['Season', 'Date', 'Team'],
    how='left'
)
df.rename(columns={'estilo_de_juego': 'away_playstyle'}, inplace=True)
df.drop(columns='Team', inplace=True)

* **TENDENCIA O REVERSIÓN A LA MEDIA**

Es más probable que un equipo que mejora vuelva a empeorar (reversión a la media).

Para medir este tipo de situaciones vamos a crear tres variables nuevas:

* `dynamic_pos_change_prev_season`: Cambio de posición respecto a la temporada anterior (puede ser positiva o negativa). Es la mejor proxy del “efecto mejora o empeoramiento”. La versión "estática" solo compara posiciones al final de temporadas anteriores pero nosotros la intentaremos hacer dinámica que da una idea clara de cómo está rindiendo el equipo en un momento determinado respecto a su posición histórica.

* `form_points_6` / `form_gd_6`: Suma de puntos / dg en los últimos 6 partidos. Detecta tendencias a corto plazo, útil como feature directa y para medir si hay sobre-rendimiento.

In [None]:
df = df.sort_values("Date").copy()

home_df = df[['Season', 'Date', 'HomeTeam_norm', 'home_points', 'home_gd']].copy()
home_df.rename(columns={
    'HomeTeam_norm': 'Team',
    'home_points': 'Points',
    'home_gd': 'GD'
}, inplace=True)

away_df = df[['Season', 'Date', 'AwayTeam_norm', 'away_points', 'away_gd']].copy()
away_df.rename(columns={
    'AwayTeam_norm': 'Team',
    'away_points': 'Points',
    'away_gd': 'GD'
}, inplace=True)

team_perf = pd.concat([home_df, away_df], axis=0).sort_values(['Season', 'Date'])

team_perf['Matchday'] = team_perf.groupby(['Season', 'Team']).cumcount() + 1

team_perf['prev_position'] = None

for season in team_perf['Season'].unique():
    df_season = team_perf[team_perf['Season'] == season].copy()
    for jornada in range(2, df_season['Matchday'].max() + 1):
        tabla_prev = df_season[df_season['Matchday'] < jornada].groupby('Team')[['Points', 'GD']].sum()
        tabla_prev = tabla_prev.sort_values(['Points', 'GD'], ascending=[False, False])
        tabla_prev['Position'] = range(1, len(tabla_prev) + 1)

        equipos_jornada = df_season[df_season['Matchday'] == jornada]['Team']
        for equipo in equipos_jornada:
            pos = tabla_prev.loc[equipo, 'Position'] if equipo in tabla_prev.index else None
            mask = (
                (team_perf['Season'] == season) &
                (team_perf['Team'] == equipo) &
                (team_perf['Matchday'] == jornada)
            )
            team_perf.loc[mask, 'prev_position'] = pos

team_perf['prev_position'] = team_perf['prev_position'].astype(float)

prev_pos = team_perf[['Season', 'Date', 'Team', 'prev_position']].copy()

df = df.merge(
    prev_pos,
    how='left',
    left_on=['Season', 'Date', 'HomeTeam_norm'],
    right_on=['Season', 'Date', 'Team']
)
df.rename(columns={'prev_position': 'home_prev_position'}, inplace=True)
df.drop(columns='Team', inplace=True)

df = df.merge(
    prev_pos,
    how='left',
    left_on=['Season', 'Date', 'AwayTeam_norm'],
    right_on=['Season', 'Date', 'Team']
)
df.rename(columns={'prev_position': 'away_prev_position'}, inplace=True)
df.drop(columns='Team', inplace=True)

In [None]:
home = df[['Season', 'Date', 'HomeTeam_norm', 'home_points', 'home_gd']].copy()
home.columns = ['Season', 'Date', 'Team', 'Points', 'GD']

away = df[['Season', 'Date', 'AwayTeam_norm', 'away_points', 'away_gd']].copy()
away.columns = ['Season', 'Date', 'Team', 'Points', 'GD']

team_season = pd.concat([home, away], axis=0)

team_season_total = team_season.groupby(['Season', 'Team'])[['Points', 'GD']].sum().reset_index()

team_season_total = team_season_total.sort_values(['Season', 'Points', 'GD'], ascending=[True, False, False])
team_season_total['FinalPosition'] = team_season_total.groupby('Season').cumcount() + 1

team_season_total['Season'] = team_season_total['Season'] + 1
team_season_total.rename(columns={'FinalPosition': 'prev_season_final_position'}, inplace=True)

df = df.merge(team_season_total[['Season', 'Team', 'prev_season_final_position']],
              left_on=['Season', 'HomeTeam_norm'],
              right_on=['Season', 'Team'],
              how='left')
df.rename(columns={'prev_season_final_position': 'home_final_position_prev_season'}, inplace=True)
df.drop(columns='Team', inplace=True)

df = df.merge(team_season_total[['Season', 'Team', 'prev_season_final_position']],
              left_on=['Season', 'AwayTeam_norm'],
              right_on=['Season', 'Team'],
              how='left')
df.rename(columns={'prev_season_final_position': 'away_final_position_prev_season'}, inplace=True)
df.drop(columns='Team', inplace=True)

df['home_dynamic_pos_change_prev_season'] = (
    df['home_final_position_prev_season'] - df['home_prev_position']
)

df['away_dynamic_pos_change_prev_season'] = (
    df['away_final_position_prev_season'] - df['away_prev_position']
)

In [None]:
home_form = df[['Season', 'Date', 'HomeTeam_norm', 'home_points']].copy()
home_form.columns = ['Season', 'Date', 'Team', 'Points']

away_form = df[['Season', 'Date', 'AwayTeam_norm', 'away_points']].copy()
away_form.columns = ['Season', 'Date', 'Team', 'Points']

team_form = pd.concat([home_form, away_form], axis=0)
team_form = team_form.sort_values(['Season', 'Team', 'Date'])

team_form['form_points_6'] = (
    team_form.groupby(['Season', 'Team'])['Points']
    .transform(lambda x: x.shift().rolling(window=6, min_periods=1).sum())
)

df = df.merge(
    team_form[['Season', 'Date', 'Team', 'form_points_6']],
    left_on=['Season', 'Date', 'HomeTeam_norm'],
    right_on=['Season', 'Date', 'Team'],
    how='left'
)
df.rename(columns={'form_points_6': 'home_form_points_6'}, inplace=True)
df.drop(columns='Team', inplace=True)

df = df.merge(
    team_form[['Season', 'Date', 'Team', 'form_points_6']],
    left_on=['Season', 'Date', 'AwayTeam_norm'],
    right_on=['Season', 'Date', 'Team'],
    how='left'
)
df.rename(columns={'form_points_6': 'away_form_points_6'}, inplace=True)
df.drop(columns='Team', inplace=True)

In [None]:
home_gd_form = df[['Season', 'Date', 'HomeTeam_norm', 'home_gd']].copy()
home_gd_form.columns = ['Season', 'Date', 'Team', 'GD']

away_gd_form = df[['Season', 'Date', 'AwayTeam_norm', 'away_gd']].copy()
away_gd_form.columns = ['Season', 'Date', 'Team', 'GD']

team_gd_form = pd.concat([home_gd_form, away_gd_form], axis=0)
team_gd_form = team_gd_form.sort_values(['Season', 'Team', 'Date'])

team_gd_form['form_gd_6'] = (
    team_gd_form.groupby(['Season', 'Team'])['GD']
    .transform(lambda x: x.shift().rolling(window=6, min_periods=1).sum())
)

df = df.merge(
    team_gd_form[['Season', 'Date', 'Team', 'form_gd_6']],
    left_on=['Season', 'Date', 'HomeTeam_norm'],
    right_on=['Season', 'Date', 'Team'],
    how='left'
)
df.rename(columns={'form_gd_6': 'home_form_gd_6'}, inplace=True)
df.drop(columns='Team', inplace=True)

df = df.merge(
    team_gd_form[['Season', 'Date', 'Team', 'form_gd_6']],
    left_on=['Season', 'Date', 'AwayTeam_norm'],
    right_on=['Season', 'Date', 'Team'],
    how='left'
)
df.rename(columns={'form_gd_6': 'away_form_gd_6'}, inplace=True)
df.drop(columns='Team', inplace=True)

* **AL BORDE DEL ABISMO**

Los equipos que están en zona de descenso o cerca, mejoran notablemente su rendimiento en las últimas jornadas. A la vez, los equipos en “zona de nadie” empeoran.

Para medir esa presión contextual podemos crear variables como:

* `position_zone`: variable categórica que indica en que zona de la tabla se encuentra el equipo antes del partido que representa el registro ('champions', 'europa', 'mid_table', 'descenso').

Otra posible variable es indicar si el equipo ya está matematicamente salvado o no.

In [None]:
def classify_zone(pos):
    if pos <= 4:
        return 'champions'
    elif pos <= 6:
        return 'europa'
    elif pos <= 17:
        return 'mid_table'
    else:
        return 'descenso'

df['home_position_zone'] = df['home_prev_position'].apply(classify_zone)
df['away_position_zone'] = df['away_prev_position'].apply(classify_zone)

* **CASA O FUERA**

¿Es más útil para predecir un partido fijarse en el rendimiento general de un equipo, o específicamente en su comportamiento como local/visitante?

Ya tenemos variables que miden el comportamiento local y visitante de los equipos como son `home_points_cum` / `away_points_cum` y `home_gd_cum` / `away_gd_cum`.

* **LA IMPORTANCIA DEL PASADO**

Supuestamente la mejor ventana predictiva es la que incluye la temporada actual y las 2 anteriores. Más allá de eso, el valor predictivo se estanca o incluso se reduce.



* **MÁS ALLÁ DE LOS GOLES**

Este apartado induce a la creación de variables basadas en las diferencias de puntos, tiros y goles entre los dos equipos del partido.

Una de las variables más interesantes que nos deja este capítulo es la de `effectiveness` pero calculada de forma dinámica acumulada, es decir, se trata de la división entre los puntos acumulados hasta antes del partido y los tiros a puerta realizados hasta antes del partido.

In [None]:
home_eff = df[['Season', 'Date', 'HomeTeam_norm', 'home_points', 'HST']].copy()
home_eff.columns = ['Season', 'Date', 'Team', 'Points', 'ShotsOnTarget']

away_eff = df[['Season', 'Date', 'AwayTeam_norm', 'away_points', 'AST']].copy()
away_eff.columns = ['Season', 'Date', 'Team', 'Points', 'ShotsOnTarget']

eff = pd.concat([home_eff, away_eff], ignore_index=True)
eff = eff.sort_values(['Season', 'Team', 'Date'], kind='mergesort')

eff['cum_points_pre'] = (
    eff.groupby(['Season','Team'])['Points']
       .transform(lambda s: s.fillna(0).cumsum().shift(1))
       .fillna(0)
)
eff['cum_sot_pre'] = (
    eff.groupby(['Season','Team'])['ShotsOnTarget']
       .transform(lambda s: s.fillna(0).cumsum().shift(1))
       .fillna(0)
)

eff['effectiveness'] = eff['cum_points_pre'] / eff['cum_sot_pre'].replace(0, np.nan)

df = df.merge(
    eff[['Season','Date','Team','effectiveness']],
    left_on=['Season','Date','HomeTeam_norm'],
    right_on=['Season','Date','Team'],
    how='left'
).rename(columns={'effectiveness':'home_effectiveness'}).drop(columns='Team')

df = df.merge(
    eff[['Season','Date','Team','effectiveness']],
    left_on=['Season','Date','AwayTeam_norm'],
    right_on=['Season','Date','Team'],
    how='left'
).rename(columns={'effectiveness':'away_effectiveness'}).drop(columns='Team')

* **EL DULCE SABOR DE LA VENGANZA**

$$
\text{rivalidad}_{ij} = \frac{\# \text{derrotas de } i \text{ vs } j}{\# \text{enfrentamientos entre } i \text{ y } j}
$$

Esta variable mide cuánto ha perdido históricamente un equipo frente a otro. Cuanto más alto el valor, más traumática puede ser la serie de enfrentamientos. Esto puede tener impacto psicológico y afectar el rendimiento.

In [None]:
# ------------ parámetros ------------
HALFLIFE = 6        # "memoria" en nº de enfrentamientos; más bajo => más reactivo
ROLL_N   = 8        # ventana opcional por últimos N h2h (además del EWM)
MINP     = 2        # mínimo de previos para rolling (para no meter puro ruido)
FILL_NEUTRAL = True # si no hay historial previo, rellena con 1/3-1/3-1/3

# ------------ base en largo (dos filas por partido: una por cada equipo) ------------
base = df[['Date','HomeTeam_norm','AwayTeam_norm','FTR','FTHG','FTAG']].copy()
base['Date'] = pd.to_datetime(base['Date'])

home_side = base.rename(columns={'HomeTeam_norm':'team','AwayTeam_norm':'opp'})
away_side = base.rename(columns={'AwayTeam_norm':'team','HomeTeam_norm':'opp'})

# Indicadores resultado desde la perspectiva de "team"
home_side['win']  = (home_side['FTR'] == 'H').astype(int)
home_side['draw'] = (home_side['FTR'] == 'D').astype(int)
home_side['loss'] = (home_side['FTR'] == 'A').astype(int)

away_side['win']  = (away_side['FTR'] == 'A').astype(int)
away_side['draw'] = (away_side['FTR'] == 'D').astype(int)
away_side['loss'] = (away_side['FTR'] == 'H').astype(int)

# (opcional) goal diff desde la perspectiva de team, por si quieres un continuo
home_side['gd_team'] = base['FTHG'] - base['FTAG']
away_side['gd_team'] = base['FTAG'] - base['FTHG']

long = pd.concat([home_side[['Date','team','opp','win','draw','loss','gd_team']],
                  away_side[['Date','team','opp','win','draw','loss','gd_team']]],
                 ignore_index=True)

long = long.sort_values(['team','opp','Date'], kind='mergesort')

# ------------ tasas previas con decaimiento exponencial (recencia) ------------
for col in ['win','draw','loss']:
    long[f'{col}_rate_ewm'] = (
        long.groupby(['team','opp'])[col]
            .transform(lambda s: s.shift(1).ewm(halflife=HALFLIFE, adjust=False, min_periods=1).mean())
    )

# (opcional) promedio en últimos N enfrentamientos
for col in ['win','draw','loss']:
    long[f'{col}_rate_roll{ROLL_N}'] = (
        long.groupby(['team','opp'])[col]
            .transform(lambda s: s.shift(1).rolling(ROLL_N, min_periods=MINP).mean())
    )

# (opcional) media EWM del goal-diff h2h (continuo, útil)
long['gd_h2h_ewm'] = (
    long.groupby(['team','opp'])['gd_team']
        .transform(lambda s: s.shift(1).ewm(halflife=HALFLIFE, adjust=False, min_periods=1).mean())
)

# Relleno neutral si no hay historial previo
if FILL_NEUTRAL:
    for k in ['win_rate_ewm','draw_rate_ewm','loss_rate_ewm',
              f'win_rate_roll{ROLL_N}', f'draw_rate_roll{ROLL_N}', f'loss_rate_roll{ROLL_N}']:
        if k in long.columns:
            long[k] = long[k].fillna(1/3)

# ------------ merge a df (home / away) ------------
keep_cols = ['Date','team','opp',
             'win_rate_ewm','draw_rate_ewm','loss_rate_ewm',
             f'win_rate_roll{ROLL_N}', f'draw_rate_roll{ROLL_N}', f'loss_rate_roll{ROLL_N}',
             'gd_h2h_ewm']
keep_cols = [c for c in keep_cols if c in long.columns]  # por si quitas los rollings

home_feat = long[keep_cols].rename(columns={'team':'HomeTeam_norm','opp':'AwayTeam_norm'})
away_feat = long[keep_cols].rename(columns={'team':'AwayTeam_norm','opp':'HomeTeam_norm'})

# merge para local
df = df.merge(home_feat, on=['Date','HomeTeam_norm','AwayTeam_norm'], how='left',
              suffixes=('','_home_tmp'))
# renombra a prefijo home_
rename_home = {c: f'home_h2h_{c}' for c in keep_cols if c not in ['Date','team','opp']}
df.rename(columns=rename_home, inplace=True)
for c in list(rename_home.values()):
    if c.endswith('_home_tmp'):
        df.rename(columns={c: c.replace('_home_tmp','')}, inplace=True)

# merge para visitante
df = df.merge(away_feat, on=['Date','HomeTeam_norm','AwayTeam_norm'], how='left',
              suffixes=('','_away_tmp'))
rename_away = {c: f'away_h2h_{c}' for c in keep_cols if c not in ['Date','team','opp']}
df.rename(columns=rename_away, inplace=True)
for c in list(rename_away.values()):
    if c.endswith('_away_tmp'):
        df.rename(columns={c: c.replace('_away_tmp','')}, inplace=True)

# (opcional) diferencias — útiles para modelos lineales
for base in ['win_rate_ewm','draw_rate_ewm','loss_rate_ewm','gd_h2h_ewm',
             f'win_rate_roll{ROLL_N}', f'draw_rate_roll{ROLL_N}', f'loss_rate_roll{ROLL_N}']:
    h, a = f'home_h2h_{base}', f'away_h2h_{base}'
    if h in df.columns and a in df.columns:
        df[f'h2h_{base}_diff'] = df[h] - df[a]

* **EL PRECIO DE UNA GRAN VICTORIA**

Los jugadores de un equipo de mitad de tabla se motivan especialmente al enfrentar a un gigante como el Madrid o el Barça. Ganan visibilidad, se juegan el prestigio personal y colectivo. Si ganan, el nivel emocional y motivacional alcanza un pico muy alto. Ese nivel de exigencia genera un bajón posterior, tanto físico como psicológico (hipótesis).

Para reflejar este bajon post gran victoria he creado la siguiente variable binaria: `post_big_odds_win_flag`.

Esta variable indica si el equipo (local o visitante) viene de una gran victoria inesperada, cuantificando "inesperada" como aquellas que superan la cuota 4 para la victoria de ese equipo, lo que equivale más o menos a una probabilidad implícita de victoria del 25%.



In [None]:
BIG_WIN_THRESHOLD = 4.0

df = df.sort_values('Date').copy()
df['_Date_dt'] = pd.to_datetime(df['Date'], errors='coerce')

home_big_win = (df['FTR'].eq('H')) & (pd.to_numeric(df['B365H'], errors='coerce') > BIG_WIN_THRESHOLD)
away_big_win = (df['FTR'].eq('A')) & (pd.to_numeric(df['B365A'], errors='coerce') > BIG_WIN_THRESHOLD)

home_long = df.loc[:, ['_Date_dt', 'HomeTeam_norm']].rename(columns={'HomeTeam_norm':'Team'})
home_long['big_win'] = home_big_win.values

away_long = df.loc[:, ['_Date_dt', 'AwayTeam_norm']].rename(columns={'AwayTeam_norm':'Team'})
away_long['big_win'] = away_big_win.values

team_long = pd.concat([home_long, away_long], ignore_index=True)
team_long = team_long.sort_values(['Team','_Date_dt']).reset_index(drop=True)

team_long['prev_big_win_any'] = (
    team_long.groupby('Team', group_keys=False)['big_win']
             .shift(1)
             .fillna(False)
             .astype(int)
)

key_prev = team_long[['Team', '_Date_dt', 'prev_big_win_any']].copy()

home_prev = key_prev.rename(columns={'Team':'HomeTeam_norm', 'prev_big_win_any':'home_prev_big_odds_win_any'})
df = df.merge(home_prev, how='left', on=['HomeTeam_norm','_Date_dt'])

away_prev = key_prev.rename(columns={'Team':'AwayTeam_norm', 'prev_big_win_any':'away_prev_big_odds_win_any'})
df = df.merge(away_prev, how='left', on=['AwayTeam_norm','_Date_dt'])

df['home_prev_big_odds_win_any'] = df['home_prev_big_odds_win_any'].fillna(0).astype(int)
df['away_prev_big_odds_win_any'] = df['away_prev_big_odds_win_any'].fillna(0).astype(int)
df = df.drop(columns=['_Date_dt'])

In [None]:
SMALL_ODDS_FAV_THRESHOLD = 1.60

df = df.sort_values('Date').copy()
df['_Date_dt'] = pd.to_datetime(df['Date'], errors='coerce')

# odds numéricas
oddsH = pd.to_numeric(df['B365H'], errors='coerce')
oddsA = pd.to_numeric(df['B365A'], errors='coerce')

# "gran favorito que perdió"
home_big_fav_loss = (df['FTR'].eq('A')) & (oddsH < SMALL_ODDS_FAV_THRESHOLD)
away_big_fav_loss = (df['FTR'].eq('H')) & (oddsA < SMALL_ODDS_FAV_THRESHOLD)

# largo por equipo-partido
home_long = df.loc[:, ['_Date_dt', 'HomeTeam_norm']].rename(columns={'HomeTeam_norm':'Team'})
home_long['big_fav_loss'] = home_big_fav_loss.values

away_long = df.loc[:, ['_Date_dt', 'AwayTeam_norm']].rename(columns={'AwayTeam_norm':'Team'})
away_long['big_fav_loss'] = away_big_fav_loss.values

team_long = pd.concat([home_long, away_long], ignore_index=True)
team_long = team_long.sort_values(['Team','_Date_dt']).reset_index(drop=True)

# indicador previo inmediato (sin fuga)
team_long['prev_big_fav_loss_any'] = (
    team_long.groupby('Team', group_keys=False)['big_fav_loss']
             .shift(1)
             .fillna(False)
             .astype(int)
)

# merge al ancho
key_prev = team_long[['Team', '_Date_dt', 'prev_big_fav_loss_any']].copy()

home_prev = key_prev.rename(columns={
    'Team':'HomeTeam_norm',
    'prev_big_fav_loss_any':'home_prev_big_odds_loss_any'
})
df = df.merge(home_prev, how='left', on=['HomeTeam_norm','_Date_dt'])

away_prev = key_prev.rename(columns={
    'Team':'AwayTeam_norm',
    'prev_big_fav_loss_any':'away_prev_big_odds_loss_any'
})
df = df.merge(away_prev, how='left', on=['AwayTeam_norm','_Date_dt'])

df['home_prev_big_odds_loss_any'] = df['home_prev_big_odds_loss_any'].fillna(0).astype(int)
df['away_prev_big_odds_loss_any'] = df['away_prev_big_odds_loss_any'].fillna(0).astype(int)

# limpia columna temporal
df = df.drop(columns=['_Date_dt'])

* **¿QUÉ ESPERABAS?**

¿Cómo saber si un equipo está jugando realmente bien o mal… o simplemente se ha enfrentado a rivales difíciles o fáciles?

Se propone una forma cuantitativa de contextualizar el rendimiento reciente de un equipo en función de las cuotas de apuestas previas, usadas como indicador de dificultad.

Sumamos todas las probabilidades implícitas de los últimos 14 partidos para estimar cuántas victorias "debería" haber tenido el equipo según las cuotas.
Lo comparamos con el número real de victorias obtenidas.

$$
\text{relative_performance} = \frac{\text{suma victorias reales}}{\text{suma victorias esperadas (según cuotas implícitas)}}
$$

In [None]:
# --- helper: invertir y desvigorizar ---
def inv(s):
    return 1.0 / pd.to_numeric(s, errors='coerce')

# 1) Probabilidades ajustadas (quita margen de la casa)
p = pd.DataFrame({
    'H': inv(df['B365H']),
    'D': inv(df['B365D']) if 'B365D' in df.columns else np.nan,
    'A': inv(df['B365A']),
})
s = p.sum(axis=1, skipna=True)
p_adj = p.div(s, axis=0)

df['pH_adj'] = p_adj['H']
df['pA_adj'] = p_adj['A']

# 2) Largo por equipo-partido
home = df[['Date','HomeTeam_norm','FTR','pH_adj']].rename(
    columns={'HomeTeam_norm':'Team','pH_adj':'p_win_adj'}
)
home['RealWin'] = (home['FTR']=='H').astype(int)

away = df[['Date','AwayTeam_norm','FTR','pA_adj']].rename(
    columns={'AwayTeam_norm':'Team','pA_adj':'p_win_adj'}
)
away['RealWin'] = (away['FTR']=='A').astype(int)

perf = pd.concat([home[['Date','Team','RealWin','p_win_adj']],
                  away[['Date','Team','RealWin','p_win_adj']]], ignore_index=True)
perf['Date'] = pd.to_datetime(perf['Date'])
perf = perf.sort_values(['Team','Date'])

# 3) Rendimiento relativo EWM (sin fuga)
HALFLIFE = 6
g = perf.groupby('Team', group_keys=False)
perf['real_ewm'] = g['RealWin'].transform(lambda s: s.shift(1).ewm(halflife=HALFLIFE, adjust=False, min_periods=3).mean())
perf['exp_ewm']  = g['p_win_adj'].transform(lambda s: s.shift(1).ewm(halflife=HALFLIFE, adjust=False, min_periods=3).mean())
perf['relative_perf_ewm'] = perf['real_ewm'] / perf['exp_ewm']

# 4) Merge a df (¡usa perf[keep], no la lista!)
keep = ['Date','Team','relative_perf_ewm']

df['Date'] = pd.to_datetime(df['Date'])

df = df.merge(perf[keep], left_on=['Date','HomeTeam_norm'], right_on=['Date','Team'], how='left') \
       .rename(columns={'relative_perf_ewm':'home_relative_perf'}) \
       .drop(columns='Team')

df = df.merge(perf[keep], left_on=['Date','AwayTeam_norm'], right_on=['Date','Team'], how='left') \
       .rename(columns={'relative_perf_ewm':'away_relative_perf'}) \
       .drop(columns='Team')

# (opcional) Diferencia para logística:
# df['relative_perf_diff'] = df['home_relative_perf'] - df['away_relative_perf']

* **CUOTAS Y PROBABILIDADES**

Por último vamos a derivar alguna variable a partir de las cuotas que ofrecen las casas de apuestas. En este caso tenemos unicamente las de Bet365.

La mas interesante serian las probabilidades implícitas reales: `pimp1`, `pimpx`, `pimp2`. Reflejan la “opinión agregada del mercado” corregida por margen.

In [None]:
df['inv_q1'] = 1 / df['B365H']
df['inv_qx'] = 1 / df['B365D']
df['inv_q2'] = 1 / df['B365A']

df['overround'] = df['inv_q1'] + df['inv_qx'] + df['inv_q2']

df['pimp1'] = df['inv_q1'] / df['overround']
df['pimpx'] = df['inv_qx'] / df['overround']
df['pimp2'] = df['inv_q2'] / df['overround']

df.drop(columns=['inv_q1', 'inv_qx', 'inv_q2'], inplace=True)

In [None]:
PROC.mkdir(parents=True, exist_ok=True)

OUT_PATH = PROC / "df_new_features.parquet"
df.to_parquet(OUT_PATH, index=False)

print(f"Guardado: {OUT_PATH} · filas={len(df):,} · cols={df.shape[1]}")

Guardado: /content/data/02_processed/df_new_features.parquet · filas=7,690 · cols=117
