In [1]:
# --- Parámetros (la CI podrá sobreescribirlos) ---
RUN_DATE = "2025-09-15"
SEASON   = "2025_26"
MATCHDAY = None
MODEL_VERSION = "xgb-local"

# --- Rutas coherentes local/CI ---
from pathlib import Path
ROOT   = Path.cwd()
DATA   = ROOT / "data"
RAW    = DATA / "01_raw"
PROC   = DATA / "02_processed"
FEAT   = DATA / "03_features"
MODELS = DATA / "04_models"
OUT    = ROOT / "outputs"

for p in [RAW, PROC, FEAT, MODELS, OUT]:
    p.mkdir(parents=True, exist_ok=True)

In [2]:
import pandas as pd

def load_raw(name: str):   return pd.read_parquet(RAW / name)
def save_raw(df, name: str):   df.to_parquet(RAW / name, index=False)

def load_proc(name: str):  return pd.read_parquet(PROC / name)
def save_proc(df, name: str):  df.to_parquet(PROC / name, index=False)

def load_feat(name: str):  return pd.read_parquet(FEAT / name)
def save_feat(df, name: str):  df.to_parquet(FEAT / name, index=False)

# **LIMPIEZA DE VARIABLES Y CREACIÓN DE NUEVAS**

In [None]:
from datetime import datetime, date, time, timedelta
from dateutil import parser
from collections import defaultdict
from pathlib import Path

import pandas as pd
import numpy as np
import soccerdata as sd
import os, re, unicodedata, requests
import pytz

# Limpieza de variables

In [6]:
IN_PATH = PROC / "fd_xg_elo_transfermarkt_2005_2025.parquet"
df = pd.read_parquet(IN_PATH)
df

Unnamed: 0,1XBA,1XBCA,1XBCD,1XBCH,1XBD,1XBH,AC,AF,AHCh,AHh,...,h_avg_age,h_value_mio,h_value_avg_mio,h_squad_size,h_pct_foreigners,a_avg_age,a_value_mio,a_value_avg_mio,a_squad_size,a_pct_foreigners
0,,,,,,,7,19,,,...,28.2,34.830,1.120,31.0,54.84,25.4,327.50,9.63,34.0,47.06
1,,,,,,,4,19,,,...,25.2,47.230,1.150,41.0,2.44,25.9,53.83,1.74,31.0,22.58
2,,,,,,,5,14,,,...,27.3,213.550,6.280,34.0,41.18,26.2,85.95,2.60,33.0,24.24
3,,,,,,,4,22,,,...,24.2,134.150,4.330,31.0,25.81,27.7,66.55,2.66,25.0,28.00
4,,,,,,,8,25,,,...,28.8,2.215,0.791,28.0,46.43,25.4,281.60,7.82,36.0,36.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7636,,,,,,,3,9,-0.75,-0.75,...,,,,,,,,,,
7637,,,,,,,12,11,0.50,0.25,...,,,,,,,,,,
7638,,,,,,,6,7,0.00,0.00,...,,,,,,,,,,
7639,,,,,,,4,8,-1.50,-2.00,...,,,,,,,,,,


In [7]:
protect_explicit = [
    'h_xg','a_xg', 'home_team_slug', 'away_team_slug',
    'h_avg_age', 'h_value_mio', 'h_value_avg_mio', 'h_squad_size', 'h_pct_foreigners',
    'a_avg_age', 'a_value_mio', 'a_value_avg_mio', 'a_squad_size', 'a_pct_foreigners'
]

protected = [c for c in protect_explicit if c in df.columns]

cols_with_na = df.columns[df.isna().any()].tolist()

cols_to_drop_na = [c for c in cols_with_na if c not in protected]

df = df.drop(columns=cols_to_drop_na)
df = df.drop(columns=['Div', 'HomeTeam', 'AwayTeam', 'home_team_slug', 'away_team_slug'], errors='ignore')

print(f"Eliminadas por NaN (excepto xG): {len(cols_to_drop_na)}")

Eliminadas por NaN (excepto xG): 166


In [8]:
df

Unnamed: 0,AC,AF,AR,AS,AST,AY,B365A,B365D,B365H,Date,...,h_avg_age,h_value_mio,h_value_avg_mio,h_squad_size,h_pct_foreigners,a_avg_age,a_value_mio,a_value_avg_mio,a_squad_size,a_pct_foreigners
0,7,19,0,17,10,1,1.50,3.75,7.00,2005-08-27,...,28.2,34.830,1.120,31.0,54.84,25.4,327.50,9.63,34.0,47.06
1,4,19,0,9,2,1,3.25,3.25,2.00,2005-08-27,...,25.2,47.230,1.150,41.0,2.44,25.9,53.83,1.74,31.0,22.58
2,5,14,0,14,3,3,3.25,3.25,2.00,2005-08-27,...,27.3,213.550,6.280,34.0,41.18,26.2,85.95,2.60,33.0,24.24
3,4,22,0,9,2,7,4.00,3.40,1.72,2005-08-28,...,24.2,134.150,4.330,31.0,25.81,27.7,66.55,2.66,25.0,28.00
4,8,25,0,17,6,2,1.44,4.00,7.50,2005-08-28,...,28.8,2.215,0.791,28.0,46.43,25.4,281.60,7.82,36.0,36.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7636,3,9,0,13,8,3,4.75,3.90,1.73,2025-09-14,...,,,,,,,,,,
7637,12,11,0,28,9,2,2.30,3.25,3.25,2025-09-14,...,,,,,,,,,,
7638,6,7,0,19,6,2,2.90,3.20,2.55,2025-09-14,...,,,,,,,,,,
7639,4,8,0,2,1,1,11.00,7.00,1.22,2025-09-14,...,,,,,,,,,,


In [9]:
PROC.mkdir(parents=True, exist_ok=True)
OUT_PATH = PROC / "df_clean_vars.parquet"
df.to_parquet(OUT_PATH, index=False)
print(f"Guardado: {OUT_PATH}")

Guardado: /content/data/02_processed/df_clean_vars.parquet


# Jornada a predecir

Aqui vamos a añadir los nuevos partidos para la jornada proxima a predecir junto con los datos de elo y las cuotas bet365:

In [10]:
IN_PATH = PROC / "df_clean_vars.parquet"
df = pd.read_parquet(IN_PATH)
df

Unnamed: 0,AC,AF,AR,AS,AST,AY,B365A,B365D,B365H,Date,...,h_avg_age,h_value_mio,h_value_avg_mio,h_squad_size,h_pct_foreigners,a_avg_age,a_value_mio,a_value_avg_mio,a_squad_size,a_pct_foreigners
0,7,19,0,17,10,1,1.50,3.75,7.00,2005-08-27,...,28.2,34.830,1.120,31.0,54.84,25.4,327.50,9.63,34.0,47.06
1,4,19,0,9,2,1,3.25,3.25,2.00,2005-08-27,...,25.2,47.230,1.150,41.0,2.44,25.9,53.83,1.74,31.0,22.58
2,5,14,0,14,3,3,3.25,3.25,2.00,2005-08-27,...,27.3,213.550,6.280,34.0,41.18,26.2,85.95,2.60,33.0,24.24
3,4,22,0,9,2,7,4.00,3.40,1.72,2005-08-28,...,24.2,134.150,4.330,31.0,25.81,27.7,66.55,2.66,25.0,28.00
4,8,25,0,17,6,2,1.44,4.00,7.50,2005-08-28,...,28.8,2.215,0.791,28.0,46.43,25.4,281.60,7.82,36.0,36.11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7636,3,9,0,13,8,3,4.75,3.90,1.73,2025-09-14,...,,,,,,,,,,
7637,12,11,0,28,9,2,2.30,3.25,3.25,2025-09-14,...,,,,,,,,,,
7638,6,7,0,19,6,2,2.90,3.20,2.55,2025-09-14,...,,,,,,,,,,
7639,4,8,0,2,1,1,11.00,7.00,1.22,2025-09-14,...,,,,,,,,,,


In [11]:
SPORT_KEY = "soccer_spain_la_liga"
REGIONS = "uk,eu"
MARKETS = "h2h"
ODDS_FORMAT = "decimal"
TZ = pytz.timezone("Europe/Madrid")
EXPECTED_COLS = ["Date","Season","HomeTeam_norm","AwayTeam_norm","h_elo","a_elo"]

def _norm_text(s) -> str:
    s = "" if s is None else str(s)
    t = unicodedata.normalize("NFKD", s)
    t = "".join(c for c in t if not unicodedata.combining(c))
    return re.sub(r"[^A-Za-z0-9]+"," ", t).strip().lower()

NAME_MAP = {
    "real madrid":"real madrid","barcelona":"barcelona","fc barcelona":"barcelona",
    "atletico madrid":"ath madrid","athletic bilbao":"ath bilbao","athletic club":"ath bilbao",
    "sevilla":"sevilla","valencia":"valencia","villarreal":"villarreal","real sociedad":"sociedad",
    "betis":"betis","real betis":"betis","ca osasuna":"osasuna","espanyol":"espanol",
    "rayo vallecano":"vallecano","deportivo alaves":"alaves","alaves":"alaves","levante":"levante",
    "getafe":"getafe","girona":"girona","ud las palmas":"las palmas","las palmas":"las palmas",
    "cadiz":"cadiz","cadiz cf":"cadiz","mallorca":"mallorca","granada":"granada",
    "leganes":"leganes","eibar":"eibar","real valladolid":"valladolid","valladolid":"valladolid",
    "elche cf":"elche","malaga":"malaga","real oviedo":"real oviedo",
    "celta vigo":"celta","rc celta de vigo":"celta","celta":"celta",
    "rcd espanyol":"espanol","real zaragoza":"zaragoza"
}
def _map_team_oddsapi_to_norm(name: str) -> str:
    return NAME_MAP.get(_norm_text(name), _norm_text(name))

NORM_TO_CLUBELO = {
    "real madrid":"Real Madrid","barcelona":"Barcelona","ath madrid":"Atletico","ath bilbao":"Bilbao",
    "sevilla":"Sevilla","valencia":"Valencia","villarreal":"Villarreal","sociedad":"Sociedad",
    "betis":"Betis","osasuna":"Osasuna","espanol":"Espanyol","getafe":"Getafe","celta":"Celta",
    "mallorca":"Mallorca","las palmas":"Las Palmas","cadiz":"Cadiz","granada":"Granada",
    "alaves":"Alaves","levante":"Levante","vallecano":"Rayo Vallecano","girona":"Girona",
    "leganes":"Leganes","eibar":"Eibar","valladolid":"Valladolid","elche":"Elche","malaga":"Malaga",
    "real oviedo":"Oviedo"
}

def _season_from_local_date(dt_local: datetime) -> int:
    return dt_local.year if dt_local.month >= 7 else dt_local.year - 1

def _weekend_window_from_friday(friday_str: str):
    f = pd.to_datetime(friday_str).date()
    monday = f + timedelta(days=3)
    return TZ.localize(datetime.combine(f, time.min)), TZ.localize(datetime.combine(monday, time.max))

def _fetch_raw_events(api_key: str) -> list:
    url = f"https://api.the-odds-api.com/v4/sports/{SPORT_KEY}/odds"
    params = {"apiKey": api_key, "regions": REGIONS, "markets": MARKETS, "oddsFormat": ODDS_FORMAT, "dateFormat": "iso"}
    r = requests.get(url, params=params, timeout=20)
    r.raise_for_status()
    return r.json() or []

def _fixtures_in_window(events: list, start_local: datetime, end_local: datetime) -> pd.DataFrame:
    rows = []
    for ev in events:
        try:
            t_local = parser.isoparse(ev["commence_time"]).astimezone(TZ)
        except Exception:
            continue
        if not (start_local <= t_local <= end_local):
            continue
        t_naive = t_local.replace(tzinfo=None)

        home = ev.get("home_team")
        teams = ev.get("teams", []) or []
        away = [t for t in teams if t != home]
        away = away[0] if away else ev.get("away_team")
        if not home or not away:  # evento incompleto
            continue

        rows.append({
            "Date_dt": t_naive,
            "Season": _season_from_local_date(t_local),
            "HomeTeam_norm": _map_team_oddsapi_to_norm(home),
            "AwayTeam_norm": _map_team_oddsapi_to_norm(away),
        })
    if not rows:
        return pd.DataFrame(columns=["Date","Season","HomeTeam_norm","AwayTeam_norm","Date_dt"])
    fx = pd.DataFrame(rows).sort_values(["Date_dt","HomeTeam_norm"]).reset_index(drop=True)
    fx["Date"] = pd.to_datetime(fx["Date_dt"]).dt.strftime("%Y-%m-%d")
    return fx[["Date","Season","HomeTeam_norm","AwayTeam_norm","Date_dt"]]

def _build_clubelo_table(teams_norm: list) -> pd.DataFrame:
    ce = sd.ClubElo()
    frames = []
    for tnorm in teams_norm:
        ce_name = NORM_TO_CLUBELO.get(tnorm) or tnorm.title().replace(" ", "")
        try:
            hist = ce.read_team_history(ce_name)
            if hist is None or hist.empty:
                continue
            hist = hist.reset_index().rename(columns={hist.index.name or "index":"Date"})
            rating_col = "elo" if "elo" in hist.columns else ("Elo" if "Elo" in hist.columns else None)
            if rating_col is None:
                continue
            df = hist[["Date", rating_col]].rename(columns={rating_col:"Elo"})
            df["team_norm"] = tnorm
            frames.append(df)
        except Exception:
            continue
    if not frames:
        return pd.DataFrame(columns=["team_norm","Date","Elo"])
    elo = pd.concat(frames, ignore_index=True)
    elo["Date"] = pd.to_datetime(elo["Date"]).dt.normalize()
    elo = elo.sort_values(["team_norm","Date"], kind="mergesort").reset_index(drop=True)
    return elo

def _merge_asof_by_team(left: pd.DataFrame, right: pd.DataFrame, by_col: str,
                        left_time_col: str, right_time_col: str,
                        right_val_col: str, out_col: str) -> pd.DataFrame:
    """merge_asof por equipo, garantizando orden por grupo y evitando 'keys must be sorted'."""
    out_parts = []
    for team, subL in left.groupby(by_col, sort=False):
        subL = subL.sort_values(left_time_col, kind="mergesort").copy()
        subR = right[right[by_col] == team].sort_values(right_time_col, kind="mergesort")
        if subR.empty:
            subL[out_col] = np.nan
        else:
            tmp = pd.merge_asof(
                subL,
                subR[[right_time_col, right_val_col]].rename(columns={right_time_col: "_rtime", right_val_col: out_col}),
                left_on=left_time_col, right_on="_rtime",
                direction="backward", allow_exact_matches=False
            ).drop(columns=["_rtime"])
            subL = tmp
        out_parts.append(subL)
    out = pd.concat(out_parts, axis=0).sort_index(kind="mergesort")
    return out

def _attach_elo(fixt: pd.DataFrame) -> pd.DataFrame:
    teams = sorted(set(fixt["HomeTeam_norm"]).union(set(fixt["AwayTeam_norm"])))
    elo = _build_clubelo_table(teams)
    if elo.empty:
        fixt["h_elo"] = np.nan; fixt["a_elo"] = np.nan
        return fixt.drop(columns=["Date_dt"])
    rh = elo.rename(columns={"team_norm":"HomeTeam_norm"})
    ra = elo.rename(columns={"team_norm":"AwayTeam_norm"})
    m1 = _merge_asof_by_team(
        fixt, rh, by_col="HomeTeam_norm",
        left_time_col="Date_dt", right_time_col="Date",
        right_val_col="Elo", out_col="h_elo"
    )
    m2 = _merge_asof_by_team(
        m1, ra, by_col="AwayTeam_norm",
        left_time_col="Date_dt", right_time_col="Date",
        right_val_col="Elo", out_col="a_elo"
    )
    return m2.drop(columns=["Date_dt"])

def _auto_fixtures_window_from_events(events, ref_date_str: str, tz=TZ, max_gap_hours=36, min_matches=5):
    """
    Detecta automáticamente la siguiente 'ventana de jornada' agrupando eventos por proximidad temporal.
    - ref_date_str: fecha de referencia (usa RUN_DATE).
    - max_gap_hours: hueco máximo entre partidos dentro de la misma jornada (36h por defecto).
    - min_matches: tamaño mínimo para considerar un cluster como 'jornada' (p.ej. 5).
    Devuelve (start_local, end_local) con TZ consciente.
    """
    ref_local = tz.localize(pd.to_datetime(ref_date_str).normalize())

    times = []
    for ev in events:
        try:
            t_local = parser.isoparse(ev["commence_time"]).astimezone(tz)
            if t_local >= ref_local:
                times.append(t_local)
        except Exception:
            continue

    if not times:
        return None, None

    df_t = pd.DataFrame({"t": sorted(times)})
    gaps_h = df_t["t"].diff().dt.total_seconds().div(3600).fillna(0)
    clusters = (gaps_h > max_gap_hours).cumsum()
    df_t["cluster"] = clusters

    chosen = None
    for cid, grp in df_t.groupby("cluster"):
        if len(grp) >= min_matches:
            chosen = grp
            break
    if chosen is None:
        chosen = next(iter(df_t.groupby("cluster")))[1]

    start = chosen["t"].min()
    end   = chosen["t"].max()

    start_local = start.replace(hour=0, minute=0, second=0, microsecond=0)
    end_local   = end.replace(hour=23, minute=59, second=59, microsecond=999999)
    return start_local, end_local

def append_next_weekend_fixtures_with_elo(
    df: pd.DataFrame,
    weekend_friday: str | None = None,
    ref_date: str | None = None,
    max_gap_hours: int = 36,
    min_matches: int = 5
) -> pd.DataFrame:
    """
    Si weekend_friday es None, detecta automáticamente la siguiente ventana de jornada (sirve para intersemanales).
    Si weekend_friday tiene valor, usa la ventana viernes→lunes como antes.
    """
    for c in EXPECTED_COLS:
        if c not in df.columns:
            df[c] = pd.Series(dtype="object")

    api_key = os.getenv("ODDS_API_KEY")
    if not api_key:
        raise EnvironmentError("Define ODDS_API_KEY con tu API key de The Odds API.")

    events = _fetch_raw_events(api_key)

    if weekend_friday:
        start_local, end_local = _weekend_window_from_friday(weekend_friday)
    else:
        ref_date = ref_date or RUN_DATE
        start_local, end_local = _auto_fixtures_window_from_events(
            events, ref_date_str=ref_date, tz=TZ, max_gap_hours=max_gap_hours, min_matches=min_matches
        )
        if start_local is None:
            print("No hay eventos futuros según The Odds API. No se añaden fixtures.")
            return df

    fixt = _fixtures_in_window(events, start_local, end_local)
    if fixt.empty:
        print("No hay partidos en la ventana seleccionada.")
        return df

    fixt = _attach_elo(fixt)

    key_df = df.copy()
    key_df["Date_key"] = pd.to_datetime(key_df["Date"], errors="coerce").dt.strftime("%Y-%m-%d")
    fixt["Date_key"] = fixt["Date"]

    key_df["match_key"] = key_df["Date_key"].astype(str)+"|"+key_df["HomeTeam_norm"].astype(str)+"|"+key_df["AwayTeam_norm"].astype(str)
    fixt["match_key"]   = fixt["Date_key"].astype(str)  +"|"+fixt["HomeTeam_norm"].astype(str)  +"|"+fixt["AwayTeam_norm"].astype(str)

    add_rows = fixt.loc[~fixt["match_key"].isin(key_df["match_key"]), EXPECTED_COLS]
    if add_rows.empty:
        print("Sin nuevos partidos (ya estaban en df).")
        return df

    out = pd.concat([df, add_rows], ignore_index=True, sort=False)
    out["Date"] = pd.to_datetime(out["Date"], errors="coerce").dt.strftime("%Y-%m-%d")
    return out

In [12]:
try:
    os.environ["ODDS_API_KEY"] = userdata.get("ODDS_API_KEY") or os.environ.get("ODDS_API_KEY", "")
except Exception:
    pass

assert os.environ.get("ODDS_API_KEY"), "Falta ODDS_API_KEY"

In [13]:
df = append_next_weekend_fixtures_with_elo(df, ref_date=RUN_DATE)

Solo faltaría actualizar manualmente las cuotas de Bet365 con el siguiente código:

In [15]:
def make_b365_template(df: pd.DataFrame, n_tail: int = 10, out_csv: str | None = None) -> pd.DataFrame:
    need = ["Date","HomeTeam_norm","AwayTeam_norm","B365H","B365D","B365A"]
    for c in ["Date","HomeTeam_norm","AwayTeam_norm"]:
        if c not in df.columns:
            raise ValueError(f"Falta columna requerida en df: {c}")
    for c in ["B365H","B365D","B365A"]:
        if c not in df.columns:
            df[c] = np.nan

    order_idx = pd.to_datetime(df["Date"], errors="coerce").argsort(kind="mergesort")
    tail_idx = df.iloc[order_idx].tail(n_tail).index

    na_mask = df.loc[tail_idx, ["B365H","B365D","B365A"]].isna().all(axis=1)
    target = df.loc[tail_idx[na_mask], ["Date","HomeTeam_norm","AwayTeam_norm"]].copy()

    target["Date"] = pd.to_datetime(target["Date"], errors="coerce").dt.strftime("%Y-%m-%d")

    target.insert(0, "row_id", target.index.astype(int))

    target["B365H"] = np.nan
    target["B365D"] = np.nan
    target["B365A"] = np.nan

    if out_csv:
        target.to_csv(out_csv, index=False)
        print(f"Plantilla guardada en: {out_csv}\nNo cambies la columna 'row_id'. Solo rellena B365H/B365D/B365A.")
    return target

def apply_b365_from_template(df: pd.DataFrame, manual_template: pd.DataFrame | str, n_tail: int = 10) -> pd.DataFrame:
    """
    Actualiza EXCLUSIVAMENTE B365H/B365D/B365A de las filas objetivo,
    identificadas por 'row_id' (el índice original del df).
    - manual_template: DataFrame o ruta CSV de la plantilla ya rellenada.
    - No crea filas nuevas. Ignora cualquier row_id que no exista.
    - Verifica que los row_id siguen estando entre los últimos n con NaN (para evitar errores).
    """
    if isinstance(manual_template, str):
        upd = pd.read_csv(manual_template)
    else:
        upd = manual_template.copy()

    must = {"row_id","Date","HomeTeam_norm","AwayTeam_norm","B365H","B365D","B365A"}
    missing = must - set(upd.columns)
    if missing:
        raise ValueError(f"Faltan columnas en la plantilla: {sorted(missing)}")

    for c in ["B365H","B365D","B365A"]:
        upd[c] = pd.to_numeric(upd[c], errors="coerce")

    order_idx = pd.to_datetime(df["Date"], errors="coerce").argsort(kind="mergesort")
    tail_idx = df.iloc[order_idx].tail(n_tail).index
    na_mask = df.loc[tail_idx, ["B365H","B365D","B365A"]].isna().all(axis=1)
    target_idx_now = set(tail_idx[na_mask].astype(int))

    upd["row_id"] = pd.to_numeric(upd["row_id"], errors="coerce").astype("Int64")
    upd_valid = upd[upd["row_id"].isin(target_idx_now)].dropna(subset=["row_id"]).copy()

    if upd_valid.empty:
        print("No hay filas válidas para actualizar (¿cambiaste 'row_id' o ya no están entre las últimas n con NaN?).")
        return df

    for _, r in upd_valid.iterrows():
        ridx = int(r["row_id"])
        df.loc[ridx, ["B365H","B365D","B365A"]] = [r["B365H"], r["B365D"], r["B365A"]]

    print(f"Actualizadas {len(upd_valid)} fila(s) por 'row_id'.")
    still_nan = df.loc[list(target_idx_now), ["B365H","B365D","B365A"]].isna().all(axis=1).sum()
    print(f"Quedan {still_nan} partidos con B365* = NaN en las últimas {n_tail} filas.")
    df["Date"] = pd.to_datetime(df["Date"], errors="coerce").dt.strftime("%Y-%m-%d")
    return df

In [18]:
df = df.reset_index(drop=True)

MANUAL_DIR = ROOT / "manual"
MANUAL_DIR.mkdir(parents=True, exist_ok=True)

d = pd.to_datetime(df["Date"], errors="coerce")
mask_future = d >= pd.to_datetime(RUN_DATE)
mask_nan = df[["B365H","B365D","B365A"]].isna().all(axis=1)
n_tail = int(df[mask_future & mask_nan].shape[0])

if n_tail == 0:
    recent = df.sort_values("Date").tail(30)
    n_tail = int(recent[["B365H","B365D","B365A"]].isna().all(axis=1).sum())

n_tail = max(1, min(n_tail, 12))

tpl_path = MANUAL_DIR / f"b365_template_{RUN_DATE}.csv"
_ = make_b365_template(df, n_tail=n_tail, out_csv=str(tpl_path))
print(f"[B365] Plantilla generada ({n_tail} filas): {tpl_path}")

candidatos = [
    MANUAL_DIR / "plantilla_bet365.csv",
    MANUAL_DIR / f"b365_filled_{RUN_DATE}.csv",
]
filled = next((p for p in candidatos if p.exists()), None)

if filled is not None:
    df = apply_b365_from_template(df, str(filled), n_tail=n_tail)
    print(f"[B365] Aplicadas cuotas desde: {filled.name}")
else:
    print(f"[B365] No se encontró CSV rellenado en {MANUAL_DIR}. "
          f"Rellena {tpl_path.name} y súbelo como '{candidatos[0].name}' "
          f"o '{candidatos[1].name}'. (La próxima ejecución lo aplicará automáticamente.)")

Plantilla guardada en: /content/manual/b365_template_2025-09-15.csv
No cambies la columna 'row_id'. Solo rellena B365H/B365D/B365A.
[B365] Plantilla generada (10 filas): /content/manual/b365_template_2025-09-15.csv
Actualizadas 10 fila(s) por 'row_id'.
Quedan 0 partidos con B365* = NaN en las últimas 10 filas.
[B365] Aplicadas cuotas desde: b365_filled_2025-09-15.csv


In [19]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

df.tail(20)

Unnamed: 0,AC,AF,AR,AS,AST,AY,B365A,B365D,B365H,Date,FTAG,FTHG,FTR,HC,HF,HR,HS,HST,HTAG,HTHG,HTR,HY,HomeTeam_norm,AwayTeam_norm,h_xg,a_xg,h_elo,a_elo,Season,h_avg_age,h_value_mio,h_value_avg_mio,h_squad_size,h_pct_foreigners,a_avg_age,a_value_mio,a_value_avg_mio,a_squad_size,a_pct_foreigners
7631,2.0,19.0,0.0,5.0,2.0,3.0,4.5,3.4,1.85,2025-09-12,2.0,2.0,D,2.0,23.0,0.0,7.0,3.0,0.0,1.0,H,4.0,sevilla,elche,0.346397,0.781151,1637.952881,1604.363525,2025,,,,,,,,,,
7632,3.0,18.0,1.0,5.0,1.0,5.0,4.5,3.1,1.95,2025-09-13,0.0,2.0,H,4.0,12.0,0.0,10.0,3.0,0.0,2.0,H,2.0,getafe,oviedo,0.954087,0.369812,1641.694824,1589.334351,2025,,,,,,,,,,
7633,4.0,7.0,1.0,16.0,6.0,1.0,1.6,4.33,5.0,2025-09-13,2.0,1.0,A,12.0,17.0,0.0,23.0,4.0,2.0,0.0,A,2.0,sociedad,real madrid,3.14772,2.54171,1658.704468,1943.715454,2025,,,,,,,,,,
7634,6.0,21.0,0.0,4.0,2.0,3.0,5.75,3.9,1.6,2025-09-13,1.0,0.0,A,6.0,17.0,0.0,12.0,3.0,0.0,0.0,D,4.0,ath bilbao,alaves,0.866662,0.090756,1805.195068,1649.077393,2025,,,,,,,,,,
7635,3.0,15.0,0.0,8.0,1.0,4.0,4.33,3.7,1.8,2025-09-13,0.0,2.0,H,8.0,12.0,0.0,8.0,3.0,0.0,1.0,H,2.0,ath madrid,villarreal,1.5106,0.668177,1833.052002,1794.064209,2025,,,,,,,,,,
7636,3.0,9.0,0.0,13.0,8.0,3.0,4.75,3.9,1.73,2025-09-14,1.0,1.0,D,10.0,7.0,0.0,21.0,7.0,1.0,0.0,A,0.0,celta,girona,1.80278,1.41213,1675.804688,1604.329102,2025,,,,,,,,,,
7637,12.0,11.0,0.0,28.0,9.0,2.0,2.3,3.25,3.25,2025-09-14,2.0,2.0,D,4.0,16.0,0.0,5.0,2.0,1.0,2.0,H,3.0,levante,betis,1.02425,2.19546,1593.387085,1735.634399,2025,,,,,,,,,,
7638,6.0,7.0,0.0,19.0,6.0,2.0,2.9,3.2,2.55,2025-09-14,0.0,2.0,H,3.0,13.0,0.0,9.0,7.0,0.0,1.0,H,3.0,osasuna,vallecano,1.4012,0.979149,1693.844727,1674.801025,2025,,,,,,,,,,
7639,4.0,8.0,0.0,2.0,1.0,1.0,11.0,7.0,1.22,2025-09-14,0.0,6.0,H,5.0,9.0,0.0,24.0,10.0,0.0,1.0,H,0.0,barcelona,valencia,3.46022,0.12495,1947.615723,1681.898926,2025,,,,,,,,,,
7640,9.0,16.0,0.0,24.0,10.0,1.0,3.5,3.4,2.1,2025-09-15,2.0,3.0,H,3.0,9.0,1.0,9.0,3.0,1.0,2.0,H,2.0,espanol,mallorca,2.67405,2.03185,1657.138062,1634.056396,2025,,,,,,,,,,


In [20]:
pd.reset_option('display.max_rows')
pd.reset_option('display.max_columns')
pd.reset_option('display.width')
pd.reset_option('display.max_colwidth')

# Creación de variables

Uno de los aspectos fundamentales a considerar en la construcción de modelos predictivos es la prevención del data leakage, es decir, la **incorporación en el conjunto de entrenamiento de variables que contienen información que no estaría disponible en el momento real de la predicción**. En el dataset empleado para este trabajo, que recoge información detallada de partidos de la Primera División española, se identificaron diversas variables que incurren en esta problemática. Concretamente, variables como los goles totales (`FTHG`, `FTAG`), el resultado final (`FTR`), las estadísticas de mitad de partido (`HTHG`, `HTAG`, `HTR`) y otras métricas post-partido como tiros, faltas, córners, tarjetas o disparos a puerta representan información generada una vez disputado el encuentro. La inclusión de estos campos en el modelo supondría una fuga de información desde el futuro hacia el presente, lo que comprometería gravemente la validez del proceso de entrenamiento y evaluación. Por ello, dichas variables han de ser excluidas del conjunto de entrenamiento.

Dado que múltiples variables del dataset original recogen estadísticas generadas durante el transcurso del propio partido (goles, tiros, tarjetas, etc.), y por tanto no pueden utilizarse como predictoras sin incurrir en data leakage, se optó por sustituirlas por métricas históricas calculadas exclusivamente con datos previos al encuentro. En concreto, se construyeron variables agregadas como la media de goles anotados, disparos realizados o córners obtenidos por cada equipo en sus últimos encuentros disputados antes del partido en cuestión. Estas variables permiten capturar la dinámica reciente de los equipos de forma legítima y temporalmente coherente, manteniendo la validez del modelo predictivo.

In [21]:
window_size = 10

rows = []

for idx, row in df.iterrows():
    date = row['Date']
    season = row['Season']

    rows.append({
        'Team': row['HomeTeam_norm'],
        'Date': date,
        'Season': season,
        'Shots': row['HS'],
        'ShotsOnTarget': row['HST'],
        'Fouls': row['HF'],
        'Corners': row['HC'],
        'Yellows': row['HY'],
        'Reds': row['HR'],
        'xG': row.get('h_xg', np.nan)
    })

    rows.append({
        'Team': row['AwayTeam_norm'],
        'Date': date,
        'Season': season,
        'Shots': row['AS'],
        'ShotsOnTarget': row['AST'],
        'Fouls': row['AF'],
        'Corners': row['AC'],
        'Yellows': row['AY'],
        'Reds': row['AR'],
        'xG': row.get('a_xg', np.nan)
    })

team_stats = pd.DataFrame(rows)

stats = ['Shots', 'ShotsOnTarget', 'Fouls', 'Corners', 'Yellows', 'Reds', 'xG']

for stat in stats:
    team_stats[stat] = pd.to_numeric(team_stats[stat], errors='coerce')
    team_stats[f'{stat}_avg_last{window_size}'] = (
        team_stats.sort_values('Date')
        .groupby('Team', group_keys=False)[stat]
        .apply(lambda x: x.shift(1).rolling(window=window_size, min_periods=1).mean())
    )

df = df.merge(
    team_stats[['Team', 'Date'] + [f'{stat}_avg_last{window_size}' for stat in stats]],
    left_on=['HomeTeam_norm', 'Date'],
    right_on=['Team', 'Date'],
    how='left'
)
df.rename(columns={
    f'{stat}_avg_last{window_size}': f'home_avg_{stat.lower()}_last{window_size}' for stat in stats
}, inplace=True)
df.drop(columns='Team', inplace=True)

df = df.merge(
    team_stats[['Team', 'Date'] + [f'{stat}_avg_last{window_size}' for stat in stats]],
    left_on=['AwayTeam_norm', 'Date'],
    right_on=['Team', 'Date'],
    how='left'
)
df.rename(columns={
    f'{stat}_avg_last{window_size}': f'away_avg_{stat.lower()}_last{window_size}' for stat in stats
}, inplace=True)
df.drop(columns='Team', inplace=True)

* **PUNTOS O GOLES**

La diferencia de goles es un mejor predictor del rendimiento futuro que los puntos.

* `home_points_cum` / `away_points_cum`: Puntos acumulados por el equipo local/visitante hasta antes del partido actual

* `home_gd_cum` / `away_gd_cum`: Diferencia de goles acumulada (FTHG - FTAG) hasta antes del partido actual

In [22]:
df['home_points'] = df['FTR'].map({'H': 3, 'D': 1, 'A': 0})
df['away_points'] = df['FTR'].map({'H': 0, 'D': 1, 'A': 3})

df['home_gd'] = df['FTHG'] - df['FTAG']
df['away_gd'] = df['FTAG'] - df['FTHG']

df = df.sort_values(['Season', 'Date']).reset_index(drop=True)

df['home_points_cum'] = (
    df.groupby(['Season','HomeTeam_norm'])['home_points']
      .transform(lambda s: s.fillna(0).cumsum().shift(1))
      .fillna(0)
)
df['away_points_cum'] = (
    df.groupby(['Season','AwayTeam_norm'])['away_points']
      .transform(lambda s: s.fillna(0).cumsum().shift(1))
      .fillna(0)
)
df['home_gd_cum'] = (
    df.groupby(['Season','HomeTeam_norm'])['home_gd']
      .transform(lambda s: s.fillna(0).cumsum().shift(1))
      .fillna(0)
)
df['away_gd_cum'] = (
    df.groupby(['Season','AwayTeam_norm'])['away_gd']
      .transform(lambda s: s.fillna(0).cumsum().shift(1))
      .fillna(0)
)

Además tambien vas a crear:

* `total_points_cum_home`: puntos totales acumulados por el equipo local hasta antes de ese partido

* `total_points_cum_away`: puntos acumulados por el equipo visitante

* `total_gd_cum_home`: diferencia de goles acumulada por el equipo local

* `total_gd_cum_away`: diferencia de goles acumulada por el visitante

In [23]:
home_df = df[['Season','Date','HomeTeam_norm','home_points','home_gd']].rename(
    columns={'HomeTeam_norm':'Team','home_points':'Points','home_gd':'GD'}
)
away_df = df[['Season','Date','AwayTeam_norm','away_points','away_gd']].rename(
    columns={'AwayTeam_norm':'Team','away_points':'Points','away_gd':'GD'}
)

team_perf = pd.concat([home_df, away_df], axis=0, ignore_index=True)
team_perf = team_perf.sort_values(['Season','Team','Date'])

team_perf['team_points_cum'] = (
    team_perf.groupby(['Season','Team'])['Points']
             .transform(lambda s: s.fillna(0).cumsum().shift(1))
             .fillna(0)
)
team_perf['team_gd_cum'] = (
    team_perf.groupby(['Season','Team'])['GD']
             .transform(lambda s: s.fillna(0).cumsum().shift(1))
             .fillna(0)
)

df = df.merge(
    team_perf[['Season','Date','Team','team_points_cum','team_gd_cum']],
    left_on=['Season','Date','HomeTeam_norm'],
    right_on=['Season','Date','Team'],
    how='left'
).rename(columns={
    'team_points_cum':'home_total_points_cum',
    'team_gd_cum':'home_total_gd_cum'
}).drop(columns='Team')

df = df.merge(
    team_perf[['Season','Date','Team','team_points_cum','team_gd_cum']],
    left_on=['Season','Date','AwayTeam_norm'],
    right_on=['Season','Date','Team'],
    how='left'
).rename(columns={
    'team_points_cum':'away_total_points_cum',
    'team_gd_cum':'away_total_gd_cum'
}).drop(columns='Team')

* **DEFENSA O ATAQUE**

¿Qué estilo de juego predice mejor el rendimiento en fútbol: el defensivo o el ofensivo?

Vamos a crear la variable `playstyle` que clasifique a cada equipo como "ofensivo", "defensivo" o "equilibrado" usando la métrica diferencia de goles media en los últimos 6 partidos.

De esta forma tendríamos una media móvil de diferencia de goles (goal_diff) en los últimos 6 partidos del equipo (como local y visitante).

In [24]:
window = 6                   # nº máximo de partidos previos a considerar
prev_weight = 0.7            # peso para partidos de la temporada PREVIA
min_total_periods = 3        # mínimo de partidos previos (actual+previa) para calcular media si NO es ascendido
min_periods_promoted = 1     # mínimo de partidos previos si ES ascendido (solo actual)
thr_off = 0.75               # umbral estilo ofensivo
thr_def = -0.75              # umbral estilo defensivo
fill_neutral_on_nan = False  # si True, rellena NaN de estilo con 'equilibrado' (cold start)

home = df[['Season', 'Date', 'HomeTeam_norm', 'home_gd']].copy()
home.rename(columns={'HomeTeam_norm': 'Team', 'home_gd': 'gd'}, inplace=True)

away = df[['Season', 'Date', 'AwayTeam_norm', 'away_gd']].copy()
away.rename(columns={'AwayTeam_norm': 'Team', 'away_gd': 'gd'}, inplace=True)

perf = pd.concat([home, away], ignore_index=True)

perf = perf.sort_values(['Team', 'Date']).reset_index(drop=True)

first_season_by_team = perf.groupby('Team')['Season'].transform('min')
perf['is_promoted_season'] = perf['Season'] == first_season_by_team

def weighted_hybrid_gd_mean_with_promoted(group: pd.DataFrame) -> pd.Series:
    """
    Para cada partido (fila) del equipo:
      - Si NO es temporada de debut en el dataset:
          * Usa últimos 'window' partidos previos (actual + previas con peso prev_weight).
          * Requiere al menos 'min_total_periods' partidos previos (entre actual+previa).
      - Si SÍ es temporada de debut:
          * Usa SOLO los partidos previos de la misma temporada (peso 1.0).
          * Requiere al menos 'min_periods_promoted' partidos previos (típicamente 1).
    """
    seasons = group['Season'].to_numpy()
    gds = group['gd'].to_numpy()
    is_promoted = group['is_promoted_season'].to_numpy()
    n = len(group)
    out = np.full(n, np.nan, dtype=float)

    for i in range(n):
        prev_end = i
        if prev_end == 0:
            continue

        start = max(0, prev_end - window)
        prev_slice = slice(start, prev_end)

        current_season = seasons[i]
        prev_seasons = seasons[prev_slice]
        prev_gds = gds[prev_slice]

        if is_promoted[i]:
            mask_same_season = (prev_seasons == current_season)
            prev_gds_sel = prev_gds[mask_same_season]
            if prev_gds_sel.size < min_periods_promoted:
                continue
            out[i] = prev_gds_sel.mean()
        else:
            if (prev_end - start) < min_total_periods:
                continue
            weights = np.where(prev_seasons == current_season, 1.0, prev_weight)
            wsum = weights.sum()
            if wsum <= 0:
                continue
            out[i] = np.dot(prev_gds, weights) / wsum

    return pd.Series(out, index=group.index)

perf['gd_mean_hybrid'] = perf.groupby('Team', group_keys=False)\
                             .apply(weighted_hybrid_gd_mean_with_promoted)

def clasificar_estilo(gd_mean, thr_off=thr_off, thr_def=thr_def):
    if pd.isna(gd_mean):
        return np.nan
    if gd_mean >= thr_off:
        return 'ofensivo'
    if gd_mean <= thr_def:
        return 'defensivo'
    return 'equilibrado'

perf['estilo_de_juego'] = perf['gd_mean_hybrid'].apply(clasificar_estilo)

if fill_neutral_on_nan:
    perf['estilo_de_juego'] = perf['estilo_de_juego'].fillna('equilibrado')

df = df.merge(
    perf[['Season', 'Date', 'Team', 'estilo_de_juego']],
    left_on=['Season', 'Date', 'HomeTeam_norm'],
    right_on=['Season', 'Date', 'Team'],
    how='left'
)
df.rename(columns={'estilo_de_juego': 'home_playstyle'}, inplace=True)
df.drop(columns='Team', inplace=True)

df = df.merge(
    perf[['Season', 'Date', 'Team', 'estilo_de_juego']],
    left_on=['Season', 'Date', 'AwayTeam_norm'],
    right_on=['Season', 'Date', 'Team'],
    how='left'
)
df.rename(columns={'estilo_de_juego': 'away_playstyle'}, inplace=True)
df.drop(columns='Team', inplace=True)

* **TENDENCIA O REVERSIÓN A LA MEDIA**

Es más probable que un equipo que mejora vuelva a empeorar (reversión a la media).

Para medir este tipo de situaciones vamos a crear tres variables nuevas:

* `dynamic_pos_change_prev_season`: Cambio de posición respecto a la temporada anterior (puede ser positiva o negativa). Es la mejor proxy del “efecto mejora o empeoramiento”. La versión "estática" solo compara posiciones al final de temporadas anteriores pero nosotros la intentaremos hacer dinámica que da una idea clara de cómo está rindiendo el equipo en un momento determinado respecto a su posición histórica.

* `form_points_6` / `form_gd_6`: Suma de puntos / dg en los últimos 6 partidos. Detecta tendencias a corto plazo, útil como feature directa y para medir si hay sobre-rendimiento.

In [25]:
df = df.sort_values("Date").copy()

home_df = df[['Season', 'Date', 'HomeTeam_norm', 'home_points', 'home_gd']].copy()
home_df.rename(columns={
    'HomeTeam_norm': 'Team',
    'home_points': 'Points',
    'home_gd': 'GD'
}, inplace=True)

away_df = df[['Season', 'Date', 'AwayTeam_norm', 'away_points', 'away_gd']].copy()
away_df.rename(columns={
    'AwayTeam_norm': 'Team',
    'away_points': 'Points',
    'away_gd': 'GD'
}, inplace=True)

team_perf = pd.concat([home_df, away_df], axis=0).sort_values(['Season', 'Date'])

team_perf['Matchday'] = team_perf.groupby(['Season', 'Team']).cumcount() + 1

team_perf['prev_position'] = None

for season in team_perf['Season'].unique():
    df_season = team_perf[team_perf['Season'] == season].copy()
    for jornada in range(2, df_season['Matchday'].max() + 1):
        tabla_prev = df_season[df_season['Matchday'] < jornada].groupby('Team')[['Points', 'GD']].sum()
        tabla_prev = tabla_prev.sort_values(['Points', 'GD'], ascending=[False, False])
        tabla_prev['Position'] = range(1, len(tabla_prev) + 1)

        equipos_jornada = df_season[df_season['Matchday'] == jornada]['Team']
        for equipo in equipos_jornada:
            pos = tabla_prev.loc[equipo, 'Position'] if equipo in tabla_prev.index else None
            mask = (
                (team_perf['Season'] == season) &
                (team_perf['Team'] == equipo) &
                (team_perf['Matchday'] == jornada)
            )
            team_perf.loc[mask, 'prev_position'] = pos

team_perf['prev_position'] = team_perf['prev_position'].astype(float)

prev_pos = team_perf[['Season', 'Date', 'Team', 'prev_position']].copy()

df = df.merge(
    prev_pos,
    how='left',
    left_on=['Season', 'Date', 'HomeTeam_norm'],
    right_on=['Season', 'Date', 'Team']
)
df.rename(columns={'prev_position': 'home_prev_position'}, inplace=True)
df.drop(columns='Team', inplace=True)

df = df.merge(
    prev_pos,
    how='left',
    left_on=['Season', 'Date', 'AwayTeam_norm'],
    right_on=['Season', 'Date', 'Team']
)
df.rename(columns={'prev_position': 'away_prev_position'}, inplace=True)
df.drop(columns='Team', inplace=True)

In [26]:
home = df[['Season', 'Date', 'HomeTeam_norm', 'home_points', 'home_gd']].copy()
home.columns = ['Season', 'Date', 'Team', 'Points', 'GD']

away = df[['Season', 'Date', 'AwayTeam_norm', 'away_points', 'away_gd']].copy()
away.columns = ['Season', 'Date', 'Team', 'Points', 'GD']

team_season = pd.concat([home, away], axis=0)

team_season_total = team_season.groupby(['Season', 'Team'])[['Points', 'GD']].sum().reset_index()

team_season_total = team_season_total.sort_values(['Season', 'Points', 'GD'], ascending=[True, False, False])
team_season_total['FinalPosition'] = team_season_total.groupby('Season').cumcount() + 1

team_season_total['Season'] = team_season_total['Season'] + 1
team_season_total.rename(columns={'FinalPosition': 'prev_season_final_position'}, inplace=True)

df = df.merge(team_season_total[['Season', 'Team', 'prev_season_final_position']],
              left_on=['Season', 'HomeTeam_norm'],
              right_on=['Season', 'Team'],
              how='left')
df.rename(columns={'prev_season_final_position': 'home_final_position_prev_season'}, inplace=True)
df.drop(columns='Team', inplace=True)

df = df.merge(team_season_total[['Season', 'Team', 'prev_season_final_position']],
              left_on=['Season', 'AwayTeam_norm'],
              right_on=['Season', 'Team'],
              how='left')
df.rename(columns={'prev_season_final_position': 'away_final_position_prev_season'}, inplace=True)
df.drop(columns='Team', inplace=True)

df['home_dynamic_pos_change_prev_season'] = (
    df['home_final_position_prev_season'] - df['home_prev_position']
)

df['away_dynamic_pos_change_prev_season'] = (
    df['away_final_position_prev_season'] - df['away_prev_position']
)

In [27]:
home_form = df[['Season', 'Date', 'HomeTeam_norm', 'home_points']].copy()
home_form.columns = ['Season', 'Date', 'Team', 'Points']

away_form = df[['Season', 'Date', 'AwayTeam_norm', 'away_points']].copy()
away_form.columns = ['Season', 'Date', 'Team', 'Points']

team_form = pd.concat([home_form, away_form], axis=0)
team_form = team_form.sort_values(['Season', 'Team', 'Date'])

team_form['form_points_6'] = (
    team_form.groupby(['Season', 'Team'])['Points']
    .transform(lambda x: x.shift().rolling(window=6, min_periods=1).sum())
)

df = df.merge(
    team_form[['Season', 'Date', 'Team', 'form_points_6']],
    left_on=['Season', 'Date', 'HomeTeam_norm'],
    right_on=['Season', 'Date', 'Team'],
    how='left'
)
df.rename(columns={'form_points_6': 'home_form_points_6'}, inplace=True)
df.drop(columns='Team', inplace=True)

df = df.merge(
    team_form[['Season', 'Date', 'Team', 'form_points_6']],
    left_on=['Season', 'Date', 'AwayTeam_norm'],
    right_on=['Season', 'Date', 'Team'],
    how='left'
)
df.rename(columns={'form_points_6': 'away_form_points_6'}, inplace=True)
df.drop(columns='Team', inplace=True)

In [28]:
home_gd_form = df[['Season', 'Date', 'HomeTeam_norm', 'home_gd']].copy()
home_gd_form.columns = ['Season', 'Date', 'Team', 'GD']

away_gd_form = df[['Season', 'Date', 'AwayTeam_norm', 'away_gd']].copy()
away_gd_form.columns = ['Season', 'Date', 'Team', 'GD']

team_gd_form = pd.concat([home_gd_form, away_gd_form], axis=0)
team_gd_form = team_gd_form.sort_values(['Season', 'Team', 'Date'])

team_gd_form['form_gd_6'] = (
    team_gd_form.groupby(['Season', 'Team'])['GD']
    .transform(lambda x: x.shift().rolling(window=6, min_periods=1).sum())
)

df = df.merge(
    team_gd_form[['Season', 'Date', 'Team', 'form_gd_6']],
    left_on=['Season', 'Date', 'HomeTeam_norm'],
    right_on=['Season', 'Date', 'Team'],
    how='left'
)
df.rename(columns={'form_gd_6': 'home_form_gd_6'}, inplace=True)
df.drop(columns='Team', inplace=True)

df = df.merge(
    team_gd_form[['Season', 'Date', 'Team', 'form_gd_6']],
    left_on=['Season', 'Date', 'AwayTeam_norm'],
    right_on=['Season', 'Date', 'Team'],
    how='left'
)
df.rename(columns={'form_gd_6': 'away_form_gd_6'}, inplace=True)
df.drop(columns='Team', inplace=True)

* **AL BORDE DEL ABISMO**

Los equipos que están en zona de descenso o cerca, mejoran notablemente su rendimiento en las últimas jornadas. A la vez, los equipos en “zona de nadie” empeoran.

Para medir esa presión contextual podemos crear variables como:

* `position_zone`: variable categórica que indica en que zona de la tabla se encuentra el equipo antes del partido que representa el registro ('champions', 'europa', 'mid_table', 'descenso').

Otra posible variable es indicar si el equipo ya está matematicamente salvado o no.

In [29]:
def classify_zone(pos):
    if pos <= 4:
        return 'champions'
    elif pos <= 6:
        return 'europa'
    elif pos <= 17:
        return 'mid_table'
    else:
        return 'descenso'

df['home_position_zone'] = df['home_prev_position'].apply(classify_zone)
df['away_position_zone'] = df['away_prev_position'].apply(classify_zone)

* **CASA O FUERA**

¿Es más útil para predecir un partido fijarse en el rendimiento general de un equipo, o específicamente en su comportamiento como local/visitante?

Ya tenemos variables que miden el comportamiento local y visitante de los equipos como son `home_points_cum` / `away_points_cum` y `home_gd_cum` / `away_gd_cum`.

* **LA IMPORTANCIA DEL PASADO**

Supuestamente la mejor ventana predictiva es la que incluye la temporada actual y las 2 anteriores. Más allá de eso, el valor predictivo se estanca o incluso se reduce.



* **MÁS ALLÁ DE LOS GOLES**

Este apartado induce a la creación de variables basadas en las diferencias de puntos, tiros y goles entre los dos equipos del partido.

Una de las variables más interesantes que nos deja este capítulo es la de `effectiveness` pero calculada de forma dinámica acumulada, es decir, se trata de la división entre los puntos acumulados hasta antes del partido y los tiros a puerta realizados hasta antes del partido.

In [30]:
home_eff = df[['Season', 'Date', 'HomeTeam_norm', 'home_points', 'HST']].copy()
home_eff.columns = ['Season', 'Date', 'Team', 'Points', 'ShotsOnTarget']

away_eff = df[['Season', 'Date', 'AwayTeam_norm', 'away_points', 'AST']].copy()
away_eff.columns = ['Season', 'Date', 'Team', 'Points', 'ShotsOnTarget']

eff = pd.concat([home_eff, away_eff], ignore_index=True)
eff = eff.sort_values(['Season', 'Team', 'Date'], kind='mergesort')

eff['cum_points_pre'] = (
    eff.groupby(['Season','Team'])['Points']
       .transform(lambda s: s.fillna(0).cumsum().shift(1))
       .fillna(0)
)
eff['cum_sot_pre'] = (
    eff.groupby(['Season','Team'])['ShotsOnTarget']
       .transform(lambda s: s.fillna(0).cumsum().shift(1))
       .fillna(0)
)

eff['effectiveness'] = eff['cum_points_pre'] / eff['cum_sot_pre'].replace(0, np.nan)

df = df.merge(
    eff[['Season','Date','Team','effectiveness']],
    left_on=['Season','Date','HomeTeam_norm'],
    right_on=['Season','Date','Team'],
    how='left'
).rename(columns={'effectiveness':'home_effectiveness'}).drop(columns='Team')

df = df.merge(
    eff[['Season','Date','Team','effectiveness']],
    left_on=['Season','Date','AwayTeam_norm'],
    right_on=['Season','Date','Team'],
    how='left'
).rename(columns={'effectiveness':'away_effectiveness'}).drop(columns='Team')

* **EL DULCE SABOR DE LA VENGANZA**

$$
\text{rivalidad}_{ij} = \frac{\# \text{derrotas de } i \text{ vs } j}{\# \text{enfrentamientos entre } i \text{ y } j}
$$

Esta variable mide cuánto ha perdido históricamente un equipo frente a otro. Cuanto más alto el valor, más traumática puede ser la serie de enfrentamientos. Esto puede tener impacto psicológico y afectar el rendimiento.

In [31]:
h2h_data = defaultdict(lambda: {"losses": 0, "matches": 0})

home_rivalry = []
away_rivalry = []

for idx, row in df.sort_values('Date').iterrows():
    home = row['HomeTeam_norm']
    away = row['AwayTeam_norm']
    ftr = row['FTR']

    key_home = (home, away)
    key_away = (away, home)

    home_ratio = h2h_data[key_home]["losses"] / h2h_data[key_home]["matches"] if h2h_data[key_home]["matches"] > 0 else 0
    away_ratio = h2h_data[key_away]["losses"] / h2h_data[key_away]["matches"] if h2h_data[key_away]["matches"] > 0 else 0

    home_rivalry.append(home_ratio)
    away_rivalry.append(away_ratio)

    h2h_data[key_home]["matches"] += 1
    h2h_data[key_away]["matches"] += 1

    if ftr == 'A':
        h2h_data[key_home]["losses"] += 1
    elif ftr == 'H':
        h2h_data[key_away]["losses"] += 1

df['home_rivalry_ratio'] = home_rivalry
df['away_rivalry_ratio'] = away_rivalry

* **EL PRECIO DE UNA GRAN VICTORIA**

Los jugadores de un equipo de mitad de tabla se motivan especialmente al enfrentar a un gigante como el Madrid o el Barça. Ganan visibilidad, se juegan el prestigio personal y colectivo. Si ganan, el nivel emocional y motivacional alcanza un pico muy alto. Ese nivel de exigencia genera un bajón posterior, tanto físico como psicológico (hipótesis).

Para reflejar este bajon post gran victoria he creado la siguiente variable binaria: `post_big_odds_win_flag`.

Esta variable indica si el equipo (local o visitante) viene de una gran victoria inesperada, cuantificando "inesperada" como aquellas que superan la cuota 4 para la victoria de ese equipo, lo que equivale más o menos a una probabilidad implícita de victoria del 25%.



In [32]:
BIG_WIN_THRESHOLD = 4.0

df = df.sort_values('Date').copy()
df['_Date_dt'] = pd.to_datetime(df['Date'], errors='coerce')

home_big_win = (df['FTR'].eq('H')) & (pd.to_numeric(df['B365H'], errors='coerce') > BIG_WIN_THRESHOLD)
away_big_win = (df['FTR'].eq('A')) & (pd.to_numeric(df['B365A'], errors='coerce') > BIG_WIN_THRESHOLD)

home_long = df.loc[:, ['_Date_dt', 'HomeTeam_norm']].rename(columns={'HomeTeam_norm':'Team'})
home_long['big_win'] = home_big_win.values

away_long = df.loc[:, ['_Date_dt', 'AwayTeam_norm']].rename(columns={'AwayTeam_norm':'Team'})
away_long['big_win'] = away_big_win.values

team_long = pd.concat([home_long, away_long], ignore_index=True)
team_long = team_long.sort_values(['Team','_Date_dt']).reset_index(drop=True)

team_long['prev_big_win_any'] = (
    team_long.groupby('Team', group_keys=False)['big_win']
             .shift(1)
             .fillna(False)
             .astype(int)
)

key_prev = team_long[['Team', '_Date_dt', 'prev_big_win_any']].copy()

home_prev = key_prev.rename(columns={'Team':'HomeTeam_norm', 'prev_big_win_any':'home_prev_big_odds_win_any'})
df = df.merge(home_prev, how='left', on=['HomeTeam_norm','_Date_dt'])

away_prev = key_prev.rename(columns={'Team':'AwayTeam_norm', 'prev_big_win_any':'away_prev_big_odds_win_any'})
df = df.merge(away_prev, how='left', on=['AwayTeam_norm','_Date_dt'])

df['home_prev_big_odds_win_any'] = df['home_prev_big_odds_win_any'].fillna(0).astype(int)
df['away_prev_big_odds_win_any'] = df['away_prev_big_odds_win_any'].fillna(0).astype(int)
df = df.drop(columns=['_Date_dt'])

* **¿QUÉ ESPERABAS?**

¿Cómo saber si un equipo está jugando realmente bien o mal… o simplemente se ha enfrentado a rivales difíciles o fáciles?

Se propone una forma cuantitativa de contextualizar el rendimiento reciente de un equipo en función de las cuotas de apuestas previas, usadas como indicador de dificultad.

Sumamos todas las probabilidades implícitas de los últimos 14 partidos para estimar cuántas victorias "debería" haber tenido el equipo según las cuotas.
Lo comparamos con el número real de victorias obtenidas.

$$
\text{relative_performance} = \frac{\text{suma victorias reales}}{\text{suma victorias esperadas (según cuotas implícitas)}}
$$

In [33]:
home_df = df[['Date', 'Season', 'HomeTeam_norm', 'FTR', 'B365H']].copy()
home_df['Team'] = home_df['HomeTeam_norm']
home_df['RealWin'] = (home_df['FTR'] == 'H').astype(int)
home_df['ExpWin'] = 1 / home_df['B365H']
home_df = home_df[['Date', 'Season', 'Team', 'RealWin', 'ExpWin']]

away_df = df[['Date', 'Season', 'AwayTeam_norm', 'FTR', 'B365A']].copy()
away_df['Team'] = away_df['AwayTeam_norm']
away_df['RealWin'] = (away_df['FTR'] == 'A').astype(int)
away_df['ExpWin'] = 1 / away_df['B365A']
away_df = away_df[['Date', 'Season', 'Team', 'RealWin', 'ExpWin']]

perf_df = pd.concat([home_df, away_df])
perf_df = perf_df.sort_values(['Team', 'Date'])

def rolling_relative_perf(group):
    real = group['RealWin'].rolling(window=14, min_periods=1).sum().shift(1)
    exp = group['ExpWin'].rolling(window=14, min_periods=1).sum().shift(1)
    return real / exp

perf_df['relative_performance_14'] = perf_df.groupby('Team', group_keys=False)\
                                            .apply(rolling_relative_perf)

df = df.merge(perf_df[['Date', 'Team', 'relative_performance_14']],
              left_on=['Date', 'HomeTeam_norm'],
              right_on=['Date', 'Team'],
              how='left')
df.rename(columns={'relative_performance_14': 'home_relative_perf_14'}, inplace=True)
df.drop(columns=['Team'], inplace=True)

df = df.merge(perf_df[['Date', 'Team', 'relative_performance_14']],
              left_on=['Date', 'AwayTeam_norm'],
              right_on=['Date', 'Team'],
              how='left')
df.rename(columns={'relative_performance_14': 'away_relative_perf_14'}, inplace=True)
df.drop(columns=['Team'], inplace=True)

* **CUOTAS Y PROBABILIDADES**

Por último vamos a derivar alguna variable a partir de las cuotas que ofrecen las casas de apuestas. En este caso tenemos unicamente las de Bet365.

La mas interesante serian las probabilidades implícitas reales: `pimp1`, `pimpx`, `pimp2`. Reflejan la “opinión agregada del mercado” corregida por margen.

In [34]:
df['inv_q1'] = 1 / df['B365H']
df['inv_qx'] = 1 / df['B365D']
df['inv_q2'] = 1 / df['B365A']

df['overround'] = df['inv_q1'] + df['inv_qx'] + df['inv_q2']

df['pimp1'] = df['inv_q1'] / df['overround']
df['pimpx'] = df['inv_qx'] / df['overround']
df['pimp2'] = df['inv_q2'] / df['overround']

df.drop(columns=['inv_q1', 'inv_qx', 'inv_q2'], inplace=True)

In [35]:
PROC.mkdir(parents=True, exist_ok=True)

OUT_PATH = PROC / "df_new_features.parquet"
df.to_parquet(OUT_PATH, index=False)

print(f"Guardado: {OUT_PATH} · filas={len(df):,} · cols={df.shape[1]}")

Guardado: /content/data/02_processed/df_new_features.parquet · filas=7,651 · cols=91
