
# PerformancePredictor v4 — Representación de quintetos como conjunto (agregado)

En esta versión, cada alineación se representa como **un conjunto de 5 jugadores**, independientemente del orden en el que aparezcan.  
Para ello, en vez de tener columnas por jugador (`STAT_1, STAT_2, ...`), se generan **features agregadas**:

- `STAT_sum` = suma de los valores de los 5 jugadores
- `STAT_mean` = media de los valores de los 5 jugadores


In [None]:

import os
import json
import numpy as np
import pandas as pd

from pathlib import Path
from typing import List

from sklearn.model_selection import KFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.inspection import permutation_importance
import joblib

from google.colab import drive
drive.mount('/content/drive')

RANDOM_STATE = 42

DATA_DIR = Path('/content/drive/MyDrive/Máster Basket Data Analytics/12. TFM/data')
LINEUPS_2324 = DATA_DIR / '23-24_Lineups_GbG.csv'
LINEUPS_2425 = DATA_DIR / '24-25_Lineups_GbG.csv'
PLAYERS_2324 = DATA_DIR / '23-24_Players.csv'
PLAYERS_2425 = DATA_DIR / '24-25_Players.csv'

MIN_MINUTES = 10.0
MIN_POSSESSIONS = 10.0
WEIGHT_BY = 'poss'

MODEL_PATH = Path('best_model.pkl')
FEATURES_PATH = Path('feature_columns.json')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:

def read_csv_safely(path: Path, sep=';') -> pd.DataFrame:
    return pd.read_csv(path, sep=sep)

def compute_net_rating(df: pd.DataFrame) -> pd.Series:
    poss = (df.get("POSS", 0).fillna(0) + df.get("OPP POSS", 0).fillna(0)) / 2.0
    net = (df.get("PTS", 0).fillna(0) - df.get("OPP PTS", 0).fillna(0))
    return np.where(poss > 0, 100.0 * net / poss, np.nan)

def filter_lineups(df: pd.DataFrame, min_min: float, min_poss: float) -> pd.DataFrame:
    poss = (df.get("POSS", 0).fillna(0) + df.get("OPP POSS", 0).fillna(0))
    cond = (df.get("MIN", 0).fillna(0) >= min_min) & (poss >= min_poss)
    return df.loc[cond].copy()

def promedio_stats(df_players: pd.DataFrame) -> pd.DataFrame:
    num_cols = df_players.select_dtypes(include='number').columns.tolist()
    non_num = [c for c in df_players.columns if c not in num_cols]
    if 'NAME' in non_num:
        non_num.remove('NAME')
    agg = {c: 'mean' for c in num_cols}
    agg.update({c: 'first' for c in non_num})
    return df_players.groupby('NAME', as_index=False).agg(agg)

def sanitize_commas_to_dots(df: pd.DataFrame, columns: List[str]) -> pd.DataFrame:
    for c in columns:
        if df[c].dtype == object:
            df[c] = (df[c].astype(str)
                            .str.replace(',', '.', regex=True)
                            .replace(['-', 'nan', 'None', ''], np.nan))
        df[c] = pd.to_numeric(df[c], errors='coerce')
    return df

def merge_lineups_players(df_lineups: pd.DataFrame, df_players: pd.DataFrame) -> pd.DataFrame:
    keep_cols = [c for c in df_lineups.columns if c in ['PL 1','PL 2','PL 3','PL 4','PL 5','PTS','OPP PTS','POSS','OPP POSS','MIN']]
    if 'NET_RTG' in df_lineups.columns:
        keep_cols.append('NET_RTG')
    base = df_lineups[keep_cols].copy()

    stat_cols = [c for c in df_players.columns if c != 'NAME']
    out_rows = []

    for idx, row in base.iterrows():
        players = [row[f'PL {i}'] for i in range(1,6)]
        df_sub = df_players[df_players['NAME'].isin(players)]

        agg = {}
        for stat in stat_cols:
            vals = df_sub[stat].values
            agg[f"{stat}_sum"] = np.nansum(vals)
            agg[f"{stat}_mean"] = np.nanmean(vals)

        agg_row = row.to_dict()
        agg_row.update(agg)
        out_rows.append(agg_row)

    return pd.DataFrame(out_rows)


In [4]:

lineups_2324 = read_csv_safely(LINEUPS_2324)
lineups_2425 = read_csv_safely(LINEUPS_2425)
players_2324 = read_csv_safely(PLAYERS_2324)
players_2425 = read_csv_safely(PLAYERS_2425)

print('Lineups 23/24:', lineups_2324.shape, '| Players 23/24:', players_2324.shape)
print('Lineups 24/25:', lineups_2425.shape, '| Players 24/25:', players_2425.shape)

for df in [lineups_2324, lineups_2425]:
    for col in ["MIN", "POSS", "OPP POSS"]:
        df[col] = df[col].astype(str).str.replace(",", ".")
        df[col] = pd.to_numeric(df[col], errors="coerce")

if 'NET_RTG' not in lineups_2324.columns:
    lineups_2324['NET_RTG'] = compute_net_rating(lineups_2324)
if 'NET_RTG' not in lineups_2425.columns:
    lineups_2425['NET_RTG'] = compute_net_rating(lineups_2425)

lineups_2324_f = filter_lineups(lineups_2324, MIN_MINUTES, MIN_POSSESSIONS)
lineups_2425_f = filter_lineups(lineups_2425, MIN_MINUTES, MIN_POSSESSIONS)

print('Filtrados -> 23/24:', lineups_2324_f.shape, ' | 24/25:', lineups_2425_f.shape)


  return pd.read_csv(path, sep=sep)


Lineups 23/24: (11486, 180) | Players 23/24: (306, 128)
Lineups 24/25: (11971, 180) | Players 24/25: (312, 128)
Filtrados -> 23/24: (285, 181)  | 24/25: (243, 181)


In [5]:

def guess_numeric_like_columns(df: pd.DataFrame) -> List[str]:
    cand = []
    for c in df.columns:
        if c == 'NAME':
            continue
        if pd.api.types.is_numeric_dtype(df[c]):
            cand.append(c)
        elif pd.api.types.is_object_dtype(df[c]):
            sample = df[c].astype(str).head(50).str.contains(r'^[\d,\.\-]+$', regex=True).mean()
            if sample > 0.6:
                cand.append(c)
    return cand

player_stat_cols_2324 = guess_numeric_like_columns(players_2324)
player_stat_cols_2425 = guess_numeric_like_columns(players_2425)
common_player_stats = sorted(list(set(player_stat_cols_2324).intersection(set(player_stat_cols_2425))))

players_2324_keep = ['NAME'] + common_player_stats
players_2425_keep = ['NAME'] + common_player_stats
players_2324_f = sanitize_commas_to_dots(players_2324[players_2324_keep].copy(), common_player_stats)
players_2425_f = sanitize_commas_to_dots(players_2425[players_2425_keep].copy(), common_player_stats)

players_2324_f = promedio_stats(players_2324_f)
players_2425_f = promedio_stats(players_2425_f)

print('Player stat columns (common):', len(common_player_stats))


Player stat columns (common): 124


In [6]:

df_2324 = merge_lineups_players(lineups_2324_f, players_2324_f)
df_2324['SEASON'] = '23-24'
df_2425 = merge_lineups_players(lineups_2425_f, players_2425_f)
df_2425['SEASON'] = '24-25'

df_all = pd.concat([df_2324, df_2425], axis=0, ignore_index=True)
num_cols_all = df_all.select_dtypes(include='number').columns.tolist()
df_all[num_cols_all] = df_all[num_cols_all].replace([np.inf, -np.inf], np.nan)

print('Tabla final:', df_all.shape)
df_all.head(2)


Tabla final: (528, 260)


Unnamed: 0,PL 1,PL 2,PL 3,PL 4,PL 5,MIN,PTS,POSS,OPP PTS,OPP POSS,...,W_mean,W%_sum,W%_mean,WIN SHARE_sum,WIN SHARE_mean,WIN Share per 40_sum,WIN Share per 40_mean,eFG%_sum,eFG%_mean,SEASON
0,Joffrey Lauvergne,John Egbunu,Nando De Colo,Paris Lee,Timothe Luwawu Cabarrot,12.2,21,20,34,20,...,6.6,1.224401,0.24488,5.857722,1.171544,0.333154,0.066631,2.65932,0.531864,23-24
1,Johannes Thiemann,Louis Olinde,Matt Thomas,Sterling Brown,Ziga Samar,13.333333,19,22,27,22,...,3.0,0.57393,0.114786,4.831683,0.966337,0.296325,0.059265,2.666011,0.533202,23-24


In [7]:

target_col = 'NET_RTG'
feature_cols = [c for c in df_all.columns if c.endswith('_sum') or c.endswith('_mean')]

def compute_sample_weights(df: pd.DataFrame, weight_by: str = 'poss') -> np.ndarray:
    if weight_by == 'poss' and all(col in df.columns for col in ['POSS','OPP POSS']):
        poss = (df['POSS'].fillna(0) + df['OPP POSS'].fillna(0)) / 2.0
        w = poss.clip(lower=1.0)
    elif weight_by == 'min' and 'MIN' in df.columns:
        w = df['MIN'].fillna(0).clip(lower=1.0)
    else:
        w = pd.Series(1.0, index=df.index)
    return w.to_numpy(dtype=float)

train_idx = df_all['SEASON'] == '23-24'
test_idx  = df_all['SEASON'] == '24-25'

X_train = df_all.loc[train_idx, feature_cols].copy()
y_train = df_all.loc[train_idx, target_col].copy()
w_train = compute_sample_weights(df_all.loc[train_idx], WEIGHT_BY)

X_test  = df_all.loc[test_idx, feature_cols].copy()
y_test  = df_all.loc[test_idx, target_col].copy()
w_test  = compute_sample_weights(df_all.loc[test_idx], WEIGHT_BY)

print('X_train:', X_train.shape, '| X_test:', X_test.shape)


X_train: (285, 248) | X_test: (243, 248)


In [9]:

def evaluate(y_true, y_pred, sample_weight=None, label=''):
    mae = mean_absolute_error(y_true, y_pred, sample_weight=sample_weight)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred, sample_weight=sample_weight))
    r2 = r2_score(y_true, y_pred, sample_weight=sample_weight)
    print(f'{label} -> MAE: {mae:.3f} | RMSE: {rmse:.3f} | R²: {r2:.3f}')
    return {'mae': mae, 'rmse': rmse, 'r2': r2}

baseline_value = np.average(y_train.to_numpy(), weights=w_train) if len(y_train) > 0 else float(y_train.mean())
baseline_pred = np.full_like(y_test, baseline_value, dtype=float)
baseline_metrics = evaluate(y_test, baseline_pred, sample_weight=w_test, label='Baseline (media ponderada)')

ridge_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', Ridge(random_state=RANDOM_STATE, alpha=10.0))
])

hgb_pipe = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('model', HistGradientBoostingRegressor(
        random_state=RANDOM_STATE,
        max_iter=500,
        learning_rate=0.05,
        early_stopping=True,
        validation_fraction=0.15,
        min_samples_leaf=20
    ))
])

n_splits = 5 if len(X_train) >= 200 else 3
cv = KFold(n_splits=n_splits)

def cv_rmse(pipe, X, y):
    scores = cross_val_score(pipe, X, y, cv=cv,
                             scoring='neg_root_mean_squared_error')
    return -scores.mean()

models = [('Ridge', ridge_pipe), ('HGBR', hgb_pipe)]
cv_results = {}
for name, pipe in models:
    mean_rmse = cv_rmse(pipe, X_train, y_train)
    cv_results[name] = mean_rmse
    print(f'CV RMSE ({name}): {mean_rmse:.3f}')

best_name = min(cv_results, key=cv_results.get)
best_pipe = dict(models)[best_name]

best_pipe.fit(X_train, y_train, model__sample_weight=w_train if 'model__sample_weight' in best_pipe.get_params() else None)

yhat_test = best_pipe.predict(X_test)
test_metrics = evaluate(y_test, yhat_test, sample_weight=w_test, label=f'Test ({best_name})')


Baseline (media ponderada) -> MAE: 29.866 | RMSE: 37.265 | R²: -0.002
CV RMSE (Ridge): 41.908
CV RMSE (HGBR): 37.429
Test (HGBR) -> MAE: 28.844 | RMSE: 36.437 | R²: 0.042


In [10]:

try:
    perm = permutation_importance(best_pipe, X_test, y_test, n_repeats=10, random_state=RANDOM_STATE, scoring='neg_root_mean_squared_error')
    importances = pd.DataFrame({'feature': feature_cols, 'importance': perm.importances_mean}).sort_values('importance', ascending=False)
    display(importances.head(20))
except Exception as e:
    print('Permutation importance no disponible:', e)

joblib.dump(best_pipe, MODEL_PATH)
with open(FEATURES_PATH, 'w', encoding='utf-8') as f:
    json.dump({'feature_columns': feature_cols, 'best_model': best_name}, f, ensure_ascii=False, indent=2)

print(f'Modelo guardado en: {MODEL_PATH.resolve()}')
print(f'Columnas guardadas en: {FEATURES_PATH.resolve()}')


Unnamed: 0,feature,importance
186,TM NET RTG (NET)_sum,0.994932
240,W%_sum,0.289559
106,NET RTG (ON)_sum,0.233542
238,W_sum,0.196868
0,+ / -_sum,0.177943
132,OPP TS% (NET)_sum,0.170456
164,RIM FREQ_sum,0.069443
28,AST/TO_sum,0.044587
236,VORP_sum,0.040889
218,TM TS% (OFF)_sum,0.039786


Modelo guardado en: /content/best_model.pkl
Columnas guardadas en: /content/feature_columns.json
