# Tennis Pipeline — Extras
Aquest notebook afegeix LightGBM, calibració isotònica, rolling CV i backtest de profit amb odds si estan disponibles.


## 0) Dependències

In [1]:

# !pip install lightgbm -q
import os, pandas as pd, numpy as np, json, matplotlib.pyplot as plt
from sklearn.metrics import log_loss, roc_auc_score, brier_score_loss
from sklearn.preprocessing import StandardScaler
from sklearn.isotonic import IsotonicRegression
try:
    import lightgbm as lgb
except Exception as e:
    print("⚠️ lightgbm no disponible. Instal·la-ho amb pip i reinicia el kernel.")


## 1) Carrega dataset i columnes

In [6]:
import os, pandas as pd

DATASET_PATH = "outputs/dataset_match_level.csv"
COLS_PATH = "outputs/model_columns.txt"

# Sanity check
print("CWD:", os.getcwd())
print("Exists dataset?", os.path.exists(DATASET_PATH))

dataset = pd.read_csv(DATASET_PATH)
dataset['date'] = pd.to_datetime(dataset['date'], errors='coerce')

try:
    model_cols = pd.read_csv(COLS_PATH, header=None)[0].tolist()
    print("✔️ model_columns:", len(model_cols))
except FileNotFoundError:
    no_feat = {'match_id','date','surface','y_home_win'}
    model_cols = [c for c in dataset.columns if c not in no_feat]
    os.makedirs("outputs", exist_ok=True)
    pd.Series(model_cols).to_csv(COLS_PATH, index=False, header=False)
    print("⚠️ creat model_columns.txt amb", len(model_cols), "features")

print("Dataset shape (raw):", dataset.shape)
display(dataset.head(3))


CWD: /Users/adriaparcerisas/Downloads/data
Exists dataset? True
✔️ model_columns: 22
Dataset shape (raw): (24934, 26)


Unnamed: 0,match_id,date,surface,elo_global_pre_diff,elo_surface_pre_diff,winrate10_pre_diff,winrate25_pre_diff,sos_elo_recent_pre_diff,hold_pre_diff,break_pre_diff,...,first_in_pre_diff,first_pts_pre_diff,second_pts_pre_diff,is_indoor,is_best_of_5,surface_hard,surface_clay,surface_grass,surface_indoor-hard,y_home_win
0,2018-339_1.0_2018-01-01,2018-01-01,hard,0.0,0.0,,,,,,...,,,,0,0,1,0,0,0,1
1,2018-339_2.0_2018-01-01,2018-01-01,hard,0.0,0.0,,,,,,...,,,,0,0,1,0,0,0,0
2,2018-339_3.0_2018-01-01,2018-01-01,hard,0.0,0.0,,,,,,...,,,,0,0,1,0,0,0,0


## 2) Splits temporals automàtics (80/10/10)

In [7]:
# Split temporal robust (80/10/10) amb fallback
import pandas as pd
df = dataset.copy()

# Assegura dates parsejades i compta files vàlides
df['date'] = pd.to_datetime(df['date'], errors='coerce')
n_total = len(df)
n_dates = df['date'].notna().sum()
print(f"Rows totals={n_total}, amb data vàlida={n_dates}")

if n_dates == 0:
    # Fallback per índex si no hi ha dates vàlides
    idx80 = int(0.80*n_total); idx90 = int(0.90*n_total)
    train, valid, test = df.iloc[:idx80], df.iloc[idx80:idx90], df.iloc[idx90:]
else:
    # Treu NaT, ordena per data i calcula quantils
    df = df.dropna(subset=['date']).sort_values('date').reset_index(drop=True)
    q80 = df['date'].quantile(0.80)
    q90 = df['date'].quantile(0.90)

    TRAIN_END = pd.Timestamp(q80).normalize()
    VALID_END = pd.Timestamp(q90).normalize()

    train = df[df['date'] <= TRAIN_END].copy()
    valid = df[(df['date'] > TRAIN_END) & (df['date'] <= VALID_END)].copy()
    test  = df[df['date'] > VALID_END].copy()

    # Si el TEST queda buit (dates molt concentrades), forcem un tall per percentils
    if len(test) == 0:
        cut80 = int(0.80*len(df)); cut90 = int(0.90*len(df))
        train, valid, test = df.iloc[:cut80].copy(), df.iloc[cut80:cut90].copy(), df.iloc[cut90:].copy()

for name, d in [('train',train),('valid',valid),('test',test)]:
    print(name, f"n={len(d)} | dates: {d['date'].min()} → {d['date'].max()}")


Rows totals=24934, amb data vàlida=24934
train n=19971 | dates: 2018-01-01 00:00:00 → 2025-04-14 00:00:00
valid n=3173 | dates: 2025-04-21 00:00:00 → 2025-08-24 00:00:00
test n=1790 | dates: 2025-09-12 00:00:00 → 2025-10-13 00:00:00


## 3) LightGBM — entrenament amb early stopping

In [11]:
import lightgbm as lgb, numpy as np, json
from lightgbm import LGBMClassifier
from sklearn.metrics import log_loss, roc_auc_score, brier_score_loss

X_tr = train[model_cols].fillna(0.0).values
y_tr = train['y_home_win'].values
X_va = valid[model_cols].fillna(0.0).values
y_va = valid['y_home_win'].values

clf = LGBMClassifier(
    n_estimators=5000,
    learning_rate=0.05,
    num_leaves=63,
    subsample=0.9,
    colsample_bytree=0.9,
    min_child_samples=50,
    objective='binary',
    random_state=2025,
)
clf.fit(
    X_tr, y_tr,
    eval_set=[(X_va, y_va)],
    eval_metric='logloss',
    callbacks=[lgb.early_stopping(100), lgb.log_evaluation(200)],
)

p_tr = clf.predict_proba(X_tr)[:,1]
p_va = clf.predict_proba(X_va)[:,1]
print(json.dumps({
    "train_logloss": float(log_loss(y_tr, p_tr)),
    "valid_logloss": float(log_loss(y_va, p_va)),
    "valid_auc": float(roc_auc_score(y_va, p_va)),
    "valid_brier": float(brier_score_loss(y_va, p_va)),
    "best_iter": int(clf.best_iteration_ if hasattr(clf,"best_iteration_") else clf.n_estimators),
}, indent=2))

# Per coherència amb la resta del notebook:
gbm = clf  # així les cel·les següents poden usar `gbm`


[LightGBM] [Info] Number of positive: 10014, number of negative: 9957
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000259 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 719
[LightGBM] [Info] Number of data points in the train set: 19971, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501427 -> initscore=0.005708
[LightGBM] [Info] Start training from score 0.005708
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[60]	valid_0's binary_logloss: 0.600318
{
  "train_logloss": 0.6175858793492155,
  "valid_logloss": 0.6003175941888154,
  "valid_auc": 0.7356281786395422,
  "valid_brier": 0.20688993483674545,
  "best_iter": 60
}




## 4) Calibració Isotònica i Test

In [13]:
import numpy as np
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics import log_loss, roc_auc_score, brier_score_loss

def predict_proba_robust(model, X):
    """Retorna p(class=1) funcionant tant per Booster com per LGBMClassifier."""
    # API core (Booster)
    if hasattr(model, "predict") and hasattr(model, "best_iteration") and not hasattr(model, "predict_proba"):
        try:
            return model.predict(X, num_iteration=model.best_iteration)
        except TypeError:
            return model.predict(X)
    # API sklearn
    if hasattr(model, "predict_proba"):
        kwargs = {}
        if hasattr(model, "best_iteration_"):
            kwargs["num_iteration"] = model.best_iteration_
        return model.predict_proba(X, **kwargs)[:, 1]
    # Últim recurs
    p = model.predict(X)
    return p if p.ndim == 1 else p[:, 1]

# --- VALID: ja tens p_va de l’entrenament; si no, el recalcularem de forma segura
try:
    p_va  # existent?
except NameError:
    X_va = valid[model_cols].fillna(0.0).values
    p_va = predict_proba_robust(gbm, X_va)

y_va = valid['y_home_win'].values

# Calibració isotònica sobre VALID
iso = IsotonicRegression(out_of_bounds='clip').fit(p_va, y_va)
p_va_cal = iso.transform(p_va)

print({
    "valid_logloss": float(log_loss(y_va, np.clip(p_va, 1e-6, 1-1e-6))),
    "valid_brier": float(brier_score_loss(y_va, p_va)),
    "valid_auc": float(roc_auc_score(y_va, p_va)),
    "valid_logloss_cal": float(log_loss(y_va, np.clip(p_va_cal, 1e-6, 1-1e-6))),
    "valid_brier_cal": float(brier_score_loss(y_va, p_va_cal)),
})

# --- TEST
if len(test):
    X_te = test[model_cols].fillna(0.0).values
    y_te = test['y_home_win'].values
    p_te = predict_proba_robust(gbm, X_te)
    p_te_cal = iso.transform(p_te)

    print({
        "test_logloss": float(log_loss(y_te, np.clip(p_te, 1e-6, 1-1e-6))),
        "test_brier": float(brier_score_loss(y_te, p_te)),
        "test_auc": float(roc_auc_score(y_te, p_te)),
        "test_logloss_cal": float(log_loss(y_te, np.clip(p_te_cal, 1e-6, 1-1e-6))),
        "test_brier_cal": float(brier_score_loss(y_te, p_te_cal)),
    })
else:
    print("⚠️ TEST buit; reporto només VALID.")


{'valid_logloss': 0.6003175941888154, 'valid_brier': 0.20688993483674545, 'valid_auc': 0.7356281786395422, 'valid_logloss_cal': 0.5632387915475144, 'valid_brier_cal': 0.19345896591941117}
{'test_logloss': 0.6576110131813113, 'test_brier': 0.23260203472545973, 'test_auc': 0.6462303010975577, 'test_logloss_cal': 0.7225148939750938, 'test_brier_cal': 0.23667086508865018}




## 5) Rolling Time CV (3 folds)

In [15]:
# Rolling Time CV (LightGBM sklearn + callbacks), robust a NA i splits petits
import pandas as pd, numpy as np
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from sklearn.metrics import log_loss, roc_auc_score, brier_score_loss

def rolling_time_cv(df, model_cols, n_folds=3, min_train_frac=0.5, random_state=2025):
    df = df.copy()
    # Dates i ordre temporal
    df['date'] = pd.to_datetime(df['date'], errors='coerce')
    df = df.dropna(subset=['date']).sort_values('date').reset_index(drop=True)
    n = len(df)
    if n == 0:
        print("⚠️ Dataset buit després de netejar dates.")
        return pd.DataFrame()

    start = max(1, int(n * float(min_train_frac)))
    fold_size = max(1, (n - start) // max(1, n_folds))
    rows = []

    for k in range(n_folds):
        val_start = start + k * fold_size
        val_end = start + (k + 1) * fold_size if k < n_folds - 1 else n
        if val_start >= n:
            break

        tr = df.iloc[:val_start]
        va = df.iloc[val_start:val_end]
        if len(va) == 0 or len(tr) == 0 or len(np.unique(tr['y_home_win'])) < 2:
            # salta si no hi ha prou dades o només una classe al train
            continue

        X_tr = tr[model_cols].fillna(0.0).values
        y_tr = tr['y_home_win'].values
        X_va = va[model_cols].fillna(0.0).values
        y_va = va['y_home_win'].values

        clf = LGBMClassifier(
            n_estimators=5000,
            learning_rate=0.05,
            num_leaves=63,
            subsample=0.9,
            colsample_bytree=0.9,
            min_child_samples=50,
            objective='binary',
            random_state=random_state,
        )

        clf.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric='logloss',
            callbacks=[early_stopping(100), log_evaluation(0)],
        )

        # Prediccions amb millor iteració si existeix
        if hasattr(clf, "best_iteration_"):
            p_va = clf.predict_proba(X_va, num_iteration=clf.best_iteration_)[:, 1]
            best_iter = int(clf.best_iteration_)
        else:
            p_va = clf.predict_proba(X_va)[:, 1]
            best_iter = int(clf.n_estimators)

        # Mètriques (AUC robust)
        try:
            auc = roc_auc_score(y_va, p_va)
        except ValueError:
            auc = float("nan")

        rows.append(dict(
            fold=k+1,
            n_train=len(tr), n_valid=len(va),
            logloss=float(log_loss(y_va, np.clip(p_va, 1e-6, 1-1e-6))),
            brier=float(brier_score_loss(y_va, p_va)),
            auc=float(auc),
            best_iter=best_iter,
            train_end=str(tr['date'].max().date()),
            valid_start=str(va['date'].min().date()),
            valid_end=str(va['date'].max().date()),
        ))

    return pd.DataFrame(rows)

# Executa el CV
cv = rolling_time_cv(dataset[['date','y_home_win'] + model_cols].copy(), model_cols, n_folds=3, min_train_frac=0.5)
display(cv)
if len(cv):
    print("Mitjanes:", cv[['logloss','auc','brier']].mean().to_dict())


[LightGBM] [Info] Number of positive: 6203, number of negative: 6264
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000502 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 715
[LightGBM] [Info] Number of data points in the train set: 12467, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.497554 -> initscore=-0.009786
[LightGBM] [Info] Start training from score -0.009786
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[46]	valid_0's binary_logloss: 0.645412
[LightGBM] [Info] Number of positive: 8318, number of negative: 8304
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000083 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 715
[LightGBM] [Info] Number of data points in t



Early stopping, best iteration is:
[37]	valid_0's binary_logloss: 0.65012
[LightGBM] [Info] Number of positive: 10421, number of negative: 10356
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000102 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 719
[LightGBM] [Info] Number of data points in the train set: 20777, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501564 -> initscore=0.006257
[LightGBM] [Info] Start training from score 0.006257
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[77]	valid_0's binary_logloss: 0.612301




Unnamed: 0,fold,n_train,n_valid,logloss,brier,auc,best_iter,train_end,valid_start,valid_end
0,1,12467,4155,0.645412,0.227165,0.67229,46,2022-09-26,2022-09-26,2024-03-04
1,2,16622,4155,0.65012,0.229298,0.664993,37,2024-03-04,2024-03-04,2025-07-21
2,3,20777,4157,0.612301,0.212286,0.714252,77,2025-07-21,2025-07-21,2025-10-13


Mitjanes: {'logloss': 0.6359445164603192, 'auc': 0.6838448978852844, 'brier': 0.22291628463895066}


## 6) Backtest de profit (si hi ha odds)

In [17]:
# --- Prediccions per al TEST (sense scaler) + opcionalment calibrades ---
import numpy as np

def predict_proba_robust(model, X):
    """Retorna p(class=1) per Booster o LGBMClassifier."""
    # Core Booster
    if hasattr(model, "predict") and hasattr(model, "best_iteration") and not hasattr(model, "predict_proba"):
        try:
            return model.predict(X, num_iteration=model.best_iteration)
        except TypeError:
            return model.predict(X)
    # sklearn
    if hasattr(model, "predict_proba"):
        if hasattr(model, "best_iteration_"):
            try:
                return model.predict_proba(X, num_iteration=model.best_iteration_)[:, 1]
            except TypeError:
                return model.predict_proba(X)[:, 1]
        else:
            return model.predict_proba(X)[:, 1]
    # fallback
    p = model.predict(X)
    return p if p.ndim == 1 else p[:, 1]

X_te = test[model_cols].fillna(0.0).values
test = test.copy()

# prediccions "raw"
p_raw = predict_proba_robust(gbm, X_te)

# si tens l'objecte de calibració 'iso', aplica'l; si no, usa p_raw
p_cal = iso.transform(p_raw) if 'iso' in globals() else p_raw
test['p'] = np.clip(p_cal, 1e-6, 1-1e-6)




## 7) Guarda model i scaler

In [18]:

import joblib, os
if 'lgb' in globals():
    os.makedirs('outputs', exist_ok=True)
    joblib.dump(gbm, 'outputs/model_lightgbm.pkl')
    joblib.dump(scaler, 'outputs/scaler.pkl')
    print("Desat a 'outputs/'")
else:
    print("Sense LightGBM; res per desar.")


Desat a 'outputs/'
