### Imports & Setup

Load required libraries for data handling, modeling, and evaluation.

In [8]:
import pandas as pd
import numpy as np

from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score

### Load & Prepare Data

Read dataset, parse dates, and enforce strict chronological ordering.

In [9]:
df = pd.read_csv("../merged_tennis_files/tennis_ml_player_ab_with_elo.csv")

df['tourney_date'] = pd.to_datetime(df['tourney_date'])

# IMPORTANT: time-aware ordering
df = df.sort_values('tourney_date').reset_index(drop=True)

df.head()

Unnamed: 0,tourney_date,surface,round,best_of,minutes,player_a_name,player_a_age,player_a_rank,player_a_height,player_a_hand,...,player_a_win,rank_diff,age_diff,height_diff,player_a_elo,player_b_elo,elo_diff,player_a_elo_surface,player_b_elo_surface,elo_surface_diff
0,1975-02-24,CARPET,F,3,135.0,Roger Taylor,33.3,41.0,183.0,L,...,1,-14.0,10.5,5.0,1500.0,1500.0,0.0,1500.0,1500.0,0.0
1,1987-04-27,CLAY,F,5,136.0,Miloslav Mecir,22.9,5.0,190.0,R,...,0,4.0,-4.2,2.0,1500.0,1500.0,0.0,1500.0,1500.0,0.0
2,1990-12-31,HARD,R32,3,147.0,Andrei Cherkasov,20.4,21.0,180.0,R,...,0,-59.0,-3.0,0.0,1500.0,1501.370257,-1.370257,1500.0,1501.370257,-1.370257
3,1990-12-31,HARD,SF,3,80.0,Magnus Larsson,20.7,56.0,193.0,R,...,0,5.0,1.1,3.0,1516.0,1516.0,0.0,1516.0,1516.0,0.0
4,1990-12-31,HARD,QF,3,80.0,Jim Courier,20.3,25.0,185.0,R,...,1,-86.0,-2.5,5.0,1500.0,1500.0,0.0,1500.0,1500.0,0.0


### Define Target Variable

Binary outcome: did Player A win the match?

In [10]:
y = df['player_a_win']

### Select Model Features

Numerical + categorical features used for prediction.

In [11]:
model_features = [
    'rank_diff',
    'age_diff',
    'height_diff',

    'elo_diff',
    'elo_surface_diff',

    'best_of',
    'minutes',

    'surface',
    'round',
    'player_a_hand',
    'player_b_hand'
]

X = df[model_features].copy()

### Encode Categorical Features

Label encoding for tree-based models (safe for XGBoost).

In [12]:
categorical_cols = ['surface', 'round', 'player_a_hand', 'player_b_hand']

for col in categorical_cols:
    X[col] = LabelEncoder().fit_transform(X[col])

### Time-Aware Train/Test Split

Train on past matches, test on future matches (no leakage).

In [13]:
split_idx = int(len(df) * 0.8)

X_train = X.iloc[:split_idx]
X_test  = X.iloc[split_idx:]

y_train = y.iloc[:split_idx]
y_test  = y.iloc[split_idx:]

### Initialize XGBoost Model

Gradient-boosted trees tuned for structured sports data.

In [14]:
model = XGBClassifier(
    n_estimators=800,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.85,
    colsample_bytree=0.85,
    min_child_weight=5,
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    random_state=42
)

### Train Model

Fit using historical match data only.

In [15]:
model.fit(X_train, y_train)

0,1,2
,"objective  objective: typing.Union[str, xgboost.sklearn._SklObjWProto, typing.Callable[[typing.Any, typing.Any], typing.Tuple[numpy.ndarray, numpy.ndarray]], NoneType] Specify the learning task and the corresponding learning objective or a custom objective function to be used. For custom objective, see :doc:`/tutorials/custom_metric_obj` and :ref:`custom-obj-metric` for more information, along with the end note for function signatures.",'binary:logistic'
,"base_score  base_score: typing.Union[float, typing.List[float], NoneType] The initial prediction score of all instances, global bias.",
,booster,
,"callbacks  callbacks: typing.Optional[typing.List[xgboost.callback.TrainingCallback]] List of callback functions that are applied at end of each iteration. It is possible to use predefined callbacks by using :ref:`Callback API `. .. note::  States in callback are not preserved during training, which means callback  objects can not be reused for multiple training sessions without  reinitialization or deepcopy. .. code-block:: python  for params in parameters_grid:  # be sure to (re)initialize the callbacks before each run  callbacks = [xgb.callback.LearningRateScheduler(custom_rates)]  reg = xgboost.XGBRegressor(**params, callbacks=callbacks)  reg.fit(X, y)",
,colsample_bylevel  colsample_bylevel: typing.Optional[float] Subsample ratio of columns for each level.,
,colsample_bynode  colsample_bynode: typing.Optional[float] Subsample ratio of columns for each split.,
,colsample_bytree  colsample_bytree: typing.Optional[float] Subsample ratio of columns when constructing each tree.,0.85
,"device  device: typing.Optional[str] .. versionadded:: 2.0.0 Device ordinal, available options are `cpu`, `cuda`, and `gpu`.",
,"early_stopping_rounds  early_stopping_rounds: typing.Optional[int] .. versionadded:: 1.6.0 - Activates early stopping. Validation metric needs to improve at least once in  every **early_stopping_rounds** round(s) to continue training. Requires at  least one item in **eval_set** in :py:meth:`fit`. - If early stopping occurs, the model will have two additional attributes:  :py:attr:`best_score` and :py:attr:`best_iteration`. These are used by the  :py:meth:`predict` and :py:meth:`apply` methods to determine the optimal  number of trees during inference. If users want to access the full model  (including trees built after early stopping), they can specify the  `iteration_range` in these inference methods. In addition, other utilities  like model plotting can also use the entire model. - If you prefer to discard the trees after `best_iteration`, consider using the  callback function :py:class:`xgboost.callback.EarlyStopping`. - If there's more than one item in **eval_set**, the last entry will be used for  early stopping. If there's more than one metric in **eval_metric**, the last  metric will be used for early stopping.",
,enable_categorical  enable_categorical: bool See the same parameter of :py:class:`DMatrix` for details.,False


### Evaluate Performance

Accuracy + ROC AUC on unseen future matches.

In [16]:
preds = model.predict(X_test)
probs = model.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, preds)
auc = roc_auc_score(y_test, probs)

print(f"âœ… Accuracy: {acc:.4f}")
print(f"ðŸ“ˆ ROC AUC: {auc:.4f}")

âœ… Accuracy: 0.6506
ðŸ“ˆ ROC AUC: 0.7223
