In [4]:
import pandas as pd
import numpy as np
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, roc_auc_score

# =========================
# 1. LOAD DATA
# =========================

df = pd.read_csv("../merged_tennis_files/tennis_ml_player_ab_with_elo.csv")
df['tourney_date'] = pd.to_datetime(df['tourney_date'])

# Sort chronologically (MANDATORY)
df = df.sort_values('tourney_date').reset_index(drop=True)

# =========================
# 2. TARGET
# =========================

y = df['player_a_win']

# =========================
# 3. SELECT MODEL FEATURES
# =========================

model_features = [
    'rank_diff',
    'age_diff',
    'height_diff',

    'elo_diff',
    'elo_surface_diff',

    'best_of',
    'minutes',

    'surface',
    'round',
    'player_a_hand',
    'player_b_hand'
]

X = df[model_features].copy()

# =========================
# 4. ENCODE CATEGORICALS
# =========================

for col in ['surface', 'round', 'player_a_hand', 'player_b_hand']:
    X[col] = LabelEncoder().fit_transform(X[col])

# =========================
# 5. TIME-AWARE SPLIT
# =========================

split_idx = int(len(df) * 0.8)

X_train = X.iloc[:split_idx]
X_test = X.iloc[split_idx:]

y_train = y.iloc[:split_idx]
y_test = y.iloc[split_idx:]

# =========================
# 6. XGBOOST MODEL
# =========================

model = XGBClassifier(
    n_estimators=800,
    max_depth=6,
    learning_rate=0.03,
    subsample=0.85,
    colsample_bytree=0.85,
    min_child_weight=5,
    objective='binary:logistic',
    eval_metric='logloss',
    tree_method='hist',
    random_state=42
)

# =========================
# 7. TRAIN
# =========================

model.fit(X_train, y_train)

# =========================
# 8. EVALUATE
# =========================

preds = model.predict(X_test)
probs = model.predict_proba(X_test)[:, 1]

acc = accuracy_score(y_test, preds)
auc = roc_auc_score(y_test, probs)

print(f"âœ… Accuracy: {acc:.4f}")
print(f"ðŸ“ˆ ROC AUC: {auc:.4f}")


âœ… Accuracy: 0.6506
ðŸ“ˆ ROC AUC: 0.7223
