In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read the saved CSV file back into a DataFrame named 'atp_df'
atp_df = pd.read_csv('/content/drive/MyDrive/Stats/Code/atp_tennis.csv')
# Display the first 5 rows of the DataFrame
atp_df.head()

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score,Comment
0,Brisbane International,2024-12-29,ATP250,Outdoor,Hard,1st Round,3,Vukic A.,Goffin D.,Vukic A.,68,52,778,1037,2.0,1.8,6-2 6-3,Completed
1,Brisbane International,2024-12-30,ATP250,Outdoor,Hard,1st Round,3,O Connell C.,Michelsen A.,Michelsen A.,64,41,795,1245,2.75,1.44,4-6 6-4 6-7,Completed
2,Brisbane International,2024-12-30,ATP250,Outdoor,Hard,1st Round,3,Bonzi B.,Tabilo A.,Bonzi B.,75,23,730,1943,1.67,2.2,6-7 7-6 6-4,Completed
3,Brisbane International,2024-12-30,ATP250,Outdoor,Hard,1st Round,3,Rinderknech A.,Nishioka Y.,Nishioka Y.,59,69,927,776,2.5,1.53,3-6 4-6,Completed
4,Brisbane International,2024-12-30,ATP250,Outdoor,Hard,1st Round,3,Thompson J.,Berrettini M.,Thompson J.,26,34,1745,1380,2.63,1.5,3-6 6-3 6-4,Completed


In [3]:
# Set the time format for date conversion
timefmt = "%Y-%m-%d"
# Convert the 'Date' column to datetime objects using the specified format
atp_df['Date'] = pd.to_datetime(atp_df['Date'], format=timefmt)
# Filter the DataFrame based on 'Surface' being 'Hard', no missing values, and excluding 'Draper J.'
df = atp_df.loc[(atp_df.notna().all(axis=1))
].copy()

In [4]:
# Assign number to the focus players
focus_players = [
    'Sinner J.', 'Alcaraz C.', 'Zverev A.', 'Fritz T.',
    'Shelton B.', 'De Minaur A.', 'Auger-Aliassime F.', 'Musetti L.'
]
p_map = {name: idx for idx, name in enumerate(focus_players)}

# Assign Match ID
df['match_id'] = range(1, len(df) + 1)

In [5]:
from pandas.core.frame import DataFrame
# Randomly assign Player_1 / Player_2 (removes P1 bias)
np.random.seed(42)
df['flip'] = np.random.randint(0, 2, size=len(df))

# Helper: swap if flip == 1
def assign_players(row):
    if row['Winner'] == row['Player_1']:
        loser = row['Player_2']
        w_rank, l_rank = row['Rank_1'], row['Rank_2']
        w_pts,  l_pts  = row['Pts_1'],  row['Pts_2']
        w_odd,  l_odd  = row['Odd_1'],  row['Odd_2']
    else:
        loser = row['Player_1']
        w_rank, l_rank = row['Rank_2'], row['Rank_1']
        w_pts,  l_pts  = row['Pts_2'],  row['Pts_1']
        w_odd,  l_odd  = row['Odd_2'],  row['Odd_1']

    if row['flip'] == 0:
        return pd.Series({
            'Player_1': row['Winner'], 'Player_2': loser,
            'Rank_1': w_rank, 'Rank_2': l_rank,
            'Pts_1': w_pts,   'Pts_2': l_pts,
            'Odd_1': w_odd,   'Odd_2': l_odd
        })
    else:
        return pd.Series({
            'Player_1': loser, 'Player_2': row['Winner'],
            'Rank_1': l_rank, 'Rank_2': w_rank,
            'Pts_1': l_pts,   'Pts_2': w_pts,
            'Odd_1': l_odd,   'Odd_2': w_odd
        })

tmp = df.apply(assign_players, axis=1)
df[['Player_1','Player_2','Rank_1','Rank_2','Pts_1','Pts_2','Odd_1','Odd_2']] = tmp

# Map to IDs (0–3 for focus, 999 for others)
df['Player1ID'] = df['Player_1'].map(p_map).fillna(999).astype(int)
df['Player2ID'] = df['Player_2'].map(p_map).fillna(999).astype(int)
df['WinnerID']  = df['Winner'].map(p_map).fillna(999).astype(int)

# Drop helper + deduplicate
dff = df.drop(columns=['flip'], errors='ignore')
dff = dff.drop_duplicates(
    subset=['Tournament', 'Date', 'Player_1', 'Player_2']
).reset_index(drop=True)

# Add Player IDs and Winner ID to playerdf
dff['Player1ID'] = dff['Player_1'].map(p_map).fillna(999).astype(int)
dff['Player2ID'] = dff['Player_2'].map(p_map).fillna(999).astype(int)
dff['WinnerID']  = dff['Winner'].map(p_map).fillna(999).astype(int)


print(f"Final dataset: {len(dff)} unique matches")
display(dff[[
    'Tournament','Date','Player_1','Player_2','Winner',
    'Player1ID','Player2ID','WinnerID','Rank_1','Rank_2','Odd_1','Odd_2'
]].head())

Final dataset: 2619 unique matches


Unnamed: 0,Tournament,Date,Player_1,Player_2,Winner,Player1ID,Player2ID,WinnerID,Rank_1,Rank_2,Odd_1,Odd_2
0,Brisbane International,2024-12-29,Vukic A.,Goffin D.,Vukic A.,999,999,999,68,52,2.0,1.8
1,Brisbane International,2024-12-30,O Connell C.,Michelsen A.,Michelsen A.,999,999,999,64,41,2.75,1.44
2,Brisbane International,2024-12-30,Bonzi B.,Tabilo A.,Bonzi B.,999,999,999,75,23,1.67,2.2
3,Brisbane International,2024-12-30,Nishioka Y.,Rinderknech A.,Nishioka Y.,999,999,999,69,59,1.53,2.5
4,Brisbane International,2024-12-30,Thompson J.,Berrettini M.,Thompson J.,999,999,999,26,34,2.63,1.5


In [6]:
dff['target'] = (dff['Player_1'] == dff['Winner']).astype(int)
df = dff.copy()

In [7]:
# One row per player-per-match (for rolling stats)
p1 = df[['match_id','Date','Player_1','Player_2','target','Score','Odd_1','Odd_2']].copy()
p1 = p1.rename(columns={'Player_1':'Player','Player_2':'Opponent'})
p1['is_player1'] = 1
p1['won']        = df['target']
p1['odd']        = df['Odd_1']
p1['opp_odd']    = df['Odd_2']

p2 = df[['match_id','Date','Player_2','Player_1','target','Score','Odd_1','Odd_2']].copy()
p2 = p2.rename(columns={'Player_2':'Player','Player_1':'Opponent'})
p2['is_player1'] = 0
p2['won']        = 1 - df['target']
p2['odd']        = df['Odd_2']
p2['opp_odd']    = df['Odd_1']

matches = pd.concat([p1, p2], ignore_index=True)
matches = matches.sort_values('Date').reset_index(drop=True)

In [8]:
df.head(5)

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,...,Pts_2,Odd_1,Odd_2,Score,Comment,match_id,Player1ID,Player2ID,WinnerID,target
0,Brisbane International,2024-12-29,ATP250,Outdoor,Hard,1st Round,3,Vukic A.,Goffin D.,Vukic A.,...,1037,2.0,1.8,6-2 6-3,Completed,1,999,999,999,1
1,Brisbane International,2024-12-30,ATP250,Outdoor,Hard,1st Round,3,O Connell C.,Michelsen A.,Michelsen A.,...,1245,2.75,1.44,4-6 6-4 6-7,Completed,2,999,999,999,0
2,Brisbane International,2024-12-30,ATP250,Outdoor,Hard,1st Round,3,Bonzi B.,Tabilo A.,Bonzi B.,...,1943,1.67,2.2,6-7 7-6 6-4,Completed,3,999,999,999,1
3,Brisbane International,2024-12-30,ATP250,Outdoor,Hard,1st Round,3,Nishioka Y.,Rinderknech A.,Nishioka Y.,...,927,1.53,2.5,3-6 4-6,Completed,4,999,999,999,1
4,Brisbane International,2024-12-30,ATP250,Outdoor,Hard,1st Round,3,Thompson J.,Berrettini M.,Thompson J.,...,1380,2.63,1.5,3-6 6-3 6-4,Completed,5,999,999,999,1


In [9]:
import re
# Parse the score string → sets, bagels, breadsticks, tie-breaks
def parse_score(row):
    raw = str(row['Score']).strip()
    if raw in ('', 'nan', 'NaN', ' '):
        return pd.Series([0,0,0,0,0,0],
                         index=['sets_won','sets_lost','bagels','breadsticks','tiebreak_won','tiebreak_played'])

    sets = re.findall(r'(\d+)-(\d+)', raw)      # only X-Y patterns
    if not sets:
        return pd.Series([0,0,0,0,0,0],
                         index=['sets_won','sets_lost','bagels','breadsticks','tiebreak_won','tiebreak_played'])

    won = lost = bagel = bread = tb_w = tb_p = 0
    for a_str, b_str in sets:
        a, b = int(a_str), int(b_str)
        player_set, opp_set = (a, b) if row['is_player1'] else (b, a)

        if player_set > opp_set:
            won += 1
            if player_set == 6 and opp_set == 0: bagel += 1
            if player_set == 6 and opp_set == 1: bread += 1
        else:
            lost += 1

        if (player_set == 7 and opp_set == 6) or (player_set == 6 and opp_set == 7):
            tb_p += 1
            if player_set > opp_set: tb_w += 1

    return pd.Series([won, lost, bagel, bread, tb_w, tb_p],
                     index=['sets_won','sets_lost','bagels','breadsticks','tiebreak_won','tiebreak_played'])

tmp = matches.apply(parse_score, axis=1)
matches = pd.concat([matches, tmp], axis=1)

In [10]:
matches.head(5)

Unnamed: 0,match_id,Date,Player,Opponent,target,Score,Odd_1,Odd_2,is_player1,won,odd,opp_odd,sets_won,sets_lost,bagels,breadsticks,tiebreak_won,tiebreak_played
0,1,2024-12-29,Vukic A.,Goffin D.,1,6-2 6-3,2.0,1.8,1,1,2.0,1.8,2,0,0,0,0,0
1,1,2024-12-29,Goffin D.,Vukic A.,1,6-2 6-3,2.0,1.8,0,0,1.8,2.0,0,2,0,0,0,0
2,8,2024-12-30,Rune H.,Lehecka J.,1,5-7 3-6,2.5,1.53,0,0,1.53,2.5,2,0,0,0,0,0
3,31,2024-12-30,Darderi L.,Kecmanovic M.,1,6-3 6-3,1.3,3.5,0,0,3.5,1.3,0,2,0,0,0,0
4,32,2024-12-30,Safiullin R.,Marozsan F.,1,1-6 6-7,2.63,1.5,0,0,1.5,2.63,2,0,0,1,1,1


In [11]:
# ROLLING STATS (including odds-based win-rates)
player_stats = []

for (player, role), grp in matches.groupby(['Player', 'is_player1']):
    grp = grp.sort_values('Date').reset_index(drop=True)

    # basic
    matches_played = np.arange(1, len(grp) + 1)
    wins_cum       = grp['won'].cumsum()
    win_rate_raw   = wins_cum / matches_played

    # sets
    sets_won  = grp['sets_won'].cumsum()
    sets_lost = grp['sets_lost'].cumsum()
    set_win_rate_raw = sets_won / (sets_won + sets_lost).replace(0, np.nan)

    # bagels / breadsticks
    bagels_rate_raw  = grp['bagels'].cumsum()  / matches_played
    bread_rate_raw   = grp['breadsticks'].cumsum() / matches_played

    # tie‑breaks
    tb_won    = grp['tiebreak_won'].cumsum()
    tb_played = grp['tiebreak_played'].cumsum()
    tb_rate_raw = np.where(tb_played == 0, 0.5, tb_won / tb_played)

    # favorite / underdog
    grp['was_fav'] = grp['odd'] < grp['opp_odd']
    fav_mask  = grp['was_fav']
    udog_mask = ~fav_mask

    fav_wins = grp.loc[fav_mask, 'won'].cumsum().reindex(grp.index, fill_value=0)
    fav_cnt  = fav_mask.cumsum()
    win_as_fav_raw = np.where(fav_cnt == 0, 0.5, fav_wins / fav_cnt)

    udog_wins = grp.loc[udog_mask, 'won'].cumsum().reindex(grp.index, fill_value=0)
    udog_cnt  = udog_mask.cumsum()
    win_as_udog_raw = np.where(udog_cnt == 0, 0.0, udog_wins / udog_cnt)

    upset_rate_raw = win_as_udog_raw.copy()
    being_upset_raw = np.where(
        fav_cnt == 0,
        0.2,
        (fav_mask & (grp['won'] == 0)).cumsum() / fav_cnt
    )

    df_p = pd.DataFrame({
        'match_id'              : grp['match_id'],
        'Player'                : player,
        'is_player1'            : role,
        'Date'                  : grp['Date'],
        'win_rate'              : win_rate_raw,
        'set_win_rate'          : set_win_rate_raw,
        'bagels_delivered'      : bagels_rate_raw,
        'breadsticks_delivered' : bread_rate_raw,
        'tiebreaks_won_rate'    : tb_rate_raw,
        'win_as_fav'            : win_as_fav_raw,
        'win_as_underdog'       : win_as_udog_raw,
        'upset_rate'            : upset_rate_raw,
        'being_upset_rate'      : being_upset_raw
    })
    player_stats.append(df_p)

player_stats_df = pd.concat(player_stats, ignore_index=True)

In [12]:
player_stats_df.head(5)

Unnamed: 0,match_id,Player,is_player1,Date,win_rate,set_win_rate,bagels_delivered,breadsticks_delivered,tiebreaks_won_rate,win_as_fav,win_as_underdog,upset_rate,being_upset_rate
0,2592,Added D.,1,2025-11-05,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.2
1,747,Albot R.,0,2025-04-01,0.0,0.0,0.0,0.0,0.5,0.5,0.0,0.0,0.2
2,143,Alcaraz C.,0,2025-01-13,1.0,0.0,0.0,0.0,0.5,1.0,0.0,0.0,0.0
3,175,Alcaraz C.,0,2025-01-15,1.0,0.0,0.0,0.0,0.5,1.0,0.0,0.0,0.0
4,230,Alcaraz C.,0,2025-01-21,0.666667,0.3,0.0,0.0,0.5,0.666667,0.0,0.0,0.333333


In [13]:
player_stats_df.to_excel('playerstat.xlsx', index=False)

In [14]:
from datetime import datetime

# Shift → past only
shifted_list = []

for (player, role), grp in player_stats_df.groupby(['Player', 'is_player1']):
    grp_shifted = grp.copy()

    # Only shift the STAT columns (NOT match_id, Player, is_player1, Date)
    stat_cols = [
        'win_rate', 'set_win_rate', 'bagels_delivered', 'breadsticks_delivered',
        'tiebreaks_won_rate', 'win_as_fav', 'win_as_underdog',
        'upset_rate', 'being_upset_rate'
    ]

    # Shift only stats
    grp_shifted[stat_cols] = grp_shifted[stat_cols].shift(1)

    shifted_list.append(grp_shifted)

# Combine
player_stats_shift = pd.concat(shifted_list).reset_index(drop=True)

In [15]:
player_stats_shift.to_excel('player_stats_shift.xlsx', index=False)

In [16]:
# Convert BOTH Date columns to Excel‑serial (float)
def to_excel_serial(dt):
    if pd.isna(dt):
        return np.nan
    epoch = pd.Timestamp('1899-12-30')
    delta = dt - epoch
    return delta.days + (delta.seconds / 86400.0)


if not pd.api.types.is_float_dtype(df['Date']):
    df['Date'] = df['Date'].apply(to_excel_serial)

# SHIFTED STATS: convert Date → Excel serial
player_stats_shift['Date_excel'] = player_stats_shift['Date'].apply(to_excel_serial)

In [17]:
# Merge Player 1
p1_stats = player_stats_shift[player_stats_shift['is_player1'] == 1].copy()
p1_stats = p1_stats.rename(columns=lambda x: x + '_p1' if x != 'match_id' else x)

df = df.merge(p1_stats, on='match_id', how='left')

# Merge Player 2
p2_stats = player_stats_shift[player_stats_shift['is_player1'] == 0].copy()
p2_stats = p2_stats.rename(columns=lambda x: x + '_p2' if x != 'match_id' else x)

df = df.merge(p2_stats, on='match_id', how='left')

In [18]:
df.head((5))

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,...,win_rate_p2,set_win_rate_p2,bagels_delivered_p2,breadsticks_delivered_p2,tiebreaks_won_rate_p2,win_as_fav_p2,win_as_underdog_p2,upset_rate_p2,being_upset_rate_p2,Date_excel_p2
0,Brisbane International,45655.0,ATP250,Outdoor,Hard,1st Round,3,Vukic A.,Goffin D.,Vukic A.,...,,,,,,,,,,45655.0
1,Brisbane International,45656.0,ATP250,Outdoor,Hard,1st Round,3,O Connell C.,Michelsen A.,Michelsen A.,...,,,,,,,,,,45656.0
2,Brisbane International,45656.0,ATP250,Outdoor,Hard,1st Round,3,Bonzi B.,Tabilo A.,Bonzi B.,...,,,,,,,,,,45656.0
3,Brisbane International,45656.0,ATP250,Outdoor,Hard,1st Round,3,Nishioka Y.,Rinderknech A.,Nishioka Y.,...,,,,,,,,,,45656.0
4,Brisbane International,45656.0,ATP250,Outdoor,Hard,1st Round,3,Thompson J.,Berrettini M.,Thompson J.,...,,,,,,,,,,45656.0


In [19]:
# Implied probabilities (current match) – margin-adjusted
df['implied_prob_p1'] = 1 / df['Odd_1']
df['implied_prob_p2'] = 1 / df['Odd_2']
margin = df['implied_prob_p1'] + df['implied_prob_p2']
df['implied_prob_norm_p1'] = df['implied_prob_p1'] / margin
df['implied_prob_norm_p2'] = df['implied_prob_p2'] / margin

In [20]:
neutral = {
    # rates (denominator exists)
    'win_rate_p1':0.5, 'set_win_rate_p1':0.5,
    'tiebreaks_won_rate_p1':0.5,
    'win_as_fav_p1':0.5, 'win_as_underdog_p1':0.0,
    'upset_rate_p1':0.0, 'being_upset_rate_p1':0.2,

    'win_rate_p2':0.5, 'set_win_rate_p2':0.5,
    'tiebreaks_won_rate_p2':0.5,
    'win_as_fav_p2':0.5, 'win_as_underdog_p2':0.0,
    'upset_rate_p2':0.0, 'being_upset_rate_p2':0.2,

    # pure counts
    'bagels_delivered_p1':0.0, 'breadsticks_delivered_p1':0.0,
    'bagels_delivered_p2':0.0, 'breadsticks_delivered_p2':0.0
}
df = df.fillna(neutral)

In [21]:
# Define a list of the current ATP top 8 players
current_rank = ['Sinner J.', 'Alcaraz C.', 'Zverev A.', 'Fritz T.', 'Shelton B.', 'De Minaur A.', 'Auger-Aliassime F.', 'Musetti L.']
# Filter the DataFrame to include only matches where at least one of the top 4 players is participating
newdf = df.loc[
    (df["Player_1"].isin(current_rank))
    | (df["Player_2"].isin(current_rank))
].copy()

In [22]:
newdf.to_excel('newdf.xlsx', index=False)

In [23]:
# Final column order & export
final_cols = [
    'WinnerID','Tournament','Date','Series','Court','Surface','Round','Best of','Winner',
    'Player_1','Player_2','Rank_1','Rank_2','Pts_1','Pts_2',
    'Odd_1','Odd_2','target',
    # implied probs
    'implied_prob_norm_p1','implied_prob_norm_p2',
    # Player 1 rolling stats
    'win_rate_p1','set_win_rate_p1','bagels_delivered_p1',
    'breadsticks_delivered_p1','tiebreaks_won_rate_p1',
    'win_as_fav_p1','win_as_underdog_p1','upset_rate_p1','being_upset_rate_p1',
    # Player 2 rolling stats
    'win_rate_p2','set_win_rate_p2','bagels_delivered_p2',
    'breadsticks_delivered_p2','tiebreaks_won_rate_p2',
    'win_as_fav_p2','win_as_underdog_p2','upset_rate_p2','being_upset_rate_p2'
]

final = newdf[final_cols].copy()
final.to_excel('playerdf_final.xlsx', index=False)

In [24]:
series_tier = {
    'Grand Slam'   : 10,
    'Masters 1000' : 9,
    'ATP 500'      : 5
}
round_map = {
    '1st Round'    : 1,
    '2nd Round'    : 2,
    '3rd Round'    : 3,
    '4th Round'    : 4,
    'Quarterfinals': 5,
    'Semifinals'   : 6,
    'Final'        : 7,
    'The Final'    : 7
}

final['Tournament_Tier'] = final['Series'].map(series_tier).fillna(1).astype(int)
final['Round_v']         = final['Round'].map(round_map).fillna(0).astype(int)
final['Importance']      = final['Tournament_Tier'] * final['Round_v']

Calculate Probability of Winning ($P$)

Historical Win Probability ($P_{old}$):
    \begin{equation}
        P_{old} = \frac{263}{343} \approx 0.7667 \quad (76.7\%)
    \end{equation}

Post-Ban Win Probability ($P_{new}$):
    \begin{equation}
        P_{new} = \frac{51}{57} \approx 0.8947 \quad (89.5\%)
    \end{equation}
Calculate the Statistical Odds
  \begin{equation}
    Odds_{old} = \frac{P_{old}}{1 - P_{old}} = \frac{0.7667}{1 - 0.7667} = \frac{0.7667}{0.2333} \approx 3.29
\end{equation}

  \begin{equation}
    Odds_{new} = \frac{P_{new}}{1 - P_{new}} = \frac{0.8947}{1 - 0.8947} = \frac{0.8947}{0.1053} \approx 8.50
\end{equation}

  Determination of Weighting Factor ($\lambda$)
  \begin{equation}
    \lambda = \frac{Odds_{new}}{Odds_{old}} = \frac{8.50}{3.29} \approx 2.58
\end{equation}

In [25]:
# Adding weight on Sinner's match data
sample_weight = np.ones(len(final))

sinner_rows = (
    (final['Player_1'] == 'Sinner J.') |
    (final['Player_2'] == 'Sinner J.')
)
sample_weight[sinner_rows] *= 2.5

print(f"Sinner matches: {sinner_rows.sum()} → boosted ×2.5")

Sinner matches: 64 → boosted ×2.5


In [26]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

# Filter out non-focus players
min_matches_per_class = 3
winner_counts = final['WinnerID'].value_counts()
valid_ids = winner_counts[winner_counts >= min_matches_per_class].index

filtered_final = final[
    final['WinnerID'].isin(valid_ids)
].copy().reset_index(drop=True)

print(f"After dropping players with < {min_matches_per_class} wins: {len(filtered_final)} rows")
print("Remaining player IDs:", sorted(filtered_final['WinnerID'].unique()))


After dropping players with < 3 wins: 502 rows
Remaining player IDs: [np.int64(0), np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(999)]


In [27]:
filtered_final['target'] = (
    filtered_final['Winner'] == filtered_final['Player_1']
).astype(int)
filtered_final.to_csv('filtered_final.csv', index=False)

In [28]:
baseline_features = [
    'Round_v', 'Best of', 'Importance',
    'tiebreaks_won_rate_p1', 'upset_rate_p1', 'tiebreaks_won_rate_p2', 'upset_rate_p2'
]
X = filtered_final[baseline_features].copy()
y = filtered_final['target'].copy()

In [29]:
X_train, X_temp, y_train, y_temp, w_train, w_temp = train_test_split(
    X, y, sample_weight,
    test_size=0.30,
    stratify=y,
    random_state=42,
    shuffle=True
)

X_val, X_test, y_val, y_test, w_val, w_test = train_test_split(
    X_temp, y_temp, w_temp,
    test_size=0.50,
    stratify=y_temp,
    random_state=42,
    shuffle=True
)

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

Train: 351, Val: 75, Test: 76


In [30]:
base_model = xgb.XGBClassifier(
    n_estimators=1000,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1,
    verbosity=0
)

base_model.fit(
    X_train, y_train,
    sample_weight=w_train,
    eval_set=[(X_val, y_val)],
    sample_weight_eval_set=[w_val],
    verbose=False
)

# MANUAL EARLY STOPPING
evals_result = base_model.evals_result()
val_logloss = evals_result['validation_0']['logloss']
best_iter = np.argmin(val_logloss)

print(f"Best iteration: {best_iter} (val_logloss: {val_logloss[best_iter]:.4f})")

Best iteration: 11 (val_logloss: 0.6714)


In [31]:
final_model = xgb.XGBClassifier(
    n_estimators=best_iter + 1,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1,
    verbosity=0
)

final_model.fit(X_train, y_train, sample_weight=w_train)

In [32]:
y_pred_prob = final_model.predict_proba(X_test)[:, 1]
y_pred = (y_pred_prob >= 0.5).astype(int)

print(f"\nTest Accuracy : {accuracy_score(y_test, y_pred):.4f}")
print(f"Test ROC-AUC  : {roc_auc_score(y_test, y_pred_prob):.4f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred,
                            target_names=['Player_2 wins', 'Player_1 wins'],
                            digits=4))


Test Accuracy : 0.6184
Test ROC-AUC  : 0.6903

Classification Report:
               precision    recall  f1-score   support

Player_2 wins     0.6842    0.3611    0.4727        36
Player_1 wins     0.5965    0.8500    0.7010        40

     accuracy                         0.6184        76
    macro avg     0.6404    0.6056    0.5869        76
 weighted avg     0.6380    0.6184    0.5929        76



In [33]:
importances = pd.Series(base_model.feature_importances_, index=baseline_features)
importances = importances.sort_values(ascending=False)
print(importances)

upset_rate_p1            0.197320
upset_rate_p2            0.157739
Best of                  0.149286
tiebreaks_won_rate_p1    0.134044
tiebreaks_won_rate_p2    0.128026
Round_v                  0.119141
Importance               0.114445
dtype: float32


In [34]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
full_features = [
    # Match context
    'Round_v', 'Best of',

    # Rankings & Points
    'Rank_1', 'Rank_2', 'Pts_1', 'Pts_2',

    # Odds
    'Odd_1', 'Odd_2',

    # Implied probabilities (correct names)
    'implied_prob_norm_p1', 'implied_prob_norm_p2',

    # Player 1 Rolling Stats
    'win_rate_p1', 'set_win_rate_p1',
    'bagels_delivered_p1', 'breadsticks_delivered_p1', 'tiebreaks_won_rate_p1',
    'win_as_fav_p1', 'win_as_underdog_p1', 'upset_rate_p1', 'being_upset_rate_p1',

    # Player 2 Rolling Stats
    'win_rate_p2', 'set_win_rate_p2',
    'bagels_delivered_p2', 'breadsticks_delivered_p2', 'tiebreaks_won_rate_p2',
    'win_as_fav_p2', 'win_as_underdog_p2', 'upset_rate_p2', 'being_upset_rate_p2'
]

X_full = filtered_final[full_features].copy()
y_full = filtered_final['target'].copy()

X_train, X_temp, y_train, y_temp, w_train, w_temp = train_test_split(
    X_full, y_full, sample_weight,
    test_size=0.30, stratify=y_full, random_state=42, shuffle=True
)
X_val, X_test, y_val, y_test, w_val, w_test = train_test_split(
    X_temp, y_temp, w_temp,
    test_size=0.50, stratify=y_temp, random_state=42, shuffle=True
)

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

Train: 351, Val: 75, Test: 76


In [35]:
param_dist = {
    'n_estimators'     : [200, 300, 500, 800, 1000],
    'max_depth'        : [4, 5, 6, 7],
    'learning_rate'    : [0.01, 0.05, 0.1],
    'subsample'        : [0.8, 0.9, 1.0],
    'colsample_bytree' : [0.8, 0.9, 1.0],
    'min_child_weight' : [1, 3, 5],
    'gamma'            : [0, 0.1, 0.2],
    'reg_alpha'        : [0, 0.1],
    'reg_lambda'       : [1.0, 1.5]
}

# Base estimator
base_estimator = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    random_state=42,
    n_jobs=-1,
    verbosity=0
)

random_search = RandomizedSearchCV(
    estimator=base_estimator,
    param_distributions=param_dist,
    n_iter=100,
    scoring='roc_auc',
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    n_jobs=-1,
    verbose=1,
    random_state=42
)

# Fit WITHOUT early_stopping_rounds
random_search.fit(X_train, y_train, sample_weight=w_train)

print(f"Best CV ROC-AUC: {random_search.best_score_:.4f}")
print("Best parameters:", random_search.best_params_)

best_model = random_search.best_estimator_

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best CV ROC-AUC: 0.8652
Best parameters: {'subsample': 0.9, 'reg_lambda': 1.5, 'reg_alpha': 0, 'n_estimators': 200, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.9}


In [36]:
# Best model from RandomizedSearchCV
best_model = random_search.best_estimator_

# Predict on held-out test set
y_pred      = best_model.predict(X_test)
y_pred_prob = best_model.predict_proba(X_test)[:, 1]

In [37]:
# Results
print("\n=== FINAL MODEL PERFORMANCE (Held-Out Test Set) ===")
print(f"Best hyperparameters : {random_search.best_params_}")
print(f"Best CV ROC-AUC       : {random_search.best_score_:.4f}")
print(f"Best iteration (est.) : {best_model.n_estimators}")

print(f"\nTest Accuracy         : {accuracy_score(y_test, y_pred):.4f}")
print(f"Test ROC-AUC          : {roc_auc_score(y_test, y_pred_prob):.4f}")

print("\nClassification Report:")
print(classification_report(
    y_test, y_pred,
    target_names=['Player_2 wins', 'Player_1 wins'],
    digits=4
))


=== FINAL MODEL PERFORMANCE (Held-Out Test Set) ===
Best hyperparameters : {'subsample': 0.9, 'reg_lambda': 1.5, 'reg_alpha': 0, 'n_estimators': 200, 'min_child_weight': 5, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0.2, 'colsample_bytree': 0.9}
Best CV ROC-AUC       : 0.8652
Best iteration (est.) : 200

Test Accuracy         : 0.7632
Test ROC-AUC          : 0.8319

Classification Report:
               precision    recall  f1-score   support

Player_2 wins     0.8214    0.6389    0.7188        36
Player_1 wins     0.7292    0.8750    0.7955        40

     accuracy                         0.7632        76
    macro avg     0.7753    0.7569    0.7571        76
 weighted avg     0.7729    0.7632    0.7591        76



In [38]:
best_importances = pd.Series(best_model.feature_importances_, index=full_features)
best_importances = best_importances.sort_values(ascending=False)
print(best_importances)

implied_prob_norm_p2        0.270240
implied_prob_norm_p1        0.233587
Odd_1                       0.143283
Odd_2                       0.030232
tiebreaks_won_rate_p1       0.020327
upset_rate_p2               0.020134
Pts_2                       0.019697
Rank_2                      0.019647
Pts_1                       0.019350
bagels_delivered_p1         0.019207
Round_v                     0.018765
win_as_fav_p2               0.018621
Rank_1                      0.017205
win_rate_p2                 0.016646
being_upset_rate_p1         0.016334
breadsticks_delivered_p1    0.014498
set_win_rate_p1             0.013811
win_as_underdog_p2          0.013730
win_as_fav_p1               0.011658
being_upset_rate_p2         0.010254
tiebreaks_won_rate_p2       0.009825
set_win_rate_p2             0.009364
win_as_underdog_p1          0.008908
win_rate_p1                 0.008670
breadsticks_delivered_p2    0.008416
bagels_delivered_p2         0.007590
Best of                     0.000000
u

In [None]:
import joblib

# Best model
joblib.dump({
    'best_model'        : best_model,
    'full_features'     : full_features,
    'focus_players': focus_players
}, 'best_predictor.pkl')

print("Model saved → 'best_predictor.pkl'")

Model saved → 'best_predictor.pkl'


In [None]:
# Base model
joblib.dump({
    'model'        : final_model,
    'features'     : baseline_features,
    'focus_players': focus_players
}, 'base_predictor.pkl')

print("Model saved → 'base_predictor.pkl'")

Model saved → 'base_predictor.pkl'


In [None]:
import random
import itertools
from collections import Counter
from sklearn.preprocessing import LabelEncoder

# Best model
best_saved = joblib.load('best_predictor.pkl')
best_model = best_saved['best_model']
best_features = best_saved['full_features']
focus_players = best_saved.get('focus_players', [])

In [None]:
# Baseline model
base_saved = joblib.load('base_predictor.pkl')
base_model = base_saved['model']
base_features = base_saved['features']
focus_players = base_saved.get('focus_players', [])

In [None]:
# 2. PLAYERS WE WILL SIMULATE
all_players = [
    'Alcaraz C.', 'Fritz T.', 'De Minaur A.', 'Musetti L.',
    'Sinner J.', 'Zverev A.', 'Shelton B.', 'Auger-Aliassime F.'
]

print("Players in Finals:")
for i, p in enumerate(all_players, 1):
    print(f"  {i}: {p}")

Players in Finals:
  1: Alcaraz C.
  2: Fritz T.
  3: De Minaur A.
  4: Musetti L.
  5: Sinner J.
  6: Zverev A.
  7: Shelton B.
  8: Auger-Aliassime F.


### Baseline Model Simulation

In [None]:
# Extract P1/P2 columns from training features
p1_cols = [c for c in base_features if c.endswith('_p1')]
p2_cols = [c for c in base_features if c.endswith('_p2')]
base_cols = [c[:-3] for c in p1_cols]

print(f"\nTraining features: {len(base_features)} total")
print(f"P1 columns: {len(p1_cols)} | P2 columns: {len(p2_cols)}")


Training features: 7 total
P1 columns: 2 | P2 columns: 2


In [None]:
# Prepare numeric dataframe
playerdf_numeric = filtered_final.copy()

def ensure_numeric(df, cols):
    return df[cols].apply(pd.to_numeric, errors='coerce').fillna(0.0)

playerdf_numeric[p1_cols] = ensure_numeric(playerdf_numeric, p1_cols)
playerdf_numeric[p2_cols] = ensure_numeric(playerdf_numeric, p2_cols)

In [None]:
# Build Symmetric stats
# All players that appear in the data
players_in_data = pd.unique(
    pd.concat([playerdf_numeric['Player_1'], playerdf_numeric['Player_2']])
)

In [None]:
# Rename P2 → P1 naming convention
def rename_p2_to_p1(df):
    rename_dict = {col: col.replace('_p2', '_p1') for col in df.columns if col.endswith('_p2')}
    return df.rename(columns=rename_dict)

In [None]:
# Compute stats for each player
player_stats_map = {}

for name in players_in_data:
    p1_mask = playerdf_numeric['Player_1'] == name
    p2_mask = playerdf_numeric['Player_2'] == name

    # P1 stats (already correct naming)
    p1_stats = playerdf_numeric.loc[p1_mask, p1_cols].mean()

    # P2 stats → rename to P1 naming
    p2_stats = playerdf_numeric.loc[p2_mask, p2_cols]
    if not p2_stats.empty:
        p2_stats = rename_p2_to_p1(p2_stats).mean()
    else:
        p2_stats = pd.Series(0.0, index=p1_cols)

    # Combine & fill missing
    combined = pd.concat([p1_stats, p2_stats]).groupby(level=0).mean()
    for col in p1_cols:
        if col not in combined:
            combined[col] = 0.0

    player_stats_map[name] = combined[p1_cols]

# Fill players that never played (neutral zeros)
zero_series = pd.Series(0.0, index=p1_cols)
for name in all_players:
    if name not in player_stats_map:
        player_stats_map[name] = zero_series

print(f"\nSymmetric stats ready for {len(player_stats_map)} players.")


Symmetric stats ready for 139 players.


In [None]:
# CURRENT RANK / POINTS (Nov 11, 2025)
rank_pts = {
    'Alcaraz C.'        : (1, 11050),
    'Sinner J.'         : (2, 10000),
    'Zverev A.'         : (3, 4960),
    'Fritz T.'          : (6, 3935),
    'Shelton B.'        : (5, 3970),
    'De Minaur A.'      : (7, 3935),
    'Auger-Aliassime F.': (8, 3845),
    'Musetti L.'        : (9, 3940),
}

In [None]:
# Building Match Row for Baseline Model
def build_match_row(p1, p2, round_v, best_of, importance):
    row = {
        'Round_v'    : round_v,
        'Best of'    : best_of,
        'Importance' : importance,
        'Rank_1'     : rank_pts.get(p1, (0, 0))[0],
        'Rank_2'     : rank_pts.get(p2, (0, 0))[0],
        'Pts_1'      : rank_pts.get(p1, (0, 0))[1],
        'Pts_2'      : rank_pts.get(p2, (0, 0))[1],
        'Odd_1'      : 1.0,
        'Odd_2'      : 1.0,
        'p1_implied_prob_norm': 0.5,
        'p2_implied_prob_norm': 0.5,
    }

    s1 = player_stats_map.get(p1, pd.Series(0.0, index=base_cols))
    s2 = player_stats_map.get(p2, pd.Series(0.0, index=base_cols))

    for base in base_cols:
        row[f'{base}_p1'] = float(s1.get(base, 0.0))
        row[f'{base}_p2'] = float(s2.get(base, 0.0))

    return pd.DataFrame([row])[base_features]

In [None]:
# PRE-COMPUTE ALL MATCH PROBABILITIES (24 matches)
match_probs = {}

for group in (all_players[:4], all_players[4:]):
    for p1, p2 in itertools.combinations(group, 2):
        for order in ((p1, p2), (p2, p1)):
            X = build_match_row(*order, round_v=5.0, best_of=3, importance=45.0)
            prob = base_model.predict_proba(X)[0, 1]   # P(Player_1 wins)
            match_probs[order] = prob

print(f"\nPre-computed {len(match_probs)} match probabilities.")


Pre-computed 24 match probabilities.


In [None]:
connors_group = ['Alcaraz C.', 'Fritz T.', 'De Minaur A.', 'Musetti L.']
borg_group    = ['Sinner J.', 'Zverev A.', 'Shelton B.', 'Auger-Aliassime F.']

# MONTE-CARLO + PER-ROUND STATS
N_SIMS = 10_000
champion_counter = Counter()
rr_stats = {p: {'matches': 0, 'wins': 0} for p in all_players}
sf_stats = {p: {'matches': 0, 'wins': 0} for p in all_players}
final_stats = {p: {'matches': 0, 'wins': 0} for p in all_players}

rng = np.random.default_rng(42)
champion_counter = Counter()
group_wins = {p: np.zeros(N_SIMS) for p in all_players}

# Vectorised simulation
rng = np.random.default_rng(42)
for sim in range(N_SIMS):
    rr_wins = Counter()
    # Round-Robin
    for (p1, p2), prob in match_probs.items():
        rr_stats[p1]['matches'] += 1
        rr_stats[p2]['matches'] += 1
        if rng.random() < prob:
            rr_wins[p1] += 1
            rr_stats[p1]['wins'] += 1
        else:
            rr_stats[p2]['wins'] += 1

    # Group top-2
    c_top2 = sorted(all_players[:4], key=lambda p: rr_wins[p], reverse=True)[:2]
    b_top2 = sorted(all_players[4:], key=lambda p: rr_wins[p], reverse=True)[:2]

    # Semifinals
    sf1 = (c_top2[0], b_top2[1])
    sf2 = (b_top2[0], c_top2[1])
    sf1_prob = base_model.predict_proba(build_match_row(*sf1, 6.0, 3, 54.0))[:, 1][0]
    sf2_prob = base_model.predict_proba(build_match_row(*sf2, 6.0, 3, 54.0))[:, 1][0]
    sf1_w = sf1[0] if rng.random() < sf1_prob else sf1[1]
    sf2_w = sf2[0] if rng.random() < sf2_prob else sf2[1]

    sf_stats[sf1[0]]['matches'] += 1; sf_stats[sf1[1]]['matches'] += 1
    sf_stats[sf2[0]]['matches'] += 1; sf_stats[sf2[1]]['matches'] += 1
    sf_stats[sf1_w]['wins'] += 1
    sf_stats[sf2_w]['wins'] += 1

    # Final
    final = (sf1_w, sf2_w)
    final_prob = base_model.predict_proba(build_match_row(*final, 7.0, 3, 50.0))[:, 1][0]
    champ = final[0] if rng.random() < final_prob else final[1]
    champion_counter[champ] += 1

    final_stats[final[0]]['matches'] += 1
    final_stats[final[1]]['matches'] += 1
    final_stats[champ]['wins'] += 1

In [None]:
# PER-ROUND STATS TABLES
def make_stats_df(stats_dict, stage):
    df = pd.DataFrame(stats_dict).T
    df['Win %'] = (df['wins'] / df['matches'].replace(0, np.nan) * 100).fillna(0).round(1).astype(str) + '%'
    df['Matches'] = df['matches']
    df['Wins'] = df['wins']
    df = df[['Matches', 'Wins', 'Win %']].sort_values('Win %', ascending=False)
    df.index.name = 'Player'
    df.columns = [f'{stage} Matches', f'{stage} Wins', f'{stage} Win %']
    return df

In [None]:
rr_df   = make_stats_df(rr_stats,   'RR')
sf_df   = make_stats_df(sf_stats,   'SF')
final_df = make_stats_df(final_stats, 'Final')

In [None]:
# DISPLAY & EXPORT
print("\n=== 2025 ATP FINALS – 10,000 SIMULATIONS – WITH BASELINE MODEL===")
print("\nRound-Robin Stats:")
display(rr_df)


=== 2025 ATP FINALS – 10,000 SIMULATIONS – WITH BASELINE MODEL===

Round-Robin Stats:


Unnamed: 0_level_0,RR Matches,RR Wins,RR Win %
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
De Minaur A.,60000,30039,50.1%
Sinner J.,60000,30037,50.1%
Shelton B.,60000,30046,50.1%
Alcaraz C.,60000,30021,50.0%
Musetti L.,60000,30000,50.0%
Auger-Aliassime F.,60000,29987,50.0%
Fritz T.,60000,29940,49.9%
Zverev A.,60000,29930,49.9%


In [None]:
print("\nSemi-Final Stats – WITH BASELINE MODEL:")
display(sf_df)


Semi-Final Stats – WITH BASELINE MODEL:


Unnamed: 0_level_0,SF Matches,SF Wins,SF Win %
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alcaraz C.,6809,3609,53.0%
Sinner J.,6798,3559,52.4%
Fritz T.,5765,2828,49.1%
Zverev A.,5751,2826,49.1%
Shelton B.,4325,2125,49.1%
Musetti L.,3185,1537,48.3%
Auger-Aliassime F.,3126,1493,47.8%
De Minaur A.,4241,2023,47.7%


In [None]:
print("\nFinal Stats – WITH BASELINE MODEL:")
display(final_df)


Final Stats – WITH BASELINE MODEL:


Unnamed: 0_level_0,Final Matches,Final Wins,Final Win %
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alcaraz C.,3609,1944,53.9%
Shelton B.,2125,1089,51.2%
Auger-Aliassime F.,1493,753,50.4%
Musetti L.,1537,771,50.2%
Fritz T.,2828,1404,49.6%
Zverev A.,2826,1400,49.5%
De Minaur A.,2023,984,48.6%
Sinner J.,3559,1655,46.5%


In [None]:
# Champion probability
champ_df = pd.DataFrame([
    {'Player': p, 'Champion %': f"{(c/N_SIMS)*100:.2f}%"}
    for p, c in champion_counter.most_common()
]).set_index('Player')

print("\nChampion Probability – WITH BASELINE MODEL:")
display(champ_df)


Champion Probability – WITH BASELINE MODEL:


Unnamed: 0_level_0,Champion %
Player,Unnamed: 1_level_1
Alcaraz C.,19.44%
Sinner J.,16.55%
Fritz T.,14.04%
Zverev A.,14.00%
Shelton B.,10.89%
De Minaur A.,9.84%
Musetti L.,7.71%
Auger-Aliassime F.,7.53%


### Best Model

In [None]:
# Extract P1/P2 columns from training features
best_p1_cols = [c for c in best_features if c.endswith('_p1')]
best_p2_cols = [c for c in best_features if c.endswith('_p2')]
base_cols = [c[:-3] for c in best_p1_cols]

print(f"\nTraining features: {len(best_features)} total")
print(f"P1 columns: {len(best_p1_cols)} | P2 columns: {len(best_p2_cols)}")


Training features: 28 total
P1 columns: 10 | P2 columns: 10


In [None]:
# Building Match Row for Baseline Model
def best_build_match_row(p1, p2, round_v, best_of, importance):
    row = {
        'Round_v'    : round_v,
        'Best of'    : best_of,
        'Importance' : importance,
        'Rank_1'     : rank_pts.get(p1, (0, 0))[0],
        'Rank_2'     : rank_pts.get(p2, (0, 0))[0],
        'Pts_1'      : rank_pts.get(p1, (0, 0))[1],
        'Pts_2'      : rank_pts.get(p2, (0, 0))[1],
        'Odd_1'      : 1.0,
        'Odd_2'      : 1.0,
        'p1_implied_prob_norm': 0.5,
        'p2_implied_prob_norm': 0.5,
    }

    best_s1 = player_stats_map.get(best_p1, pd.Series(0.0, index=base_cols))
    best_s2 = player_stats_map.get(best_p2, pd.Series(0.0, index=base_cols))

    for base in base_cols:
        row[f'{base}_p1'] = float(best_s1.get(base, 0.0))
        row[f'{base}_p2'] = float(best_s2.get(base, 0.0))

    return pd.DataFrame([row])[best_features]

In [None]:
# PRE-COMPUTE ALL MATCH PROBABILITIES (24 matches)
best_match_probs = {}

for group in (all_players[:4], all_players[4:]):
    for best_p1, best_p2 in itertools.combinations(group, 2):
        for order in ((best_p1, best_p2), (best_p2, best_p1)):
            X = best_build_match_row(*order, round_v=5.0, best_of=3, importance=45.0)
            best_prob = best_model.predict_proba(X)[0, 1]   # P(Player_1 wins)
            best_match_probs[order] = best_prob

print(f"\nPre-computed {len(best_match_probs)} match probabilities.")


Pre-computed 24 match probabilities.


In [None]:
connors_group = ['Alcaraz C.', 'Fritz T.', 'De Minaur A.', 'Musetti L.']
borg_group    = ['Sinner J.', 'Zverev A.', 'Shelton B.', 'Auger-Aliassime F.']

# MONTE-CARLO + PER-ROUND STATS
N_SIMS = 10_000
best_champion_counter = Counter()
best_rr_stats = {p: {'matches': 0, 'wins': 0} for p in all_players}
best_sf_stats = {p: {'matches': 0, 'wins': 0} for p in all_players}
best_final_stats = {p: {'matches': 0, 'wins': 0} for p in all_players}

best_rng = np.random.default_rng(42)
best_champion_counter = Counter()
best_group_wins = {p: np.zeros(N_SIMS) for p in all_players}

# Vectorised simulation
best_rng = np.random.default_rng(42)
for sim in range(N_SIMS):
    best_rr_wins = Counter()
    # Round-Robin
    for (best_p1, best_p2), best_prob in best_match_probs.items():
        best_rr_stats[best_p1]['matches'] += 1
        best_rr_stats[best_p2]['matches'] += 1
        if best_rng.random() < best_prob:
            best_rr_wins[best_p1] += 1
            best_rr_stats[best_p1]['wins'] += 1
        else:
            best_rr_stats[best_p2]['wins'] += 1

    # Group top-2
    best_c_top2 = sorted(all_players[:4], key=lambda p: best_rr_wins[p], reverse=True)[:2]
    best_b_top2 = sorted(all_players[4:], key=lambda p: best_rr_wins[p], reverse=True)[:2]

    # Semifinals
    best_sf1 = (best_c_top2[0], best_b_top2[1])
    best_sf2 = (best_b_top2[0], best_c_top2[1])
    best_sf1_prob = best_model.predict_proba(best_build_match_row(*best_sf1, 6.0, 3, 54.0))[:, 1][0]
    best_sf2_prob = best_model.predict_proba(best_build_match_row(*best_sf2, 6.0, 3, 54.0))[:, 1][0]
    best_sf1_w = best_sf1[0] if best_rng.random() < best_sf1_prob else best_sf1[1]
    best_sf2_w = best_sf2[0] if best_rng.random() < best_sf2_prob else best_sf1[1]

    best_sf_stats[best_sf1[0]]['matches'] += 1; best_sf_stats[best_sf1[1]]['matches'] += 1
    best_sf_stats[best_sf2[0]]['matches'] += 1; best_sf_stats[best_sf2[1]]['matches'] += 1
    best_sf_stats[best_sf1_w]['wins'] += 1
    best_sf_stats[best_sf2_w]['wins'] += 1

    # Final
    best_final = (best_sf1_w, best_sf2_w)
    best_final_prob = best_model.predict_proba(best_build_match_row(*best_final, 7.0, 3, 50.0))[:, 1][0]
    best_champ = best_final[0] if best_rng.random() < best_final_prob else best_final[1]
    best_champion_counter[best_champ] += 1

    best_final_stats[best_final[0]]['matches'] += 1
    best_final_stats[best_final[1]]['matches'] += 1
    best_final_stats[best_champ]['wins'] += 1

In [None]:
best_rr_df   = make_stats_df(best_rr_stats,   'RR')
best_sf_df   = make_stats_df(best_sf_stats,   'SF')
best_final_df = make_stats_df(best_final_stats, 'Final')

In [None]:
# DISPLAY & EXPORT
print("\n=== 2025 ATP FINALS – 10,000 SIMULATIONS – WITH BEST MODEL===")
print("\nRound-Robin Stats:")
display(best_rr_df)


=== 2025 ATP FINALS – 10,000 SIMULATIONS – WITH BEST MODEL===

Round-Robin Stats:


Unnamed: 0_level_0,RR Matches,RR Wins,RR Win %
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sinner J.,60000,33726,56.2%
Alcaraz C.,60000,33540,55.9%
Auger-Aliassime F.,60000,29222,48.7%
Musetti L.,60000,29106,48.5%
De Minaur A.,60000,29010,48.4%
Shelton B.,60000,28610,47.7%
Zverev A.,60000,28442,47.4%
Fritz T.,60000,28344,47.2%


In [None]:
print("\nSemi-Final Stats – WITH BEST MODEL:")
display(best_sf_df)


Semi-Final Stats – WITH BEST MODEL:


Unnamed: 0_level_0,SF Matches,SF Wins,SF Win %
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Sinner J.,8226,6106,74.2%
Auger-Aliassime F.,2669,1574,59.0%
Alcaraz C.,8166,4726,57.9%
Shelton B.,3674,2108,57.4%
Zverev A.,5431,3016,55.5%
De Minaur A.,3741,805,21.5%
Fritz T.,5339,1125,21.1%
Musetti L.,2754,540,19.6%


In [None]:
print("\nFinal Stats – WITH BEST MODEL:")
display(best_final_df)


Final Stats – WITH BEST MODEL:


Unnamed: 0_level_0,Final Matches,Final Wins,Final Win %
Player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alcaraz C.,4726,3651,77.3%
Musetti L.,540,350,64.8%
Fritz T.,1125,726,64.5%
De Minaur A.,805,517,64.2%
Zverev A.,3016,1255,41.6%
Shelton B.,2108,857,40.7%
Auger-Aliassime F.,1574,627,39.8%
Sinner J.,6106,2017,33.0%


In [None]:
# Champion probability
best_champ_df = pd.DataFrame([
    {'Player': p, 'Champion %': f"{(c/N_SIMS)*100:.2f}%"}
    for p, c in best_champion_counter.most_common()
]).set_index('Player')

print("\nChampion Probability – WITH BEST MODEL:")
display(best_champ_df)


Champion Probability – WITH BEST MODEL:


Unnamed: 0_level_0,Champion %
Player,Unnamed: 1_level_1
Alcaraz C.,36.51%
Sinner J.,20.17%
Zverev A.,12.55%
Shelton B.,8.57%
Fritz T.,7.26%
Auger-Aliassime F.,6.27%
De Minaur A.,5.17%
Musetti L.,3.50%


### **Summary of Simulation Results**

#### **Baseline Model Results**

**Champion Probability:**

| Player             | Champion % |
| :----------------- | :--------- |
| Alcaraz C.         | 19.44%     |
| Sinner J.          | 16.55%     |
| Fritz T.           | 14.04%     |
| Zverev A.          | 14.00%     |
| Shelton B.         | 10.89%     |
| De Minaur A.       | 9.84%      |
| Musetti L.         | 7.71%      |
| Auger-Aliassime F. | 7.53%      |

**Round-Robin Stats:**

| Player             | RR Matches | RR Wins | RR Win % |
| :----------------- | :--------- | :------ | :------- |
| De Minaur A.       | 60000      | 30039   | 50.1%    |
| Sinner J.          | 60000      | 30037   | 50.1%    |
| Shelton B.         | 60000      | 30046   | 50.1%    |
| Alcaraz C.         | 60000      | 30021   | 50.0%    |
| Musetti L.         | 60000      | 30000   | 50.0%    |
| Auger-Aliassime F. | 60000      | 29987   | 50.0%    |
| Fritz T.           | 60000      | 29940   | 49.9%    |
| Zverev A.          | 60000      | 29930   | 49.9%    |

**Semi-Final Stats:**

| Player             | SF Matches | SF Wins | SF Win % |
| :----------------- | :--------- | :------ | :------- |
| Alcaraz C.         | 6809       | 3609    | 53.0%    |
| Sinner J.          | 6798       | 3559    | 52.4%    |
| Fritz T.           | 5765       | 2828    | 49.1%    |
| Zverev A.          | 5751       | 2826    | 49.1%    |
| Shelton B.         | 4325       | 2125    | 49.1%    |
| Musetti L.         | 3185       | 1537    | 48.3%    |
| Auger-Aliassime F. | 3126       | 1493    | 47.8%    |
| De Minaur A.       | 4241       | 2023    | 47.7%    |

**Final Stats:**

| Player             | Final Matches | Final Wins | Final Win % |
| :----------------- | :------------ | :--------- | :---------- |
| Alcaraz C.         | 3609          | 1944       | 53.9%       |
| Shelton B.         | 2125          | 1089       | 51.2%       |
| Auger-Aliassime F. | 1493          | 753        | 50.4%       |
| Musetti L.         | 1537          | 771        | 50.2%       |
| Fritz T.           | 2828          | 1404       | 49.6%       |
| Zverev A.          | 2826          | 1400       | 49.5%       |
| De Minaur A.       | 2023          | 984        | 48.6%       |
| Sinner J.          | 3559          | 1655       | 46.5%       |

#### **Best Model Results**

**Champion Probability:**

| Player             | Champion % |
| :----------------- | :--------- |
| Alcaraz C.         | 36.51%     |
| Sinner J.          | 20.17%     |
| Zverev A.          | 12.55%     |
| Shelton B.         | 8.57%      |
| Fritz T.           | 7.26%      |
| Auger-Aliassime F. | 6.27%      |
| De Minaur A.       | 5.17%      |
| Musetti L.         | 3.50%      |

**Round-Robin Stats:**

| Player             | RR Matches | RR Wins | RR Win % |
| :----------------- | :--------- | :------ | :------- |
| Sinner J.          | 60000      | 33726   | 56.2%    |
| Alcaraz C.         | 60000      | 33540   | 55.9%    |
| Auger-Aliassime F. | 60000      | 29222   | 48.7%    |
| Musetti L.         | 60000      | 29106   | 48.5%    |
| De Minaur A.       | 60000      | 29010   | 48.4%    |
| Shelton B.         | 60000      | 28610   | 47.7%    |
| Zverev A.          | 60000      | 28442   | 47.4%    |
| Fritz T.           | 60000      | 28344   | 47.2%    |

**Semi-Final Stats:**

| Player             | SF Matches | SF Wins | SF Win % |
| :----------------- | :--------- | :------ | :------- |
| Sinner J.          | 8226       | 6106    | 74.2%    |
| Auger-Aliassime F. | 2669       | 1574    | 59.0%    |
| Alcaraz C.         | 8166       | 4726    | 57.9%    |
| Shelton B.         | 3674       | 2108    | 57.4%    |
| Zverev A.          | 5431       | 3016    | 55.5%    |
| De Minaur A.       | 3741       | 805     | 21.5%    |
| Fritz T.           | 5339       | 1125    | 21.1%    |
| Musetti L.         | 2754       | 540     | 19.6%    |

**Final Stats:**

| Player             | Final Matches | Final Wins | Final Win % |
| :----------------- | :------------ | :--------- | :---------- |
| Alcaraz C.         | 4726          | 3651       | 77.3%       |
| Musetti L.         | 540           | 350        | 64.8%       |
| Fritz T.           | 1125          | 726        | 64.5%       |
| De Minaur A.       | 805           | 517        | 64.2%       |
| Zverev A.          | 3016          | 1255       | 41.6%       |
| Shelton B.         | 2108          | 857        | 40.7%       |
| Auger-Aliassime F. | 1574          | 627        | 39.8%       |
| Sinner J.          | 6106          | 2017       | 33.0%       |