In [6]:
import requests
import pandas as pd
from data_processor import TennisDataProcessor, to_player, to_average,get_fatigue_stats

ROUND_ORDER = ["RR", "R128", "R64", "R32", "R16", "QF", "SF", "F"]

pd.options.display.max_columns = None
url = "https://raw.githubusercontent.com/JeffSackmann/tennis_atp/refs/heads/master/atp_matches_{}.csv"
data_20s = pd.concat([pd.read_csv(url.format(year)) for year in range(2000, 2025)])


In [7]:
tdp = TennisDataProcessor(data_20s.copy())
derived_data = tdp.derive_match_data()

In [8]:
player_1_name = "Novak Djokovic"
player_1_full = get_fatigue_stats(
    to_average(
        to_player(player_1_name, derived_data), 
        lookback=10)
    )
player_1_full.sort_values(
    by=["player_rank_points"],
    ascending=False
)

player_2_name = "Roger Federer"
player_2_full = get_fatigue_stats(
    to_average(
        to_player(player_2_name, derived_data), 
        lookback=10)
    )
player_2_full.sort_values(
    by=["player_rank_points"],
    ascending=False 
)

novvsfed = player_1_full.loc[player_1_full["opponent_name"]=="Roger Federer"].head(15)
fedvsnov = player_2_full.loc[player_2_full["opponent_name"]=="Novak Djokovic"].head(15)

In [None]:
match_keys = ['surface', 'tourney_level', 'tourney_date', 'tourney_name', 'round', 'best_of']
df = pd.merge(
    novvsfed,
    fedvsnov,
    on=match_keys,
    suffixes=('_djokovic', '_federer'),
    how='inner'
)
def reorder_players(row):
    if row['result_djokovic'] == 'win':
        winner = {f'winner_{col.replace("_djokovic", "")}': row[col] for col in row.index if '_djokovic' in col}
        loser = {f'loser_{col.replace("_federer", "")}': row[col] for col in row.index if '_federer' in col}
    else:
        winner = {f'winner_{col.replace("_federer", "")}': row[col] for col in row.index if '_federer' in col}
        loser = {f'loser_{col.replace("_djokovic", "")}': row[col] for col in row.index if '_djokovic' in col}

    match_data = {key: row[key] for key in match_keys}
    return pd.Series({**match_data, **winner, **loser})

df_final = df.apply(reorder_players, axis=1)
df_final = df_final[[
    'surface',
    'tourney_level',
    'tourney_date',
    'tourney_name',
    'round',
    'best_of',
    'winner_player_name',
    'winner_player_rank_points',
    'winner_player_ace_rate',
    'winner_player_ace_rate_avg',
    'winner_player_df_rate',
    'winner_player_df_rate_avg',
    'winner_player_1stWon_pct',
    'winner_player_1stWon_pct_avg',
    'winner_player_2ndWon_pct',
    'winner_player_2ndWon_pct_avg',
    'winner_player_1stsv_acc',
    'winner_player_1stsv_acc_avg',
    'winner_player_rt_won_pct',
    'winner_player_rt_won_pct_avg',
    'winner_games_played_tournament',
    'winner_games_played_last_30_days',
    'loser_player_name',
    'loser_player_rank_points',
    'loser_player_ace_rate',
    'loser_player_ace_rate_avg',
    'loser_player_df_rate',
    'loser_player_df_rate_avg',
    'loser_player_1stWon_pct',
    'loser_player_1stWon_pct_avg',
    'loser_player_2ndWon_pct',
    'loser_player_2ndWon_pct_avg',
    'loser_player_1stsv_acc',
    'loser_player_1stsv_acc_avg',
    'loser_player_rt_won_pct',
    'loser_player_rt_won_pct_avg',
    'loser_games_played_tournament', 
    'loser_games_played_last_30_days', 
    'winner_result', # keep dupe
    'winner_set_count', # keep dupe
    'winner_tiebreak_count', # keep dupe
    'winner_games_in_sets', # keep dupe
    'winner_total_games', # keep dupe
    'winner_gps', # keep dupe
]]
df_final = df_final.rename(columns={
    col:col.replace("player_", "") for col in df_final.columns
} | {'winner_set_count':"set_count",
    'winner_tiebreak_count':"tiebreak_count",
    'winner_games_in_sets':"games_in_sets",
    'winner_total_games':"total_games",
    'winner_gps':"gps"})
df_final

Unnamed: 0,surface,tourney_level,tourney_date,tourney_name,round,best_of,winner_name,winner_rank_points,winner_ace_rate,winner_ace_rate_avg,winner_df_rate,winner_df_rate_avg,winner_1stWon_pct,winner_1stWon_pct_avg,winner_2ndWon_pct,winner_2ndWon_pct_avg,winner_1stsv_acc,winner_1stsv_acc_avg,winner_rt_won_pct,winner_rt_won_pct_avg,winner_games_played_tournament,winner_games_played_last_30_days,loser_name,loser_rank_points,loser_ace_rate,loser_ace_rate_avg,loser_df_rate,loser_df_rate_avg,loser_1stWon_pct,loser_1stWon_pct_avg,loser_2ndWon_pct,loser_2ndWon_pct_avg,loser_1stsv_acc,loser_1stsv_acc_avg,loser_rt_won_pct,loser_rt_won_pct_avg,loser_games_played_tournament,loser_games_played_last_30_days,result,set_count,tiebreak_count,games_in_sets,total_games,gps
0,Clay,M,2006-04-17,Monte Carlo Masters,R64,3,Roger Federer,6810.0,0.013699,0.068755,0.0,0.021276,0.717949,0.780393,0.617647,0.586911,0.534247,0.652617,0.414894,0.436405,0,140,Novak Djokovic,606.0,0.010638,0.084735,0.042553,0.033113,0.590164,0.732654,0.575758,0.521551,0.648936,0.609041,0.328767,0.384991,0,35,win,3,0,"[9, 8, 9]",26,8.666667
1,Hard,G,2007-01-15,Australian Open,R16,5,Roger Federer,8120.0,0.15,0.111937,0.0125,0.018004,0.8,0.787162,0.666667,0.57897,0.625,0.651684,0.445545,0.442666,84,84,Novak Djokovic,1530.0,0.069307,0.101327,0.019802,0.029425,0.612903,0.765971,0.461538,0.542358,0.613861,0.668232,0.25,0.461745,85,184,win,3,0,"[8, 12, 9]",29,9.666667
2,Hard,A,2007-02-26,Dubai,QF,3,Roger Federer,8120.0,0.08,0.103524,0.01,0.011573,0.741935,0.769825,0.578947,0.656706,0.62,0.619629,0.397959,0.459374,52,52,Novak Djokovic,1665.0,0.061224,0.08177,0.040816,0.02197,0.666667,0.728248,0.485714,0.556308,0.642857,0.676204,0.32,0.403913,51,168,win,3,1,"[9, 13, 9]",31,10.333333
3,Hard,M,2007-08-05,Canada Masters,F,3,Novak Djokovic,3200.0,0.0625,0.095101,0.03125,0.026786,0.698413,0.752009,0.575758,0.544715,0.65625,0.688683,0.343137,0.397218,79,125,Roger Federer,7290.0,0.147059,0.13526,0.009804,0.013841,0.768116,0.828005,0.424242,0.599295,0.676471,0.660158,0.34375,0.398917,80,80,win,3,2,"[13, 8, 13]",34,11.333333
4,Hard,G,2007-08-27,US Open,F,5,Roger Federer,7605.0,0.103774,0.124915,0.037736,0.017098,0.787879,0.775983,0.625,0.625903,0.622642,0.640259,0.362069,0.403724,194,429,Novak Djokovic,3670.0,0.043103,0.107708,0.060345,0.037627,0.703125,0.771229,0.557692,0.55423,0.551724,0.631155,0.273585,0.392816,217,347,win,3,2,"[13, 13, 10]",36,12.0
5,Hard,G,2008-01-14,Australian Open,SF,5,Novak Djokovic,4315.0,0.115044,0.093239,0.053097,0.036987,0.779221,0.702973,0.416667,0.519855,0.681416,0.613504,0.394495,0.39501,132,132,Roger Federer,7180.0,0.091743,0.13593,0.018349,0.010334,0.705882,0.775532,0.439024,0.650178,0.623853,0.708202,0.336283,0.441733,172,172,win,3,1,"[12, 9, 13]",34,11.333333
6,Clay,M,2008-04-20,Monte Carlo Masters,SF,3,Roger Federer,6425.0,0.02381,0.105804,0.0,0.021864,0.818182,0.760669,0.5,0.586891,0.52381,0.626152,0.483871,0.426944,76,265,Novak Djokovic,4725.0,0.032258,0.073152,0.048387,0.019048,0.594595,0.741511,0.4,0.588215,0.596774,0.628737,0.333333,0.458486,50,82,win,3,0,"[9, 5]",14,4.666667
7,Hard,G,2008-08-25,US Open,SF,5,Roger Federer,5930.0,0.16,0.120028,0.008,0.01751,0.7625,0.798103,0.6,0.604897,0.64,0.642216,0.379032,0.385792,170,319,Novak Djokovic,5105.0,0.048387,0.100528,0.040323,0.02257,0.7125,0.766555,0.454545,0.570763,0.645161,0.690167,0.296,0.398553,196,415,win,4,0,"[9, 12, 12, 8]",41,10.25
8,Hard,M,2009-03-25,Miami Masters,SF,3,Novak Djokovic,8420.0,0.054795,0.059743,0.054795,0.031362,0.686275,0.718581,0.545455,0.599566,0.69863,0.621117,0.4875,0.411906,70,274,Roger Federer,10910.0,0.05,0.076305,0.05,0.034363,0.583333,0.786383,0.40625,0.562919,0.6,0.608319,0.356164,0.390279,80,202,win,3,0,"[9, 8, 9]",26,8.666667
9,Clay,M,2009-04-27,Rome Masters,SF,3,Novak Djokovic,9160.0,0.03,0.045851,0.06,0.05043,0.64,0.697226,0.56,0.478696,0.5,0.618539,0.432099,0.484291,52,170,Roger Federer,10060.0,0.074074,0.063531,0.037037,0.029836,0.725,0.763436,0.414634,0.592105,0.493827,0.587782,0.4,0.401785,58,100,win,3,0,"[10, 9, 9]",28,9.333333


In [None]:
# Replace player_ with ''
[
 'surface',
 'tourney_level',
 'tourney_date',
 'tourney_name',
 'round',
 'best_of',
 'winner_player_name',
 'winner_player_rank_points',
 'winner_opponent_name',
 'winner_player_ace_rate',
 'winner_player_ace_rate_avg',
 'winner_player_df_rate',
 'winner_player_df_rate_avg',
 'winner_player_1stWon_pct',
 'winner_player_1stWon_pct_avg',
 'winner_player_2ndWon_pct',
 'winner_player_2ndWon_pct_avg',
 'winner_player_1stsv_acc',
 'winner_player_1stsv_acc_avg',
 'winner_player_rt_won_pct',
 'winner_player_rt_won_pct_avg',
 'winner_result', # keep dupe
 'winner_set_count', # keep dupe
 'winner_tiebreak_count', # keep dupe
 'winner_games_in_sets', # keep dupe
 'winner_total_games', # keep dupe
 'winner_gps', # keep dupe
 'winner_games_played_tournament',
 'winner_games_played_last_30_days',
 'loser_player_name',
 'loser_player_rank_points',
 'loser_opponent_name',
 'loser_player_ace_rate',
 'loser_player_ace_rate_avg',
 'loser_player_df_rate',
 'loser_player_df_rate_avg',
 'loser_player_1stWon_pct',
 'loser_player_1stWon_pct_avg',
 'loser_player_2ndWon_pct',
 'loser_player_2ndWon_pct_avg',
 'loser_player_1stsv_acc',
 'loser_player_1stsv_acc_avg',
 'loser_player_rt_won_pct',
 'loser_player_rt_won_pct_avg',
 'loser_games_played_tournament', 
 'loser_games_played_last_30_days' 
 'loser_result', # drop dupe
 'loser_set_count', # drop dupe
 'loser_tiebreak_count', # drop dupe
 'loser_games_in_sets', # drop dupe
 'loser_total_games', # drop dupe
 'loser_gps', # drop dupe
 ]

['surface',
 'tourney_level',
 'tourney_date',
 'tourney_name',
 'round',
 'best_of',
 'winner_player_name',
 'winner_player_rank_points',
 'winner_opponent_name',
 'winner_player_ace_rate',
 'winner_player_ace_rate_avg',
 'winner_player_df_rate',
 'winner_player_df_rate_avg',
 'winner_player_1stWon_pct',
 'winner_player_1stWon_pct_avg',
 'winner_player_2ndWon_pct',
 'winner_player_2ndWon_pct_avg',
 'winner_player_1stsv_acc',
 'winner_player_1stsv_acc_avg',
 'winner_player_rt_won_pct',
 'winner_player_rt_won_pct_avg',
 'winner_result',
 'winner_set_count',
 'winner_tiebreak_count',
 'winner_games_in_sets',
 'winner_total_games',
 'winner_gps',
 'winner_games_played_tournament',
 'winner_games_played_last_30_days',
 'loser_player_name',
 'loser_player_rank_points',
 'loser_opponent_name',
 'loser_player_ace_rate',
 'loser_player_ace_rate_avg',
 'loser_player_df_rate',
 'loser_player_df_rate_avg',
 'loser_player_1stWon_pct',
 'loser_player_1stWon_pct_avg',
 'loser_player_2ndWon_pct',
 'l

In [11]:
# Next steps: 
# Change columns on df_final
# games per set
# Build model