In [1]:
from collections import Counter
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import re

RANDOM_SEED = 42
warnings.simplefilter("ignore")
np.seed = RANDOM_SEED

# Load Dataset

In [2]:
pathes_details = [(lambda year: fr'Final_Data/atp_matches_{year}.csv', 1968, 2023),
               (lambda year: fr'Final_Data/atp_matches_futures_{year}.csv', 2000, 2023),
               (lambda year: fr'Final_Data/atp_matches_qual_chall_{year}.csv', 1978, 2023)]

df = pd.DataFrame()
for path_func, first_year, last_year in pathes_details:
    for year in range(first_year, last_year + 1):
        current_df = pd.read_csv(path_func(year))
        df = pd.concat([df, current_df])

df.head(3)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,1968-2029,Dublin,Grass,32.0,A,19680708,270,112411,,,...,,,,,,,,,,
1,1968-2029,Dublin,Grass,32.0,A,19680708,271,126914,,,...,,,,,,,,,,
2,1968-2029,Dublin,Grass,32.0,A,19680708,272,209523,,,...,,,,,,,,,,


# Calculate number of matches each player

In [3]:
def number_of_matches_per_player(group):
    # Count matches for winners and losers
    winner_match_counts = group['winner_id'].value_counts().to_dict()
    loser_match_count = group['loser_id'].value_counts().to_dict()
    # Combine using Counter
    combined_dict = Counter(winner_match_counts) + Counter(loser_match_count)
    match_counts = dict(combined_dict)
    
    # Get winner and loser match counts using a direct lookup
    group['winner_matches'] = group['winner_id'].map(match_counts)
    group['loser_matches'] = group['loser_id'].map(match_counts)
    
    return group

# Apply the mapping and preserve original order
df = df.groupby('tourney_id').apply(number_of_matches_per_player).reset_index(drop=True)
df.head(2)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,winner_matches,loser_matches
0,1968-2029,Dublin,Grass,32.0,A,19680708,270,112411,,,...,,,,,,,,,2,1
1,1969-2029,Dublin,Grass,32.0,A,19690707,270,100011,,,...,,,,,,,,,3,1


# Transform round to int

In [4]:
df = df[~df['round'].isin(['RR', 'BR', 'ER', 'Q4'])]

# Map initial rounds to integers
map_round = {'Q1': 1, 'Q2': 2, 'Q3': 3, 'R128': 4, 'R64': 5,
             'R32': 6, 'R16': 7, 'QF': 8, 'SF': 9, 'F': 10}

df['Round'] = df['round'].map(map_round)

# Function to create a continuous mapping for rounds within each group
def map_rounds(group):
    unique_rounds = group['Round'].unique()
    round_mapping = {value: idx + 1 for idx, value in enumerate(sorted(unique_rounds))}
    group['Round'] = group['Round'].map(round_mapping)
    return group

# Apply the mapping and preserve original order
df = df.groupby('tourney_id').apply(map_rounds).reset_index(drop=True)
df.head(2)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,winner_matches,loser_matches,Round
0,1968-2029,Dublin,Grass,32.0,A,19680708,270,112411,,,...,,,,,,,,2,1,1
1,1969-2029,Dublin,Grass,32.0,A,19690707,270,100011,,,...,,,,,,,,3,1,1


# Filtered Data

In [None]:
COLUMNS = ['tourney_name', 'surface', 'draw_size', 'tourney_date', 'winner_name', 'winner_matches',
           'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age', 'loser_name', 'loser_hand', 'loser_ht',
           'loser_matches', 'loser_ioc', 'loser_age', 'score', 'best_of', 'Round', 'winner_rank', 'loser_rank']
df = df[COLUMNS].dropna().drop_duplicates()

df['best_of'] = df['best_of'].astype(str)
df['score'] = df['score'].apply(lambda text: re.sub(r'\(\d+\)', '', text))
df = df[df['score'].apply(lambda score: ('7-6' in score.split()[0] or '6-7' in score.split()[0]))]
df = df[~df['score'].str.contains('Walkover|DEF|RET|W/O|&nbsp;|Def|Played and abandoned|UNK|UNP', na=False)]
df['score'] = df['score'].str.split()
df = df[(df['loser_hand'].isin(['R','L'])) & (df['winner_hand'].isin(['R','L']))]
df = df[df['score'].apply(len)>1]
pattern = r'^\d+-\d+( (?:\d+-\d+))*$'
df = df[df['score'].apply(lambda score_list: " ".join(score_list)).str.match(pattern)]

df.head(3)

Unnamed: 0,tourney_name,surface,draw_size,tourney_date,winner_name,winner_matches,winner_hand,winner_ht,winner_ioc,winner_age,...,loser_hand,loser_ht,loser_matches,loser_ioc,loser_age,score,best_of,Round,winner_rank,loser_rank
24,Guaruja CH,Clay,32.0,19850114,Gustavo Tiberti,2,R,175.0,ARG,25.0,...,L,178.0,1,USA,24.1,"[7-6, 6-0]",3,1,342.0,313.0
35,Adelaide,Hard,32.0,19900101,Sergi Bruguera,4,R,188.0,ESP,18.9,...,R,190.0,1,SWE,20.3,"[7-6, 6-2]",3,1,25.0,282.0
44,Adelaide,Hard,32.0,19950102,Yevgeny Kafelnikov,3,R,190.0,RUS,20.8,...,R,183.0,1,RSA,22.0,"[7-6, 6-1]",3,1,11.0,98.0


# Choose Random Player

In [None]:
random_players_func = lambda row: np.random.choice(['winner', 'loser'], size=1)[0]
df['Choosen Player'] = df.apply(random_players_func, axis=1)

df.head(3)

Unnamed: 0,tourney_name,surface,draw_size,tourney_date,winner_name,winner_matches,winner_hand,winner_ht,winner_ioc,winner_age,...,loser_ht,loser_matches,loser_ioc,loser_age,score,best_of,Round,winner_rank,loser_rank,Choosen Player
24,Guaruja CH,Clay,32.0,19850114,Gustavo Tiberti,2,R,175.0,ARG,25.0,...,178.0,1,USA,24.1,"[7-6, 6-0]",3,1,342.0,313.0,winner
35,Adelaide,Hard,32.0,19900101,Sergi Bruguera,4,R,188.0,ESP,18.9,...,190.0,1,SWE,20.3,"[7-6, 6-2]",3,1,25.0,282.0,loser
44,Adelaide,Hard,32.0,19950102,Yevgeny Kafelnikov,3,R,190.0,RUS,20.8,...,183.0,1,RSA,22.0,"[7-6, 6-1]",3,1,11.0,98.0,loser


# Extract Treatment and Outcome

In [None]:
# df['T'] = df.apply(lambda row: int(row['score'][0][0 if row['Choosen Player']=='winner' else -1]=='7'), axis=1)
df['T'] = df.apply(lambda row: int(int(row['score'][0][0]) > int(row['score'][0][-1])) 
                                    if row['Choosen Player']=='winner'
                                    else int(int(row['score'][0][0]) < int(row['score'][0][-1])), axis=1)

df['Y'] = df.apply(lambda row: int(int(row['score'][1][0]) > int(row['score'][1][-1])) 
                                    if row['Choosen Player']=='winner'
                                    else int(int(row['score'][1][0]) < int(row['score'][1][-1])), axis=1)

df.head(5)

Unnamed: 0,tourney_name,surface,draw_size,tourney_date,winner_name,winner_matches,winner_hand,winner_ht,winner_ioc,winner_age,...,loser_ioc,loser_age,score,best_of,Round,winner_rank,loser_rank,Choosen Player,T,Y
24,Guaruja CH,Clay,32.0,19850114,Gustavo Tiberti,2,R,175.0,ARG,25.0,...,USA,24.1,"[7-6, 6-0]",3,1,342.0,313.0,winner,1,1
35,Adelaide,Hard,32.0,19900101,Sergi Bruguera,4,R,188.0,ESP,18.9,...,SWE,20.3,"[7-6, 6-2]",3,1,25.0,282.0,loser,0,0
44,Adelaide,Hard,32.0,19950102,Yevgeny Kafelnikov,3,R,190.0,RUS,20.8,...,RSA,22.0,"[7-6, 6-1]",3,1,11.0,98.0,loser,0,0
58,Auckland,Hard,32.0,20010108,Stefan Koubek,3,L,175.0,AUT,24.0,...,ARG,25.3,"[7-6, 6-2]",3,1,52.0,14.0,winner,1,1
64,Madrid Masters,Hard,48.0,20031013,Wayne Ferreira,2,R,185.0,RSA,32.0,...,CRO,24.5,"[7-6, 7-6]",3,1,28.0,42.0,loser,0,0


# Feature Engineering

In [None]:
df['log_rank_difference'] = df.apply(lambda row: (np.log2(row['winner_rank']) - np.log2(row['loser_rank'])) 
                                            if row['Choosen Player'] == 'winner'
                                            else (np.log2(row['loser_rank']) - np.log2(row['winner_rank']))
                                            ,axis=1)

# Save Clean Dataset

In [None]:
dataset = df[['surface', 'best_of', 'Round',
            'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age', 'winner_rank', #'winner_matches',
            'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc', 'loser_age', 'loser_rank', #'loser_matches',
            'log_rank_difference', 'Choosen Player', 'T', 'Y']].to_csv('Final_Data/dataset.csv', index=False)

dataset = pd.read_csv('Final_Data/dataset.csv')

dataset.head(5)

Unnamed: 0,surface,best_of,Round,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,winner_rank,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,loser_rank,log_rank_difference,Choosen Player,T,Y
0,Clay,3,1,Gustavo Tiberti,R,175.0,ARG,25.0,342.0,Billy Nealon,L,178.0,USA,24.1,313.0,0.127834,winner,1,1
1,Hard,3,1,Sergi Bruguera,R,188.0,ESP,18.9,25.0,Per Henricsson,R,190.0,SWE,20.3,282.0,3.495695,loser,0,0
2,Hard,3,1,Yevgeny Kafelnikov,R,190.0,RUS,20.8,11.0,Marcos Ondruska,R,183.0,RSA,22.0,98.0,3.155278,loser,0,0
3,Hard,3,1,Stefan Koubek,L,175.0,AUT,24.0,52.0,Franco Squillari,L,183.0,ARG,25.3,14.0,1.893085,winner,1,1
4,Hard,3,1,Wayne Ferreira,R,185.0,RSA,32.0,28.0,Ivan Ljubicic,R,193.0,CRO,24.5,42.0,0.584963,loser,0,0


# Save Feature Dataset

In [None]:
feature_df = df[['surface', 'best_of', 'Round',
                  'winner_hand', 'winner_ht', 'winner_age', 'winner_rank', #'winner_matches',
                  'loser_hand', 'loser_ht', 'loser_age', 'loser_rank', #'loser_matches', 
                  'log_rank_difference', 'Choosen Player', 'T', 'Y']]

def switch_columns(row):
    if row['Choosen Player'] == 'loser':
        row['winner_hand'], row['loser_hand'] = row['loser_hand'], row['winner_hand']
        row['winner_ht'], row['loser_ht'] = row['loser_ht'], row['winner_ht']
        row['winner_age'], row['loser_age'] = row['loser_age'], row['winner_age']
        row['winner_rank'], row['loser_rank'] = row['loser_rank'], row['winner_rank']
        # row['winner_matches'], row['loser_matches'] = row['loser_matches'], row['winner_matches']
    return row

feature_df = feature_df.apply(switch_columns, axis=1)
feature_df.columns = ['surface', 'best_of', 'Round', 
                      'player_hand', 'player_ht', 'player_age', 'player_rank', #'player_matches',
                      'opponent_hand', 'opponent_ht', 'opponent_age', 'opponent_rank', #'opponent_matches',
                      'log_rank_difference', 'Choosen Player', 'T', 'Y']

# cols = ['surface', 'best_of', 'player_rank', 'player_ht', 'player_age', 'T', 'Y']
feature_df = feature_df.drop(columns=['Choosen Player'])

# train_df = feature_df.sample(frac=0.5, random_state=RANDOM_SEED)
# eval_df = feature_df.drop(train_df.index)

feature_df.to_csv('Final_Data/feature_df.csv')
# train_df.to_csv('Final_Data/train_feature_df.csv')
# eval_df.to_csv('Final_Data/eval_feature_df.csv')

In [None]:
feature_df

Unnamed: 0,surface,best_of,Round,player_hand,player_ht,player_age,player_rank,opponent_hand,opponent_ht,opponent_age,opponent_rank,log_rank_difference,T,Y
44,Hard,3,1,R,183.0,22.0,98.0,R,190.0,20.8,11.0,3.155278,0,0
58,Hard,3,1,L,175.0,24.0,52.0,L,183.0,25.3,14.0,1.893085,1,1
74,Carpet,3,1,R,185.0,17.8,186.0,R,196.0,18.4,701.0,-1.914112,1,0
123,Hard,3,2,R,180.0,25.0,83.0,R,191.0,29.9,92.0,-0.148523,1,0
124,Hard,3,2,R,196.0,26.7,16.0,R,193.0,25.1,9.0,0.830075,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749170,Hard,3,2,R,183.0,33.0,464.0,R,165.0,20.8,725.0,-0.643856,1,1
750339,Hard,3,4,R,178.0,23.0,332.0,R,180.0,24.7,420.0,-0.339206,1,1
750408,Hard,3,5,R,188.0,20.0,224.0,R,185.0,22.7,192.0,0.222392,0,0
751236,Clay,3,3,R,175.0,28.6,295.0,R,188.0,23.8,718.0,-1.283269,0,0
