In [1]:
from collections import Counter
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import re

warnings.simplefilter("ignore")
np.seed = 42

# Load Dataset

In [2]:
pathes_details = [(lambda year: rf'Final_Data\atp_matches_{year}.csv', 1968, 2023),
               (lambda year: rf'Final_Data\atp_matches_futures_{year}.csv', 2000, 2023),
               (lambda year: rf'Final_Data\atp_matches_qual_chall_{year}.csv', 1978, 2023)]

df = pd.DataFrame()
for path_func, first_year, last_year in pathes_details:
    for year in range(first_year, last_year + 1):
        current_df = pd.read_csv(path_func(year))
        df = pd.concat([df, current_df])

df.head(3)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,1968-2029,Dublin,Grass,32.0,A,19680708,270,112411,,,...,,,,,,,,,,
1,1968-2029,Dublin,Grass,32.0,A,19680708,271,126914,,,...,,,,,,,,,,
2,1968-2029,Dublin,Grass,32.0,A,19680708,272,209523,,,...,,,,,,,,,,


# Calculate number of matches each player

In [3]:
def number_of_matches_per_player(group):
    # Count matches for winners and losers
    winner_match_counts = group['winner_id'].value_counts().to_dict()
    loser_match_count = group['loser_id'].value_counts().to_dict()
    # Combine using Counter
    combined_dict = Counter(winner_match_counts) + Counter(loser_match_count)
    match_counts = dict(combined_dict)
    
    # Get winner and loser match counts using a direct lookup
    group['winner_matches'] = group['winner_id'].map(match_counts)
    group['loser_matches'] = group['loser_id'].map(match_counts)
    
    return group

# Apply the mapping and preserve original order
df = df.groupby('tourney_id').apply(number_of_matches_per_player).reset_index(drop=True)
df.head(2)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,winner_matches,loser_matches
0,1968-2016,London 3,Hard,8.0,A,19681121,300,100126,,,...,,,,,,,,,3,3
1,1968-2016,London 3,Hard,8.0,A,19681121,299,100073,,,...,,,,,,,,,3,2


# Transform round to int

In [4]:
df = df[~df['round'].isin(['RR', 'BR', 'ER', 'Q4'])]

# Map initial rounds to integers
map_round = {'Q1': 1, 'Q2': 2, 'Q3': 3, 'R128': 4, 'R64': 5,
             'R32': 6, 'R16': 7, 'QF': 8, 'SF': 9, 'F': 10}

df['Round'] = df['round'].map(map_round)

# Function to create a continuous mapping for rounds within each group
def map_rounds(group):
    unique_rounds = group['Round'].unique()
    round_mapping = {value: idx + 1 for idx, value in enumerate(sorted(unique_rounds))}
    group['Round'] = group['Round'].map(round_mapping)
    return group

# Apply the mapping and preserve original order
df = df.groupby('tourney_id').apply(map_rounds).reset_index(drop=True)
df.head(2)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,winner_matches,loser_matches,Round
0,1968-2016,London 3,Hard,8.0,A,19681121,300,100126,,,...,,,,,,,,3,3,3
1,1968-2016,London 3,Hard,8.0,A,19681121,299,100073,,,...,,,,,,,,3,2,2


# Filtered Data

In [5]:
COLUMNS = ['tourney_name', 'surface', 'draw_size', 'tourney_date', 'winner_name', 'winner_matches',
           'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age', 'loser_name', 'loser_hand', 'loser_ht',
           'loser_matches', 'loser_ioc', 'loser_age', 'score', 'best_of', 'Round', 'winner_rank', 'loser_rank']
df = df[COLUMNS].dropna().drop_duplicates()

df['best_of'] = df['best_of'].astype(str)
df['score'] = df['score'].apply(lambda text: re.sub(r'\(\d+\)', '', text))
# df = df[df['score'].apply(lambda score: ('7-6' in score.split()[0] or '6-7' in score.split()[0]))]
df = df[~df['score'].str.contains('Walkover|DEF|RET|W/O|&nbsp;|Def|Played and abandoned|UNK|UNP', na=False)]
df['score'] = df['score'].str.split()
df = df[(df['loser_hand'].isin(['R','L'])) & (df['winner_hand'].isin(['R','L']))]
df = df[df['score'].apply(len)>1]
pattern = r'^\d+-\d+( (?:\d+-\d+))*$'
df = df[df['score'].apply(lambda score_list: " ".join(score_list)).str.match(pattern)]

df.head(3)

Unnamed: 0,tourney_name,surface,draw_size,tourney_date,winner_name,winner_matches,winner_hand,winner_ht,winner_ioc,winner_age,...,loser_hand,loser_ht,loser_matches,loser_ioc,loser_age,score,best_of,Round,winner_rank,loser_rank
16669,Tehran,Clay,64.0,19731028,Raul Ramirez,5,R,183.0,MEX,20.3,...,R,183.0,5,AUS,29.4,"[6-7, 6-1, 7-5, 6-3]",5,6,32.0,2.0
16670,Tehran,Clay,64.0,19731028,John Newcombe,5,R,183.0,AUS,29.4,...,L,173.0,4,AUS,35.2,"[7-6, 6-3]",3,5,2.0,10.0
16673,Tehran,Clay,64.0,19731028,Rod Laver,4,L,173.0,AUS,35.2,...,R,178.0,3,GER,26.4,"[6-0, 6-3]",3,4,10.0,114.0


# Choose Random Player

In [6]:
random_players_func = lambda row: np.random.choice(['winner', 'loser'], size=1)[0]
df['Choosen Player'] = df.apply(random_players_func, axis=1)

df.head(3)

Unnamed: 0,tourney_name,surface,draw_size,tourney_date,winner_name,winner_matches,winner_hand,winner_ht,winner_ioc,winner_age,...,loser_ht,loser_matches,loser_ioc,loser_age,score,best_of,Round,winner_rank,loser_rank,Choosen Player
16669,Tehran,Clay,64.0,19731028,Raul Ramirez,5,R,183.0,MEX,20.3,...,183.0,5,AUS,29.4,"[6-7, 6-1, 7-5, 6-3]",5,6,32.0,2.0,loser
16670,Tehran,Clay,64.0,19731028,John Newcombe,5,R,183.0,AUS,29.4,...,173.0,4,AUS,35.2,"[7-6, 6-3]",3,5,2.0,10.0,winner
16673,Tehran,Clay,64.0,19731028,Rod Laver,4,L,173.0,AUS,35.2,...,178.0,3,GER,26.4,"[6-0, 6-3]",3,4,10.0,114.0,loser


# Extract Treatment and Outcome

In [7]:
# df['T'] = df.apply(lambda row: int(row['score'][0][0 if row['Choosen Player']=='winner' else -1]=='7'), axis=1)
df['T'] = df.apply(lambda row: int(int(row['score'][0][0]) > int(row['score'][0][-1])) 
                                    if row['Choosen Player']=='winner'
                                    else int(int(row['score'][0][0]) < int(row['score'][0][-1])), axis=1)

df['Y'] = df.apply(lambda row: int(int(row['score'][1][0]) > int(row['score'][1][-1])) 
                                    if row['Choosen Player']=='winner'
                                    else int(int(row['score'][1][0]) < int(row['score'][1][-1])), axis=1)

df.head(5)

Unnamed: 0,tourney_name,surface,draw_size,tourney_date,winner_name,winner_matches,winner_hand,winner_ht,winner_ioc,winner_age,...,loser_ioc,loser_age,score,best_of,Round,winner_rank,loser_rank,Choosen Player,T,Y
16669,Tehran,Clay,64.0,19731028,Raul Ramirez,5,R,183.0,MEX,20.3,...,AUS,29.4,"[6-7, 6-1, 7-5, 6-3]",5,6,32.0,2.0,loser,1,0
16670,Tehran,Clay,64.0,19731028,John Newcombe,5,R,183.0,AUS,29.4,...,AUS,35.2,"[7-6, 6-3]",3,5,2.0,10.0,winner,1,1
16673,Tehran,Clay,64.0,19731028,Rod Laver,4,L,173.0,AUS,35.2,...,GER,26.4,"[6-0, 6-3]",3,4,10.0,114.0,loser,0,0
16675,Tehran,Clay,64.0,19731028,Raul Ramirez,5,R,183.0,MEX,20.3,...,ROU,27.2,"[7-6, 6-7, 9-7]",3,4,32.0,1.0,loser,0,1
16681,Tehran,Clay,64.0,19731028,John Newcombe,5,R,183.0,AUS,29.4,...,RSA,27.1,"[7-6, 4-6, 8-6]",3,3,2.0,39.0,loser,0,1


# Feature Engineering

In [8]:
df['log_rank_difference'] = df.apply(lambda row: (np.log2(row['winner_rank']) - np.log2(row['loser_rank'])) 
                                            if row['Choosen Player'] == 'winner'
                                            else (np.log2(row['loser_rank']) - np.log2(row['winner_rank']))
                                            ,axis=1)

# Save Clean Dataset

In [9]:
dataset = df[['surface', 'best_of', 'Round',
            'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age', 'winner_rank', #'winner_matches',
            'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc', 'loser_age', 'loser_rank', #'loser_matches',
            'log_rank_difference', 'Choosen Player', 'T', 'Y']].to_csv('Final_Data/dataset.csv', index=False)

dataset = pd.read_csv('Final_Data/dataset.csv')

dataset.head(5)

Unnamed: 0,surface,best_of,Round,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,winner_rank,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,loser_rank,log_rank_difference,Choosen Player,T,Y
0,Clay,5,6,Raul Ramirez,R,183.0,MEX,20.3,32.0,John Newcombe,R,183.0,AUS,29.4,2.0,-4.0,loser,1,0
1,Clay,3,5,John Newcombe,R,183.0,AUS,29.4,2.0,Rod Laver,L,173.0,AUS,35.2,10.0,-2.321928,winner,1,1
2,Clay,3,4,Rod Laver,L,173.0,AUS,35.2,10.0,Hans Jurgen Pohmann,R,178.0,GER,26.4,114.0,3.510962,loser,0,0
3,Clay,3,4,Raul Ramirez,R,183.0,MEX,20.3,32.0,Ilie Nastase,R,183.0,ROU,27.2,1.0,-5.0,loser,0,1
4,Clay,3,3,John Newcombe,R,183.0,AUS,29.4,2.0,Raymond Moore,R,183.0,RSA,27.1,39.0,4.285402,loser,0,1


# Save Feature Dataset

In [10]:
feature_df = df[['surface', 'best_of', 'Round',
                  'winner_hand', 'winner_ht', 'winner_age', 'winner_rank', #'winner_matches',
                  'loser_hand', 'loser_ht', 'loser_age', 'loser_rank', #'loser_matches', 
                  'log_rank_difference', 'Choosen Player', 'T', 'Y']]

def switch_columns(row):
    if row['Choosen Player'] == 'loser':
        row['winner_hand'], row['loser_hand'] = row['loser_hand'], row['winner_hand']
        row['winner_ht'], row['loser_ht'] = row['loser_ht'], row['winner_ht']
        row['winner_age'], row['loser_age'] = row['loser_age'], row['winner_age']
        row['winner_rank'], row['loser_rank'] = row['loser_rank'], row['winner_rank']
        # row['winner_matches'], row['loser_matches'] = row['loser_matches'], row['winner_matches']
    return row

feature_df = feature_df.apply(switch_columns, axis=1)
feature_df.columns = ['surface', 'best_of', 'Round', 
                      'player_hand', 'player_ht', 'player_age', 'player_rank', #'player_matches',
                      'opponent_hand', 'opponent_ht', 'opponent_age', 'opponent_rank', #'opponent_matches',
                      'log_rank_difference', 'Choosen Player', 'T', 'Y']
feature_df = feature_df.drop(columns=['Choosen Player'])

feature_df.to_csv('Final_Data/feature_df.csv')
feature_df

Unnamed: 0,surface,best_of,Round,player_hand,player_ht,player_age,player_rank,opponent_hand,opponent_ht,opponent_age,opponent_rank,log_rank_difference,T,Y
16669,Clay,5,6,R,183.0,29.4,2.0,R,183.0,20.3,32.0,-4.000000,1,0
16670,Clay,3,5,R,183.0,29.4,2.0,L,173.0,35.2,10.0,-2.321928,1,1
16673,Clay,3,4,R,178.0,26.4,114.0,L,173.0,35.2,10.0,3.510962,0,0
16675,Clay,3,4,R,183.0,27.2,1.0,R,183.0,20.3,32.0,-5.000000,0,1
16681,Clay,3,3,R,183.0,27.1,39.0,R,183.0,29.4,2.0,4.285402,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751054,Clay,3,1,R,196.0,20.6,1053.0,R,188.0,31.7,244.0,2.109552,0,1
751078,Clay,3,3,R,193.0,29.6,658.0,R,196.0,20.6,1053.0,-0.678346,0,1
751082,Clay,3,4,R,193.0,29.6,658.0,R,191.0,26.5,419.0,0.651137,0,0
751092,Clay,3,1,R,196.0,20.6,1081.0,R,183.0,31.5,697.0,0.633136,1,0
