In [1]:
from collections import Counter
import matplotlib.pyplot as plt
from datetime import datetime
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
import re

RANDOM_SEED = 42
warnings.simplefilter("ignore")
np.seed = RANDOM_SEED

# Load Dataset

In [2]:
pathes_details = [(lambda year: fr'Final_Data/atp_matches_{year}.csv', 1968, 2023),
               (lambda year: fr'Final_Data/atp_matches_futures_{year}.csv', 2000, 2023),
               (lambda year: fr'Final_Data/atp_matches_qual_chall_{year}.csv', 1978, 2023)]

df = pd.DataFrame()
for path_func, first_year, last_year in pathes_details:
    for year in range(first_year, last_year + 1):
        current_df = pd.read_csv(path_func(year))
        df = pd.concat([df, current_df])

df.head(3)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_1stIn,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points
0,1968-2029,Dublin,Grass,32.0,A,19680708,270,112411,,,...,,,,,,,,,,
1,1968-2029,Dublin,Grass,32.0,A,19680708,271,126914,,,...,,,,,,,,,,
2,1968-2029,Dublin,Grass,32.0,A,19680708,272,209523,,,...,,,,,,,,,,


# Calculate number of matches each player

In [3]:
def number_of_matches_per_player(group):
    # Count matches for winners and losers
    winner_match_counts = group['winner_id'].value_counts().to_dict()
    loser_match_count = group['loser_id'].value_counts().to_dict()
    # Combine using Counter
    combined_dict = Counter(winner_match_counts) + Counter(loser_match_count)
    match_counts = dict(combined_dict)
    
    # Get winner and loser match counts using a direct lookup
    group['winner_matches'] = group['winner_id'].map(match_counts)
    group['loser_matches'] = group['loser_id'].map(match_counts)
    
    return group

# Apply the mapping and preserve original order
df = df.groupby('tourney_id').apply(number_of_matches_per_player).reset_index(drop=True)
df.head(2)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,winner_matches,loser_matches
0,1968-2029,Dublin,Grass,32.0,A,19680708,270,112411,,,...,,,,,,,,,2,1
1,1969-2029,Dublin,Grass,32.0,A,19690707,270,100011,,,...,,,,,,,,,3,1


# Transform round to int

In [4]:
df = df[~df['round'].isin(['RR', 'BR', 'ER', 'Q4'])]

# Map initial rounds to integers
map_round = {'Q1': 1, 'Q2': 2, 'Q3': 3, 'R128': 4, 'R64': 5,
             'R32': 6, 'R16': 7, 'QF': 8, 'SF': 9, 'F': 10}

df['Round'] = df['round'].map(map_round)

# Function to create a continuous mapping for rounds within each group
def map_rounds(group):
    unique_rounds = group['Round'].unique()
    round_mapping = {value: idx + 1 for idx, value in enumerate(sorted(unique_rounds))}
    group['Round'] = group['Round'].map(round_mapping)
    return group

# Apply the mapping and preserve original order
df = df.groupby('tourney_id').apply(map_rounds).reset_index(drop=True)
df.head(2)

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,winner_id,winner_seed,winner_entry,...,l_SvGms,l_bpSaved,l_bpFaced,winner_rank,winner_rank_points,loser_rank,loser_rank_points,winner_matches,loser_matches,Round
0,1968-2029,Dublin,Grass,32.0,A,19680708,270,112411,,,...,,,,,,,,2,1,1
1,1969-2029,Dublin,Grass,32.0,A,19690707,270,100011,,,...,,,,,,,,3,1,1


# Filtered Data

In [5]:
COLUMNS = ['tourney_name', 'surface', 'draw_size', 'tourney_date', 'winner_name', 'winner_matches',
           'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age', 'loser_name', 'loser_hand', 'loser_ht',
           'loser_matches', 'loser_ioc', 'loser_age', 'score', 'best_of', 'Round', 'winner_rank', 'loser_rank']
df = df[COLUMNS].dropna().drop_duplicates()

df['best_of'] = df['best_of'].astype(str)
df['score'] = df['score'].apply(lambda text: re.sub(r'\(\d+\)', '', text))
# df = df[df['score'].apply(lambda score: ('7-6' in score.split()[0] or '6-7' in score.split()[0]))]
df = df[~df['score'].str.contains('Walkover|DEF|RET|W/O|&nbsp;|Def|Played and abandoned|UNK|UNP', na=False)]
df['score'] = df['score'].str.split()
df = df[(df['loser_hand'].isin(['R','L'])) & (df['winner_hand'].isin(['R','L']))]
df = df[df['score'].apply(len)>1]
pattern = r'^\d+-\d+( (?:\d+-\d+))*$'
df = df[df['score'].apply(lambda score_list: " ".join(score_list)).str.match(pattern)]

df.head(3)

Unnamed: 0,tourney_name,surface,draw_size,tourney_date,winner_name,winner_matches,winner_hand,winner_ht,winner_ioc,winner_age,...,loser_hand,loser_ht,loser_matches,loser_ioc,loser_age,score,best_of,Round,winner_rank,loser_rank
7,San Antonio WCT,Hard,32.0,19750223,Marty Riessen,3,R,185.0,USA,33.2,...,R,185.0,1,IND,22.9,"[6-3, 6-2]",3,1,12.0,85.0
9,Birmingham WCT,Carpet,16.0,19770112,Jimmy Connors,4,L,178.0,USA,24.3,...,R,193.0,1,GBR,21.7,"[6-3, 6-2]",3,1,1.0,43.0
10,Birmingham WCT,Carpet,32.0,19780109,Bjorn Borg,5,R,180.0,SWE,21.5,...,R,168.0,1,USA,25.3,"[6-2, 6-4]",3,1,3.0,14.0


# Choose Random Player

In [6]:
random_players_func = lambda row: np.random.choice(['winner', 'loser'], size=1)[0]
df['Choosen Player'] = df.apply(random_players_func, axis=1)

df.head(3)

Unnamed: 0,tourney_name,surface,draw_size,tourney_date,winner_name,winner_matches,winner_hand,winner_ht,winner_ioc,winner_age,...,loser_ht,loser_matches,loser_ioc,loser_age,score,best_of,Round,winner_rank,loser_rank,Choosen Player
7,San Antonio WCT,Hard,32.0,19750223,Marty Riessen,3,R,185.0,USA,33.2,...,185.0,1,IND,22.9,"[6-3, 6-2]",3,1,12.0,85.0,winner
9,Birmingham WCT,Carpet,16.0,19770112,Jimmy Connors,4,L,178.0,USA,24.3,...,193.0,1,GBR,21.7,"[6-3, 6-2]",3,1,1.0,43.0,loser
10,Birmingham WCT,Carpet,32.0,19780109,Bjorn Borg,5,R,180.0,SWE,21.5,...,168.0,1,USA,25.3,"[6-2, 6-4]",3,1,3.0,14.0,winner


# Extract Treatment and Outcome

In [7]:
# df['T'] = df.apply(lambda row: int(row['score'][0][0 if row['Choosen Player']=='winner' else -1]=='7'), axis=1)
df['T'] = df.apply(lambda row: int(int(row['score'][0][0]) > int(row['score'][0][-1])) 
                                    if row['Choosen Player']=='winner'
                                    else int(int(row['score'][0][0]) < int(row['score'][0][-1])), axis=1)

df['Y'] = df.apply(lambda row: int(int(row['score'][1][0]) > int(row['score'][1][-1])) 
                                    if row['Choosen Player']=='winner'
                                    else int(int(row['score'][1][0]) < int(row['score'][1][-1])), axis=1)

df.head(5)

Unnamed: 0,tourney_name,surface,draw_size,tourney_date,winner_name,winner_matches,winner_hand,winner_ht,winner_ioc,winner_age,...,loser_ioc,loser_age,score,best_of,Round,winner_rank,loser_rank,Choosen Player,T,Y
7,San Antonio WCT,Hard,32.0,19750223,Marty Riessen,3,R,185.0,USA,33.2,...,IND,22.9,"[6-3, 6-2]",3,1,12.0,85.0,winner,1,1
9,Birmingham WCT,Carpet,16.0,19770112,Jimmy Connors,4,L,178.0,USA,24.3,...,GBR,21.7,"[6-3, 6-2]",3,1,1.0,43.0,loser,0,0
10,Birmingham WCT,Carpet,32.0,19780109,Bjorn Borg,5,R,180.0,SWE,21.5,...,USA,25.3,"[6-2, 6-4]",3,1,3.0,14.0,winner,1,1
12,Pepsi Grand Slam,Clay,4.0,19790210,Jimmy Connors,2,L,178.0,USA,26.4,...,USA,19.9,"[6-3, 6-4]",3,1,1.0,4.0,winner,1,1
14,Pepsi Grand Slam,Clay,4.0,19800208,Bjorn Borg,2,R,180.0,SWE,23.6,...,ARG,27.4,"[6-2, 6-3]",3,1,1.0,6.0,loser,0,0


# Feature Engineering

In [8]:
df['log_rank_difference'] = df.apply(lambda row: (np.log2(row['winner_rank']) - np.log2(row['loser_rank'])) 
                                            if row['Choosen Player'] == 'winner'
                                            else (np.log2(row['loser_rank']) - np.log2(row['winner_rank']))
                                            ,axis=1)

# Save Clean Dataset

In [9]:
dataset = df[['surface', 'best_of', 'Round',
            'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age', 'winner_rank', #'winner_matches',
            'loser_name', 'loser_hand', 'loser_ht', 'loser_ioc', 'loser_age', 'loser_rank', #'loser_matches',
            'log_rank_difference', 'Choosen Player', 'T', 'Y']].to_csv('Final_Data/dataset.csv', index=False)

dataset = pd.read_csv('Final_Data/full_scores_dataset.csv')

dataset.head(5)

Unnamed: 0,surface,best_of,Round,winner_name,winner_hand,winner_ht,winner_ioc,winner_age,winner_rank,loser_name,loser_hand,loser_ht,loser_ioc,loser_age,loser_rank,log_rank_difference,Choosen Player,T,Y
0,Hard,3,1,Marty Riessen,R,185.0,USA,33.2,12.0,Anand Amritraj,R,185.0,IND,22.9,85.0,2.824428,loser,0,0
1,Carpet,3,1,Jimmy Connors,L,178.0,USA,24.3,1.0,Buster C Mottram,R,193.0,GBR,21.7,43.0,-5.426265,winner,1,1
2,Carpet,3,1,Bjorn Borg,R,180.0,SWE,21.5,3.0,Harold Solomon,R,168.0,USA,25.3,14.0,-2.222392,winner,1,1
3,Clay,3,1,Jimmy Connors,L,178.0,USA,26.4,1.0,John McEnroe,L,180.0,USA,19.9,4.0,2.0,loser,0,0
4,Clay,3,1,Bjorn Borg,R,180.0,SWE,23.6,1.0,Guillermo Vilas,L,180.0,ARG,27.4,6.0,-2.584963,winner,1,1


# Save Feature Dataset

In [10]:
feature_df = df[['surface', 'best_of', 'Round',
                  'winner_hand', 'winner_ht', 'winner_age', 'winner_rank', #'winner_matches',
                  'loser_hand', 'loser_ht', 'loser_age', 'loser_rank', #'loser_matches', 
                  'log_rank_difference', 'Choosen Player', 'T', 'Y']]

def switch_columns(row):
    if row['Choosen Player'] == 'loser':
        row['winner_hand'], row['loser_hand'] = row['loser_hand'], row['winner_hand']
        row['winner_ht'], row['loser_ht'] = row['loser_ht'], row['winner_ht']
        row['winner_age'], row['loser_age'] = row['loser_age'], row['winner_age']
        row['winner_rank'], row['loser_rank'] = row['loser_rank'], row['winner_rank']
    return row

feature_df = feature_df.apply(switch_columns, axis=1)
feature_df.columns = ['surface', 'best_of', 'Round', 
                      'player_hand', 'player_ht', 'player_age', 'player_rank', #'player_matches',
                      'opponent_hand', 'opponent_ht', 'opponent_age', 'opponent_rank', #'opponent_matches',
                      'log_rank_difference', 'Choosen Player', 'T', 'Y']

# cols = ['surface', 'best_of', 'player_rank', 'player_ht', 'player_age', 'T', 'Y']
feature_df = feature_df.drop(columns=['Choosen Player'])

# train_df = feature_df.sample(frac=0.5, random_state=RANDOM_SEED)
# eval_df = feature_df.drop(train_df.index)

feature_df.to_csv('Final_Data/full_scores_feature_df.csv')
# train_df.to_csv('Final_Data/train_feature_df.csv')
# eval_df.to_csv('Final_Data/eval_feature_df.csv')

In [11]:
feature_df

Unnamed: 0,surface,best_of,Round,player_hand,player_ht,player_age,player_rank,opponent_hand,opponent_ht,opponent_age,opponent_rank,log_rank_difference,T,Y
7,Hard,3,1,R,185.0,33.2,12.0,R,185.0,22.9,85.0,-2.824428,1,1
9,Carpet,3,1,R,193.0,21.7,43.0,L,178.0,24.3,1.0,5.426265,0,0
10,Carpet,3,1,R,180.0,21.5,3.0,R,168.0,25.3,14.0,-2.222392,1,1
12,Clay,3,1,L,178.0,26.4,1.0,L,180.0,19.9,4.0,-2.000000,1,1
14,Clay,3,1,L,180.0,27.4,6.0,R,180.0,23.6,1.0,2.584963,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
751548,Hard,3,4,R,193.0,18.7,407.0,R,185.0,28.2,414.0,-0.024602,0,0
751554,Hard,3,1,R,191.0,21.5,622.0,R,191.0,22.6,482.0,0.367881,0,0
751597,Hard,3,2,R,188.0,20.4,280.0,R,191.0,17.2,885.0,-1.660251,1,1
751630,Hard,3,2,R,180.0,23.4,450.0,R,185.0,18.3,640.0,-0.508147,0,1
