In [1]:
import pandas as pd          
import numpy as np

In [2]:
df = pd.concat(map(pd.read_csv, ['tennis_atp/atp_matches_2018.csv','tennis_atp/atp_matches_2019.csv', 'tennis_atp/atp_matches_2020.csv','tennis_atp/atp_matches_2021.csv','tennis_atp/atp_matches_2022.csv','tennis_atp/atp_matches_2023.csv','tennis_atp/atp_matches_2024.csv']))

In [3]:
# Remove identifying data
df = df.drop(columns=['winner_id', 'winner_name', 'winner_ioc', 
                      'loser_id', 'loser_name', 'loser_ioc', 
                      'tourney_id', 'tourney_name', 'tourney_level', 
                      'tourney_date', 'match_num', 'winner_seed', 
                      'winner_entry', 'loser_seed', 'loser_entry'])

In [4]:
# Check for missing values.
print(df.isnull().sum())

# There are zero duplicates
print(df.duplicated().sum())

# Drop any rows with nulls in them
df = df.dropna()

# Check 
row_count = len(df)
print(row_count)

surface                 53
draw_size                0
winner_hand              0
winner_ht               79
winner_age               3
loser_hand               1
loser_ht               227
loser_age                3
score                    0
best_of                  0
round                    0
minutes               1033
w_ace                  694
w_df                   694
w_svpt                 694
w_1stIn                694
w_1stWon               694
w_2ndWon               694
w_SvGms                694
w_bpSaved              694
w_bpFaced              694
l_ace                  694
l_df                   694
l_svpt                 694
l_1stIn                694
l_1stWon               694
l_2ndWon               694
l_SvGms                694
l_bpSaved              694
l_bpFaced              694
winner_rank             79
winner_rank_points      79
loser_rank             222
loser_rank_points      222
dtype: int64
2
17547


In [5]:
# Convert features to not be associated with winning or losing
df = df.rename(columns={
    'winner_age': 'p1_age',
    'winner_seed': 'p1_seed',
    'winner_entry': 'p1_entry',
    'winner_ht': 'p1_ht',
    'winner_hand': 'p1_hand',
    'winner_rank': 'p1_rank',
    'winner_rank_points': 'p1_rank_points',
    'loser_age': 'p2_age',
    'loser_ht': 'p2_ht',
    'loser_hand': 'p2_hand',
    'loser_rank': 'p2_rank',
    'loser_rank_points': 'p2_rank_points'
})

# Ensure that player 1 is not always winning by swapping values
swap_mask = np.random.rand(len(df)) > 0.5
for col in ['age', 'ht', 'hand', 'rank', 'rank_points']:
    df.loc[swap_mask, f'p1_{col}'], df.loc[swap_mask, f'p2_{col}'] = df.loc[swap_mask, f'p2_{col}'], df.loc[swap_mask, f'p1_{col}']

# If a swap occured player 2 has won otherwise player 1 has won
# Using labels of 0 and 1 for binary encoding
df['winner'] = np.where(swap_mask, 1, 0)
df['winner'].value_counts()

winner
1    8820
0    8727
Name: count, dtype: int64

In [6]:
# Display the first and last few rows of the dataset.
display(df.head())
display(df.tail())

# Display the info of the dataset.
display(df.info())

# Display summary statistics of the dataset.
display(df.describe())

Unnamed: 0,surface,draw_size,p1_hand,p1_ht,p1_age,p2_hand,p2_ht,p2_age,score,best_of,...,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,p1_rank,p1_rank_points,p2_rank,p2_rank_points,winner
0,Hard,32,R,188.0,30.6,R,185.0,25.6,6-4 3-6 6-2,3,...,33.0,19.0,14.0,1.0,4.0,52.0,909.0,47.0,1010.0,1
1,Hard,32,R,188.0,21.2,R,183.0,23.7,6-2 6-4,3,...,25.0,7.0,9.0,7.0,11.0,54.0,890.0,94.0,593.0,0
2,Hard,32,R,175.0,25.6,R,185.0,31.3,6-7(4) 6-3 6-2,3,...,37.0,29.0,15.0,10.0,16.0,30.0,1391.0,63.0,809.0,1
3,Hard,32,R,188.0,28.0,R,183.0,18.8,7-6(7) 6-4,3,...,33.0,17.0,11.0,4.0,6.0,44.0,1055.0,208.0,245.0,1
4,Hard,32,R,188.0,19.9,L,193.0,27.2,6-3 6-4,3,...,28.0,5.0,9.0,0.0,2.0,175.0,299.0,68.0,755.0,0


Unnamed: 0,surface,draw_size,p1_hand,p1_ht,p1_age,p2_hand,p2_ht,p2_age,score,best_of,...,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,p1_rank,p1_rank_points,p2_rank,p2_rank_points,winner
3064,Hard,4,R,175.0,25.0,R,183.0,28.2,6-2 6-3,3,...,19.0,5.0,8.0,5.0,9.0,569.0,64.0,819.0,24.0,0
3066,Hard,4,R,178.0,31.0,R,188.0,28.8,6-2 6-3,3,...,17.0,9.0,8.0,2.0,5.0,279.0,205.0,900.0,18.0,0
3068,Clay,4,R,188.0,21.8,L,185.0,23.9,4-6 6-1 6-1,3,...,24.0,18.0,12.0,8.0,16.0,740.0,34.0,616.0,55.0,1
3070,Clay,4,R,175.0,34.2,L,185.0,23.9,6-3 6-1,3,...,33.0,6.0,8.0,6.0,11.0,136.0,489.0,616.0,55.0,0
3074,Hard,4,R,175.0,26.9,R,185.0,27.3,6-4 3-6 6-3,3,...,32.0,17.0,14.0,5.0,9.0,554.0,67.0,416.0,109.0,0


<class 'pandas.core.frame.DataFrame'>
Index: 17547 entries, 0 to 3074
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   surface         17547 non-null  object 
 1   draw_size       17547 non-null  int64  
 2   p1_hand         17547 non-null  object 
 3   p1_ht           17547 non-null  float64
 4   p1_age          17547 non-null  float64
 5   p2_hand         17547 non-null  object 
 6   p2_ht           17547 non-null  float64
 7   p2_age          17547 non-null  float64
 8   score           17547 non-null  object 
 9   best_of         17547 non-null  int64  
 10  round           17547 non-null  object 
 11  minutes         17547 non-null  float64
 12  w_ace           17547 non-null  float64
 13  w_df            17547 non-null  float64
 14  w_svpt          17547 non-null  float64
 15  w_1stIn         17547 non-null  float64
 16  w_1stWon        17547 non-null  float64
 17  w_2ndWon        17547 non-null  float

None

Unnamed: 0,draw_size,p1_ht,p1_age,p2_ht,p2_age,best_of,minutes,w_ace,w_df,w_svpt,...,l_1stWon,l_2ndWon,l_SvGms,l_bpSaved,l_bpFaced,p1_rank,p1_rank_points,p2_rank,p2_rank_points,winner
count,17547.0,17547.0,17547.0,17547.0,17547.0,17547.0,17547.0,17547.0,17547.0,17547.0,...,17547.0,17547.0,17547.0,17547.0,17547.0,17547.0,17547.0,17547.0,17547.0,17547.0
mean,62.260785,187.264832,27.129766,187.192626,27.145318,3.371232,115.207215,7.207899,2.621873,79.191828,...,34.013735,14.723713,12.503733,4.770217,8.453012,81.349461,1518.469881,77.681826,1538.865504,0.50265
std,42.604316,7.165403,4.573055,7.026553,4.571686,0.777615,42.268962,5.652881,2.31895,29.010744,...,14.401328,6.902797,4.20348,3.282107,4.148517,111.545993,1724.675915,104.515831,1723.603037,0.500007
min,2.0,15.0,16.4,15.0,16.7,3.0,5.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0
25%,32.0,183.0,23.6,183.0,23.6,3.0,83.0,3.0,1.0,58.0,...,24.0,10.0,9.0,2.0,5.0,26.0,639.0,25.0,651.0,0.0
50%,32.0,188.0,26.8,188.0,26.8,3.0,108.0,6.0,2.0,74.0,...,32.0,14.0,12.0,4.0,8.0,56.0,939.0,55.0,957.0,1.0
75%,128.0,193.0,30.3,193.0,30.3,3.0,139.0,10.0,4.0,95.0,...,42.0,19.0,15.0,7.0,11.0,95.0,1628.0,93.0,1640.5,1.0
max,128.0,211.0,42.5,211.0,41.8,5.0,396.0,64.0,26.0,278.0,...,171.0,52.0,50.0,27.0,38.0,2146.0,12415.0,1921.0,12415.0,1.0


In [7]:
# Check for missing values.
print(df.isnull().sum())

# There are zero duplicates
print(df.duplicated().sum())

surface           0
draw_size         0
p1_hand           0
p1_ht             0
p1_age            0
p2_hand           0
p2_ht             0
p2_age            0
score             0
best_of           0
round             0
minutes           0
w_ace             0
w_df              0
w_svpt            0
w_1stIn           0
w_1stWon          0
w_2ndWon          0
w_SvGms           0
w_bpSaved         0
w_bpFaced         0
l_ace             0
l_df              0
l_svpt            0
l_1stIn           0
l_1stWon          0
l_2ndWon          0
l_SvGms           0
l_bpSaved         0
l_bpFaced         0
p1_rank           0
p1_rank_points    0
p2_rank           0
p2_rank_points    0
winner            0
dtype: int64
0


In [8]:
# Drop any rows with nulls in them
df = df.dropna()

# Check 
row_count = len(df)
print(row_count)

17547
