In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from dateutil.relativedelta import relativedelta

np.seed = 42

# Load Dataset

In [2]:
matches = pd.read_csv(r'Data\atp_tennis.csv')
players = pd.read_csv(r'Data\atp_players.csv')
tournament = pd.read_csv(r'Data\all_tournaments.csv')

# Helper Function

In [3]:
def merge_columns(row, player_columns):
    merged_data = {}
    columns = ['hand', 'dob', 'ioc', 'height', 'name_first', 'name_last']
    
    for col, player_col in zip(columns, player_columns):
        val1 = row[col]
        val2 = row[player_col]
        
        if pd.isna(val1) and pd.isna(val2):
            merged_data[player_col] = np.nan  # Both are NaN
        elif pd.isna(val1):
            merged_data[player_col] = val2  # Take player value if main is NaN
        elif pd.isna(val2):
            merged_data[player_col] = val1  # Take main value if player is NaN
        else:
            if val1 == val2:
                merged_data[player_col] = val1  # Take the value if both are the same
            else:
                return None  # Conflict detected, drop the row
    
    return pd.Series(merged_data)


def convert_to_date(date_str):
    try:
        # Try to convert from YYYYMMDD format
        return datetime.strptime(str(date_str), "%Y%m%d").date()
    except ValueError:
        # If it fails, try the YYYY-MM-DD format
        return datetime.strptime(str(date_str), "%Y-%m-%d").date()


def calculate_age(tournament_date, dob):
    delta = relativedelta(tournament_date, dob)
    return delta.years

# Filtered Data

In [4]:
matches = matches[(matches['Odd_1']!=-1)&(matches['Odd_2']!=-1)]
matches = matches[matches['Score'].apply(lambda x: x.split()[0] in ['6-7', '7-6'])]
matches = matches.drop_duplicates()
matches

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score
2892,AAPT Championships,2001-01-01,International,Outdoor,Hard,2nd Round,3,Hewitt L.,Phau B.,Hewitt L.,7,208,-1,-1,1.130000,5.500000,6-7 6-4 6-0
2899,AAPT Championships,2001-01-01,International,Outdoor,Hard,Quarterfinals,3,Johansson T.,Henman T.,Henman T.,39,10,-1,-1,2.662500,1.367500,6-7 7-6 3-6
2900,AAPT Championships,2001-01-01,International,Outdoor,Hard,Quarterfinals,3,Massu N.,Ljubicic I.,Massu N.,87,91,-1,-1,1.916667,1.653333,6-7 7-6 6-2
2912,TATA Open,2001-01-01,International,Outdoor,Hard,1st Round,3,Robredo T.,Paes L.,Robredo T.,131,188,-1,-1,2.000000,1.450000,7-6 6-0
2917,TATA Open,2001-01-01,International,Outdoor,Hard,1st Round,3,Kratochvil M.,Vinciguerra A.,Vinciguerra A.,93,52,-1,-1,2.333333,1.410000,6-7 3-6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63889,Shanghai Masters,2024-10-08,ATP1000,Outdoor,Hard,3rd Round,3,Griekspoor T.,Zverev A.,Zverev A.,40,3,1305,6705,4.330000,1.220000,6-7 6-2 6-7
63891,Shanghai Masters,2024-10-09,ATP1000,Outdoor,Hard,4th Round,3,Tsitsipas S.,Medvedev D.,Medvedev D.,12,5,3390,5375,3.000000,1.400000,6-7 3-6
63899,Shanghai Masters,2024-10-10,ATP1000,Outdoor,Hard,Quarterfinals,3,Alcaraz C.,Machac T.,Machac T.,2,33,7010,1449,1.080000,8.000000,6-7 5-7
63901,Shanghai Masters,2024-10-11,ATP1000,Outdoor,Hard,Quarterfinals,3,Mensik J.,Djokovic N.,Djokovic N.,65,4,822,5560,7.000000,1.100000,7-6 1-6 4-6


In [5]:
players = players.dropna(subset=['name_first', 'name_last', 'hand', 'ioc', 'height']).drop_duplicates().drop(columns=['wikidata_id', 'player_id'])
players['name_last'] = players['name_last'].apply(lambda x: "-".join(x.strip().split()))
players['player_name'] = players['name_last'].str.strip() + ' ' + players['name_first'].apply(
    lambda x: ''.join([name.strip()[0] + '.' for name in x.split()])
)
# players = players.drop_duplicates('player_name')
players['player_name2'] = players['player_name'].str.replace('-',' ')
players

Unnamed: 0,name_first,name_last,hand,dob,ioc,height,player_name,player_name2
0,Gardnar,Mulloy,R,19131122.0,USA,185.0,Mulloy G.,Mulloy G.
1,Pancho,Segura,R,19210620.0,ECU,168.0,Segura P.,Segura P.
2,Frank,Sedgman,R,19271002.0,AUS,180.0,Sedgman F.,Sedgman F.
4,Richard,Gonzalez,R,19280509.0,USA,188.0,Gonzalez R.,Gonzalez R.
5,Grant,Golden,R,19290821.0,USA,175.0,Golden G.,Golden G.
...,...,...,...,...,...,...,...,...
64749,Theodore,Winegar,R,20010710.0,USA,191.0,Winegar T.,Winegar T.
64753,Leonardo,Vega,R,19970904.0,USA,191.0,Vega L.,Vega L.
64761,Patrick Edward,Toman,R,20040629.0,USA,183.0,Toman P.E.,Toman P.E.
64775,Liam,Puttergill,R,20000830.0,AUS,183.0,Puttergill L.,Puttergill L.


# Merge dfs

In [6]:
# add data on player 1
player_columns = ['hand_player1', 'dob_player1', 'ioc_player1', 'height_player1', 'name_first_player1', 'name_last_player1']
player_matches_df = matches.merge(players, left_on='Player_1', right_on='player_name', how='left', suffixes=('', '_player1')).drop(columns=['player_name'])
player_matches_df = player_matches_df.merge(players, left_on='Player_1', right_on='player_name2', how='left', suffixes=('', '_player1')).drop(columns=['player_name2'])
merged_columns = player_matches_df.apply(lambda x: merge_columns(x, player_columns), axis=1)
player_matches_df[player_columns] = merged_columns
player_matches_df.drop(columns=['hand', 'dob', 'ioc', 'height', 'name_first', 'name_last', 'player_name2_player1', 'player_name'], inplace=True)
player_matches_df = player_matches_df.dropna(subset=player_columns)

# add data on player 2
player_columns = ['hand_player2', 'dob_player2', 'ioc_player2', 'height_player2', 'name_first_player2', 'name_last_player2']
player_matches_df = player_matches_df.merge(players, left_on='Player_2', right_on='player_name', how='left', suffixes=('', '_player2')).drop(columns=['player_name'])
player_matches_df = player_matches_df.merge(players, left_on='Player_2', right_on='player_name2', how='left', suffixes=('', '_player2')).drop(columns=['player_name2'])
merged_columns = player_matches_df.apply(lambda x: merge_columns(x, player_columns), axis=1)
player_matches_df[player_columns] = merged_columns
player_matches_df.drop(columns=['hand', 'dob', 'ioc', 'height', 'name_first', 'name_last', 'player_name2_player2', 'player_name'], inplace=True)
player_matches_df = player_matches_df.dropna(subset=player_columns)

player_matches_df

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,...,hand_player1,dob_player1,ioc_player1,height_player1,name_first_player2,name_last_player2,hand_player2,dob_player2,ioc_player2,height_player2
0,AAPT Championships,2001-01-01,International,Outdoor,Hard,2nd Round,3,Hewitt L.,Phau B.,Hewitt L.,...,R,19810224.0,AUS,180.0,Bjorn,Phau,R,19791004.0,GER,175.0
1,AAPT Championships,2001-01-01,International,Outdoor,Hard,Quarterfinals,3,Johansson T.,Henman T.,Henman T.,...,R,19750324.0,SWE,180.0,Tim,Henman,R,19740906.0,GBR,185.0
2,AAPT Championships,2001-01-01,International,Outdoor,Hard,Quarterfinals,3,Massu N.,Ljubicic I.,Massu N.,...,R,19791010.0,CHI,183.0,Ivan,Ljubicic,R,19790319.0,CRO,193.0
3,TATA Open,2001-01-01,International,Outdoor,Hard,1st Round,3,Robredo T.,Paes L.,Robredo T.,...,R,19820501.0,ESP,180.0,Leander,Paes,R,19730617.0,IND,178.0
4,TATA Open,2001-01-01,International,Outdoor,Hard,1st Round,3,Kratochvil M.,Vinciguerra A.,Vinciguerra A.,...,R,19790407.0,SUI,185.0,Andreas,Vinciguerra,L,19810219.0,SWE,180.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12034,Shanghai Masters,2024-10-08,ATP1000,Outdoor,Hard,3rd Round,3,Dimitrov G.,Popyrin A.,Dimitrov G.,...,R,19910516.0,BUL,188.0,Alexei,Popyrin,R,19990805.0,AUS,196.0
12035,Shanghai Masters,2024-10-08,ATP1000,Outdoor,Hard,3rd Round,3,Griekspoor T.,Zverev A.,Zverev A.,...,R,19960702.0,NED,188.0,Alexander,Zverev,R,19970420.0,GER,198.0
12036,Shanghai Masters,2024-10-09,ATP1000,Outdoor,Hard,4th Round,3,Tsitsipas S.,Medvedev D.,Medvedev D.,...,R,19980812.0,GRE,193.0,Daniil,Medvedev,R,19960211.0,RUS,198.0
12037,Shanghai Masters,2024-10-10,ATP1000,Outdoor,Hard,Quarterfinals,3,Alcaraz C.,Machac T.,Machac T.,...,R,20030505.0,ESP,185.0,Tomas,Machac,R,20001013.0,CZE,183.0


# Choose Random Player

In [7]:
random_players = np.random.choice([1, 2], size=len(player_matches_df))
player_matches_df['Relevant Player'] = random_players
player_matches_df

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,...,dob_player1,ioc_player1,height_player1,name_first_player2,name_last_player2,hand_player2,dob_player2,ioc_player2,height_player2,Relevant Player
0,AAPT Championships,2001-01-01,International,Outdoor,Hard,2nd Round,3,Hewitt L.,Phau B.,Hewitt L.,...,19810224.0,AUS,180.0,Bjorn,Phau,R,19791004.0,GER,175.0,1
1,AAPT Championships,2001-01-01,International,Outdoor,Hard,Quarterfinals,3,Johansson T.,Henman T.,Henman T.,...,19750324.0,SWE,180.0,Tim,Henman,R,19740906.0,GBR,185.0,1
2,AAPT Championships,2001-01-01,International,Outdoor,Hard,Quarterfinals,3,Massu N.,Ljubicic I.,Massu N.,...,19791010.0,CHI,183.0,Ivan,Ljubicic,R,19790319.0,CRO,193.0,1
3,TATA Open,2001-01-01,International,Outdoor,Hard,1st Round,3,Robredo T.,Paes L.,Robredo T.,...,19820501.0,ESP,180.0,Leander,Paes,R,19730617.0,IND,178.0,2
4,TATA Open,2001-01-01,International,Outdoor,Hard,1st Round,3,Kratochvil M.,Vinciguerra A.,Vinciguerra A.,...,19790407.0,SUI,185.0,Andreas,Vinciguerra,L,19810219.0,SWE,180.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12034,Shanghai Masters,2024-10-08,ATP1000,Outdoor,Hard,3rd Round,3,Dimitrov G.,Popyrin A.,Dimitrov G.,...,19910516.0,BUL,188.0,Alexei,Popyrin,R,19990805.0,AUS,196.0,2
12035,Shanghai Masters,2024-10-08,ATP1000,Outdoor,Hard,3rd Round,3,Griekspoor T.,Zverev A.,Zverev A.,...,19960702.0,NED,188.0,Alexander,Zverev,R,19970420.0,GER,198.0,2
12036,Shanghai Masters,2024-10-09,ATP1000,Outdoor,Hard,4th Round,3,Tsitsipas S.,Medvedev D.,Medvedev D.,...,19980812.0,GRE,193.0,Daniil,Medvedev,R,19960211.0,RUS,198.0,1
12037,Shanghai Masters,2024-10-10,ATP1000,Outdoor,Hard,Quarterfinals,3,Alcaraz C.,Machac T.,Machac T.,...,20030505.0,ESP,185.0,Tomas,Machac,R,20001013.0,CZE,183.0,2


# Extract Confounder

In [8]:
# age of each player
player_matches_df['Date'] = player_matches_df['Date'].apply(convert_to_date)

for player in ['1', '2']:
    player_matches_df[f'dob_player{player}'] = player_matches_df[f'dob_player{player}'].astype(int)
    player_matches_df[f'dob_player{player}'] = player_matches_df[f'dob_player{player}'].apply(convert_to_date)
    player_matches_df[f'age_player{player}'] = player_matches_df.apply(lambda row: calculate_age(row['Date'], row[f'dob_player{player}']), axis=1)

player_matches_df

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,...,height_player1,name_first_player2,name_last_player2,hand_player2,dob_player2,ioc_player2,height_player2,Relevant Player,age_player1,age_player2
0,AAPT Championships,2001-01-01,International,Outdoor,Hard,2nd Round,3,Hewitt L.,Phau B.,Hewitt L.,...,180.0,Bjorn,Phau,R,1979-10-04,GER,175.0,1,19,21
1,AAPT Championships,2001-01-01,International,Outdoor,Hard,Quarterfinals,3,Johansson T.,Henman T.,Henman T.,...,180.0,Tim,Henman,R,1974-09-06,GBR,185.0,1,25,26
2,AAPT Championships,2001-01-01,International,Outdoor,Hard,Quarterfinals,3,Massu N.,Ljubicic I.,Massu N.,...,183.0,Ivan,Ljubicic,R,1979-03-19,CRO,193.0,1,21,21
3,TATA Open,2001-01-01,International,Outdoor,Hard,1st Round,3,Robredo T.,Paes L.,Robredo T.,...,180.0,Leander,Paes,R,1973-06-17,IND,178.0,2,18,27
4,TATA Open,2001-01-01,International,Outdoor,Hard,1st Round,3,Kratochvil M.,Vinciguerra A.,Vinciguerra A.,...,185.0,Andreas,Vinciguerra,L,1981-02-19,SWE,180.0,1,21,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12034,Shanghai Masters,2024-10-08,ATP1000,Outdoor,Hard,3rd Round,3,Dimitrov G.,Popyrin A.,Dimitrov G.,...,188.0,Alexei,Popyrin,R,1999-08-05,AUS,196.0,2,33,25
12035,Shanghai Masters,2024-10-08,ATP1000,Outdoor,Hard,3rd Round,3,Griekspoor T.,Zverev A.,Zverev A.,...,188.0,Alexander,Zverev,R,1997-04-20,GER,198.0,2,28,27
12036,Shanghai Masters,2024-10-09,ATP1000,Outdoor,Hard,4th Round,3,Tsitsipas S.,Medvedev D.,Medvedev D.,...,193.0,Daniil,Medvedev,R,1996-02-11,RUS,198.0,1,26,28
12037,Shanghai Masters,2024-10-10,ATP1000,Outdoor,Hard,Quarterfinals,3,Alcaraz C.,Machac T.,Machac T.,...,185.0,Tomas,Machac,R,2000-10-13,CZE,183.0,2,21,23


In [9]:
player_matches_df = player_matches_df[(player_matches_df['age_player1']<40)&((player_matches_df['age_player2']<40))]
player_matches_df

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,...,height_player1,name_first_player2,name_last_player2,hand_player2,dob_player2,ioc_player2,height_player2,Relevant Player,age_player1,age_player2
0,AAPT Championships,2001-01-01,International,Outdoor,Hard,2nd Round,3,Hewitt L.,Phau B.,Hewitt L.,...,180.0,Bjorn,Phau,R,1979-10-04,GER,175.0,1,19,21
1,AAPT Championships,2001-01-01,International,Outdoor,Hard,Quarterfinals,3,Johansson T.,Henman T.,Henman T.,...,180.0,Tim,Henman,R,1974-09-06,GBR,185.0,1,25,26
2,AAPT Championships,2001-01-01,International,Outdoor,Hard,Quarterfinals,3,Massu N.,Ljubicic I.,Massu N.,...,183.0,Ivan,Ljubicic,R,1979-03-19,CRO,193.0,1,21,21
3,TATA Open,2001-01-01,International,Outdoor,Hard,1st Round,3,Robredo T.,Paes L.,Robredo T.,...,180.0,Leander,Paes,R,1973-06-17,IND,178.0,2,18,27
4,TATA Open,2001-01-01,International,Outdoor,Hard,1st Round,3,Kratochvil M.,Vinciguerra A.,Vinciguerra A.,...,185.0,Andreas,Vinciguerra,L,1981-02-19,SWE,180.0,1,21,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12034,Shanghai Masters,2024-10-08,ATP1000,Outdoor,Hard,3rd Round,3,Dimitrov G.,Popyrin A.,Dimitrov G.,...,188.0,Alexei,Popyrin,R,1999-08-05,AUS,196.0,2,33,25
12035,Shanghai Masters,2024-10-08,ATP1000,Outdoor,Hard,3rd Round,3,Griekspoor T.,Zverev A.,Zverev A.,...,188.0,Alexander,Zverev,R,1997-04-20,GER,198.0,2,28,27
12036,Shanghai Masters,2024-10-09,ATP1000,Outdoor,Hard,4th Round,3,Tsitsipas S.,Medvedev D.,Medvedev D.,...,193.0,Daniil,Medvedev,R,1996-02-11,RUS,198.0,1,26,28
12037,Shanghai Masters,2024-10-10,ATP1000,Outdoor,Hard,Quarterfinals,3,Alcaraz C.,Machac T.,Machac T.,...,185.0,Tomas,Machac,R,2000-10-13,CZE,183.0,2,21,23


In [10]:
# Country of the tournament

In [11]:
# Country of each player

In [12]:
# Transform round to int
map_round = {'1st Round': 1, 'Round Robin':1, '2nd Round': 2, '3rd Round': 3, '4th Round': 4}

round = []
for i in range(len(player_matches_df)):
    # print(df['Round'].iloc[i])
    if player_matches_df['Round'].iloc[i] in map_round.keys():
        round.append(map_round[player_matches_df['Round'].iloc[i]])
    else: 
        # Check previous rounds
        t = 1
        while player_matches_df['Round'].iloc[i - t] ==  player_matches_df['Round'].iloc[i]:
            t += 1
        round.append(round[i-t] + 1)  # Increment the last known round

player_matches_df['round'] = round
player_matches_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_matches_df['round'] = round


Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,...,name_first_player2,name_last_player2,hand_player2,dob_player2,ioc_player2,height_player2,Relevant Player,age_player1,age_player2,round
0,AAPT Championships,2001-01-01,International,Outdoor,Hard,2nd Round,3,Hewitt L.,Phau B.,Hewitt L.,...,Bjorn,Phau,R,1979-10-04,GER,175.0,1,19,21,2
1,AAPT Championships,2001-01-01,International,Outdoor,Hard,Quarterfinals,3,Johansson T.,Henman T.,Henman T.,...,Tim,Henman,R,1974-09-06,GBR,185.0,1,25,26,3
2,AAPT Championships,2001-01-01,International,Outdoor,Hard,Quarterfinals,3,Massu N.,Ljubicic I.,Massu N.,...,Ivan,Ljubicic,R,1979-03-19,CRO,193.0,1,21,21,3
3,TATA Open,2001-01-01,International,Outdoor,Hard,1st Round,3,Robredo T.,Paes L.,Robredo T.,...,Leander,Paes,R,1973-06-17,IND,178.0,2,18,27,1
4,TATA Open,2001-01-01,International,Outdoor,Hard,1st Round,3,Kratochvil M.,Vinciguerra A.,Vinciguerra A.,...,Andreas,Vinciguerra,L,1981-02-19,SWE,180.0,1,21,19,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12034,Shanghai Masters,2024-10-08,ATP1000,Outdoor,Hard,3rd Round,3,Dimitrov G.,Popyrin A.,Dimitrov G.,...,Alexei,Popyrin,R,1999-08-05,AUS,196.0,2,33,25,3
12035,Shanghai Masters,2024-10-08,ATP1000,Outdoor,Hard,3rd Round,3,Griekspoor T.,Zverev A.,Zverev A.,...,Alexander,Zverev,R,1997-04-20,GER,198.0,2,28,27,3
12036,Shanghai Masters,2024-10-09,ATP1000,Outdoor,Hard,4th Round,3,Tsitsipas S.,Medvedev D.,Medvedev D.,...,Daniil,Medvedev,R,1996-02-11,RUS,198.0,1,26,28,4
12037,Shanghai Masters,2024-10-10,ATP1000,Outdoor,Hard,Quarterfinals,3,Alcaraz C.,Machac T.,Machac T.,...,Tomas,Machac,R,2000-10-13,CZE,183.0,2,21,23,5


In [13]:
player_matches_df['Score'] = player_matches_df['Score'].apply(lambda score: score.split())
player_matches_df['T'] = player_matches_df.apply(lambda row: int(row['Score'][0][0 if row['Relevant Player']=='1' else -1]=='7'), axis=1)
player_matches_df['Y'] = player_matches_df.apply(lambda row: int(int(row['Score'][1][0]) > int(row['Score'][1][-1])) 
                                                if row['Relevant Player']==1
                                                else int(int(row['Score'][1][0]) < int(row['Score'][1][-1])), axis=1)

player_matches_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_matches_df['Score'] = player_matches_df['Score'].apply(lambda score: score.split())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  player_matches_df['T'] = player_matches_df.apply(lambda row: int(row['Score'][0][0 if row['Relevant Player']=='1' else -1]=='7'), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,...,hand_player2,dob_player2,ioc_player2,height_player2,Relevant Player,age_player1,age_player2,round,T,Y
0,AAPT Championships,2001-01-01,International,Outdoor,Hard,2nd Round,3,Hewitt L.,Phau B.,Hewitt L.,...,R,1979-10-04,GER,175.0,1,19,21,2,1,1
1,AAPT Championships,2001-01-01,International,Outdoor,Hard,Quarterfinals,3,Johansson T.,Henman T.,Henman T.,...,R,1974-09-06,GBR,185.0,1,25,26,3,1,1
2,AAPT Championships,2001-01-01,International,Outdoor,Hard,Quarterfinals,3,Massu N.,Ljubicic I.,Massu N.,...,R,1979-03-19,CRO,193.0,1,21,21,3,1,1
3,TATA Open,2001-01-01,International,Outdoor,Hard,1st Round,3,Robredo T.,Paes L.,Robredo T.,...,R,1973-06-17,IND,178.0,2,18,27,1,0,0
4,TATA Open,2001-01-01,International,Outdoor,Hard,1st Round,3,Kratochvil M.,Vinciguerra A.,Vinciguerra A.,...,L,1981-02-19,SWE,180.0,1,21,19,1,1,0


In [14]:
numeric_df = player_matches_df.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numeric_df.corr()
correlation_matrix

Unnamed: 0,Best of,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,height_player1,height_player2,age_player1,age_player2,round,T,Y
Best of,1.0,-0.024747,-0.025052,0.065434,0.071263,0.140025,0.120639,0.002261,-0.007278,0.010466,0.005056,0.022738,-0.008073,-0.012215
Rank_1,-0.024747,1.0,0.085636,-0.327855,-0.095469,0.221131,-0.238978,-0.041193,-0.000537,-0.043336,0.004269,-0.208517,0.059289,0.018059
Rank_2,-0.025052,0.085636,1.0,-0.078493,-0.317652,-0.240764,0.23461,-0.005958,-0.056561,0.010045,-0.023299,-0.214678,-0.044062,-0.006694
Pts_1,0.065434,-0.327855,-0.078493,1.0,0.294826,-0.161841,0.522602,0.107804,0.079742,0.071327,0.063439,0.299842,-0.086708,-0.016471
Pts_2,0.071263,-0.095469,-0.317652,0.294826,1.0,0.52702,-0.163784,0.072629,0.103018,0.063502,0.071688,0.296656,0.064565,-0.002688
Odd_1,0.140025,0.221131,-0.240764,-0.161841,0.52702,1.0,-0.348455,-0.051928,0.046591,0.008299,0.003974,0.032005,0.130377,-0.009224
Odd_2,0.120639,-0.238978,0.23461,0.522602,-0.163784,-0.348455,1.0,0.050916,-0.059036,0.012663,-0.006978,0.02014,-0.139256,-0.017456
height_player1,0.002261,-0.041193,-0.005958,0.107804,0.072629,-0.051928,0.050916,1.0,0.032842,-0.00397,0.025003,0.078868,-0.014344,0.009472
height_player2,-0.007278,-0.000537,-0.056561,0.079742,0.103018,0.046591,-0.059036,0.032842,1.0,0.048065,-0.009485,0.087926,0.029753,-0.001101
age_player1,0.010466,-0.043336,0.010045,0.071327,0.063502,0.008299,0.012663,-0.00397,0.048065,1.0,0.044275,-0.018683,-0.005247,-0.016014


In [15]:
feature_df = player_matches_df[['Court', 'Surface', 'round', 'Best of',
                                'Rank_1', 'Rank_2', 'Pts_1', 'Pts_2',
                                'Odd_1', 'Odd_2',
                                'hand_player1', 'height_player1',
                                'hand_player2',
                                'height_player2', 'Relevant Player',
                                'age_player1', 'age_player2', 'T', 'Y']]

def switch_columns(row):
    if row['Relevant Player'] == '2':
        row['hand_player1'], row['hand_player2'] = row['hand_player2'], row['hand_player1']
        row['height_player1'], row['height_player2'] = row['height_player2'], row['height_player1']
        row['age_player1'], row['age_player2'] = row['age_player2'], row['age_player1']
        row['Rank_1'], row['Rank_2'] = row['Rank_2'], row['Rank_1']
        row['Pts_1'], row['Pts_2'] = row['Pts_2'], row['Pts_1']
        row['Odd_1'], row['Odd_2'] = row['Odd_2'], row['Odd_1']
    return row

feature_df = feature_df.apply(switch_columns, axis=1)
# feature_df.columns = ['Court', 'Surface', 'round', 'Best of',
#                     'Rank_1', 'Rank_2', 'Pts_1', 'Pts_2',
#                     'Odd_1', 'Odd_2',
#                     'hand_player1', 'height_player1',
#                     'hand_player2',
#                     'height_player2', 'Relevant Player',
#                     'age_player1', 'age_player2', 'T', 'Y']
feature_df = feature_df.drop(columns=['Relevant Player'])

feature_df.to_csv('Final_Data/feature_df_second_pre_process.csv')
feature_df

Unnamed: 0,Court,Surface,round,Best of,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,hand_player1,height_player1,hand_player2,height_player2,age_player1,age_player2,T,Y
0,Outdoor,Hard,2,3,7,208,-1,-1,1.130000,5.500000,R,180.0,R,175.0,19,21,1,1
1,Outdoor,Hard,3,3,39,10,-1,-1,2.662500,1.367500,R,180.0,R,185.0,25,26,1,1
2,Outdoor,Hard,3,3,87,91,-1,-1,1.916667,1.653333,R,183.0,R,193.0,21,21,1,1
3,Outdoor,Hard,1,3,131,188,-1,-1,2.000000,1.450000,R,180.0,R,178.0,18,27,0,0
4,Outdoor,Hard,1,3,93,52,-1,-1,2.333333,1.410000,R,185.0,L,180.0,21,19,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12034,Outdoor,Hard,3,3,10,24,3840,1825,1.670000,2.200000,R,188.0,R,196.0,33,25,0,0
12035,Outdoor,Hard,3,3,40,3,1305,6705,4.330000,1.220000,R,188.0,R,198.0,28,27,1,0
12036,Outdoor,Hard,4,3,12,5,3390,5375,3.000000,1.400000,R,193.0,R,198.0,26,28,1,0
12037,Outdoor,Hard,5,3,2,33,7010,1449,1.080000,8.000000,R,185.0,R,183.0,21,23,1,1
