## Import Stuff


In [1]:
import dotenv
import kagglehub
from pathlib import Path
import os
import pandas as pd
import numpy as np

## Load Environment Variables


In [2]:
dotenv.load_dotenv()

True

## Load and Read Data


In [3]:
path = kagglehub.dataset_download("dissfya/atp-tennis-2000-2023daily-pull")
print(f"Data Path -> {path}")
dataset_dir = Path(path)
data_file = dataset_dir / "atp_tennis.csv"
df = pd.read_csv(data_file)
print(
    f"data file name -> {data_file.name}\nnumber of matches from 2000-present-> {len(df)}"
)

Data Path -> /Users/pranavrajan/.cache/kagglehub/datasets/dissfya/atp-tennis-2000-2023daily-pull/versions/958
data file name -> atp_tennis.csv
number of matches from 2000-present-> 66681


## Exploratory Data Analysis


In [4]:
df.head()

Unnamed: 0,Tournament,Date,Series,Court,Surface,Round,Best of,Player_1,Player_2,Winner,Rank_1,Rank_2,Pts_1,Pts_2,Odd_1,Odd_2,Score
0,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Dosedel S.,Ljubicic I.,Dosedel S.,63,77,-1,-1,-1.0,-1.0,6-4 6-2
1,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Clement A.,Enqvist T.,Enqvist T.,56,5,-1,-1,-1.0,-1.0,3-6 3-6
2,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Escude N.,Baccanello P.,Escude N.,40,655,-1,-1,-1.0,-1.0,6-7 7-5 6-3
3,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Knippschild J.,Federer R.,Federer R.,87,65,-1,-1,-1.0,-1.0,1-6 4-6
4,Australian Hardcourt Championships,2000-01-03,International,Outdoor,Hard,1st Round,3,Fromberg R.,Woodbridge T.,Fromberg R.,81,198,-1,-1,-1.0,-1.0,7-6 5-7 6-4


### Column Types


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66681 entries, 0 to 66680
Data columns (total 17 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Tournament  66681 non-null  object 
 1   Date        66681 non-null  object 
 2   Series      66681 non-null  object 
 3   Court       66681 non-null  object 
 4   Surface     66681 non-null  object 
 5   Round       66681 non-null  object 
 6   Best of     66681 non-null  int64  
 7   Player_1    66681 non-null  object 
 8   Player_2    66681 non-null  object 
 9   Winner      66681 non-null  object 
 10  Rank_1      66681 non-null  int64  
 11  Rank_2      66681 non-null  int64  
 12  Pts_1       66681 non-null  int64  
 13  Pts_2       66681 non-null  int64  
 14  Odd_1       66681 non-null  float64
 15  Odd_2       66681 non-null  float64
 16  Score       66681 non-null  object 
dtypes: float64(2), int64(5), object(10)
memory usage: 8.6+ MB


### Min-Max Values - Numerical Feature Columns


In [6]:
numeric_min_max = df.select_dtypes(include=[np.number]).agg(["min", "max"])
print(numeric_min_max)

     Best of  Rank_1  Rank_2  Pts_1  Pts_2  Odd_1  Odd_2
min        3      -1      -1     -1     -1   -1.0   -1.0
max        5    3390    4915  16950  16950   67.0   51.0


### Unique Values in Columns


In [7]:
# For each column, show unique values
for col in df.columns:
    unique_vals = df[col].unique()
    n_unique = len(unique_vals)
    print(f"\n{col} -> {n_unique} unique values")

    # Only show actual values if there are < 20 unique values
    if n_unique < 20:
        print(f"  Values: {unique_vals}")
    else:
        print("(Too many to display - showing first 10)")
        print(f"  Sample: {unique_vals[:10]}")


Tournament -> 268 unique values
(Too many to display - showing first 10)
  Sample: ['Australian Hardcourt Championships' 'Gold Flake Open' 'Qatar Open'
 'Heineken Open' 'Sydney International' 'Australian Open' 'Dubai Open'
 'Marseille Open' 'Sybase Open' 'Kroger St. Jude']

Date -> 6518 unique values
(Too many to display - showing first 10)
  Sample: ['2000-01-03' '2000-01-10' '2000-01-17' '2000-02-07' '2000-02-14'
 '2000-02-21' '2000-02-28' '2000-03-06' '2000-03-13' '2000-03-23']

Series -> 8 unique values
  Values: ['International' 'Grand Slam' 'International Gold' 'Masters' 'Masters Cup'
 'ATP250' 'ATP500' 'Masters 1000']

Court -> 2 unique values
  Values: ['Outdoor' 'Indoor']

Surface -> 4 unique values
  Values: ['Hard' 'Clay' 'Grass' 'Carpet']

Round -> 8 unique values
  Values: ['1st Round' '2nd Round' 'Quarterfinals' 'Semifinals' 'The Final'
 '3rd Round' '4th Round' 'Round Robin']

Best of -> 2 unique values
  Values: [3 5]

Player_1 -> 1509 unique values
(Too many to display

### Number of Unique Players

In [8]:
total_unique_players = pd.concat([df["Player_1"], df["Player_2"]]).nunique()
print(f"Total unique players in the dataset -> {total_unique_players}")

Total unique players in the dataset -> 1779


### Total Match Stats

In [9]:
def match_stats(df):
    wins = df["Winner"].value_counts()
    p1_counts = df["Player_1"].value_counts()
    p2_counts = df["Player_2"].value_counts()
    total_matches = p1_counts.add(p2_counts, fill_value=0)
    wins = wins.reindex(total_matches.index, fill_value=0)
    losses = total_matches - wins
    player_stats = pd.DataFrame(
        {"Wins": wins, "Losses": losses, "Total Matches": total_matches}
    ).sort_values(by="Wins", ascending=False)
    print(f"len of list -> {len(player_stats)}")
    print(player_stats.head())

In [10]:
match_stats(df)

len of list -> 1779
             Wins  Losses  Total Matches
Federer R.   1151   242.0         1393.0
Djokovic N.  1059   201.0         1260.0
Nadal R.     1007   207.0         1214.0
Ferrer D.     677   358.0         1035.0
Murray A.     670   254.0          924.0


### Series Stats

In [11]:
def series_stats(df):
    # 1. Overall Stats: Wins, Total Matches, and Losses
    wins = df["Winner"].value_counts()
    p1_counts = df["Player_1"].value_counts()
    p2_counts = df["Player_2"].value_counts()
    total_matches = p1_counts.add(p2_counts, fill_value=0)

    # Reindex to include players with 0 wins
    wins = wins.reindex(total_matches.index, fill_value=0)
    losses = total_matches - wins

    # 2. Wins by Series
    # Unique series include 'Grand Slam', 'Masters 1000', 'ATP500', etc.
    series_wins = df.groupby(["Winner", "Series"]).size().unstack(fill_value=0)
    series_wins.columns = [f"Wins_{col}" for col in series_wins.columns]

    # 3. Create and Merge DataFrames
    player_stats = pd.DataFrame(
        {"Total Matches": total_matches, "Wins": wins, "Losses": losses}
    )

    # Join series wins to the main stats
    player_stats = player_stats.join(series_wins, how="left").fillna(0)

    # 4. Percentage Logic: Calculate series wins as a % of TOTAL wins
    win_cols = [c for c in player_stats.columns if c.startswith("Wins_")]

    # Vectorized division by total wins
    series_pcts = player_stats[win_cols].div(player_stats["Wins"], axis=0) * 100

    # Rename columns to reflect they are percentages of total wins
    series_pcts.columns = [
        c.replace("Wins_", "WinPct_Total_") for c in series_pcts.columns
    ]

    # Concatenate and handle division by zero
    player_stats = pd.concat([player_stats, series_pcts], axis=1).fillna(0)

    # Final sorting and display
    player_stats = player_stats.sort_values(by="Wins", ascending=False)

    print(f"Number of players analyzed -> {len(player_stats)}")
    print(player_stats.head())


In [12]:
series_stats(df)

Number of players analyzed -> 1779
             Total Matches  Wins  Losses  Wins_ATP250  Wins_ATP500  \
Federer R.          1393.0  1151   242.0         55.0        105.0   
Djokovic N.         1260.0  1059   201.0         68.0         94.0   
Nadal R.            1214.0  1007   207.0         55.0        120.0   
Ferrer D.           1035.0   677   358.0        117.0        106.0   
Murray A.            924.0   670   254.0         86.0         81.0   

             Wins_Grand Slam  Wins_International  Wins_International Gold  \
Federer R.             362.0               138.0                     62.0   
Djokovic N.            384.0                40.0                     16.0   
Nadal R.               303.0                64.0                     43.0   
Ferrer D.              139.0                80.0                     41.0   
Murray A.              195.0                57.0                     12.0   

             Wins_Masters  Wins_Masters 1000  Wins_Masters Cup  \
Federer R.     

### Court Stats

In [13]:
def court_stats(df):
    # 1. Existing Logic: Overall Wins, Total Matches, and Losses
    wins = df["Winner"].value_counts()
    p1_counts = df["Player_1"].value_counts()
    p2_counts = df["Player_2"].value_counts()
    total_matches = p1_counts.add(p2_counts, fill_value=0)

    # Reindex to include players with 0 wins
    wins = wins.reindex(total_matches.index, fill_value=0)
    losses = total_matches - wins

    # 2. New Logic: Wins by Court (Replacing Series/Surface)
    # Group by Winner and Court to get counts for 'Indoor' and 'Outdoor'
    court_wins = df.groupby(["Winner", "Court"]).size().unstack(fill_value=0)
    court_wins.columns = [f"Wins_{col}" for col in court_wins.columns]

    # 3. Create and Merge DataFrames
    player_stats = pd.DataFrame(
        {"Total Matches": total_matches, "Wins": wins, "Losses": losses}
    )

    # Join court wins to the main stats
    player_stats = player_stats.join(court_wins, how="left").fillna(0)

    # 4. Percentage Logic: Calculate court wins as a % of TOTAL wins
    win_cols = [c for c in player_stats.columns if c.startswith("Wins_")]

    # Vectorized division: Divide each court win column by the "Wins" column
    court_pcts = player_stats[win_cols].div(player_stats["Wins"], axis=0) * 100

    # Rename columns to reflect they are percentages of total career wins
    court_pcts.columns = [
        c.replace("Wins_", "WinPct_Total_") for c in court_pcts.columns
    ]

    # Concatenate and handle potential division by zero (for players with 0 wins)
    player_stats = pd.concat([player_stats, court_pcts], axis=1).fillna(0)

    # Final sorting and display
    player_stats = player_stats.sort_values(by="Wins", ascending=False)

    print(f"Number of players analyzed -> {len(player_stats)}")
    print(player_stats.head())

    # return player_stats


In [14]:
court_stats(df)

Number of players analyzed -> 1779
             Total Matches  Wins  Losses  Wins_Indoor  Wins_Outdoor  \
Federer R.          1393.0  1151   242.0        239.0         912.0   
Djokovic N.         1260.0  1059   201.0        150.0         909.0   
Nadal R.            1214.0  1007   207.0         76.0         931.0   
Ferrer D.           1035.0   677   358.0         90.0         587.0   
Murray A.            924.0   670   254.0        129.0         541.0   

             WinPct_Total_Indoor  WinPct_Total_Outdoor  
Federer R.             20.764553             79.235447  
Djokovic N.            14.164306             85.835694  
Nadal R.                7.547170             92.452830  
Ferrer D.              13.293944             86.706056  
Murray A.              19.253731             80.746269  


### Surface Stats

In [15]:
def surface_stats(df):
    # 1. Existing Logic: Overall Wins, Total Matches, and Losses
    wins = df["Winner"].value_counts()
    p1_counts = df["Player_1"].value_counts()
    p2_counts = df["Player_2"].value_counts()
    total_matches = p1_counts.add(p2_counts, fill_value=0)

    # Reindex to include players with 0 wins
    wins = wins.reindex(total_matches.index, fill_value=0)
    losses = total_matches - wins

    # 2. New Logic: Wins by Surface
    # Group by Winner and Surface, then count the occurrences
    surface_wins = df.groupby(["Winner", "Surface"]).size().unstack(fill_value=0)
    surface_wins.columns = [f"Wins_{col}" for col in surface_wins.columns]

    # 3. Create and Merge DataFrames
    player_stats = pd.DataFrame(
        {"Total Matches": total_matches, "Wins": wins, "Losses": losses}
    )

    # Join surface wins first so we have the columns needed for calculation
    player_stats = player_stats.join(surface_wins, how="left").fillna(0)

    # 4. NEW LOGIC: Calculate surface wins as a % of TOTAL wins
    # We select only the columns that start with "Wins_"
    win_cols = [c for c in player_stats.columns if c.startswith("Wins_")]

    # Vectorized division: Divide each surface win column by the "Wins" column
    # .div(..., axis=0) ensures it divides row-by-row
    surface_pcts = player_stats[win_cols].div(player_stats["Wins"], axis=0) * 100

    # Rename columns from "Wins_Clay" to "WinPct_Total_Clay" to clarify the denominator
    surface_pcts.columns = [
        c.replace("Wins_", "WinPct_Total_") for c in surface_pcts.columns
    ]

    # Concatenate the percentage columns and fill NaN (from 0/0 division) with 0
    player_stats = pd.concat([player_stats, surface_pcts], axis=1).fillna(0)

    # Final sorting and display
    player_stats = player_stats.sort_values(by="Wins", ascending=False)

    print(f"len of list -> {len(player_stats)}")
    print(player_stats.head())

    # return player_stats


In [16]:
surface_stats(df)

len of list -> 1779
             Total Matches  Wins  Losses  Wins_Carpet  Wins_Clay  Wins_Grass  \
Federer R.          1393.0  1151   242.0         46.0      210.0       182.0   
Djokovic N.         1260.0  1059   201.0          5.0      272.0       120.0   
Nadal R.            1214.0  1007   207.0          7.0      455.0        75.0   
Ferrer D.           1035.0   677   358.0          9.0      300.0        40.0   
Murray A.            924.0   670   254.0         18.0       97.0       105.0   

             Wins_Hard  WinPct_Total_Carpet  WinPct_Total_Clay  \
Federer R.       713.0             3.996525          18.245004   
Djokovic N.      662.0             0.472144          25.684608   
Nadal R.         470.0             0.695134          45.183714   
Ferrer D.        328.0             1.329394          44.313146   
Murray A.        450.0             2.686567          14.477612   

             WinPct_Total_Grass  WinPct_Total_Hard  
Federer R.            15.812337          61.94613

### Round Stats

In [17]:
def round_stats(df):
    # 1. Existing Logic: Overall Wins, Total Matches, and Losses
    wins = df["Winner"].value_counts()
    p1_counts = df["Player_1"].value_counts()
    p2_counts = df["Player_2"].value_counts()
    total_matches = p1_counts.add(p2_counts, fill_value=0)

    # Reindex to include players with 0 wins
    wins = wins.reindex(total_matches.index, fill_value=0)
    losses = total_matches - wins

    # 2. New Logic: Wins by Round (Replacing Surface)
    # Group by Winner and Round, then count the occurrences
    # Unique rounds in your data include '1st Round', 'Quarterfinals', 'The Final', etc.
    round_wins = df.groupby(["Winner", "Round"]).size().unstack(fill_value=0)
    round_wins.columns = [f"Wins_{col}" for col in round_wins.columns]

    # 3. Create and Merge DataFrames
    player_stats = pd.DataFrame(
        {"Total Matches": total_matches, "Wins": wins, "Losses": losses}
    )

    # Join round wins to the main stats
    player_stats = player_stats.join(round_wins, how="left").fillna(0)

    # 4. Percentage Logic: Calculate round wins as a % of TOTAL wins
    # Select columns that start with "Wins_" (which are now round names)
    win_cols = [c for c in player_stats.columns if c.startswith("Wins_")]

    # Vectorized division: Divide each round win column by the "Wins" column
    # Use .div(..., axis=0) to handle row-wise division
    round_pcts = player_stats[win_cols].div(player_stats["Wins"], axis=0) * 100

    # Rename columns to clarify the denominator (Total Wins)
    round_pcts.columns = [
        c.replace("Wins_", "WinPct_Total_") for c in round_pcts.columns
    ]

    # Concatenate the percentage columns and fill NaN (from 0/0 division) with 0
    player_stats = pd.concat([player_stats, round_pcts], axis=1).fillna(0)

    # Final sorting and display
    player_stats = player_stats.sort_values(by="Wins", ascending=False)

    print(f"Number of players -> {len(player_stats)}")
    print(player_stats.head())

In [18]:
round_stats(df)

Number of players -> 1779
             Total Matches  Wins  Losses  Wins_1st Round  Wins_2nd Round  \
Federer R.          1393.0  1151   242.0           174.0           266.0   
Djokovic N.         1260.0  1059   201.0           131.0           243.0   
Nadal R.            1214.0  1007   207.0           130.0           246.0   
Ferrer D.           1035.0   677   358.0           175.0           208.0   
Murray A.            924.0   670   254.0           144.0           183.0   

             Wins_3rd Round  Wins_4th Round  Wins_Quarterfinals  \
Federer R.            157.0            78.0               184.0   
Djokovic N.           164.0            82.0               171.0   
Nadal R.              173.0            67.0               161.0   
Ferrer D.              88.0            24.0                96.0   
Murray A.              95.0            38.0                90.0   

             Wins_Round Robin  Wins_Semifinals  Wins_The Final  \
Federer R.               41.0            149.0  

### Best of Stats

In [19]:
def best_of_stats(df):
    # 1. Overall Stats: Wins, Total Matches, and Losses
    wins = df["Winner"].value_counts()
    p1_counts = df["Player_1"].value_counts()
    p2_counts = df["Player_2"].value_counts()
    total_matches = p1_counts.add(p2_counts, fill_value=0)

    # Reindex to include players with 0 wins
    wins = wins.reindex(total_matches.index, fill_value=0)
    losses = total_matches - wins

    # 2. Wins by Match Format (Best of 3 vs 5)
    # Group by Winner and the 'Best of' column
    best_of_wins = df.groupby(["Winner", "Best of"]).size().unstack(fill_value=0)

    # Rename columns to 'Wins_3' and 'Wins_5'
    best_of_wins.columns = [f"Wins_{col}" for col in best_of_wins.columns]

    # 3. Create and Merge DataFrames
    player_stats = pd.DataFrame(
        {"Total Matches": total_matches, "Wins": wins, "Losses": losses}
    )

    # Join match format wins to the main stats
    player_stats = player_stats.join(best_of_wins, how="left").fillna(0)

    # 4. Percentage Logic: Calculate format wins as a % of TOTAL wins
    win_cols = [c for c in player_stats.columns if c.startswith("Wins_")]

    # Vectorized division by total wins
    format_pcts = player_stats[win_cols].div(player_stats["Wins"], axis=0) * 100

    # Rename columns to reflect they are percentages of total career wins
    format_pcts.columns = [
        c.replace("Wins_", "WinPct_Total_") for c in format_pcts.columns
    ]

    # Concatenate and handle potential division by zero
    player_stats = pd.concat([player_stats, format_pcts], axis=1).fillna(0)

    # Final sorting and display
    player_stats = player_stats.sort_values(by="Wins", ascending=False)

    print(f"Number of players analyzed: {len(player_stats)}")
    print(player_stats.head())

In [20]:
best_of_stats(df)

Number of players analyzed: 1779
             Total Matches  Wins  Losses  Wins_3  Wins_5  WinPct_Total_3  \
Federer R.          1393.0  1151   242.0   772.0   379.0       67.072111   
Djokovic N.         1260.0  1059   201.0   676.0   383.0       63.833805   
Nadal R.            1214.0  1007   207.0   697.0   310.0       69.215492   
Ferrer D.           1035.0   677   358.0   537.0   140.0       79.320532   
Murray A.            924.0   670   254.0   478.0   192.0       71.343284   

             WinPct_Total_5  
Federer R.        32.927889  
Djokovic N.       36.166195  
Nadal R.          30.784508  
Ferrer D.         20.679468  
Murray A.         28.656716  


## Data Preprocessing


In [29]:
def preprocess_data(df):
    # 1. base statistics - wins, losses, total matches
    wins = df["Winner"].value_counts()
    p1_counts = df["Player_1"].value_counts()
    p2_counts = df["Player_2"].value_counts()
    total_matches = p1_counts.add(p2_counts, fill_value=0)
    wins = wins.reindex(total_matches.index, fill_value=0)
    losses = total_matches - wins

    player_stats = pd.DataFrame(
        {
            "Total_Matches": total_matches,
            "Wins": wins,
            "Losses": losses,
            "Win_Pct": (wins / total_matches * 100).fillna(0),
        }
    )

    # 2. surface stats
    surface_wins = df.groupby(["Winner", "Surface"]).size().unstack(fill_value=0)
    surface_wins.columns = [f"Surface_Wins_{col}" for col in surface_wins.columns]
    player_stats = player_stats.join(surface_wins, how="left").fillna(0)
    surface_win_cols = [c for c in surface_wins.columns]
    surface_pcts = (
        player_stats[surface_win_cols].div(player_stats["Wins"], axis=0) * 100
    )
    surface_pcts.columns = [
        c.replace("Wins_", "SurfaceWinPct_") for c in surface_pcts.columns
    ]
    player_stats = pd.concat([player_stats, surface_pcts], axis=1).fillna(0)

    # 3. round stats
    round_wins = df.groupby(["Winner", "Round"]).size().unstack(fill_value=0)
    round_wins.columns = [f"Round_Wins_{col}" for col in round_wins.columns]
    player_stats = player_stats.join(round_wins, how="left").fillna(0)
    round_win_cols = [c for c in round_wins.columns]
    round_pcts = player_stats[round_win_cols].div(player_stats["Wins"], axis=0) * 100
    round_pcts.columns = [
        c.replace("Round_Wins_", "RoundWinPct_") for c in round_pcts.columns
    ]
    player_stats = pd.concat([player_stats, round_pcts], axis=1).fillna(0)

    # 4. series stats
    series_wins = df.groupby(["Winner", "Series"]).size().unstack(fill_value=0)
    series_wins.columns = [f"Series_Wins_{col}" for col in series_wins.columns]
    player_stats = player_stats.join(series_wins, how="left").fillna(0)
    series_win_cols = [c for c in series_wins.columns]
    series_pcts = player_stats[series_win_cols].div(player_stats["Wins"], axis=0) * 100
    series_pcts.columns = [
        c.replace("Series_Wins_", "SeriesWinPct_") for c in series_pcts.columns
    ]
    player_stats = pd.concat([player_stats, series_pcts], axis=1).fillna(0)

    # 5. court stats
    court_wins = df.groupby(["Winner", "Court"]).size().unstack(fill_value=0)
    court_wins.columns = [f"Court_Wins_{col}" for col in court_wins.columns]
    player_stats = player_stats.join(court_wins, how="left").fillna(0)
    court_win_cols = [c for c in court_wins.columns]
    court_pcts = player_stats[court_win_cols].div(player_stats["Wins"], axis=0) * 100
    court_pcts.columns = [
        c.replace("Court_Wins_", "CourtWinPct_") for c in court_pcts.columns
    ]
    player_stats = pd.concat([player_stats, court_pcts], axis=1).fillna(0)

    # 6. best of stats
    best_of_wins = df.groupby(["Winner", "Best of"]).size().unstack(fill_value=0)
    best_of_wins.columns = [f"BestOf_Wins_{int(col)}" for col in best_of_wins.columns]
    player_stats = player_stats.join(best_of_wins, how="left").fillna(0)

    best_of_win_cols = [c for c in best_of_wins.columns]
    best_of_pcts = (
        player_stats[best_of_win_cols].div(player_stats["Wins"], axis=0) * 100
    )
    best_of_pcts.columns = [
        c.replace("BestOf_Wins_", "BestOfWinPct_") for c in best_of_pcts.columns
    ]
    player_stats = pd.concat([player_stats, best_of_pcts], axis=1).fillna(0)

    # 7. sort by total wins
    player_stats = player_stats.sort_values(by="Wins", ascending=False)

    # 8. Reset index
    player_stats = player_stats.reset_index()
    player_stats = player_stats.rename(columns={"index": "Player_Name"})

    return player_stats


In [30]:
player_stats = preprocess_data(df)

In [31]:
player_stats.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1779 entries, 0 to 1778
Data columns (total 53 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Player_Name                      1779 non-null   object 
 1   Total_Matches                    1779 non-null   float64
 2   Wins                             1779 non-null   int64  
 3   Losses                           1779 non-null   float64
 4   Win_Pct                          1779 non-null   float64
 5   Surface_Wins_Carpet              1779 non-null   float64
 6   Surface_Wins_Clay                1779 non-null   float64
 7   Surface_Wins_Grass               1779 non-null   float64
 8   Surface_Wins_Hard                1779 non-null   float64
 9   Surface_SurfaceWinPct_Carpet     1779 non-null   float64
 10  Surface_SurfaceWinPct_Clay       1779 non-null   float64
 11  Surface_SurfaceWinPct_Grass      1779 non-null   float64
 12  Surface_SurfaceWinPc

In [32]:
player_stats.head()

Unnamed: 0,Player_Name,Total_Matches,Wins,Losses,Win_Pct,Surface_Wins_Carpet,Surface_Wins_Clay,Surface_Wins_Grass,Surface_Wins_Hard,Surface_SurfaceWinPct_Carpet,...,SeriesWinPct_Masters 1000,SeriesWinPct_Masters Cup,Court_Wins_Indoor,Court_Wins_Outdoor,CourtWinPct_Indoor,CourtWinPct_Outdoor,BestOf_Wins_3,BestOf_Wins_5,BestOfWinPct_3,BestOfWinPct_5
0,Federer R.,1393.0,1151,242.0,82.627423,46.0,210.0,182.0,713.0,3.996525,...,17.376195,4.952215,239.0,912.0,20.764553,79.235447,772.0,379.0,67.072111,32.927889
1,Djokovic N.,1260.0,1059,201.0,84.047619,5.0,272.0,120.0,662.0,0.472144,...,33.711048,4.627007,150.0,909.0,14.164306,85.835694,676.0,383.0,63.833805,36.166195
2,Nadal R.,1214.0,1007,207.0,82.948929,7.0,455.0,75.0,470.0,0.695134,...,28.003972,1.886792,76.0,931.0,7.54717,92.45283,697.0,310.0,69.215492,30.784508
3,Ferrer D.,1035.0,677,358.0,65.410628,9.0,300.0,40.0,328.0,1.329394,...,18.463811,1.181684,90.0,587.0,13.293944,86.706056,537.0,140.0,79.320532,20.679468
4,Murray A.,924.0,670,254.0,72.510823,18.0,97.0,105.0,450.0,2.686567,...,26.567164,2.38806,129.0,541.0,19.253731,80.746269,478.0,192.0,71.343284,28.656716
