In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import defaultdict

In [2]:
# Directory where the CSV files are located
directory = 'C:\\Users\\Otto\\OneDrive\\ATP-number\\data' 

# Start year and end year (inclusive)
start_year = 1968
end_year = 2023

# Empty DataFrame to concatenate all the files
all_atp_data_raw = pd.DataFrame()

# Loop through the files
for year in range(start_year, end_year + 1):
    file_name = f"atp_matches_{year}.csv"
    file_path = os.path.join(directory, file_name)
    
    # Check if the file exists
    if os.path.isfile(file_path):
        # Read the data from the file
        atp_data_raw = pd.read_csv(file_path)
        # Concatenate it into the all_data DataFrame
        all_atp_data_raw = pd.concat([all_atp_data_raw, atp_data_raw], ignore_index=True)
    else:
        print(f"File {file_name} does not exist.")

# all_data now contains the data from all the files

print(all_atp_data_raw.head(2))

  tourney_id tourney_name surface  draw_size tourney_level  tourney_date  \
0  1968-2029       Dublin   Grass         32             A      19680708   
1  1968-2029       Dublin   Grass         32             A      19680708   

   match_num  winner_id winner_seed winner_entry  ... l_1stIn l_1stWon  \
0        270     112411         NaN          NaN  ...     NaN      NaN   
1        271     126914         NaN          NaN  ...     NaN      NaN   

   l_2ndWon l_SvGms  l_bpSaved  l_bpFaced winner_rank winner_rank_points  \
0       NaN     NaN        NaN        NaN         NaN                NaN   
1       NaN     NaN        NaN        NaN         NaN                NaN   

  loser_rank loser_rank_points  
0        NaN               NaN  
1        NaN               NaN  

[2 rows x 49 columns]


In [3]:
all_atp_data_raw.to_csv('C:\\Users\\Otto\\OneDrive\\ATP-number\\data\\all_atp_data_raw.csv', index=False)

In [4]:
all_atp_data_raw.columns


Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'],
      dtype='object')

In [5]:
all_atp_data_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189882 entries, 0 to 189881
Data columns (total 49 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   tourney_id          189882 non-null  object 
 1   tourney_name        189882 non-null  object 
 2   surface             187566 non-null  object 
 3   draw_size           189882 non-null  int64  
 4   tourney_level       189882 non-null  object 
 5   tourney_date        189882 non-null  int64  
 6   match_num           189882 non-null  int64  
 7   winner_id           189882 non-null  int64  
 8   winner_seed         70415 non-null   object 
 9   winner_entry        16529 non-null   object 
 10  winner_name         189882 non-null  object 
 11  winner_hand         189868 non-null  object 
 12  winner_ht           173552 non-null  float64
 13  winner_ioc          189874 non-null  object 
 14  winner_age          188674 non-null  float64
 15  loser_id            189882 non-nul

In [6]:
## Get the columns tourney_id, tourney_date, winner_id, winner_name, winner_rank, loser_id, loser_name, loser_rank, score
rankings_columns = ['tourney_id', 'tourney_name', 'tourney_date','tourney_level', 'winner_id', 'winner_name', 'winner_rank', 'loser_id', 'loser_name', 'loser_rank', 'score']

all_atp_data = all_atp_data_raw[rankings_columns].copy()

all_atp_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189882 entries, 0 to 189881
Data columns (total 11 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tourney_id     189882 non-null  object 
 1   tourney_name   189882 non-null  object 
 2   tourney_date   189882 non-null  int64  
 3   tourney_level  189882 non-null  object 
 4   winner_id      189882 non-null  int64  
 5   winner_name    189882 non-null  object 
 6   winner_rank    154878 non-null  float64
 7   loser_id       189882 non-null  int64  
 8   loser_name     189882 non-null  object 
 9   loser_rank     146507 non-null  float64
 10  score          189876 non-null  object 
dtypes: float64(2), int64(3), object(6)
memory usage: 15.9+ MB


In [7]:
all_atp_data['tourney_date'] = pd.to_datetime(all_atp_data['tourney_date'], format='%Y%m%d')


In [8]:
all_atp_data.head()

Unnamed: 0,tourney_id,tourney_name,tourney_date,tourney_level,winner_id,winner_name,winner_rank,loser_id,loser_name,loser_rank,score
0,1968-2029,Dublin,1968-07-08,A,112411,Douglas Smith,,110196,Peter Ledbetter,,6-1 7-5
1,1968-2029,Dublin,1968-07-08,A,126914,Louis Pretorius,,209536,Maurice Pollock,,6-1 6-1
2,1968-2029,Dublin,1968-07-08,A,209523,Cecil Pedlow,,209535,John Mulvey,,6-2 6-2
3,1968-2029,Dublin,1968-07-08,A,100084,Tom Okker,,209534,Unknown Fearmon,,6-1 6-1
4,1968-2029,Dublin,1968-07-08,A,100132,Armistead Neely,,209533,Harry Sheridan,,6-2 6-4


In [9]:
all_atp_data_12_months = all_atp_data[all_atp_data['tourney_date'] >= '2022-07-11'].copy()

all_atp_data_12_months.head()

## print to a csv file
all_atp_data_12_months.to_csv('C:\\Users\\Otto\\OneDrive\\ATP-number\\data\\all_atp_data_12_months.csv', index=False)

In [10]:
# Count the number of times each player has won
baseline_ranking = all_atp_data_12_months['winner_name'].value_counts().reset_index()
baseline_ranking.columns = ['player_name', 'score']

# Show the top 10 players in the baseline ranking
baseline_ranking.head(10)

Unnamed: 0,player_name,score
0,Carlos Alcaraz,62
1,Daniil Medvedev,60
2,Holger Rune,59
3,Andrey Rublev,53
4,Taylor Fritz,51
5,Stefanos Tsitsipas,50
6,Jannik Sinner,49
7,Frances Tiafoe,48
8,Cameron Norrie,48
9,Novak Djokovic,46


Now let's move on to a more complex ranking system - the Elo ranking. We'll implement a basic version of the Elo ranking system where we start all players with an Elo of 1500:

* When a player wins a match, they gain Elo points
* When they lose a match, they lose Elo points. 
* The amount of points gained or lost depends on the Elo rating of the opponent.

1. In the Elo system, the difference in ratings between two players serves as a predictor of the outcome of a match. 
2. Two players with equal ratings who play against each other are expected to score an equal number of wins. 
3. A player whose rating is 100 points greater than their opponent's is expected to win 64% of the time. 
4. If the difference is 200 points, then the expected win rate for the stronger player goes up to approximately 76%.

One of the key features of the Elo rating system is its self-correcting nature. Players' ratings rise and fall according to their performance against other rated players. The system is zero-sum, meaning that any points gained by one player are lost by another.

In [11]:
# Initialize Elo ratings for all players
elo_ratings = defaultdict(lambda: 1500)

# Define the K-factor (maximum change in rating)
K = 32

# Function to update Elo ratings after a match
def update_elo(winner_elo, loser_elo):
    """
    Update the Elo ratings of two players after a match.

    Parameters:
    winner_elo (float): Elo rating of the winner before the match.
    loser_elo (float): Elo rating of the loser before the match.

    Returns:
    new_winner_elo (float): Updated Elo rating of the winner.
    new_loser_elo (float): Updated Elo rating of the loser.
    """
    # Convert ratings to scale from 0 to 1
    winner_transformed_elo = 10**(winner_elo / 400)
    loser_transformed_elo = 10**(loser_elo / 400)

    # Expected scores
    winner_expected = winner_transformed_elo / (winner_transformed_elo + loser_transformed_elo)
    loser_expected = loser_transformed_elo / (winner_transformed_elo + loser_transformed_elo)

    # Update ratings
    new_winner_elo = winner_elo + K * (1 - winner_expected)
    new_loser_elo = loser_elo + K * (0 - loser_expected)

    return new_winner_elo, new_loser_elo

# Update Elo ratings for all matches
for i, row in all_atp_data_12_months.iterrows():
    winner_name = row['winner_name']
    loser_name = row['loser_name']

    # Get current Elo ratings
    winner_elo = elo_ratings[winner_name]
    loser_elo = elo_ratings[loser_name]

    # Update ratings
    new_winner_elo, new_loser_elo = update_elo(winner_elo, loser_elo)

    # Store updated ratings
    elo_ratings[winner_name] = new_winner_elo
    elo_ratings[loser_name] = new_loser_elo

# Convert the Elo ratings to a DataFrame
elo_ranking = pd.DataFrame(list(elo_ratings.items()), columns=['player_name', 'elo'])

# Show the top 10 players in the Elo ranking
elo_ranking.sort_values('elo', ascending=False).head(20)


Unnamed: 0,player_name,elo
56,Carlos Alcaraz,1870.929156
193,Novak Djokovic,1849.764192
25,Holger Rune,1778.413012
130,Daniil Medvedev,1772.622692
3,Andrey Rublev,1726.59365
162,Stefanos Tsitsipas,1691.917464
58,Karen Khachanov,1686.970931
0,Francisco Cerundolo,1686.740206
93,Frances Tiafoe,1684.210369
118,Jannik Sinner,1683.307227


In [12]:
# Reset Elo ratings
elo_ratings = defaultdict(lambda: 1500)

# Function to get the K-factor based on the tournament level
def get_k_factor(tourney_level):
    if tourney_level == 'G':  # Grand Slam
        return 60
    elif tourney_level == 'M':  # Masters
        return 50
    elif tourney_level == 'A':  # ATP Tour
        return 40
    else:  # Davis Cup, Fed Cup, and others
        return 32

# Update Elo ratings for all matches with adjusted K-factor
for i, row in all_atp_data_12_months.iterrows():
    winner_name = row['winner_name']
    loser_name = row['loser_name']
    tourney_level = row['tourney_level']

    # Get current Elo ratings
    winner_elo = elo_ratings[winner_name]
    loser_elo = elo_ratings[loser_name]

    # Get K-factor
    K = get_k_factor(tourney_level)

    # Update ratings
    new_winner_elo, new_loser_elo = update_elo(winner_elo, loser_elo)

    # Store updated ratings
    elo_ratings[winner_name] = new_winner_elo
    elo_ratings[loser_name] = new_loser_elo

# Convert the Elo ratings to a DataFrame
elo_ranking_adjusted = pd.DataFrame(list(elo_ratings.items()), columns=['player_name', 'elo'])

# Show the top 10 players in the adjusted Elo ranking
elo_ranking_adjusted = elo_ranking_adjusted.sort_values('elo', ascending=False)

elo_ranking_adjusted.head(20)


Unnamed: 0,player_name,elo
193,Novak Djokovic,1956.476749
56,Carlos Alcaraz,1941.772643
25,Holger Rune,1821.975019
130,Daniil Medvedev,1795.031722
3,Andrey Rublev,1765.601484
8,Casper Ruud,1763.972392
58,Karen Khachanov,1757.900393
0,Francisco Cerundolo,1744.182238
162,Stefanos Tsitsipas,1726.083394
323,Alexander Zverev,1713.079323


Elo ranking system takes into account
* Strength of Opponent
* Importance of the matches
* Looks pretty accurate from initial readings.

Assumptions
* Each match is an independent event.

Doesn't take into account
* Performance of the player
* surface of court
* Physical condition of players
* recent performance

In [13]:
## Pandas put the elo_ranking_adjusted to html
elo_ranking_adjusted.to_markdown('C:\git-clones\ottosterner1.github.io\_scripts\markdown\elo_ranking_adjusted.md', index=False)


In [14]:
elo_ranking_adjusted.head()

Unnamed: 0,player_name,elo
193,Novak Djokovic,1956.476749
56,Carlos Alcaraz,1941.772643
25,Holger Rune,1821.975019
130,Daniil Medvedev,1795.031722
3,Andrey Rublev,1765.601484
