# **Retrieving Tennis player data from the ATP website**


In [None]:
#Load libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import datetime
from io import StringIO
import pickle

In [None]:
# Define the filename from which to load
filename_yearly = 'all_player_data_yearly.pkl'

# Load the list from the file
with open(filename_yearly, 'rb') as f:
    loaded_all_player_data_yearly = pickle.load(f)

print(f"all_player_data_yearly loaded from {filename_yearly}:")
print(f"Type of loaded_all_player_data_yearly: {type(loaded_all_player_data_yearly)}")
print(f"Number of DataFrames in loaded list: {len(loaded_all_player_data_yearly)}")
if loaded_all_player_data_yearly:
    print("First DataFrame head:")
    display(loaded_all_player_data_yearly[0].head())

all_player_data_yearly loaded from all_player_data_yearly.pkl:
Type of loaded_all_player_data_yearly: <class 'list'>
Number of DataFrames in loaded list: 14
First DataFrame head:


Unnamed: 0,rank,player_name,age,points,n_tournaments,diff_rank_three,last_name,year
0,1,Carlos Alcaraz,22,12050,19,6890,Alcaraz,2025
1,2,Jannik Sinner,24,11500,18,6340,Sinner,2025
2,3,Alexander Zverev,28,5160,24,0,Zverev,2025
3,4,Novak Djokovic,38,4830,20,330,Djokovic,2025
4,5,Felix Auger-Aliassime,25,4245,28,915,Auger-Aliassime,2025


In [None]:
#Extracting the table with all ATP profesional players for the season 2025.
url = "https://www.atptour.com/en/rankings/singles?rankRange=0-5000&region=all&dateWeek=2025-12-15&SortField=null&SortAscending=null"
response = requests.get(url)
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')

HTTPError: 403 Client Error: Forbidden for url: https://www.atptour.com/en/rankings/singles?rankRange=0-5000&region=all&dateWeek=2025-12-15&SortField=null&SortAscending=null

In [None]:
# Finding the dates with available ranking data
ranking_weeks = soup.find_all('select', id = "dateWeek-filter")

In [None]:
# Get only the date value
ranking_values_initial = []
for select_tag in ranking_weeks:
    for option_tag in select_tag.find_all('option'):
        ranking_values_initial.append(option_tag.get('value'))

#Keep dates with year > 2003
filtered_ranking_values = []
for value in ranking_values_initial:
    if value == 'Current Week':
        continue
    try:
        date_obj = datetime.datetime.strptime(value, '%Y-%m-%d')
        if date_obj.year > 2003:
            filtered_ranking_values.append(value)
    except ValueError:
        pass

# Keep only the last December date for each year
last_december_dates = {}
for value in filtered_ranking_values:
    date_obj = datetime.datetime.strptime(value, '%Y-%m-%d')
    if date_obj.month == 12:
        year = date_obj.year
        # Keep the latest December date for each year
        if year not in last_december_dates or date_obj > last_december_dates[year]:
            last_december_dates[year] = date_obj

# Convert the dictionary values back to string format and sort them in descending order
ranking_values = sorted([date_obj.strftime('%Y-%m-%d') for date_obj in last_december_dates.values()], reverse=True)

print("Last December dates for each year:")
for date in ranking_values:
    print(date)

Last December dates for each year:
2025-12-22
2024-12-30
2023-12-25
2022-12-26
2021-12-27
2020-12-28
2019-12-30
2018-12-31
2017-12-25
2016-12-26
2015-12-28
2014-12-29
2013-12-30
2012-12-31
2011-12-26
2010-12-27
2009-12-28
2008-12-29
2007-12-31
2006-12-25
2005-12-26
2004-12-27


In [None]:
# Check if all_player_data_yearly is already defined and contains data from a previous run
if 'all_player_data_yearly' not in globals() or not all_player_data_yearly:
    all_player_data_yearly = []
    existing_years_in_df = set()
else:
    existing_years_in_df = set()
    for df_item in all_player_data_yearly:
        if not df_item.empty and 'year' in df_item.columns:
            existing_years_in_df.add(df_item['year'].iloc[0])

for date_to_process in ranking_values:
    year = int(date_to_process[:4]) # Convert year to int for consistent comparison
    print(f"Processing data for year: {year} and date: {date_to_process}")

    # Skip if data for this year already exists
    if year in existing_years_in_df:
        print(f"Skipping year {year} as data for it already exists in all_player_data_yearly.")
        continue

    # Construct the URL for the current date
    url = f"https://www.atptour.com/en/rankings/singles?rankRange=0-5000&region=all&dateWeek={date_to_process}&SortField=null&SortAscending=null"

    # Make the request and parse the HTML
    response = requests.get(url)
    response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract the table into a DataFrame, now using StringIO for future compatibility
    dfs_tables = pd.read_html(StringIO(html_content))
    player_data = dfs_tables[1]

    # Clean the DataFrame (reusing cleaning logic of challenge 5)
    player_data_clean = player_data[["Hidden header", "Player", "Age", "Official Points", "Tourn Played"]]
    player_data_clean = player_data_clean.rename(columns={
        "Hidden header": "rank",
        "Player": "player_name",
        "Age": "age",
        "Official Points": "points",
        "Tourn Played": "n_tournaments"
    })

    # Clean player names
    player_data_clean['player_name'] = player_data_clean['player_name'].str.replace(r'^[-\d\s]+', '', regex=True)

    # Remove the problematic row (index 10) if it exists after cleaning
    # It's safer to check for the content rather than fixed index due to potential data variations
    if 10 in player_data_clean.index and "googletag.cmd.push" in player_data_clean.loc[10, 'player_name']:
        player_data_clean = player_data_clean.drop(index=10)

    # Convert 'points' to numeric
    player_data_clean['points'] = pd.to_numeric(player_data_clean['points'], errors='coerce')

    # Calculate the difference between player rank 3 and the rest.
    points_rank_three = player_data_clean.loc[2, 'points']
    player_data_clean["diff_rank_three"] = abs(player_data_clean["points"] - points_rank_three)

    # Column with the last name
    player_data_clean['last_name'] = player_data_clean['player_name'].apply(lambda x: x.split(' ', 1)[1] if len(x.split(' ', 1)) > 1 else '')

    # Pass data to number
    player_data_clean['rank'] = player_data_clean['rank'].astype(str).str.replace('T', '', regex=False)
    player_data_clean['rank'] = pd.to_numeric(player_data_clean['rank'], errors='coerce')
    player_data_clean['age'] = pd.to_numeric(player_data_clean['age'], errors='coerce')
    player_data_clean['n_tournaments'] = pd.to_numeric(player_data_clean['n_tournaments'], errors='coerce')

    # Add the 'year' column to the dataframe
    player_data_clean['year'] = year

    # Append the cleaned data to the list
    all_player_data_yearly.append(player_data_clean)

    # Add a small delay to avoid overwhelming the server
    time.sleep(1)

# Concatenate all DataFrames into a single DataFrame after the loop
final_player_data_df = pd.concat(all_player_data_yearly, ignore_index=True)
print("Combined all yearly player data into a single DataFrame.")


NameError: name 'ranking_values' is not defined

In [None]:
# Define the filename for the pickled data
filename_yearly = 'all_player_data_yearly.pkl'

# Save the all_player_data_yearly list to a file
with open(filename_yearly, 'wb') as f:
    pickle.dump(all_player_data_yearly, f)

print(f"all_player_data_yearly saved to {filename_yearly}")

all_player_data_yearly saved to all_player_data_yearly.pkl


In [None]:
# Concatenate all DataFrames into a single DataFrame after the loop
final_player_data_df = pd.concat(all_player_data_yearly, ignore_index=True)

In [None]:
#Save the yearly raking data
final_player_data_df.to_csv("atp_ranking.csv", index=False)

In [None]:
prize_by_year = df_filtered.groupby("year")["prize_usd"].sum()
print(prize_by_year)

year
2018         438
2019       12212
2020       81932
2021     1617820
2022     7627612
2023    10753431
2024     9850338
2025    18803427
Name: prize_usd, dtype: int64


In [None]:
rows = []

for year_block in activity:          # cada bloque es un año
    year = year_block["EventYear"]
    for t in year_block["Tournaments"]:   # cada torneo dentro del año
        rows.append({
            "year": year,
            "event_id": t["EventId"],
            "event_name": t["EventName"],
            "event_title": t["EventDisplayName"],
            "prize_raw": t["Prize"],          # premio en la moneda local
            "currency": t["CurrSymbol"],      # "$", "€", "£", etc.
            "prize_usd": t["PrizeUsd"],       # premio convertido a USD
        })

# Ahora rows tiene un registro por evento-año
for r in rows[:5]:
    print(r)

{'year': 2025, 'event_id': '605', 'event_name': 'Nitto ATP Finals', 'event_title': 'Nitto ATP Finals', 'prize_raw': 5071000, 'currency': '$', 'prize_usd': 5071000}
{'year': 2025, 'event_id': '352', 'event_name': 'ATP Masters 1000 Paris', 'event_title': 'Rolex Paris Masters', 'prize_raw': 946610, 'currency': '€', 'prize_usd': 1100529}
{'year': 2025, 'event_id': '337', 'event_name': 'Vienna', 'event_title': 'Erste Bank Open', 'prize_raw': 511835, 'currency': '€', 'prize_usd': 596339}
{'year': 2025, 'event_id': '5014', 'event_name': 'ATP Masters 1000 Shanghai', 'event_title': 'Rolex Shanghai Masters', 'prize_raw': 60400, 'currency': '$', 'prize_usd': 60400}
{'year': 2025, 'event_id': '747', 'event_name': 'Beijing', 'event_title': 'China Open', 'prize_raw': 751075, 'currency': '$', 'prize_usd': 751075}


In [None]:
df = pd.DataFrame(rows)
print(df.head(40))

    year event_id                     event_name  \
0   2025      605               Nitto ATP Finals   
1   2025      352         ATP Masters 1000 Paris   
2   2025      337                         Vienna   
3   2025     5014      ATP Masters 1000 Shanghai   
4   2025      747                        Beijing   
5   2025      560                        US Open   
6   2025      422    ATP Masters 1000 Cincinnati   
7   2025      540                      Wimbledon   
8   2025      500                          Halle   
9   2025      520                  Roland Garros   
10  2025      416          ATP Masters 1000 Rome   
11  2025      580                Australian Open   
12  2024     4481                 500 Bonus Pool   
13  2024      607                1000 Bonus Pool   
14  2024      901               Davis Cup Finals   
15  2024      605               Nitto ATP Finals   
16  2024     5014      ATP Masters 1000 Shanghai   
17  2024      747                        Beijing   
18  2024    

In [None]:
df_filtered[df_filtered["year"] == 2021]

Unnamed: 0,year,event_id,event_name,event_title,prize_raw,currency,prize_usd
78,2021,607,Bonus Prize Money,Bonus Prize Money,145000,$,145000
79,2021,901,Davis Cup Finals,Davis Cup Finals,0,€,0
80,2021,605,Nitto ATP Finals,Nitto ATP Finals,266000,$,266000
81,2021,429,Stockholm,Stockholm,11230,€,12989
82,2021,352,ATP Masters 1000 Paris,ATP Masters 1000 Paris,39120,€,45544
83,2021,337,Vienna,Vienna,103000,€,119913
84,2021,7485,Antwerp,Antwerp,49885,€,57832
85,2021,404,ATP Masters 1000 Indian Wells,ATP Masters 1000 Indian Wells,92000,$,92000
86,2021,7434,Sofia,Sofia,41145,€,48218
87,2021,560,US Open,US Open,265000,$,265000


In [None]:
exclude = ["500 Bonus Pool", "1000 Bonus Pool", "Profit Sharing", "Profit Share", "Bonus Prize Money"]
df_filtered = activity_all_df[~activity_all_df["event_name"].isin(exclude)]

In [None]:
df_filtered.head()

Unnamed: 0,player_id,year,event_id,event_name,event_title,prize_raw,currency,prize_usd
0,a0e2,2025,605,Nitto ATP Finals,Nitto ATP Finals,2704000,$,2704000
1,a0e2,2025,352,ATP Masters 1000 Paris,Rolex Paris Masters,44220,€,51410
2,a0e2,2025,329,Tokyo,Kinoshita Group Japan Open Tennis Championships,416365,$,416365
3,a0e2,2025,9210,Laver Cup,Laver Cup,0,$,0
4,a0e2,2025,560,US Open,US Open,5000000,$,5000000


In [None]:
prize_by_year = df_filtered.groupby(["player_id", "year"])["prize_usd"].sum()
prize_by_year.tail(20)

Unnamed: 0_level_0,Unnamed: 1_level_0,prize_usd
player_id,year,Unnamed: 2_level_1
z371,2019,148968
z371,2020,24899
z371,2021,125712
z371,2022,199673
z371,2023,1068483
z371,2024,1448942
z371,2025,378405
z419,2013,624
z419,2014,2738
z419,2015,6967
