# **Retrieving Tennis player price money data from the ATP website between 2014 to 2025**


In [1]:
#Load libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import datetime
from io import StringIO

In [2]:
#Extracting the table with all ATP profesional players for the season 2025.
url = "https://www.atptour.com/en/rankings/singles?rankRange=0-5000&region=all&dateWeek=2025-12-15&SortField=null&SortAscending=null"
response = requests.get(url)
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')

In [4]:
# Finding the dates with available ranking data
ranking_weeks = soup.find_all('select', id = "dateWeek-filter")

In [5]:
# Get only the date value
ranking_values_initial = []
for select_tag in ranking_weeks:
    for option_tag in select_tag.find_all('option'):
        ranking_values_initial.append(option_tag.get('value'))

#Keep dates with year > 2003
filtered_ranking_values = []
for value in ranking_values_initial:
    if value == 'Current Week':
        continue
    try:
        date_obj = datetime.datetime.strptime(value, '%Y-%m-%d')
        if date_obj.year > 2003:
            filtered_ranking_values.append(value)
    except ValueError:
        pass

# Keep only the last December date for each year
last_december_dates = {}
for value in filtered_ranking_values:
    date_obj = datetime.datetime.strptime(value, '%Y-%m-%d')
    if date_obj.month == 12:
        year = date_obj.year
        # Keep the latest December date for each year
        if year not in last_december_dates or date_obj > last_december_dates[year]:
            last_december_dates[year] = date_obj

# Convert the dictionary values back to string format and sort them in descending order
ranking_values = sorted([date_obj.strftime('%Y-%m-%d') for date_obj in last_december_dates.values()], reverse=True)

print("Last December dates for each year:")
for date in ranking_values:
    print(date)

Last December dates for each year:
2025-12-22
2024-12-30
2023-12-25
2022-12-26
2021-12-27
2020-12-28
2019-12-30
2018-12-31
2017-12-25
2016-12-26
2015-12-28
2014-12-29
2013-12-30
2012-12-31
2011-12-26
2010-12-27
2009-12-28
2008-12-29
2007-12-31
2006-12-25
2005-12-26
2004-12-27


In [6]:
# Filter ranking_values from 2014 or above
ranking_values = [date for date in ranking_values if datetime.datetime.strptime(date, '%Y-%m-%d').year >= 2014]
ranking_values

['2025-12-22',
 '2024-12-30',
 '2023-12-25',
 '2022-12-26',
 '2021-12-27',
 '2020-12-28',
 '2019-12-30',
 '2018-12-31',
 '2017-12-25',
 '2016-12-26',
 '2015-12-28',
 '2014-12-29']

In [3]:
#Retrieved the link with the overview of the players
player_links = soup.find_all('a', href=lambda href: href and '/en/players/' in href and '/overview' in href)
player_overview_urls = []
for link in player_links:
          full_url = "https://www.atptour.com" + link['href']
          if full_url not in player_overview_urls:
            player_overview_urls.append(full_url)
player_overview_urls[0:5]

['https://www.atptour.com/en/players/carlos-alcaraz/a0e2/overview',
 'https://www.atptour.com/en/players/jannik-sinner/s0ag/overview',
 'https://www.atptour.com/en/players/alexander-zverev/z355/overview',
 'https://www.atptour.com/en/players/novak-djokovic/d643/overview',
 'https://www.atptour.com/en/players/felix-auger-aliassime/ag37/overview']

In [7]:
player_overview_urls = []

for ranking_date in ranking_values:
    # Construct the full URL for the current ranking date
    ranking_url = f"https://www.atptour.com/en/rankings/singles?rankRange=0-5000&region=all&dateWeek={ranking_date}&SortField=null&SortAscending=null"

    # Make an HTTP GET request
    try:
        response = requests.get(ranking_url)
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        html_content = response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {ranking_date}: {e}")
        continue

    # Implement a brief delay to avoid overwhelming the server
    time.sleep(1)

    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all player profile links
    player_links = soup.find_all('a', href=lambda href: href and '/en/players/' in href and '/overview' in href)

    current_date_player_urls = set()
    for link in player_links:
        href = link.get('href')
        if href:
            full_player_url = "https://www.atptour.com" + href
            current_date_player_urls.add(full_player_url)

    # Convert the set to a list and append to the main list with the ranking date
    for player_url in list(current_date_player_urls):
        player_overview_urls.append({
            'date': ranking_date,
            'player_url': player_url
        })

print(f"Total unique player URLs collected across all dates: {len(player_overview_urls)}")

Total unique player URLs collected across all dates: 23760


In [19]:
#Generate a list only with all the unique player urls
player_urls = [item['player_url'] for item in player_overview_urls]
unique_player_urls = list(set(player_urls))
len(unique_player_urls)

6268

In [37]:
#Activity URL for each player
player_activity_urls = []
activity_suffix = "player-activity?matchType=Singles&year=2025&tournament=all"

for url in unique_player_urls:
    # Replace "overview" for the string activity_suffix
    activity_url = url.replace("/overview", f"/{activity_suffix}")
    player_activity_urls.append(activity_url)
player_activity_urls[:5]

['https://www.atptour.com/en/players/logan-smith/sy30/player-activity?matchType=Singles&year=2025&tournament=all',
 'https://www.atptour.com/en/players/iker-sevilla/s0w6/player-activity?matchType=Singles&year=2025&tournament=all',
 'https://www.atptour.com/en/players/wil-spencer/si72/player-activity?matchType=Singles&year=2025&tournament=all',
 'https://www.atptour.com/en/players/ignacio-parisca-romera/p0i5/player-activity?matchType=Singles&year=2025&tournament=all',
 'https://www.atptour.com/en/players/natthayut-nithithananont/n0co/player-activity?matchType=Singles&year=2025&tournament=all']

In [38]:
#Create a table with unique_player_urls and player_activity_urls in columns
players_df = pd.DataFrame({'player_overview_url': unique_player_urls, 'player_activity_url': player_activity_urls})
players_df.tail()

Unnamed: 0,player_overview_url,player_activity_url
6263,https://www.atptour.com/en/players/dmitry-resn...,https://www.atptour.com/en/players/dmitry-resn...
6264,https://www.atptour.com/en/players/sergio-call...,https://www.atptour.com/en/players/sergio-call...
6265,https://www.atptour.com/en/players/valerio-abo...,https://www.atptour.com/en/players/valerio-abo...
6266,https://www.atptour.com/en/players/boris-fassb...,https://www.atptour.com/en/players/boris-fassb...
6267,https://www.atptour.com/en/players/steve-johns...,https://www.atptour.com/en/players/steve-johns...


In [39]:
# Function to extract the id of each player and saving in the table.
def extract_player_id(url: str) -> str:

    if not isinstance(url, str):
        return None
    parts = url.strip("/").split("/")
    # Últimos elementos: [..., player_id, 'overview']
    if len(parts) >= 2:
        return parts[-2]
    return None

players_df["player_id"] = players_df["player_overview_url"].apply(extract_player_id)
players_df.head()

Unnamed: 0,player_overview_url,player_activity_url,player_id
0,https://www.atptour.com/en/players/logan-smith...,https://www.atptour.com/en/players/logan-smith...,sy30
1,https://www.atptour.com/en/players/iker-sevill...,https://www.atptour.com/en/players/iker-sevill...,s0w6
2,https://www.atptour.com/en/players/wil-spencer...,https://www.atptour.com/en/players/wil-spencer...,si72
3,https://www.atptour.com/en/players/ignacio-par...,https://www.atptour.com/en/players/ignacio-par...,p0i5
4,https://www.atptour.com/en/players/natthayut-n...,https://www.atptour.com/en/players/natthayut-n...,n0co


In [40]:
BASE_ACTIVITY_URL = "https://www.atptour.com/en/-/www/activity/sgl/{player_id}/?v=1"

#Funtion to download the activity JSON of each player
def fetch_activity_json(player_id: str, session: requests.Session = None, timeout: int = 20):

    url = BASE_ACTIVITY_URL.format(player_id=player_id)
    sess = session or requests.Session()

    try:
        resp = sess.get(url, timeout=timeout)
        if resp.status_code != 200:
            print(f"[WARN] {player_id}: status {resp.status_code}")
            return None
        return resp.json()
    except Exception as e:
        print(f"[ERROR] {player_id}: {e}")
        return None

In [41]:
def activity_json_to_rows(player_id: str, data: dict) -> list[dict]:

    if data is None:
        return []

    activity = data.get("Activity", [])
    rows = []

    for year_block in activity:
        year = year_block.get("EventYear")

        tournaments = year_block.get("Tournaments", [])
        for t in tournaments:
            rows.append({
                "player_id": player_id,
                "year": year,
                "event_id": t.get("EventId"),
                "event_name": t.get("EventName"),
                "event_title": t.get("EventDisplayName"),
                "prize_raw": t.get("Prize"),          # prize in local money
                "currency": t.get("CurrSymbol"),      # "$", "€", "£"...
                "prize_usd": t.get("PrizeUsd"),       # prize in USD
            })

    return rows

In [42]:
# Descargar la información de los jugadores que están en la primera tabla descargada.
session = requests.Session()

all_rows = []

for i, row in players_df.iterrows():
    player_id = row["player_id"]

    # Saltar si falta el id
    if pd.isna(player_id):
        continue

    print(f"[{i+1}/{len(players_df)}] Descargando activity de {player_id}...")

    resp = fetch_activity_json(player_id=player_id, session=session)

    # Si falló la descarga, seguimos con el siguiente
    if resp is None:
        continue

    player_rows = activity_json_to_rows(player_id=player_id, data=resp)

    # Si el jugador no tiene actividad, seguimos
    if not player_rows:
        continue

    # Agregamos las filas de este jugador a la lista global
    all_rows.extend(player_rows)

    # Pequeña pausa para no pegarle tan fuerte a la web
    time.sleep(0.3)

# Crear el DataFrame final con TODOS los jugadores
activity_all_df = pd.DataFrame(all_rows)

print(activity_all_df.head())
print(activity_all_df.shape)

[1;30;43mSe truncaron las últimas líneas 5000 del resultado de transmisión.[0m
[1293/6268] Descargando activity de s0gw...
[1294/6268] Descargando activity de v0cg...
[1295/6268] Descargando activity de sn56...
[1296/6268] Descargando activity de l702...
[1297/6268] Descargando activity de mq75...
[1298/6268] Descargando activity de so25...
[1299/6268] Descargando activity de j0a9...
[1300/6268] Descargando activity de t0bd...
[1301/6268] Descargando activity de p0ge...
[1302/6268] Descargando activity de b0qx...
[1303/6268] Descargando activity de w421...
[1304/6268] Descargando activity de a0ay...
[1305/6268] Descargando activity de mp50...
[1306/6268] Descargando activity de sx91...
[1307/6268] Descargando activity de c0bb...
[1308/6268] Descargando activity de t0e3...
[1309/6268] Descargando activity de i308...
[1310/6268] Descargando activity de m0o8...
[1311/6268] Descargando activity de lh74...
[1312/6268] Descargando activity de v691...
[1313/6268] Descargando activity de af7

In [43]:
#Save the dataset
activity_all_df.to_csv('players_activity.csv', index=False)