<a href="https://colab.research.google.com/github/nicolasrojasv/nicolasrojasv.github.io/blob/main/scrape_atp_players_overview.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Extracting Tennis player overview data from the ATP website**


In [None]:
#Load libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import datetime
from io import StringIO
import re
from typing import Dict, Any, Optional
import random

In [None]:
#Extracting the table with all ATP profesional players for the season 2025.
url = "https://www.atptour.com/en/rankings/singles?rankRange=0-5000&region=all&dateWeek=2025-12-15&SortField=null&SortAscending=null"
response = requests.get(url)
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')

In [None]:
# Finding the dates with available ranking data
ranking_weeks = soup.find_all('select', id = "dateWeek-filter")

In [None]:
# Get only the date value
ranking_values_initial = []
for select_tag in ranking_weeks:
    for option_tag in select_tag.find_all('option'):
        ranking_values_initial.append(option_tag.get('value'))

#Keep dates with year > 2003
filtered_ranking_values = []
for value in ranking_values_initial:
    if value == 'Current Week':
        continue
    try:
        date_obj = datetime.datetime.strptime(value, '%Y-%m-%d')
        if date_obj.year > 2003:
            filtered_ranking_values.append(value)
    except ValueError:
        pass

# Keep only the last December date for each year
last_december_dates = {}
for value in filtered_ranking_values:
    date_obj = datetime.datetime.strptime(value, '%Y-%m-%d')
    if date_obj.month == 12:
        year = date_obj.year
        # Keep the latest December date for each year
        if year not in last_december_dates or date_obj > last_december_dates[year]:
            last_december_dates[year] = date_obj

# Convert the dictionary values back to string format and sort them in descending order
ranking_values = sorted([date_obj.strftime('%Y-%m-%d') for date_obj in last_december_dates.values()], reverse=True)

print("Last December dates for each year:")
for date in ranking_values:
    print(date)

Last December dates for each year:
2025-12-29
2024-12-30
2023-12-25
2022-12-26
2021-12-27
2020-12-28
2019-12-30
2018-12-31
2017-12-25
2016-12-26
2015-12-28
2014-12-29
2013-12-30
2012-12-31
2011-12-26
2010-12-27
2009-12-28
2008-12-29
2007-12-31
2006-12-25
2005-12-26
2004-12-27


In [None]:
# Filter ranking_values from 2014 or above
ranking_values = [date for date in ranking_values if datetime.datetime.strptime(date, '%Y-%m-%d').year >= 2014]
ranking_values

['2025-12-29',
 '2024-12-30',
 '2023-12-25',
 '2022-12-26',
 '2021-12-27',
 '2020-12-28',
 '2019-12-30',
 '2018-12-31',
 '2017-12-25',
 '2016-12-26',
 '2015-12-28',
 '2014-12-29']

In [None]:
#Retrieved the link with the overview of the players for the date of 2025
player_links = soup.find_all('a', href=lambda href: href and '/en/players/' in href and '/overview' in href)
player_overview_urls = []
for link in player_links:
          full_url = "https://www.atptour.com" + link['href']
          if full_url not in player_overview_urls:
            player_overview_urls.append(full_url)
player_overview_urls[0:5]

['https://www.atptour.com/en/players/carlos-alcaraz/a0e2/overview',
 'https://www.atptour.com/en/players/jannik-sinner/s0ag/overview',
 'https://www.atptour.com/en/players/alexander-zverev/z355/overview',
 'https://www.atptour.com/en/players/novak-djokovic/d643/overview',
 'https://www.atptour.com/en/players/felix-auger-aliassime/ag37/overview']

In [None]:
#Exctracting the player overview from 2014 to 2024
player_overview_urls = []

for ranking_date in ranking_values:
    # Construct the full URL for the current ranking date
    ranking_url = f"https://www.atptour.com/en/rankings/singles?rankRange=0-5000&region=all&dateWeek={ranking_date}&SortField=null&SortAscending=null"

    # Make an HTTP GET request
    try:
        response = requests.get(ranking_url)
        response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
        html_content = response.text
    except requests.exceptions.RequestException as e:
        print(f"Error fetching data for {ranking_date}: {e}")
        continue

    # Implement a brief delay to avoid overwhelming the server
    time.sleep(1)

    # Parse the HTML content
    soup = BeautifulSoup(html_content, 'html.parser')

    # Find all player profile links
    player_links = soup.find_all('a', href=lambda href: href and '/en/players/' in href and '/overview' in href)

    current_date_player_urls = set()
    for link in player_links:
        href = link.get('href')
        if href:
            full_player_url = "https://www.atptour.com" + href
            current_date_player_urls.add(full_player_url)

    # Convert the set to a list and append to the main list with the ranking date
    for player_url in list(current_date_player_urls):
        player_overview_urls.append({
            'date': ranking_date,
            'player_url': player_url
        })

In [None]:
#Generate a list only with all the unique player urls
player_urls = [item['player_url'] for item in player_overview_urls]
unique_player_urls = list(set(player_urls))
len(unique_player_urls)

6270

In [None]:
#View the data
unique_player_urls[:5]

['https://www.atptour.com/en/players/bernd-kossler/ke14/overview',
 'https://www.atptour.com/en/players/antoni-fabre/f0gf/overview',
 'https://www.atptour.com/en/players/jack-sock/sm25/overview',
 'https://www.atptour.com/en/players/dhruv-sunish/s0dk/overview',
 'https://www.atptour.com/en/players/tejas-chaukulkar/cg19/overview']

In [None]:
#Activity URL for each player
player_activity_urls = []
activity_suffix = "player-activity?matchType=Singles&year=2025&tournament=all"

for url in unique_player_urls:
    # Replace "overview" for the string activity_suffix
    activity_url = url.replace("/overview", f"/{activity_suffix}")
    player_activity_urls.append(activity_url)
player_activity_urls[:5]

['https://www.atptour.com/en/players/bernd-kossler/ke14/player-activity?matchType=Singles&year=2025&tournament=all',
 'https://www.atptour.com/en/players/antoni-fabre/f0gf/player-activity?matchType=Singles&year=2025&tournament=all',
 'https://www.atptour.com/en/players/jack-sock/sm25/player-activity?matchType=Singles&year=2025&tournament=all',
 'https://www.atptour.com/en/players/dhruv-sunish/s0dk/player-activity?matchType=Singles&year=2025&tournament=all',
 'https://www.atptour.com/en/players/tejas-chaukulkar/cg19/player-activity?matchType=Singles&year=2025&tournament=all']

In [None]:
#Create a table with unique_player_urls and player_activity_urls in columns
players_df = pd.DataFrame({'player_overview_url': unique_player_urls, 'player_activity_url': player_activity_urls})
players_df.tail()

Unnamed: 0,player_overview_url,player_activity_url
6265,https://www.atptour.com/en/players/sam-weissbo...,https://www.atptour.com/en/players/sam-weissbo...
6266,https://www.atptour.com/en/players/dimitri-vid...,https://www.atptour.com/en/players/dimitri-vid...
6267,https://www.atptour.com/en/players/kazuya-tamu...,https://www.atptour.com/en/players/kazuya-tamu...
6268,https://www.atptour.com/en/players/drew-van-or...,https://www.atptour.com/en/players/drew-van-or...
6269,https://www.atptour.com/en/players/amr-elsayed...,https://www.atptour.com/en/players/amr-elsayed...


In [None]:
# Function to extract the id of each player and saving in the table.
def extract_player_id(url: str) -> str:

    if not isinstance(url, str):
        return None
    parts = url.strip("/").split("/")
    # Ãšltimos elementos: [..., player_id, 'overview']
    if len(parts) >= 2:
        return parts[-2]
    return None

players_df["player_id"] = players_df["player_overview_url"].apply(extract_player_id)
players_df.head()

Unnamed: 0,player_overview_url,player_activity_url,player_id
0,https://www.atptour.com/en/players/bernd-kossl...,https://www.atptour.com/en/players/bernd-kossl...,ke14
1,https://www.atptour.com/en/players/antoni-fabr...,https://www.atptour.com/en/players/antoni-fabr...,f0gf
2,https://www.atptour.com/en/players/jack-sock/s...,https://www.atptour.com/en/players/jack-sock/s...,sm25
3,https://www.atptour.com/en/players/dhruv-sunis...,https://www.atptour.com/en/players/dhruv-sunis...,s0dk
4,https://www.atptour.com/en/players/tejas-chauk...,https://www.atptour.com/en/players/tejas-chauk...,cg19


In [None]:
#Save the data
players_df.to_csv('players_urls.csv', index=False)

In [None]:
#Url to extract players characteristics
BASE_URL = "https://www.atptour.com/en/-/www/players/hero/{player_id}?v=1"

HEADERS = {
    "User-Agent": "Mozilla/5.0",
    "Accept": "application/json",
    "Referer": "https://www.atptour.com/",
}

# Recommended rate limiting to avoid stressing the ATP servers
SLEEP_MIN = 0.01
SLEEP_MAX = 0.1

# Replace enum-like sub-dictionaries by their human-readable
def simplify_enum_field(x: Any) -> Any:
    if isinstance(x, dict):
        return x.get("Description", x)
    return x

# Keep only importants fields from the original dictionary
def cut_until_active(data: Dict[str, Any]) -> Dict[str, Any]:
    out: Dict[str, Any] = {}
    for k, v in data.items():
        out[k] = v
        if k == "Active":
            break
    return out

# Download and process the ATP "hero" JSON for a single player
def fetch_player_hero(player_id: str, session: requests.Session) -> Optional[Dict[str, Any]]:
    url = BASE_URL.format(player_id=player_id)

    r = session.get(url, headers=HEADERS, timeout=30)

    # If the player id does not exist, return None
    if r.status_code == 404:
        return None

    r.raise_for_status()

    data = r.json()  # Ordered dict returned by the ATP endpoint

    # Keep only fields up to 'Active'
    data = cut_until_active(data)

    # Replace enum-like fields by their descriptions
    for key in ("PlayHand", "BackHand", "Active"):
        if key in data:
            data[key] = simplify_enum_field(data[key])

    # Always include the player id as an explicit column
    data["player_id"] = player_id

    return data

# Build a player-level table by looping over a list of player_ids.
def build_players_table(player_ids, sleep: bool = True) -> pd.DataFrame:
    rows = []

    with requests.Session() as session:
        for pid in player_ids:
            pid = str(pid).strip()
            if not pid:
                continue

            try:
                row = fetch_player_hero(pid, session)
                if row is not None:
                    rows.append(row)
                else:
                    # Track invalid or missing player ids
                    rows.append({"player_id": pid, "error": "not_found"})
            except Exception as e:
                # Keep the pipeline running even if one player fails
                rows.append({"player_id": pid, "error": str(e)})

            if sleep:
                time.sleep(random.uniform(SLEEP_MIN, SLEEP_MAX))

    df = pd.DataFrame(rows)

    # Optional: normalize column names for analysis / merges
    # df.columns = (
    #     df.columns.str.lower()
    #       .str.replace(r"[^\w]+", "_", regex=True)
    #       .str.strip("_")
    # )

    # Move player_id to the first column for readability
    if "player_id" in df.columns:
        cols = ["player_id"] + [c for c in df.columns if c != "player_id"]
        df = df[cols]

    return df



In [None]:
player_ids = players_df["player_id"].tolist()
player_ids[:10]

['ke14',
 'f0gf',
 'sm25',
 's0dk',
 'cg19',
 'cc57',
 'd0jz',
 'mo92',
 'z184',
 'r0e4']

In [None]:
#Scrape the characteristics of each player
players_table = build_players_table(player_ids)

#Save the table
players_table.to_csv('players_characteristics.csv', index=False)