<a href="https://colab.research.google.com/github/nicolasrojasv/nicolasrojasv.github.io/blob/main/challenge_6_atp_ranking_data_across_seasons.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Retrieving Tennis player data from the ATP website across seasons
Extract, clean, and save the ATP tennis player ranking data for the last December dates of 2024, 2020, 2016, 2012, 2008, and 2004 from the ATP website into separate JSON files, with each file named `atp_ranking_YYYY.json`.

In [1]:
#Load libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import time
import datetime
from io import StringIO

In [None]:
#Extracting the table with all ATP profesional players.
url = "https://www.atptour.com/en/rankings/singles?rankRange=0-5000&region=all&dateWeek=2025-12-15&SortField=null&SortAscending=null"
response = requests.get(url)
response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
html_content = response.text
soup = BeautifulSoup(html_content, 'html.parser')

In [None]:
# Finding the dates with available ranking data
ranking_weeks = soup.find_all('select', id = "dateWeek-filter")

In [None]:
# Get only the date value
ranking_values_initial = []
for select_tag in ranking_weeks:
    for option_tag in select_tag.find_all('option'):
        ranking_values_initial.append(option_tag.get('value'))

#Keep dates with year > 1999
filtered_ranking_values = []
for value in ranking_values_initial:
    if value == 'Current Week':
        continue
    try:
        date_obj = datetime.datetime.strptime(value, '%Y-%m-%d')
        if date_obj.year > 1999:
            filtered_ranking_values.append(value)
    except ValueError:
        pass

# Keep only the last December date for each year
last_december_dates = {}
for value in filtered_ranking_values:
    date_obj = datetime.datetime.strptime(value, '%Y-%m-%d')
    if date_obj.month == 12:
        year = date_obj.year
        # Keep the latest December date for each year
        if year not in last_december_dates or date_obj > last_december_dates[year]:
            last_december_dates[year] = date_obj

# Convert the dictionary values back to string format and sort them in descending order
ranking_values = sorted([date_obj.strftime('%Y-%m-%d') for date_obj in last_december_dates.values()], reverse=True)

print("Last December dates for each year (after 1999):")
for date in ranking_values:
    print(date)

Last December dates for each year (after 1999):
2025-12-22
2024-12-30
2023-12-25
2022-12-26
2021-12-27
2020-12-28
2019-12-30
2018-12-31
2017-12-25
2016-12-26
2015-12-28
2014-12-29
2013-12-30
2012-12-31
2011-12-26
2010-12-27
2009-12-28
2008-12-29
2007-12-31
2006-12-25
2005-12-26
2004-12-27
2003-12-29
2002-12-30
2001-12-31
2000-12-25


In [None]:
#Filter the 'ranking_values' list to include only the last December dates for the years 2024, 2020, 2016, 2012, 2008, and 2004.
target_years = [2024, 2020, 2016, 2012, 2008, 2004]
filtered_ranking_values_for_target_years = []

for date_string in ranking_values:
    year = int(date_string[:4])
    if year in target_years:
        filtered_ranking_values_for_target_years.append(date_string)

print("Filtered ranking values for target years:")
for date in filtered_ranking_values_for_target_years:
    print(date)

Filtered ranking values for target years:
2024-12-30
2020-12-28
2016-12-26
2012-12-31
2008-12-29
2004-12-27


In [None]:
#This loop iterate through the filtered dates, fetch the ranking data for each year, clean it, and save it as a separate JSON file.
for date_to_process in filtered_ranking_values_for_target_years:
    year = date_to_process[:4]
    print(f"Processing data for year: {year} and date: {date_to_process}")

    # Construct the URL for the current date
    url = f"https://www.atptour.com/en/rankings/singles?rankRange=0-5000&region=all&dateWeek={date_to_process}&SortField=null&SortAscending=null"

    # Make the request and parse the HTML
    response = requests.get(url)
    response.raise_for_status() # Raise an HTTPError for bad responses (4xx or 5xx)
    html_content = response.text
    soup = BeautifulSoup(html_content, 'html.parser')

    # Extract the table into a DataFrame, now using StringIO for future compatibility
    dfs_tables = pd.read_html(StringIO(html_content))
    player_data = dfs_tables[1]

    # Clean the DataFrame (reusing cleaning logic of challenge 5)
    player_data_clean = player_data[["Hidden header", "Player", "Age", "Official Points", "Tourn Played"]]
    player_data_clean = player_data_clean.rename(columns={
        "Hidden header": "rank",
        "Player": "player_name",
        "Age": "age",
        "Official Points": "points",
        "Tourn Played": "n_tournaments"
    })

    # Clean player names
    player_data_clean['player_name'] = player_data_clean['player_name'].str.replace(r'^[-\d\s]+', '', regex=True)

    # Remove the problematic row (index 10) if it exists after cleaning
    # It's safer to check for the content rather than fixed index due to potential data variations
    if 10 in player_data_clean.index and "googletag.cmd.push" in player_data_clean.loc[10, 'player_name']:
        player_data_clean = player_data_clean.drop(index=10)

    # Convert 'points' to numeric
    player_data_clean['points'] = pd.to_numeric(player_data_clean['points'], errors='coerce')

    # Calculate the difference between player rank 3 and the rest.
    points_rank_three = player_data_clean.loc[2, 'points']
    player_data_clean["diff_rank_three"] = abs(player_data_clean["points"] - points_rank_three)

    # Column with the last name
    player_data_clean['last_name'] = player_data_clean['player_name'].apply(lambda x: x.split(' ', 1)[1] if len(x.split(' ', 1)) > 1 else '')

    # Pass data to number
    player_data_clean['rank'] = player_data_clean['rank'].astype(str).str.replace('T', '', regex=False)
    player_data_clean['rank'] = pd.to_numeric(player_data_clean['rank'], errors='coerce')
    player_data_clean['age'] = pd.to_numeric(player_data_clean['age'], errors='coerce')
    player_data_clean['n_tournaments'] = pd.to_numeric(player_data_clean['n_tournaments'], errors='coerce')

    # Save the cleaned data to a JSON file
    save_path = f'atp_ranking_{year}.json'
    player_data_clean.to_json(save_path, orient='records', indent=4)
    print(f"Saved data for {year} to {save_path}")

    # Add a small delay to avoid overwhelming the server
    time.sleep(1)

print("Data extraction and saving complete for all target years.")

Processing data for year: 2024 and date: 2024-12-30
Saved data for 2024 to atp_ranking_2024.json
Processing data for year: 2020 and date: 2020-12-28
Saved data for 2020 to atp_ranking_2020.json
Processing data for year: 2016 and date: 2016-12-26
Saved data for 2016 to atp_ranking_2016.json
Processing data for year: 2012 and date: 2012-12-31
Saved data for 2012 to atp_ranking_2012.json
Processing data for year: 2008 and date: 2008-12-29
Saved data for 2008 to atp_ranking_2008.json
Processing data for year: 2004 and date: 2004-12-27
Saved data for 2004 to atp_ranking_2004.json
Data extraction and saving complete for all target years.
