# Data Retrieval

In [4]:
import requests
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import time

In [5]:
START_YEAR = 2009
END_YEAR = datetime.datetime.now().year - 1
MAX_RETRIES = 5
MAX_SINGLES = 750
MAX_DOUBLES = 250

# The following COVID constants are used to ignore players who did not play
# in the 2020 or 2021 seasons due to COVID-19. These players are included in the
# rankings but are to be ignored in the following calculations since they did
# not play.
COVID_2020 = 2020
COVID_2020_SINGLES_NON_PLAYERS = ["d683", "a887", "g710", "kd46", "t742",
                                  "b0cd", "v762", "n555", "g676", "sc73",
                                  "ab17", "g717", "ba98", "sf62", "hc21",
                                  "va70", "j439", "sx96", "g0ab", "gb41",
                                  "d776", "hb55", "gc54", "le38", "pc78",
                                  "bc72", "j557", "dc58", "g935"]
COVID_2020_DOUBLES_NON_PLAYERS = ["mc10", "me52", "e831", "a887", "t742",
                                  "bk40", "sc54", "g820"]
COVID_2021 = 2021
COVID_2021_SINGLES_NON_PLAYERS = ["e831", "ka98", "be69", "d632", "t742",
                                  "ch27", "j397", "a941", "h896", "k964",
                                  "g725", "pj32", "co06", "sx90", "sc73",
                                  "q948", "ab17", "n670", "o408", "ge73",
                                  "gd08", "sf62", "w571", "bg72", "si36",
                                  "bh87"]
COVID_2021_DOUBLES_NON_PLAYERS = ["b588", "b589", "p269", "h939"]

In [3]:
def collect_singles_overview_data(year, day):
    url = f"https://www.atptour.com/en/rankings/singles?rankRange=1-5000&rankDate={year}-12-{day}"

    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    ranking_table = soup.find("table", {"class": "mega-table desktop-table"})

    # return if there is no table meaning that the date is not a Monday
    if ranking_table is None:
        return False

    rows = ranking_table.find_all("tr")

    # return if there is no data meaning that the date is not a Monday
    if len(rows) == 1:
        return False

    singles = []
    for row in rows[1:]:
        cells = row.find_all("td")
        if len(cells) <= 1:
            continue

        ranking = int(cells[0].text.strip().replace("T", ""))
        name = cells[1].find("li", {"class": "name center"}).text.strip()
        player_id = cells[1].find("a")["href"].split("/")[4]
        link = "https://www.atptour.com" + cells[1].find("a")["href"]
        
        country = cells[1].find("img", {"class": "flag"})["src"].split("/")[6].replace(".svg", "")
        age = cells[2].text.strip()
        if age == "":
            age = None
        else:
            age = int(age)
        points = int(cells[3].text.strip().replace(",", ""))
        tournaments_played = int(cells[5].text.strip())

        singles.append({
            "ranking": ranking,
            "country": country,
            "name": name,
            "player_id": player_id,
            "link": link,
            "age": age,
            "points": points,
            "tournaments_played": tournaments_played
        })

    df = pd.DataFrame(singles)
    df.to_csv(f"data/players/overview/singles/{str(year)}.gz", index=False, compression="gzip")
    return True

In [4]:
def collect_doubles_overview_data(year, day):
    url = f"https://www.atptour.com/en/rankings/doubles?rankRange=1-5000&rankDate={year}-12-{day}"

    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    ranking_table = soup.find("table", {"class": "mega-table desktop-table"})
    rows = ranking_table.find_all("tr")

    doubles = []
    for row in rows[1:]:
        cells = row.find_all("td")
        if len(cells) <= 1:
            continue

        ranking = int(cells[0].text.strip().replace("T", ""))
        name = cells[1].find("li", {"class": "name center"}).text.strip()
        player_id = cells[1].find("a")["href"].split("/")[4]
        link = "https://www.atptour.com" + cells[1].find("a")["href"]
        flag_image = cells[1].find("img", {"class": "flag"})["src"].split("/")
        if len(flag_image) < 7:
            country = ""
        else:
            country = flag_image[6].replace(".svg", "")
        age = cells[2].text.strip()
        if age == "":
            age = None
        else:
            age = int(age)
        points = int(cells[3].text.strip().replace(",", ""))
        tournaments_played = int(cells[5].text.strip())

        doubles.append({
            "ranking": ranking,
            "country": country,
            "name": name,
            "player_id": player_id,
            "link": link,
            "age": age,
            "points": points,
            "tournaments_played": tournaments_played
        })

    df = pd.DataFrame(doubles)
    df.to_csv(f"data/players/overview/doubles/{str(year)}.gz", index=False, compression="gzip")

In [66]:
# loop through all years
for year in range(START_YEAR, END_YEAR + 1):
    # find the last Monday of the year
    for day in range(31, 24, -1):
        success = collect_singles_overview_data(year, day)
        if not success:
            continue

        collect_doubles_overview_data(year, day)
        break

In [6]:
def clean_activity_link(link):
    link = link.replace("alex-bogomolov-jr.", "alex-bogomolov-jr")
    return link

In [13]:
def collect_singles_activity_data(year):
    singles = pd.read_csv(f"data/players/overview/singles/{str(year)}.gz")

    players = []
    for _, row in singles.iterrows():
        # only collect data for top 750 players
        if row["ranking"] > MAX_SINGLES:
            continue

        # skip this player because they have no doubles data for 2020 or 2021 (due to COVID-19)
        if (row["player_id"] in COVID_2020_SINGLES_NON_PLAYERS and year == COVID_2020) or \
            (row["player_id"] in COVID_2021_SINGLES_NON_PLAYERS and year == COVID_2021):
            continue

        response = ""
        count = 0
        while response == "":
            time.sleep(count)
            count += 1

            activity_link = f"https://www.atptour.com/en/-/www/activity/sgl/{row['player_id']}//{str(year)}?v=1"
            clean_link = clean_activity_link(activity_link)
            response = requests.get(clean_link).json()

            if count > MAX_RETRIES:
                raise Exception(f"Failed to get data for {row['name']}")

        total_wins = int(response["Won"])
        total_losses = int(response["Lost"])
        total_prize_money = int(response["Prize"])

        tournaments = response["Activity"][0]["Tournaments"]

        tournaments_played = []
        for tourney in tournaments:
            
            tournament_type = tourney["EventType"]
            tournament_name = tourney["EventName"]
            if "ShortName" not in tourney["HiRound"]:
                round_reached = "-"
            else:
                round_reached = tourney["HiRound"]["ShortName"]

            wins = int(tourney["Won"])
            losses = int(tourney["Lost"])

            points = int(tourney["Points"])
            prize_money = int(tourney["PrizeUsd"])

            tournaments_played.append({
                "tournament_type": tournament_type,
                "tournament_name": tournament_name,
                "wins": wins,
                "losses": losses,
                "points": points,
                "prize_money": prize_money,
                "round_reached": round_reached
            })
        
        # include all other player info as well
        players.append({
            "ranking": row["ranking"],
            "country": row["country"],
            "name": row["name"],
            "player_id": row["player_id"],
            "link": row["link"],
            "age": row["age"],
            "points": row["points"],
            "wins": total_wins,
            "losses": total_losses,
            "prize_money": total_prize_money,
            "tournament_played": row["tournaments_played"],
            "tournaments": tournaments_played
        })

    df = pd.DataFrame(players)
    df.to_csv(f"data/players/activity/singles/{str(year)}.gz", index=False, compression="gzip")

In [14]:
def collect_doubles_activity_data(year):
    doubles = pd.read_csv(f"data/players/overview/doubles/{str(year)}.gz")

    players = []
    for _, row in doubles.iterrows():
        # only collect data for top 250 players
        if row["ranking"] > MAX_DOUBLES:
            continue

        # skip this player because they have no doubles data for 2020 or 2021 (due to COVID-19)
        if (row["player_id"] in COVID_2020_DOUBLES_NON_PLAYERS and year == COVID_2020) or \
            (row["player_id"] in COVID_2021_DOUBLES_NON_PLAYERS and year == COVID_2021):
            continue
        
        response = ""
        count = 0
        while response == "":
            time.sleep(count)
            count += 1

            activity_link = f"https://www.atptour.com/en/-/www/activity/dbl/{row['player_id']}//{str(year)}?v=1"
            clean_link = clean_activity_link(activity_link)
            response = requests.get(clean_link).json()

            if count > MAX_RETRIES:
                raise Exception(f"Failed to get data for {row['name']}")

        total_wins = int(response["Won"])
        total_losses = int(response["Lost"])
        total_prize_money = int(response["Prize"])

        tournaments = response["Activity"][0]["Tournaments"]

        tournaments_played = []
        for tourney in tournaments:
            
            tournament_type = tourney["EventType"]
            tournament_name = tourney["EventName"]
            if "ShortName" not in tourney["HiRound"]:
                round_reached = "-"
            else:
                round_reached = tourney["HiRound"]["ShortName"]

            wins = int(tourney["Won"])
            losses = int(tourney["Lost"])

            points = int(tourney["Points"])
            prize_money = int(tourney["PrizeUsd"])

            tournaments_played.append({
                "tournament_type": tournament_type,
                "tournament_name": tournament_name,
                "wins": wins,
                "losses": losses,
                "points": points,
                "prize_money": prize_money,
                "round_reached": round_reached
            })

        # include all other player info as well
        players.append({
            "ranking": row["ranking"],
            "country": row["country"],
            "name": row["name"],
            "player_id": row["player_id"],
            "link": row["link"],
            "age": row["age"],
            "points": row["points"],
            "wins": total_wins,
            "losses": total_losses,
            "prize_money": total_prize_money,
            "tournament_played": row["tournaments_played"],
            "tournaments": tournaments_played
        })

    df = pd.DataFrame(players)
    df.to_csv(f"data/players/activity/doubles/{str(year)}.gz", index=False, compression="gzip")

In [16]:
# loop through all years
for year in range(START_YEAR, END_YEAR + 1):
    collect_singles_activity_data(year)
    collect_doubles_activity_data(year)