In [21]:
import requests
from bs4 import BeautifulSoup
import datetime
import pandas as pd
import time

In [107]:
START_YEAR = 2009
END_YEAR = datetime.datetime.now().year - 1
MAX_RETRIES = 5
MAX_SINGLES = 750
MAX_DOUBLES = 250
COVID_2020 = 2020
COVID_2020_SINGLES_NON_PLAYERS = ["d683", "a887", "g710", "kd46", "t742",
                                  "b0cd", "v762", "n555", "g676", "sc73",
                                  "ab17", "g717", "ba98", "sf62", "hc21",
                                  "va70", "j439", "sx96", "g0ab", "gb41",
                                  "d776", "hb55", "gc54", "le38", "pc78",
                                  "bc72", "j557", "dc58", "g935"]
COVID_2020_DOUBLES_NON_PLAYERS = ["mc10", "me52", "e831", "a887", "t742",
                                  "bk40", "sc54", "g820"]
COVID_2021 = 2021
COVID_2021_SINGLES_NON_PLAYERS = ["e831", "ka98", "be69", "d632", "t742",
                                  "ch27", "j397", "a941", "h896", "k964",
                                  "g725", "pj32", "co06", "sx90", "sc73",
                                  "q948", "ab17", "n670", "o408", "ge73",
                                  "gd08", "sf62", "w571", "bg72", "si36",
                                  "bh87"]
COVID_2021_DOUBLES_NON_PLAYERS = ["b588", "b589", "p269", "h939"]

In [4]:
def collect_singles_overview_data(year, day):
    url = f"https://www.atptour.com/en/rankings/singles?rankRange=1-5000&rankDate={year}-12-{day}"

    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    ranking_table = soup.find("table", {"class": "mega-table"})

    # return if there is no table meaning that the date is not a Monday
    if ranking_table is None:
        return False

    rows = ranking_table.find_all("tr")

    # return if there is no data meaning that the date is not a Monday
    if len(rows) == 1:
        return False

    singles = []
    for row in rows[1:]:
        cells = row.find_all("td")

        ranking = int(cells[0].text.strip().replace("T", ""))
        country = cells[2].find("img")["alt"]
        name = cells[3].text.strip()
        player_id = cells[3].find("a")["href"].split("/")[4]
        link = "https://www.atptour.com" + cells[3].find("a")["href"]
        age = cells[4].text.strip()
        if age == "":
            age = None
        else:
            age = int(age)
        points = int(cells[5].text.strip().replace(",", ""))
        tournaments_played = int(cells[7].text.strip())

        singles.append({
            "ranking": ranking,
            "country": country,
            "name": name,
            "player_id": player_id,
            "link": link,
            "age": age,
            "points": points,
            "tournaments_played": tournaments_played
        })

    df = pd.DataFrame(singles)
    df.to_csv(f"data/players/overview/singles/{str(year)}.gz", index=False, compression="gzip")
    return True

In [5]:
def collect_doubles_overview_data(year, day):
    url = f"https://www.atptour.com/en/rankings/doubles?rankRange=1-5000&rankDate={year}-12-{day}"

    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    ranking_table = soup.find("table", {"class": "mega-table"})
    rows = ranking_table.find_all("tr")

    doubles = []
    for row in rows[1:]:
        cells = row.find_all("td")

        ranking = int(cells[0].text.strip().replace("T", ""))
        country = cells[2].find("img")["alt"]
        name = cells[3].text.strip()
        player_id = cells[3].find("a")["href"].split("/")[4]
        link = "https://www.atptour.com" + cells[3].find("a")["href"]
        age = cells[4].text.strip()
        if age == "":
            age = None
        else:
            age = int(age)
        points = int(cells[5].text.strip().replace(",", ""))
        tournaments_played = int(cells[7].text.strip())

        doubles.append({
            "ranking": ranking,
            "country": country,
            "name": name,
            "player_id": player_id,
            "link": link,
            "age": age,
            "points": points,
            "tournaments_played": tournaments_played
        })

    df = pd.DataFrame(doubles)
    df.to_csv(f"data/players/overview/doubles/{str(year)}.gz", index=False, compression="gzip")

In [None]:
# loop through all years
for year in range(START_YEAR, END_YEAR + 1):
    # find the last Monday of the year
    for day in range(31, 24, -1):
        success = collect_singles_overview_data(year, day)
        if not success:
            continue

        collect_doubles_overview_data(year, day)
        break

In [100]:
def clean_activity_link(link):
    link = link.replace("alex-bogomolov-jr.", "alex-bogomolov-jr")
    return link

In [101]:
def collect_singles_activity_data(year):
    singles = pd.read_csv(f"data/players/overview/singles/{str(year)}.gz")

    players = []
    for _, row in singles.iterrows():
        # only collect data for top 750 players
        if row["ranking"] > MAX_SINGLES:
            continue

        # skip this player because they have no doubles data for 2020 (due to COVID-19)
        if (row["player_id"] in COVID_2020_SINGLES_NON_PLAYERS and year == COVID_2020) or \
            (row["player_id"] in COVID_2021_SINGLES_NON_PLAYERS and year == COVID_2021):
            continue

        activity_table = None
        count = 0
        out = False
        while activity_table is None and not out:
            time.sleep(count)

            activity_link = row["link"].replace("overview", "player-activity") + "?year=" + str(year) + "&matchType=singles"
            clean_link = clean_activity_link(activity_link)
            response = requests.get(clean_link)
            soup = BeautifulSoup(response.text, "html.parser")

            activity_table = soup.find("div", {"data-filtered-module": "playerActivityTables"})
            count += 1
            if count == MAX_RETRIES:
                print(clean_link)
                # raise Exception("Could not find activity table")
                out = True

        if out:
            continue

        activity_stats = soup.find("div", {"data-filtered-module": "playerActivityStats"})
        stats = activity_stats.find_all("div", {"class": "stat-value"})
        record = stats[1].text.strip()
        total_wins = int(record.split("-")[0])
        total_losses = int(record.split("-")[1])
        total_prize_money = stats[3].text.strip().replace(",", "").replace("$", "")

        tournaments = activity_table.find_all("div", {"class": "activity-tournament-table"})

        tournaments_played = []
        for tourney in tournaments:
            tournament_info = tourney.find("table", {"class": "tourney-results-wrapper"})
            tournament_type = tournament_info.find("img")["alt"].strip()
            try:
                tournament_name = tournament_info.find("a", {"class": "tourney-title"}).text.strip()
            except:
                tournament_name = tournament_info.find("span", {"class": "tourney-title"}).text.strip()

            match_info = tourney.find("table", {"class": "mega-table"})
            match_rows = match_info.find_all("tr")

            wins = 0
            losses = 0
            for match in match_rows:
                result = match.find_all("td")[3].text.strip()
                if result == "W":
                    wins += 1
                elif result == "L":
                    losses += 1
            
            other_info = tourney.find("div", {"class": "activity-tournament-caption"}).text.strip()
            other_info_split = other_info.split(", ")

            points = 0
            prize_money = ""
            for info in other_info_split:
                if "Points" in info:
                    points = int(info.split(" ")[-1])
                elif "Prize Money" in info:
                    prize_money = info.split(" ")[-1].replace(",", "")

            tournaments_played.append({
                "tournament_type": tournament_type,
                "tournament_name": tournament_name,
                "wins": wins,
                "losses": losses,
                "points": points,
                "prize_money": prize_money
            })
        
        # include all other player info as well
        players.append({
            "ranking": row["ranking"],
            "country": row["country"],
            "name": row["name"],
            "player_id": row["player_id"],
            "link": row["link"],
            "age": row["age"],
            "points": row["points"],
            "wins": total_wins,
            "losses": total_losses,
            "prize_money": total_prize_money,
            "tournament_played": row["tournaments_played"],
            "tournaments": tournaments_played
        })

    df = pd.DataFrame(players)
    df.to_csv(f"data/players/activity/singles/{str(year)}.gz", index=False, compression="gzip")

In [102]:
def collect_doubles_activity_data(year):
    doubles = pd.read_csv(f"data/players/overview/doubles/{str(year)}.gz")

    players = []
    for _, row in doubles.iterrows():
        # only collect data for top 250 players
        if row["ranking"] > MAX_DOUBLES:
            continue

        # skip this player because they have no doubles data for 2020 (due to COVID-19)
        if (row["player_id"] in COVID_2020_DOUBLES_NON_PLAYERS and year == COVID_2020) or \
            (row["player_id"] in COVID_2021_DOUBLES_NON_PLAYERS and year == COVID_2021):
            continue

        activity_table = None
        count = 0
        out = False
        while activity_table is None and not out:
            time.sleep(count)

            activity_link = row["link"].replace("overview", "player-activity") + "?year=" + str(year) + "&matchType=doubles"
            clean_link = clean_activity_link(activity_link)
            response = requests.get(clean_link)
            soup = BeautifulSoup(response.text, "html.parser")

            activity_table = soup.find("div", {"data-filtered-module": "playerActivityTables"})
            count += 1
            if count == MAX_RETRIES:
                print(clean_link)
                # raise Exception("Could not find activity table")
                out = True

        if out:
            continue

        activity_stats = soup.find("div", {"data-filtered-module": "playerActivityStats"})
        stats = activity_stats.find_all("div", {"class": "stat-value"})
        record = stats[1].text.strip()
        total_wins = int(record.split("-")[0])
        total_losses = int(record.split("-")[1])
        total_prize_money = stats[3].text.strip().replace(",", "").replace("$", "")

        tournaments = activity_table.find_all("div", {"class": "activity-tournament-table"})

        tournaments_played = []
        for tourney in tournaments:
            tournament_info = tourney.find("table", {"class": "tourney-results-wrapper"})
            tournament_type = tournament_info.find("img")["alt"].strip()
            try:
                tournament_name = tournament_info.find("a", {"class": "tourney-title"}).text.strip()
            except:
                tournament_name = tournament_info.find("span", {"class": "tourney-title"}).text.strip()

            match_info = tourney.find("table", {"class": "mega-table"})
            match_rows = match_info.find_all("tr")

            wins = 0
            losses = 0
            for match in match_rows:
                result = match.find_all("td")[3].text.strip()
                if result == "W":
                    wins += 1
                elif result == "L":
                    losses += 1

            other_info = tourney.find("div", {"class": "activity-tournament-caption"}).text.strip()
            other_info_split = other_info.split(", ")

            points = 0
            prize_money = ""
            for info in other_info_split:
                if "Points" in info:
                    points = int(info.split(" ")[-1])
                elif "Prize Money" in info:
                    prize_money = info.split(" ")[-1].replace(",", "")

            tournaments_played.append({
                "tournament_type": tournament_type,
                "tournament_name": tournament_name,
                "wins": wins,
                "losses": losses,
                "points": points,
                "prize_money": prize_money
            })

        # include all other player info as well
        players.append({
            "ranking": row["ranking"],
            "country": row["country"],
            "name": row["name"],
            "player_id": row["player_id"],
            "link": row["link"],
            "age": row["age"],
            "points": row["points"],
            "wins": total_wins,
            "losses": total_losses,
            "prize_money": total_prize_money,
            "tournament_played": row["tournaments_played"],
            "tournaments": tournaments_played
        })

    df = pd.DataFrame(players)
    df.to_csv(f"data/players/activity/doubles/{str(year)}.gz", index=False, compression="gzip")

In [109]:
# loop through all years
for year in range(START_YEAR, END_YEAR + 1):
    collect_singles_activity_data(year)
    collect_doubles_activity_data(year)