# Data Retrieval

In [71]:
import requests
import datetime
import pandas as pd
from tqdm import tqdm

In [3]:
START_YEAR = 2009
END_YEAR = datetime.datetime.now().year - 1
MAX_RETRIES = 5
MAX_SINGLES = 750
MAX_DOUBLES = 250

# The following COVID constants are used to ignore players who did not play
# in the 2020 or 2021 seasons due to COVID-19. These players are included in the
# rankings but are to be ignored in the following calculations since they did
# not play.
COVID_2020 = 2020
COVID_2020_SINGLES_NON_PLAYERS = []
COVID_2020_DOUBLES_NON_PLAYERS = []
COVID_2021 = 2021
COVID_2021_SINGLES_NON_PLAYERS = []
COVID_2021_DOUBLES_NON_PLAYERS = []

In [33]:
def collect_singles_overview_data(year, day):
    page_num = 0
    url = f"https://api.wtatennis.com/tennis/players/ranked?page={page_num}&pageSize=100&type=rankSingles&name=&metric=SINGLES&at={year}-12-{day}"

    response = requests.get(url)
    players = response.json()
    num_players = len(players)

    if num_players == 0:
        return False

    singles = []
    while num_players != 0:
        for player in players:
            try:
                player_id = player["player"]["id"]
            except:
                continue

            singles.append({
                "ranking": player["ranking"],
                "country": player["player"]["countryCode"],
                "name": player["player"]["fullName"],
                "player_id": player_id,
                "date_of_birth": player["player"]["dateOfBirth"],
                "points": player["points"],
                "tournaments_played": player["tournamentsPlayed"]
            })

        page_num += 1
        url = f"https://api.wtatennis.com/tennis/players/ranked?page={page_num}&pageSize=100&type=rankSingles&name=&metric=SINGLES&at={year}-12-{day}"
        response = requests.get(url)
        players = response.json()
        num_players = len(players)

    df = pd.DataFrame(singles)
    df.to_csv(f"data/players/overview/singles/{str(year)}.gz", index=False, compression="gzip")
    return True

In [34]:
def collect_doubles_overview_data(year, day):
    page_num = 0
    url = f"https://api.wtatennis.com/tennis/players/ranked?page={page_num}&pageSize=100&type=Doubles&name=&metric=DOUBLES&at={year}-12-{day}"

    response = requests.get(url)
    players = response.json()
    num_players = len(players)

    doubles = []
    while num_players != 0:
        for player in players:
            try:
                player_id = player["player"]["id"]
            except:
                continue

            doubles.append({
                "ranking": player["ranking"],
                "country": player["player"]["countryCode"],
                "name": player["player"]["fullName"],
                "player_id": player_id,
                "date_of_birth": player["player"]["dateOfBirth"],
                "points": player["points"],
                "tournaments_played": player["tournamentsPlayed"]
            })

        page_num += 1
        url = f"https://api.wtatennis.com/tennis/players/ranked?page={page_num}&pageSize=100&type=Doubles&name=&metric=DOUBLES&at={year}-12-{day}"
        response = requests.get(url)
        players = response.json()
        num_players = len(players)

    df = pd.DataFrame(doubles)
    df.to_csv(f"data/players/overview/doubles/{str(year)}.gz", index=False, compression="gzip")

In [35]:
# loop through all years
for year in range(START_YEAR, END_YEAR + 1):
    # find the last Monday of the year
    for day in range(31, 24, -1):
        success = collect_singles_overview_data(year, day)
        if not success:
            continue

        collect_doubles_overview_data(year, day)
        break

In [72]:
def collect_singles_activity_data(year):
    singles = pd.read_csv(f"data/players/overview/singles/{str(year)}.gz")

    players = []
    for _, row in tqdm(singles.iterrows()):
        # only collect data for top 750 players
        if row["ranking"] > MAX_SINGLES:
            continue

        # skip this player because they have no doubles data for 2020 or 2021 (due to COVID-19)
        if (row["player_id"] in COVID_2020_SINGLES_NON_PLAYERS and year == COVID_2020) or \
            (row["player_id"] in COVID_2021_SINGLES_NON_PLAYERS and year == COVID_2021):
            continue
        
        page_num = 0
        match_data = []

        matches_link = f"https://api.wtatennis.com/tennis/players/{row['player_id']}/matches/?page={page_num}&pageSize=100&id={row['player_id']}&year={year}&type=S"
        response = requests.get(matches_link)
        try:
            matches = response.json()["matches"]
        except:
            continue
        num_matches = len(matches)

        while num_matches != 0:
            match_data.extend(matches)

            page_num += 1
            matches_link = f"https://api.wtatennis.com/tennis/players/{row['player_id']}/matches/?page={page_num}&pageSize=100&id={row['player_id']}&year={year}&type=S"
            response = requests.get(matches_link)
            matches = response.json()["matches"]
            num_matches = len(matches)

        total_wins = 0
        total_losses = 0
        total_prize_money = 0

        unique_tournaments = set()
        tournaments_played = {}

        for match in match_data:
            won = 0
            lost = 0
            if match["winner"] == 1:
                won = 1
                total_wins += 1
            else:
                lost = 1
                total_losses += 1

            if match["tournament"]["title"] not in unique_tournaments:
                unique_tournaments.add(match["tournament"]["title"])
                total_prize_money += match["PrizeWon"]
                tournaments_played[match["tournament"]["title"]] = {
                    "tournament_type": match["TournamentType"],
                    "tournament_level": match["TournamentLevel"],
                    "tournament_name": match["tournament"]["title"],
                    "wins": won,
                    "losses": lost,
                    "points": match["points_1"],
                    "prize_money": match["PrizeWon"],
                    "round_reached": match["round_name"]
                }
            else:
                tournaments_played[match["tournament"]["title"]]["wins"] += won
                tournaments_played[match["tournament"]["title"]]["losses"] += lost
                tournaments_played[match["tournament"]["title"]]["round_reached"] = match["round_name"]

        players.append({
            "ranking": row["ranking"],
            "country": row["country"],
            "name": row["name"],
            "player_id": row["player_id"],
            "date_of_birth": row["date_of_birth"],
            "points": row["points"],
            "wins": total_wins,
            "losses": total_losses,
            "prize_money": total_prize_money,
            "tournament_played": row["tournaments_played"],
            "tournaments": tournaments_played
        })

    df = pd.DataFrame(players)
    df.to_csv(f"data/players/activity/singles/{str(year)}.gz", index=False, compression="gzip")

In [73]:
def collect_doubles_activity_data(year):
    doubles = pd.read_csv(f"data/players/overview/doubles/{str(year)}.gz")

    players = []
    for _, row in tqdm(doubles.iterrows()):
        # only collect data for top 750 players
        if row["ranking"] > MAX_DOUBLES:
            continue

        # skip this player because they have no doubles data for 2020 or 2021 (due to COVID-19)
        if (row["player_id"] in COVID_2020_DOUBLES_NON_PLAYERS and year == COVID_2020) or \
            (row["player_id"] in COVID_2021_DOUBLES_NON_PLAYERS and year == COVID_2021):
            continue

        page_num = 0
        match_data = []

        matches_link = f"https://api.wtatennis.com/tennis/players/{row['player_id']}/matches/?page={page_num}&pageSize=100&id={row['player_id']}&year={year}&type=D"
        response = requests.get(matches_link)
        try:
            matches = response.json()["matches"]
        except:
            continue
        num_matches = len(matches)

        while num_matches != 0:
            match_data.extend(matches)

            page_num += 1
            matches_link = f"https://api.wtatennis.com/tennis/players/{row['player_id']}/matches/?page={page_num}&pageSize=100&id={row['player_id']}&year={year}&type=D"
            response = requests.get(matches_link)
            matches = response.json()["matches"]
            num_matches = len(matches)

        total_wins = 0
        total_losses = 0
        total_prize_money = 0

        unique_tournaments = set()
        tournaments_played = {}

        for match in match_data:
            won = 0
            lost = 0
            if match["winner"] == 1:
                won = 1
                total_wins += 1
            else:
                lost = 1
                total_losses += 1

            if match["tournament"]["title"] not in unique_tournaments:
                unique_tournaments.add(match["tournament"]["title"])
                total_prize_money += match["PrizeWon"]
                tournaments_played[match["tournament"]["title"]] = {
                    "tournament_type": match["TournamentType"],
                    "tournament_level": match["TournamentLevel"],
                    "tournament_name": match["tournament"]["title"],
                    "wins": won,
                    "losses": lost,
                    "points": match["points_1"],
                    "prize_money": match["PrizeWon"],
                    "round_reached": match["round_name"]
                }
            else:
                tournaments_played[match["tournament"]["title"]]["wins"] += won
                tournaments_played[match["tournament"]["title"]]["losses"] += lost
                tournaments_played[match["tournament"]["title"]]["round_reached"] = match["round_name"]

        players.append({
            "ranking": row["ranking"],
            "country": row["country"],
            "name": row["name"],
            "player_id": row["player_id"],
            "date_of_birth": row["date_of_birth"],
            "points": row["points"],
            "wins": total_wins,
            "losses": total_losses,
            "prize_money": total_prize_money,
            "tournament_played": row["tournaments_played"],
            "tournaments": tournaments_played
        })

    df = pd.DataFrame(players)
    df.to_csv(f"data/players/activity/doubles/{str(year)}.gz", index=False, compression="gzip")

In [None]:
# loop through all years
for year in range(START_YEAR, END_YEAR + 1):
    collect_singles_activity_data(year)
    collect_doubles_activity_data(year)