# Tournament Data Extraction

In [1]:
import pandas as pd
import datetime
import json

In [2]:
START_YEAR = 2009
END_YEAR = datetime.datetime.now().year - 1
TOURNAMENT_TYPE_MAPPINGS = {"WC": "ATP Finals", "PZ": "Bonus Prize Money",
                            "1000": "ATP Masters 1000", "500": "ATP 500",
                            "250": "ATP 250", "CH": "Challenger",
                            "FU": "Futures", "DC": "Davis Cup",
                            "OL": "Olympics", "GS": "Grand Slam",
                            "WT": "World Team Cup", "LVR": "Laver Cup",
                            "XXI": "Next Gen ATP Finals", "UC": "United Cup",
                            "ATPC": "ATP Cup"}
INCLUDED_TOURNAMENT_TYPES = ["GS", "1000", "500", "250", "CH"]
IGNORED_TOURNAMENT_TYPES = ["OL", "DC", "WT", "LVR", "XXI", "UC", "PZ", "FU",
                            "ATPC", "WC"]

In [3]:
def clean_tournaments_string(tournaments_string):
    tournaments_string = tournaments_string.replace("L'Aquila", "LAquila")
    tournaments_string = tournaments_string.replace("Queen's", "Queens")
    tournaments_string = tournaments_string.replace("'s-Hertogenbosch", "s-Hertogenbosch")
    tournaments_string = tournaments_string.replace("l'Orne", "lOrne")
    tournaments_string = tournaments_string.replace("L'Orne", "LOrne")
    tournaments_string = tournaments_string.replace("D'Aro", "DAro")
    tournaments_string = tournaments_string.replace("'", '"')
    return tournaments_string

In [4]:
def get_tournament_data(df):
    all_tournaments = dict()

    for i, _ in df.iterrows():
        clean_tournaments = clean_tournaments_string(df.loc[i, "tournaments"])
        try:
            tournaments = json.loads(clean_tournaments[clean_tournaments.find("["):])
        except:
            print(clean_tournaments)
            raise

        for tournament in tournaments:
            tournament_type = tournament["tournament_type"]
            round_reached = tournament["round_reached"]
            prize_money = tournament["prize_money"]
            points = tournament["points"]

            if tournament_type not in all_tournaments:
                all_tournaments[tournament_type] = {
                    round_reached: {
                        "total prize money": prize_money,
                        "total points": points,
                        "total players": 1,
                        "tournament_name": tournament["tournament_name"],
                    }
                }
            else:
                if round_reached not in all_tournaments[tournament_type]:
                    all_tournaments[tournament_type][round_reached] = {
                        "total prize money": prize_money,
                        "total points": points,
                        "total players": 1,
                        "tournament_name": tournament["tournament_name"],
                    }
                else:
                    all_tournaments[tournament_type][round_reached]["total prize money"] += prize_money
                    all_tournaments[tournament_type][round_reached]["total points"] += points
                    all_tournaments[tournament_type][round_reached]["total players"] += 1

    return all_tournaments

In [5]:
data = []
for t in ["singles", "doubles"]:
    for year in range(START_YEAR, END_YEAR + 1):
        df = pd.read_csv(f"data/players/clean/{t}/{year}.gz")
        all_tournaments = get_tournament_data(df)

        for tournament_type in all_tournaments:
            for round_reached in all_tournaments[tournament_type]:
                prize_money = all_tournaments[tournament_type][round_reached]["total prize money"]
                points = all_tournaments[tournament_type][round_reached]["total points"]
                players = all_tournaments[tournament_type][round_reached]["total players"]
                if points != 0 and players != 0:
                    average_prize_money_per_point = prize_money / points
                else:
                    average_prize_money_per_point = 0

                data.append({
                    "type": t,
                    "year": year,
                    "tournament_type": tournament_type,
                    "tournament_name": all_tournaments[tournament_type][round_reached]["tournament_name"],
                    "round": round_reached,
                    "prize_money": all_tournaments[tournament_type][round_reached]["total prize money"],
                    "points": all_tournaments[tournament_type][round_reached]["total points"],
                    "players": all_tournaments[tournament_type][round_reached]["total players"],
                    "average_prize_money_per_point": average_prize_money_per_point
                })

df = pd.DataFrame(data)
df.to_csv("data/tournaments/tournaments_expanded.gz", index=False, compression="gzip")

In [6]:
def synthesize_tournament_data(df):
    synthesized_tournaments = dict()

    for i, _ in df.iterrows():
        if df.loc[i, "tournament_type"] in INCLUDED_TOURNAMENT_TYPES:
            t = df.loc[i, "type"]
            year = df.loc[i, "year"]
            tournament_type = df.loc[i, "tournament_type"]
            prize_money = df.loc[i, "prize_money"]
            points = df.loc[i, "points"]
            players = df.loc[i, "players"]

            if t in synthesized_tournaments:
                if year not in synthesized_tournaments[t]:
                    synthesized_tournaments[t][year] = {
                        tournament_type: {
                            "total prize money": prize_money,
                            "total points": points,
                            "total players": players,
                        }
                    }
                else:
                    if tournament_type not in synthesized_tournaments[t][year]:
                        synthesized_tournaments[t][year][tournament_type] = {
                                "total prize money": prize_money,
                                "total points": points,
                                "total players": players,
                        }
                    else:
                        synthesized_tournaments[t][year][tournament_type]["total prize money"] += prize_money
                        synthesized_tournaments[t][year][tournament_type]["total points"] += points
                        synthesized_tournaments[t][year][tournament_type]["total players"] += players
            else:
                synthesized_tournaments[t] = {
                    year: {
                        tournament_type: {
                            "total prize money": prize_money,
                            "total points": points,
                            "total players": players,
                        }
                    }
                }

    return synthesized_tournaments

In [7]:
synthesized_data = []
synthesized_tournaments = synthesize_tournament_data(df)

for t in synthesized_tournaments:
    for year in synthesized_tournaments[t]:
        for tournament_type in synthesized_tournaments[t][year]:
            prize_money = synthesized_tournaments[t][year][tournament_type]["total prize money"]
            points = synthesized_tournaments[t][year][tournament_type]["total points"]
            players = synthesized_tournaments[t][year][tournament_type]["total players"]
            if points != 0 and players != 0:
                average_prize_money_per_point = prize_money / points
            else:
                continue

            synthesized_data.append({
                "type": t,
                "year": year,
                "tournament_type": TOURNAMENT_TYPE_MAPPINGS[tournament_type],
                "average_prize_money_per_point": round(average_prize_money_per_point),
                "total_prize_money": prize_money,
            })

synthesized_df = pd.DataFrame(synthesized_data)
synthesized_df.to_csv("data/tournaments/tournaments_condensed.gz", index=False, compression="gzip")