# Data Processing

In [8]:
import pandas as pd
import datetime
import json

In [9]:
START_YEAR = 2009
END_YEAR = datetime.datetime.now().year - 1

In [10]:
def inflate_prize_money(initial_amount, initial_year):
    cpi = pd.read_csv("data/inflation/inflation.csv")
    rate = cpi.loc[cpi["year"] == initial_year]["rate"].values[0]
    return initial_amount * rate

In [11]:
def clean_tournaments_string(tournaments_string):
    tournaments_string = tournaments_string.replace("L'Aquila", "LAquila")
    tournaments_string = tournaments_string.replace("Queen's", "Queens")
    tournaments_string = tournaments_string.replace("'s-Hertogenbosch", "s-Hertogenbosch")
    tournaments_string = tournaments_string.replace("l'Orne", "lOrne")
    tournaments_string = tournaments_string.replace("L'Orne", "LOrne")
    tournaments_string = tournaments_string.replace("D'Aro", "DAro")
    tournaments_string = tournaments_string.replace("'", '"')
    return tournaments_string

In [12]:
def process_singles_data(year):
    singles = pd.read_csv(f"data/players/activity/singles/{str(year)}.gz")

    # loop through each row
    for i in range(len(singles)):
        try:
            tournaments = json.loads(clean_tournaments_string(singles.loc[i, "tournaments"]))
        except:
            print(singles.loc[i, "tournaments"])
            raise

        wins = singles.loc[i, "wins"]
        losses = singles.loc[i, "losses"]
        for tournament in tournaments:
            # remove wins/losses from Davis Cup
            if tournament["tournament_type"] == "DC":
                wins -= tournament["wins"]
                losses -= tournament["losses"]

        prize_money = singles.loc[i, "prize_money"]
        inflated_prize_money = inflate_prize_money(prize_money, year)

        # update row
        singles.loc[i, "wins"] = wins
        singles.loc[i, "losses"] = losses
        singles.loc[i, "prize_money"] = inflated_prize_money

    # save to gzipped csv
    singles.to_csv(f"data/players/clean/singles/{year}.gz", index=False, compression="gzip")

In [13]:
def process_doubles_data(year):
    doubles = pd.read_csv(f"data/players/activity/doubles/{str(year)}.gz")

    # loop through each row
    for i in range(len(doubles)):
        try:
            tournaments = json.loads(clean_tournaments_string(doubles.loc[i, "tournaments"]))
        except:
            print(doubles.loc[i, "tournaments"])
            raise

        wins = doubles.loc[i, "wins"]
        losses = doubles.loc[i, "losses"]
        for tournament in tournaments:
            # remove wins/losses from Davis Cup
            if tournament["tournament_type"] == "DC":
                wins -= tournament["wins"]
                losses -= tournament["losses"]

        prize_money = doubles.loc[i, "prize_money"]
        inflated_prize_money = inflate_prize_money(prize_money, year)

        # update row
        doubles.loc[i, "wins"] = wins
        doubles.loc[i, "losses"] = losses
        doubles.loc[i, "prize_money"] = inflated_prize_money

    # save to gzipped csv
    doubles.to_csv(f"data/players/clean/doubles/{year}.gz", index=False, compression="gzip")

In [14]:
for year in range(START_YEAR, END_YEAR + 1):
    process_singles_data(year)
    process_doubles_data(year)

In [15]:
def get_unique_player_ids(singles, doubles):
    singles_ids = singles["player_id"].unique()
    doubles_ids = doubles["player_id"].unique()
    return list(set(singles_ids) | set(doubles_ids))

In [16]:
for year in range(START_YEAR, END_YEAR + 1):
    combined = []
    singles = pd.read_csv(f"data/players/clean/singles/{year}.gz")
    doubles = pd.read_csv(f"data/players/clean/doubles/{year}.gz")

    players = get_unique_player_ids(singles, doubles)

    for player_id in players:
        total_prize_money = 0
        predominate_type = ""
        highest_ranking = 9999

        player_singles = singles[singles["player_id"] == player_id]
        player_doubles = doubles[doubles["player_id"] == player_id]

        singles_tournaments = []
        if len(player_singles) == 1:
            total_prize_money += player_singles["prize_money"].sum()
            predominate_type = "singles"
            highest_ranking = player_singles["ranking"].min()
            singles_tournaments = json.loads(clean_tournaments_string(player_singles["tournaments"].iloc[0]))

        doubles_tournaments = []
        if len(player_doubles) == 1:
            total_prize_money += player_doubles["prize_money"].sum()
            double_ranking = player_doubles["ranking"].min()
            doubles_tournaments = json.loads(clean_tournaments_string(player_doubles["tournaments"].iloc[0]))

            if highest_ranking == 0 or double_ranking < highest_ranking:
                predominate_type = "doubles"

        combined.append({
            "player_id": player_id,
            "total_prize_money": total_prize_money,
            "type": predominate_type,
            "singles_tournaments": singles_tournaments,
            "doubles_tournaments": doubles_tournaments,
        })

    df = pd.DataFrame(combined)
    df.to_csv(f"data/players/combined/{year}.gz", index=False, compression="gzip")