# **Analyzing the financial break even point in the ATP Tour**

In [1]:
#Load libraries
import pandas as pd
import numpy as np
import altair as alt

In [2]:
#Load ATP players activity data
players_activity = pd.read_csv('https://raw.githubusercontent.com/nicolasrojasv/nicolasrojasv.github.io/refs/heads/main/Project/data/players_activity.csv')

In [3]:
#Check tables
display(players_activity.head())

Unnamed: 0,player_id,year,event_id,event_name,event_title,prize_raw,currency,prize_usd
0,sy30,2019,8332,M25 Claremont,M25 Claremont,260,$,260
1,sy30,2019,1592,M25 Harlingen,M25 Harlingen,730,$,730
2,sy30,2019,8315,M15 Cancun,M15 Cancun,258,$,258
3,sy30,2019,8308,M15 Cancun,M15 Cancun,258,$,258
4,sy30,2019,8306,M15 Cancun,M15 Cancun,753,$,753


In [4]:
#Prizes not awarded in tournaments are excluded from the analysis.
#This is solely to ensure consistency with the information displayed on the website.
exclude = ["500 Bonus Pool", "1000 Bonus Pool", "Profit Sharing", "Profit Share", "Bonus Prize Money"]
players_activity_filtered = players_activity[~players_activity["event_name"].isin(exclude)]

In [5]:
players_prize_by_year = players_activity_filtered.groupby(["player_id", "year"])["prize_usd"].sum().reset_index()
players_prize_by_year.tail(10)

Unnamed: 0,player_id,year,prize_usd
42995,z450,2017,1470
42996,z450,2018,2274
42997,z450,2019,1666
42998,z452,2016,812
42999,z452,2017,1704
43000,z452,2018,1804
43001,z457,2015,182
43002,z457,2016,104
43003,z460,2014,414
43004,z460,2015,516


In [6]:
#Sort the table by prize money and year
players_prize_by_year = players_prize_by_year.sort_values(by=['year', 'prize_usd'], ascending=False)
players_prize_by_year.tail(10)

Unnamed: 0,player_id,year,prize_usd
10458,e121,1991,5366
34718,s535,1991,1040
23876,m343,1991,520
10457,e121,1990,1820
23875,m343,1990,1300
28153,o111,1989,212
23874,m343,1988,1194
23873,m343,1987,540
23872,m343,1985,680
3732,bf71,1956,0


In [7]:
#Check the unique year in the table
players_prize_by_year['year'].unique()

array([2025, 2024, 2023, 2022, 2021, 2020, 2019, 2018, 2017, 2016, 2015,
       2014, 2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004,
       2003, 2002, 2001, 2000, 1999, 1998, 1997, 1996, 1995, 1994, 1993,
       1992, 1991, 1990, 1989, 1988, 1987, 1985, 1956])

In [8]:
#The table has activity from 1956 (possible an error when extracting, but the price is 0) to 2025.
#For simplicity the project start at 2014.
players_prize_by_year = players_prize_by_year[players_prize_by_year['year'] >= 2014]

In [12]:
#Generate a ranking by year
players_prize_by_year['rank'] = players_prize_by_year.groupby('year')['prize_usd'].rank(ascending=False).astype(int)

In [15]:
players_prize_by_year[players_prize_by_year['year'] == 2014].head(10)

Unnamed: 0,player_id,year,prize_usd,rank
8798,d643,2014,11600528,1
11520,f324,2014,7993989,2
27217,n409,2014,6202445,3
41127,w367,2014,4912119,4
6494,c977,2014,4667359,5
27318,n552,2014,4431365,6
3143,ba47,2014,3529535,7
24362,mc10,2014,3474824,8
32503,r975,2014,3194482,9
11613,f401,2014,2574027,10


In [31]:
#Load the average cost of ATP season
season_cost = pd.read_csv('https://raw.githubusercontent.com/nicolasrojasv/nicolasrojasv.github.io/refs/heads/main/Project/data/average_annual_cost_atp_season.csv', sep=';')

In [33]:
#Merge the player prize data with the anual_cost
merged_player_prize = pd.merge(players_prize_by_year, season_cost, on='year', how='left')

In [35]:
#Calculate the difference between prize usd and annual cost
merged_player_prize['balance'] = merged_player_prize['prize_usd'] - merged_player_prize['annual_cost']

In [39]:
#Indicate if the player has a positive balance
merged_player_prize['positive_balance'] = merged_player_prize['balance'] > 0

In [46]:
#Save the data
merged_player_prize.to_csv('players_prize_by_year.csv', index=False)

In [45]:
#Identified the rank of the player that achieve the first positive balance
first_positive_balance = merged_player_prize[merged_player_prize['positive_balance']].groupby('year')['rank'].max().reset_index()
first_positive_balance

Unnamed: 0,year,rank
0,2014,192
1,2015,181
2,2016,197
3,2017,208
4,2018,225
5,2019,221
6,2020,181
7,2021,232
8,2022,237
9,2023,248


In [43]:
#Generate a dataframe that calculate the number and the percentage of player that achieve a positive balance by year
positive_balance_by_year = merged_player_prize.groupby('year')['positive_balance'].agg(['sum', 'count']).reset_index()
positive_balance_by_year['percentage'] = positive_balance_by_year['sum'] / positive_balance_by_year['count'] * 100
positive_balance_by_year

Unnamed: 0,year,sum,count,percentage
0,2014,192,2730,7.032967
1,2015,181,2911,6.217795
2,2016,197,2842,6.931738
3,2017,208,2715,7.661142
4,2018,225,2618,8.594347
5,2019,221,2428,9.102142
6,2020,181,1583,11.433986
7,2021,232,2139,10.84619
8,2022,237,2594,9.136469
9,2023,248,2651,9.35496


In [10]:
#Save the data
players_prize_by_year.to_csv('players_prize_by_year.csv', index=False)

In [47]:
#Save the data
positive_balance_by_year.to_csv('positive_balance_by_year.csv', index=False)