This notebook aims to create a pandas DataFrame containing the prize money for the top 500 ATP tennis players.

In [45]:
#Load libraries
import pandas as pd
import numpy as np
import altair as alt

In [46]:
#Load ATP players data
df = pd.read_csv('/content/atp_players_activity.csv')
display(df.head())

Unnamed: 0,player_id,year,event_id,event_name,event_title,prize_raw,currency,prize_usd
0,a0e2,2025,605,Nitto ATP Finals,Nitto ATP Finals,2704000,$,2704000
1,a0e2,2025,352,ATP Masters 1000 Paris,Rolex Paris Masters,44220,€,51410
2,a0e2,2025,329,Tokyo,Kinoshita Group Japan Open Tennis Championships,416365,$,416365
3,a0e2,2025,9210,Laver Cup,Laver Cup,0,$,0
4,a0e2,2025,560,US Open,US Open,5000000,$,5000000


In [47]:
# Filter the original DataFrame 'df' to include only rows where the 'year' column is equal to 2025.
# The result is saved in a new DataFrame called 'df_2025'.
df_2025 = df[df['year'] == 2025]

# Display the first 5 rows of the filtered DataFrame 'df_2025'
# to verify that the filtering was performed correctly and only contains 2025 data.
display(df_2025.head())

Unnamed: 0,player_id,year,event_id,event_name,event_title,prize_raw,currency,prize_usd
0,a0e2,2025,605,Nitto ATP Finals,Nitto ATP Finals,2704000,$,2704000
1,a0e2,2025,352,ATP Masters 1000 Paris,Rolex Paris Masters,44220,€,51410
2,a0e2,2025,329,Tokyo,Kinoshita Group Japan Open Tennis Championships,416365,$,416365
3,a0e2,2025,9210,Laver Cup,Laver Cup,0,$,0
4,a0e2,2025,560,US Open,US Open,5000000,$,5000000


In [48]:
# Load the 'atp_ranking_2025.csv' file into a new DataFrame.
df_ranking = pd.read_csv('/content/atp_ranking_2025.csv')

# Display the first 5 rows of the resulting DataFrame to inspect its content.
display(df_ranking.head())

Unnamed: 0,rank,player_name,age,points,n_tournaments,player_overview_url,player_activity_url,player_id
0,1,Carlos Alcaraz,22,12050,19,https://www.atptour.com/en/players/carlos-alca...,https://www.atptour.com/en/players/carlos-alca...,a0e2
1,2,Jannik Sinner,24,11500,18,https://www.atptour.com/en/players/jannik-sinn...,https://www.atptour.com/en/players/jannik-sinn...,s0ag
2,3,Alexander Zverev,28,5160,24,https://www.atptour.com/en/players/alexander-z...,https://www.atptour.com/en/players/alexander-z...,z355
3,4,Novak Djokovic,38,4830,20,https://www.atptour.com/en/players/novak-djoko...,https://www.atptour.com/en/players/novak-djoko...,d643
4,5,Felix Auger-Aliassime,25,4245,28,https://www.atptour.com/en/players/felix-auger...,https://www.atptour.com/en/players/felix-auger...,ag37


In [49]:
# Group the DataFrame 'df_2025' by 'player_id' and sum the 'prize_usd' for each player.
# The result is saved in a new DataFrame called 'player_prize_money'.
player_prize_money = merged_df.groupby('player_id')['prize_usd'].sum().reset_index()

# Display the first 5 rows of the resulting DataFrame to verify the total prize money per player.
display(player_prize_money.head())

Unnamed: 0,player_id,prize_usd
0,a09q,18177
1,a09t,79972
2,a09u,41291
3,a0ba,168616
4,a0cj,160331


In [50]:
# Sort the DataFrame 'player_prize_money' by the 'prize_usd' column in descending order (from highest to lowest).
# The `ascending=False` option ensures descending order.
# The result is saved in the same DataFrame, overwriting the previous one.
player_prize_money = player_prize_money.sort_values(by='prize_usd', ascending=False)

# Display the first 5 rows of the sorted DataFrame
# to verify that players with the highest prize money appear at the top.
display(player_prize_money.head())

Unnamed: 0,player_id,prize_usd
385,s0ag,19114398
5,a0e2,18803427
496,z355,5976269
156,fb98,5456536
18,ag37,5228696


In [51]:
# Perform the merge of the two DataFrames.
merged_df = pd.merge(player_prize_money, df_ranking, left_on="player_id", right_on="player_id", how='left')

# Display the resulting merged DataFrame.
display(merged_df.head(10))

Unnamed: 0,player_id,prize_usd,rank,player_name,age,points,n_tournaments,player_overview_url,player_activity_url
0,s0ag,19114398,2,Jannik Sinner,24,11500,18,https://www.atptour.com/en/players/jannik-sinn...,https://www.atptour.com/en/players/jannik-sinn...
1,a0e2,18803427,1,Carlos Alcaraz,22,12050,19,https://www.atptour.com/en/players/carlos-alca...,https://www.atptour.com/en/players/carlos-alca...
2,z355,5976269,3,Alexander Zverev,28,5160,24,https://www.atptour.com/en/players/alexander-z...,https://www.atptour.com/en/players/alexander-z...
3,fb98,5456536,6,Taylor Fritz,28,4135,23,https://www.atptour.com/en/players/taylor-frit...,https://www.atptour.com/en/players/taylor-frit...
4,ag37,5228696,5,Felix Auger-Aliassime,25,4245,28,https://www.atptour.com/en/players/felix-auger...,https://www.atptour.com/en/players/felix-auger...
5,dh58,5192228,7,Alex de Minaur,26,4135,23,https://www.atptour.com/en/players/alex-de-min...,https://www.atptour.com/en/players/alex-de-min...
6,d643,5127247,4,Novak Djokovic,38,4830,20,https://www.atptour.com/en/players/novak-djoko...,https://www.atptour.com/en/players/novak-djoko...
7,m0ej,4682629,8,Lorenzo Musetti,23,4040,23,https://www.atptour.com/en/players/lorenzo-mus...,https://www.atptour.com/en/players/lorenzo-mus...
8,s0s1,4600654,9,Ben Shelton,23,3970,23,https://www.atptour.com/en/players/ben-shelton...,https://www.atptour.com/en/players/ben-shelton...
9,d0co,3421705,10,Jack Draper,23,2990,17,https://www.atptour.com/en/players/jack-draper...,https://www.atptour.com/en/players/jack-draper...


In [54]:
# Define the limits for the ranking bins
# We want bins of 10, from 1 to 500
bins = np.arange(1, 501, 10)

# Create labels for the bins (e.g., '1-10', '11-20')
labels = [f'{i}-{i+9}' for i in bins[:-1]]
# For the last bin, which might not be exactly 10 if the maximum is not a multiple of 10
labels.append(f'{bins[-1]}-{merged_df["rank"].max()}')

# Assign each player to a ranking bin
# pandas' cut function is ideal for this.
# right=True by default for intervals [a, b) -> (a,b] if the ranking is 'rank'
# bins = [1, 11, 21, ..., 491, 501]
merged_df['ranking_bin'] = pd.cut(
    merged_df['rank'],
    bins=np.append(bins, 501), # Add an upper limit for the last bin
    labels=labels,
    right=False # Intervals of the form [a, b) instead of (a, b]
)

# Calculate the total prize_usd for each ranking bin
prize_money_by_bin = merged_df.groupby('ranking_bin')['prize_usd'].sum().reset_index()

# Display the result
display(prize_money_by_bin.head())

  prize_money_by_bin = merged_df.groupby('ranking_bin')['prize_usd'].sum().reset_index()


Unnamed: 0,ranking_bin,prize_usd
0,1-10,77603789
1,11-20,28055427
2,21-30,19541700
3,31-40,17007405
4,41-50,13535022


In [55]:
# Create the bar chart with Altair
chart_bins = alt.Chart(prize_money_by_bin).mark_bar().encode(
    x=alt.X('ranking_bin:O', title='Ranking Bin', sort=labels[::-1]), # 'O' for ordinal, to maintain order, and reverse the labels list for inverse order
    y=alt.Y('prize_usd:Q', title='Total Prize Money (USD)')
).properties(
    title='Total Prize Money by Ranking Bin (Top 500)'
).interactive() # Allows interactivity such as zoom and pan

# Display the chart
chart_bins

In [56]:
# Lorenz curve function

def lorenz_curve(incomes):
    incomes = np.sort(np.array(incomes, dtype=float))
    n = incomes.size

    cum_income = np.cumsum(incomes)
    cum_income = np.insert(cum_income, 0, 0)
    L = cum_income / cum_income[-1]

    p = np.arange(0, n + 1) / n

    return p, L

In [57]:
# Calculate the Lorenz curve

p, L = lorenz_curve(merged_df["prize_usd"])

df_lorenz = pd.DataFrame({
    "cumulative_population": p,
    "cumulative_income": L
})

In [58]:
def gini(p, L):
    """Calculates the Gini index from p and L."""
    area = np.trapezoid(L, p)     # area under the Lorenz curve
    return 1 - 2 * area

In [59]:
g = gini(p, L)
print("Gini:", g)

Gini: 0.7349848324862847


In [60]:

# Lorenz Curve
curve = alt.Chart(df_lorenz).mark_line(color='steelblue').encode(
    x=alt.X("cumulative_population:Q", title="Cumulative Proportion of Population"),
    y=alt.Y("cumulative_income:Q", title="Cumulative Proportion of Income")
)

# Line of perfect equality
equality = alt.Chart(pd.DataFrame({
    "cumulative_population": [0, 1],
    "cumulative_income": [0, 1]
})).mark_line(strokeDash=[5,5], color="red").encode(
    x="cumulative_population",
    y="cumulative_income"
)

# Create a DataFrame for the Gini text
gini_text_df = pd.DataFrame({
    'text': [f'Gini Index: {g:.3f}'], # Format Gini to 3 decimal places
    'x': [0.05], # X position (adjust if necessary)
    'y': [0.95]  # Y position (adjust if necessary)
})

# Create the Gini text
gini_text = alt.Chart(gini_text_df).mark_text(align='left', baseline='top', fontSize=12, color='black').encode(
    x=alt.value(5), # Fixed X coordinate in pixels from the left edge
    y=alt.value(5), # Fixed Y coordinate in pixels from the top edge
    text=alt.Text('text')
)

# Combine both and the Gini text
chart = (curve + equality + gini_text).properties(
    width=400,
    height=400,
    title="Lorenz Curve"
)

chart

In [61]:
# Save the 'prize_money_by_bin' DataFrame to a CSV file
prize_money_by_bin.to_csv('prize_money_by_ranking_bin.csv', index=False)

print("Data saved to 'prize_money_by_ranking_bin.csv'")

Data saved to 'prize_money_by_ranking_bin.csv'
