In [23]:
import random
import itertools
import numpy as np
import pandas as pd
import seaborn as sns
from dataclasses import dataclass, field

In [24]:
np.random.seed(42)

# Synthetica Data 

In [43]:
# --- Parameters ---
NUM_PLAYERS = 10
N_MATCHES = 500  # Set the desired number of matches
PLAYER_NAMES = [f"Player_{i}" for i in range(NUM_PLAYERS)]
STRENGTHS = np.random.normal(loc=0.0, scale=1.0, size=NUM_PLAYERS)
K_LOGISTIC = 2.0  # Controls sensitivity of win probability to strength difference
SIMILARITY_SHARPNESS = 5.0  # Higher = more likely to match with similar strengths
MATCH_PROBABILITY_SCALE = 0.8  # Max probability of a match happening
FULL_GRAPH = True  # If True, every pair is considered. If False, only similar strength players likely to face each other.

# --- Utility Functions ---
def win_probability(s1, s2, k=K_LOGISTIC):
    return 1 / (1 + np.exp(-k * (s1 - s2)))

def match_probability(s1, s2, sharpness=SIMILARITY_SHARPNESS, scale=MATCH_PROBABILITY_SCALE):
    diff = abs(s1 - s2)
    return scale * np.exp(-sharpness * diff**2)

# --- Create Matches ---
matches = []
# Generate matches until we reach N_MATCHES
while len(matches) < N_MATCHES:
    # Randomly select two players (allowing for rematches)
    i, j = random.sample(range(NUM_PLAYERS), 2)
    s1, s2 = STRENGTHS[i], STRENGTHS[j]
    p_match = 1.0 if FULL_GRAPH else match_probability(s1, s2)
    if np.random.rand() < p_match:
        p_win = win_probability(s1, s2)
        winner = i if np.random.rand() < p_win else j
        loser = j if winner == i else i
        matches.append({
            "player_A": PLAYER_NAMES[i],
            "player_B": PLAYER_NAMES[j],
            "strength_A": s1,
            "strength_B": s2,
            "winner": PLAYER_NAMES[winner],
            "loser": PLAYER_NAMES[loser]
        })

# --- Package as DataFrame ---
df_matches = pd.DataFrame(matches)
df_players = pd.DataFrame({
    "player": PLAYER_NAMES,
    "strength": STRENGTHS
})


In [44]:
df_players.head(5)

Unnamed: 0,player,strength
0,Player_0,-1.1464
1,Player_1,0.465773
2,Player_2,1.867224
3,Player_3,1.243234
4,Player_4,0.707186


In [45]:
df_matches.to_parquet('./data_matches_synthetic.pq')

# Real World : Tennis Example

In [15]:
# List of years to process
years = [2020, 2021, 2022, 2023, 2024]

# Load and combine the CSV files
df = []
for year in years:
    url = f"https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_{year}.csv"
    df.append(pd.read_csv(url))
df = pd.concat(df)


columns_to_keep = [
    'tourney_id',       # Unique tournament identifier
    'tourney_date',     # Date of the tournament (for ordering)
    'match_num',        # Match number within tournament (for finer ordering)
    'winner_id',        # Unique ID of the winner
    'winner_name',      # Name of the winner (for readability)
    'loser_id',         # Unique ID of the loser
    'loser_name',       # Name of the loser (for readability)
    'winner_rank',      # Winner's rank (optional, for validation)
    'loser_rank'        # Loser's rank (optional, for validation)
]

df = df.query('tourney_level == "G"') 
df = df[columns_to_keep]
df.loc[:, 'tourney_date'] = pd.to_datetime(df['tourney_date'], format='%Y%m%d')

# Handle missing values
df.loc[:, 'winner_rank'] = df['winner_rank'].fillna(-1).astype(int)
df.loc[:, 'loser_rank'] = df['loser_rank'].fillna(-1).astype(int)

df = df.sort_values(['tourney_date', 'match_num'])

# Rename columns for clarity
df.columns = [
    'tournament_id', 'match_date', 'match_number',
    'winner_id', 'winner_name', 'loser_id', 'loser_name',
    'winner_rank', 'loser_rank'
]

['2020-01-20 00:00:00', '2020-01-20 00:00:00', '2020-01-20 00:00:00',
 '2020-01-20 00:00:00', '2020-01-20 00:00:00', '2020-01-20 00:00:00',
 '2020-01-20 00:00:00', '2020-01-20 00:00:00', '2020-01-20 00:00:00',
 '2020-01-20 00:00:00',
 ...
 '2024-08-26 00:00:00', '2024-08-26 00:00:00', '2024-08-26 00:00:00',
 '2024-08-26 00:00:00', '2024-08-26 00:00:00', '2024-08-26 00:00:00',
 '2024-08-26 00:00:00', '2024-08-26 00:00:00', '2024-08-26 00:00:00',
 '2024-08-26 00:00:00']
Length: 2413, dtype: datetime64[ns]' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df.loc[:, 'tourney_date'] = pd.to_datetime(df['tourney_date'], format='%Y%m%d')


In [17]:
df.to_csv('atp_matches_2020_2024_preprocessed.csv', index=False)

In [18]:
df

Unnamed: 0,tournament_id,match_date,match_number,winner_id,winner_name,loser_id,loser_name,winner_rank,loser_rank
167,2020-580,2020-01-20,100,104745,Rafael Nadal,106198,Hugo Dellien,1.0,73.0
168,2020-580,2020-01-20,101,105643,Federico Delbonis,105311,Joao Sousa,76.0,59.0
169,2020-580,2020-01-20,102,105376,Peter Gojowczyk,111153,Christopher Eubanks,120.0,221.0
170,2020-580,2020-01-20,103,105807,Pablo Carreno Busta,106075,Jozef Kovalik,30.0,139.0
171,2020-580,2020-01-20,104,106401,Nick Kyrgios,132283,Lorenzo Sonego,26.0,53.0
...,...,...,...,...,...,...,...,...,...
2309,2024-560,2024-08-26,222,126203,Taylor Fritz,100644,Alexander Zverev,12.0,4.0
2310,2024-560,2024-08-26,223,126207,Frances Tiafoe,105777,Grigor Dimitrov,20.0,9.0
2311,2024-560,2024-08-26,224,206173,Jannik Sinner,207733,Jack Draper,1.0,25.0
2312,2024-560,2024-08-26,225,126203,Taylor Fritz,126207,Frances Tiafoe,12.0,20.0


In [20]:
# Description of the data
# tournament_id / match_date / winner / loser 
df_mini = df[['tournament_id', 'match_date', 'match_number', 'winner_name', 'loser_name']] 
df_mini.head()

Unnamed: 0,tournament_id,match_date,match_number,winner_name,loser_name
167,2020-580,2020-01-20,100,Rafael Nadal,Hugo Dellien
168,2020-580,2020-01-20,101,Federico Delbonis,Joao Sousa
169,2020-580,2020-01-20,102,Peter Gojowczyk,Christopher Eubanks
170,2020-580,2020-01-20,103,Pablo Carreno Busta,Jozef Kovalik
171,2020-580,2020-01-20,104,Nick Kyrgios,Lorenzo Sonego


In [22]:
df_mini.to_parquet('atp_matches_2020_2024_preprocessed_mini.pq', index=False)