In [19]:
import pandas as pd
import numpy as np

In [20]:
rng = np.random.default_rng()

In [21]:
from physicsbasedposes import generate_pose, evaluate_save, pose_to_dataframe

# Matches Generation

In [22]:
def generate_matches(opponents_array, num_games=10, start_date="2025-01-15", freq="W"):
    dates = pd.date_range(start_date, periods=num_games, freq=freq)
    selected_opponents = np.random.choice(opponents_array, size=num_games)
    
    matches = pd.DataFrame({
        "date": dates,
        "opponent": selected_opponents
    })
    
    return matches

In [23]:
opponents = ["Fábrica de Azulejos Coentrão", "Comunidade Judaica de Vila Real", "Sporting Clube de Cascos de Rolha", "Associação Desportiva de Cascos de Rolha"]
matches_df = generate_matches(opponents, num_games=10, freq="W")
matches_df.insert(0, "match_id", np.arange(len(matches_df), dtype=int))
matches_df

Unnamed: 0,match_id,date,opponent
0,0,2025-01-19,Sporting Clube de Cascos de Rolha
1,1,2025-01-26,Fábrica de Azulejos Coentrão
2,2,2025-02-02,Associação Desportiva de Cascos de Rolha
3,3,2025-02-09,Comunidade Judaica de Vila Real
4,4,2025-02-16,Associação Desportiva de Cascos de Rolha
5,5,2025-02-23,Sporting Clube de Cascos de Rolha
6,6,2025-03-02,Sporting Clube de Cascos de Rolha
7,7,2025-03-09,Fábrica de Azulejos Coentrão
8,8,2025-03-16,Comunidade Judaica de Vila Real
9,9,2025-03-23,Comunidade Judaica de Vila Real


# Appearances Generation

In [24]:
def generate_appearances(matches_df, starter_id=0, starter_weight=0.7, substitute_chance=0.15, available_players=[0, 1, 2, 3]):
    appearances = []

    for _, match in matches_df.iterrows():
        match_id = match["match_id"]
        players_in_match = []

        num_appearances = 2 if np.random.random() < substitute_chance else 1

        for appearance_num in range(num_appearances):
            available_pool = [p for p in available_players if p not in players_in_match]
            if not available_pool:
                break

            prefer_starter = (np.random.random() < starter_weight) and (starter_id in available_pool)
            if prefer_starter:
                player_id = starter_id
            else:
                non_starter_pool = [p for p in available_pool if p != starter_id]
                choice_pool = non_starter_pool if non_starter_pool else available_pool
                player_id = np.random.choice(choice_pool)

            players_in_match.append(player_id)
            appearances.append({
                "match_id": match_id,
                "player_id": player_id,
                "appearance_number": appearance_num + 1
            })

    appearances_df = pd.DataFrame(appearances)
    appearances_df.insert(0, "appearance_id", np.arange(len(appearances_df), dtype=int))
    return appearances_df

In [25]:
appearances_df = generate_appearances(matches_df, starter_id=0, starter_weight=0.7, substitute_chance=0.15)
appearances_df

Unnamed: 0,appearance_id,match_id,player_id,appearance_number
0,0,0,0,1
1,1,1,0,1
2,2,2,0,1
3,3,3,2,1
4,4,3,0,2
5,5,4,0,1
6,6,5,3,1
7,7,6,0,1
8,8,7,0,1
9,9,8,0,1


In [26]:
dup_check = appearances_df.groupby(["match_id", "player_id"]).size().reset_index(name="count")
dup_check[dup_check["count"] > 1]

Unnamed: 0,match_id,player_id,count


# Shots Generation

In [27]:
players_df = pd.read_excel("Synthetic Data.xlsx")
players_df.head()

Unnamed: 0,player_id,name,age,base_torso_x,base_torso_y,upper_arm_length,forearm_length,thigh_length,shin_length,head_vertical_offset,shoulder_horizontal_offset,shoulder_vertical_offset,hip_horizontal_offset,hip_vertical_offset,agility,presence,flexibility,reflexes
0,0,Pedro Farelo,29,13,15,3.5,3.0,6.0,4.5,2.5,2.0,1.5,1.5,3.0,90,85,85,80
1,1,Vasco Palmeirim,23,13,15,3.3,3.1,6.0,4.3,2.4,1.8,1.4,1.5,2.9,80,90,80,75
2,2,José Saramago,38,11,15,3.6,3.0,6.1,4.5,2.5,2.0,1.7,1.5,3.0,70,90,90,60
3,3,Eça de Queirós,31,13,14,3.5,2.9,6.2,4.6,2.6,2.0,1.5,1.5,3.1,75,65,70,70


In [28]:
def generate_shots(appearances_df, players_df, shots_per_appearance_range=(50, 75)):
    shots = []
    start_time=pd.Timestamp("2026-01-01 00:00")
    
    for _, appearance in appearances_df.iterrows():
        appearance_id = appearance["appearance_id"]
        player_id = appearance["player_id"]
        
        player = players_df[players_df["player_id"] == player_id].iloc[0]
        presence = player["presence"]
        
        x_min = 0 + presence * 0.01
        x_max = 27 - presence * 0.01
        y_min = 0 + presence * 0.01
        y_max = 27 - presence * 0.01
        
        num_shots = np.random.randint(shots_per_appearance_range[0], shots_per_appearance_range[1] + 1)
        
        for _ in range(num_shots):
            shots.append({
                "appearance_id": appearance_id,
                "match_id": appearance["match_id"],
                "player_id": player_id,
                "x": rng.uniform(x_min, x_max),
                "y": rng.uniform(y_min, y_max),
                "velocity": rng.lognormal(mean=4.55, sigma=0.16),
                "timestamp": start_time + pd.to_timedelta(rng.uniform(0, 3600),unit="s")
            })
    
    shots_df = pd.DataFrame(shots)
    shots_df.insert(0, "shot_id", np.arange(len(shots_df), dtype=int))
    return shots_df

In [29]:
shots_df = generate_shots(appearances_df, players_df)
shots_df.head(15)

Unnamed: 0,shot_id,appearance_id,match_id,player_id,x,y,velocity,timestamp
0,0,0,0,0,19.280025,23.706123,81.3578,2026-01-01 00:35:49.127609762
1,1,0,0,0,1.909727,19.470203,74.927952,2026-01-01 00:10:34.701092719
2,2,0,0,0,13.988766,3.769347,106.589612,2026-01-01 00:34:37.620664144
3,3,0,0,0,9.072017,8.057534,95.548089,2026-01-01 00:25:55.085034267
4,4,0,0,0,24.000144,9.398532,90.655408,2026-01-01 00:22:13.112313238
5,5,0,0,0,4.813976,15.383392,117.817193,2026-01-01 00:11:37.812199325
6,6,0,0,0,21.001072,16.107991,112.258887,2026-01-01 00:20:47.008676747
7,7,0,0,0,8.172206,4.605875,79.190505,2026-01-01 00:31:33.854622260
8,8,0,0,0,3.161815,3.938089,99.496505,2026-01-01 00:38:03.704880796
9,9,0,0,0,11.44743,15.488491,79.489058,2026-01-01 00:18:48.008798387


In [30]:
results = []
for _, shot in shots_df.iterrows():
    pose = generate_pose(shot['player_id'], [shot['x'], shot['y']], shot['velocity'])
    
    flat_pose = {}
    for joint_name, coords in pose.items():
        flat_pose[f'{joint_name}_x'] = coords['x']
        flat_pose[f'{joint_name}_y'] = coords['y']
    
    pose_df = pose_to_dataframe(pose)
    eval_result = evaluate_save(pose_df, [shot['x'], shot['y']], radius=1.0)
    
    combined = {**flat_pose, **eval_result}
    results.append(combined)

shots_df = pd.concat([shots_df, pd.DataFrame(results)], axis=1)
shots_df.head(15)

Unnamed: 0,shot_id,appearance_id,match_id,player_id,x,y,velocity,timestamp,torso_x,torso_y,...,right_hip_x,right_hip_y,right_knee_x,right_knee_y,right_foot_x,right_foot_y,nearest_node,distance,radius,saved
0,0,0,0,0,19.280025,23.706123,81.3578,2026-01-01 00:35:49.127609762,14.891967,17.626734,...,16.391967,14.626734,13.018534,9.664885,15.847687,6.165472,right_hand,0.911429,1.0,True
1,1,0,0,0,1.909727,19.470203,74.927952,2026-01-01 00:10:34.701092719,9.342972,16.460023,...,10.842972,13.460023,7.743707,8.32246,10.75855,4.981693,left_hand,1.022888,1.0,False
2,2,0,0,0,13.988766,3.769347,106.589612,2026-01-01 00:34:37.620664144,13.21857,12.393844,...,14.71857,9.393844,10.759302,4.885607,14.562778,2.480703,right_foot,1.410707,1.0,False
3,3,0,0,0,9.072017,8.057534,95.548089,2026-01-01 00:25:55.085034267,11.977049,13.199673,...,13.477049,10.199673,17.93469,6.183525,14.994646,2.776747,left_foot,0.70576,1.0,True
4,4,0,0,0,24.000144,9.398532,90.655408,2026-01-01 00:22:13.112313238,15.979822,13.46753,...,17.479822,10.46753,22.291508,14.051896,22.851413,9.586865,right_foot,1.164067,1.0,False
5,5,0,0,0,4.813976,15.383392,117.817193,2026-01-01 00:11:37.812199325,11.277995,15.070182,...,12.777995,12.070182,17.547591,8.429996,14.252555,5.365239,left_hand,0.819192,1.0,True
6,6,0,0,0,21.001072,16.107991,112.258887,2026-01-01 00:20:47.008676747,14.746177,15.233196,...,16.246177,12.233196,16.515369,18.227154,19.699534,15.047359,right_hand,1.080053,1.0,False
7,7,0,0,0,8.172206,4.605875,79.190505,2026-01-01 00:31:33.854622260,11.48784,11.755885,...,12.98784,8.755885,18.073202,5.57169,14.541358,2.783127,left_foot,1.199815,1.0,False
8,8,0,0,0,3.161815,3.938089,99.496505,2026-01-01 00:38:03.704880796,10.553604,12.250552,...,12.053604,9.250552,17.232167,6.220291,13.69522,3.438204,left_foot,1.914148,1.0,False
9,9,0,0,0,11.44743,15.488491,79.489058,2026-01-01 00:18:48.008798387,12.508735,15.141422,...,14.008735,12.141422,9.875868,7.791777,13.605348,5.273647,left_hand,0.408346,1.0,True


In [31]:
shot_stats = shots_df.groupby('appearance_id')['saved'].agg([
    ('total_shots', 'size'),
    ('saved', 'sum')
]).reset_index()
shot_stats["conceded"] = shot_stats["total_shots"] - shot_stats["saved"]

In [32]:
appearances_df = appearances_df.merge(shot_stats, on='appearance_id', how='left').fillna(0)
appearances_df

Unnamed: 0,appearance_id,match_id,player_id,appearance_number,total_shots,saved,conceded
0,0,0,0,1,73,25,48
1,1,1,0,1,55,21,34
2,2,2,0,1,59,29,30
3,3,3,2,1,52,27,25
4,4,3,0,2,63,35,28
5,5,4,0,1,61,23,38
6,6,5,3,1,61,12,49
7,7,6,0,1,62,23,39
8,8,7,0,1,73,35,38
9,9,8,0,1,59,23,36


In [33]:
appearance_stats = appearances_df.groupby('match_id')[['total_shots', 'saved', 'conceded']].sum().reset_index()

matches_df = matches_df.merge(appearance_stats, on='match_id', how='left').fillna(0)
matches_df

Unnamed: 0,match_id,date,opponent,total_shots,saved,conceded
0,0,2025-01-19,Sporting Clube de Cascos de Rolha,73,25,48
1,1,2025-01-26,Fábrica de Azulejos Coentrão,55,21,34
2,2,2025-02-02,Associação Desportiva de Cascos de Rolha,59,29,30
3,3,2025-02-09,Comunidade Judaica de Vila Real,115,62,53
4,4,2025-02-16,Associação Desportiva de Cascos de Rolha,61,23,38
5,5,2025-02-23,Sporting Clube de Cascos de Rolha,61,12,49
6,6,2025-03-02,Sporting Clube de Cascos de Rolha,62,23,39
7,7,2025-03-09,Fábrica de Azulejos Coentrão,73,35,38
8,8,2025-03-16,Comunidade Judaica de Vila Real,59,23,36
9,9,2025-03-23,Comunidade Judaica de Vila Real,60,31,29


In [None]:
matches_df['scored'] = np.maximum(0, rng.normal(loc=matches_df['conceded'].mean(), scale=1.2, size=len(matches_df))).astype(int)
matches_df
matches_df["result"] = "Draw"
matches_df.loc[matches_df["scored"] > matches_df["conceded"], "result"] = "Win"
matches_df.loc[matches_df["scored"] < matches_df["conceded"], "result"] = "Loss"
matches_df

Unnamed: 0,match_id,date,opponent,total_shots,saved,conceded,scored
0,0,2025-01-19,Sporting Clube de Cascos de Rolha,73,25,48,38
1,1,2025-01-26,Fábrica de Azulejos Coentrão,55,21,34,38
2,2,2025-02-02,Associação Desportiva de Cascos de Rolha,59,29,30,39
3,3,2025-02-09,Comunidade Judaica de Vila Real,115,62,53,40
4,4,2025-02-16,Associação Desportiva de Cascos de Rolha,61,23,38,38
5,5,2025-02-23,Sporting Clube de Cascos de Rolha,61,12,49,39
6,6,2025-03-02,Sporting Clube de Cascos de Rolha,62,23,39,41
7,7,2025-03-09,Fábrica de Azulejos Coentrão,73,35,38,39
8,8,2025-03-16,Comunidade Judaica de Vila Real,59,23,36,38
9,9,2025-03-23,Comunidade Judaica de Vila Real,60,31,29,39


In [39]:
player_shot_stats = appearances_df.groupby('player_id')[['total_shots', 'saved', 'conceded']].sum().reset_index()
player_appearances_with_results = appearances_df.merge(matches_df[['match_id', 'result']], on='match_id')
player_result_stats = player_appearances_with_results.groupby('player_id')['result'].value_counts().unstack(fill_value=0).reset_index()
players_stats_df = player_shot_stats.merge(player_result_stats, on='player_id', how='left')
players_df = players_df.merge(players_stats_df, on='player_id', how='left').fillna(0)
players_df

Unnamed: 0,player_id,name,age,base_torso_x,base_torso_y,upper_arm_length,forearm_length,thigh_length,shin_length,head_vertical_offset,...,agility,presence,flexibility,reflexes,total_shots,saved,conceded,Draw,Loss,Win
0,0,Pedro Farelo,29,13,15,3.5,3.0,6.0,4.5,2.5,...,90,85,85,80,565.0,245.0,320.0,1.0,2.0,6.0
1,1,Vasco Palmeirim,23,13,15,3.3,3.1,6.0,4.3,2.4,...,80,90,80,75,0.0,0.0,0.0,0.0,0.0,0.0
2,2,José Saramago,38,11,15,3.6,3.0,6.1,4.5,2.5,...,70,90,90,60,52.0,27.0,25.0,0.0,1.0,0.0
3,3,Eça de Queirós,31,13,14,3.5,2.9,6.2,4.6,2.6,...,75,65,70,70,61.0,12.0,49.0,0.0,1.0,0.0


In [41]:
with pd.ExcelWriter("output.xlsx") as writer:
    shots_df.to_excel(writer, sheet_name="Shots", index=False)
    appearances_df.to_excel(writer, sheet_name="Appearances", index=False)
    players_df.to_excel(writer, sheet_name="Players", index=False)
    matches_df.to_excel(writer, sheet_name="Matches", index=False)