In [21]:
import pandas as pd
import numpy as np

In [22]:
rng = np.random.default_rng()

In [23]:
from physicsbasedposes import generate_pose, evaluate_save, pose_to_dataframe

# Matches Generation

In [24]:
def generate_matches(opponents_array, num_games=10, start_date="2025-01-15", freq="W"):
    dates = pd.date_range(start_date, periods=num_games, freq=freq)
    selected_opponents = np.random.choice(opponents_array, size=num_games)
    
    matches = pd.DataFrame({
        "date": dates,
        "opponent": selected_opponents
    })
    
    return matches

In [25]:
opponents = ["Fábrica de Azulejos Coentrão", "Comunidade Judaica de Vila Real", "Sporting Clube de Cascos de Rolha", "Associação Desportiva de Cascos de Rolha"]
matches_df = generate_matches(opponents, num_games=10, freq="W")
matches_df.insert(0, "match_id", np.arange(len(matches_df), dtype=int))
matches_df

Unnamed: 0,match_id,date,opponent
0,0,2025-01-19,Fábrica de Azulejos Coentrão
1,1,2025-01-26,Sporting Clube de Cascos de Rolha
2,2,2025-02-02,Sporting Clube de Cascos de Rolha
3,3,2025-02-09,Sporting Clube de Cascos de Rolha
4,4,2025-02-16,Sporting Clube de Cascos de Rolha
5,5,2025-02-23,Comunidade Judaica de Vila Real
6,6,2025-03-02,Fábrica de Azulejos Coentrão
7,7,2025-03-09,Fábrica de Azulejos Coentrão
8,8,2025-03-16,Associação Desportiva de Cascos de Rolha
9,9,2025-03-23,Sporting Clube de Cascos de Rolha


# Appearances Generation

In [26]:
def generate_appearances(matches_df, starter_id=0, starter_weight=0.7, substitute_chance=0.15, available_players=[0, 1, 2, 3]):
    appearances = []

    for _, match in matches_df.iterrows():
        match_id = match["match_id"]
        players_in_match = []

        num_appearances = 2 if np.random.random() < substitute_chance else 1

        for appearance_num in range(num_appearances):
            available_pool = [p for p in available_players if p not in players_in_match]
            if not available_pool:
                break

            prefer_starter = (np.random.random() < starter_weight) and (starter_id in available_pool)
            if prefer_starter:
                player_id = starter_id
            else:
                non_starter_pool = [p for p in available_pool if p != starter_id]
                choice_pool = non_starter_pool if non_starter_pool else available_pool
                player_id = np.random.choice(choice_pool)

            players_in_match.append(player_id)
            appearances.append({
                "match_id": match_id,
                "player_id": player_id,
                "appearance_number": appearance_num + 1
            })

    appearances_df = pd.DataFrame(appearances)
    appearances_df.insert(0, "appearance_id", np.arange(len(appearances_df), dtype=int))
    return appearances_df

In [27]:
appearances_df = generate_appearances(matches_df, starter_id=0, starter_weight=0.7, substitute_chance=0.15)
appearances_df

Unnamed: 0,appearance_id,match_id,player_id,appearance_number
0,0,0,3,1
1,1,1,0,1
2,2,2,3,1
3,3,3,0,1
4,4,4,0,1
5,5,4,1,2
6,6,5,0,1
7,7,6,0,1
8,8,6,3,2
9,9,7,3,1


In [28]:
dup_check = appearances_df.groupby(["match_id", "player_id"]).size().reset_index(name="count")
dup_check[dup_check["count"] > 1]

Unnamed: 0,match_id,player_id,count


# Shots Generation

In [29]:
players_df = pd.read_excel("Synthetic Data.xlsx")
players_df.head()

Unnamed: 0,player_id,name,age,base_torso_x,base_torso_y,upper_arm_length,forearm_length,thigh_length,shin_length,head_vertical_offset,shoulder_horizontal_offset,shoulder_vertical_offset,hip_horizontal_offset,hip_vertical_offset,agility,presence,flexibility,reflexes
0,0,Pedro Farelo,29,13,15,3.5,3.0,6.0,4.5,2.5,2.0,1.5,1.5,3.0,90,85,85,80
1,1,Vasco Palmeirim,23,13,15,3.3,3.1,6.0,4.3,2.4,1.8,1.4,1.5,2.9,80,90,80,75
2,2,José Saramago,38,11,15,3.6,3.0,6.1,4.5,2.5,2.0,1.7,1.5,3.0,70,90,90,60
3,3,Eça de Queirós,31,13,14,3.5,2.9,6.2,4.6,2.6,2.0,1.5,1.5,3.1,75,65,70,70


In [30]:
def generate_shots(appearances_df, players_df, shots_per_appearance_range=(50, 75)):
    shots = []
    start_time=pd.Timestamp("2026-01-01 00:00")
    
    for _, appearance in appearances_df.iterrows():
        appearance_id = appearance["appearance_id"]
        player_id = appearance["player_id"]
        
        player = players_df[players_df["player_id"] == player_id].iloc[0]
        presence = player["presence"]
        
        x_min = 0 + presence * 0.01
        x_max = 27 - presence * 0.01
        y_min = 0 + presence * 0.01
        y_max = 27 - presence * 0.01
        
        num_shots = np.random.randint(shots_per_appearance_range[0], shots_per_appearance_range[1] + 1)
        
        for _ in range(num_shots):
            shots.append({
                "appearance_id": appearance_id,
                "match_id": appearance["match_id"],
                "player_id": player_id,
                "x": rng.uniform(x_min, x_max),
                "y": rng.uniform(y_min, y_max),
                "velocity": rng.lognormal(mean=4.55, sigma=0.16),
                "timestamp": start_time + pd.to_timedelta(rng.uniform(0, 3600),unit="s")
            })
    
    shots_df = pd.DataFrame(shots)
    shots_df.insert(0, "shot_id", np.arange(len(shots_df), dtype=int))
    return shots_df

In [31]:
shots_df = generate_shots(appearances_df, players_df)
shots_df.head(15)

Unnamed: 0,shot_id,appearance_id,match_id,player_id,x,y,velocity,timestamp
0,0,0,0,3,12.730742,8.376599,134.471464,2026-01-01 00:31:05.794603420
1,1,0,0,3,9.617027,12.992511,72.149458,2026-01-01 00:40:03.574544875
2,2,0,0,3,6.181509,2.028205,77.071999,2026-01-01 00:19:22.456161223
3,3,0,0,3,4.706749,20.487481,84.23582,2026-01-01 00:30:10.774404526
4,4,0,0,3,8.132882,7.840237,103.701418,2026-01-01 00:42:51.724016467
5,5,0,0,3,2.772136,21.13455,75.296091,2026-01-01 00:37:06.469312479
6,6,0,0,3,23.102599,22.698142,125.350533,2026-01-01 00:51:01.751658045
7,7,0,0,3,3.736313,3.049108,76.349362,2026-01-01 00:20:12.106476908
8,8,0,0,3,2.609305,11.876969,76.727215,2026-01-01 00:34:59.918300203
9,9,0,0,3,22.862743,15.017219,89.958608,2026-01-01 00:58:56.835843737


In [32]:
results = []
for _, shot in shots_df.iterrows():
    pose = generate_pose(shot['player_id'], [shot['x'], shot['y']], shot['velocity'])
    
    flat_pose = {}
    for joint_name, coords in pose.items():
        flat_pose[f'{joint_name}_x'] = coords['x']
        flat_pose[f'{joint_name}_y'] = coords['y']
    
    pose_df = pose_to_dataframe(pose)
    eval_result = evaluate_save(pose_df, [shot['x'], shot['y']], radius=1.0)
    
    combined = {**flat_pose, **eval_result}
    results.append(combined)

shots_df = pd.concat([shots_df, pd.DataFrame(results)], axis=1)
shots_df.head(15)

Unnamed: 0,shot_id,appearance_id,match_id,player_id,x,y,velocity,timestamp,torso_x,torso_y,...,right_hip_x,right_hip_y,right_knee_x,right_knee_y,right_foot_x,right_foot_y,nearest_node,distance,radius,saved
0,0,0,0,3,12.730742,8.376599,134.471464,2026-01-01 00:31:05.794603420,12.931829,13.073392,...,14.431829,9.973392,10.093948,5.543631,14.66976,5.072521,left_hip,2.058379,1.0,False
1,1,0,0,3,9.617027,12.992511,72.149458,2026-01-01 00:40:03.574544875,11.964086,13.673938,...,13.464086,10.573938,18.655735,7.184723,14.713916,4.813634,left_hand,0.885813,1.0,True
2,2,0,0,3,6.181509,2.028205,77.071999,2026-01-01 00:19:22.456161223,11.067606,10.626029,...,12.567606,7.526029,18.120118,4.767476,13.939535,2.848434,left_foot,2.069428,1.0,False
3,3,0,0,3,4.706749,20.487481,84.23582,2026-01-01 00:30:10.774404526,10.852358,15.635459,...,12.352358,12.535459,8.63222,7.575562,12.691631,5.411957,left_hand,2.045034,1.0,False
4,4,0,0,3,8.132882,7.840237,103.701418,2026-01-01 00:42:51.724016467,11.963104,12.694357,...,13.463104,9.594357,18.907316,6.627785,14.809856,4.537134,left_foot,1.141231,1.0,False
5,5,0,0,3,2.772136,21.13455,75.296091,2026-01-01 00:37:06.469312479,10.046392,16.01788,...,11.546392,12.91788,7.624693,8.115768,11.561216,5.735896,left_hand,2.261551,1.0,False
6,6,0,0,3,23.102599,22.698142,125.350533,2026-01-01 00:51:01.751658045,14.712623,15.47106,...,16.212623,12.37106,12.552374,7.366805,16.610114,5.200068,right_hand,4.324865,1.0,False
7,7,0,0,3,3.736313,3.049108,76.349362,2026-01-01 00:20:12.106476908,10.359063,10.88262,...,11.859063,7.78262,17.335267,4.875526,13.140924,2.986747,left_foot,2.814523,1.0,False
8,8,0,0,3,2.609305,11.876969,76.727215,2026-01-01 00:34:59.918300203,10.055262,13.378438,...,11.555262,10.278438,16.674272,6.780477,12.843179,4.234357,left_foot,2.044565,1.0,False
9,9,0,0,3,22.862743,15.017219,89.958608,2026-01-01 00:58:56.835843737,15.338762,14.218793,...,16.838762,11.118793,17.908385,17.22583,20.968554,13.791392,right_hand,2.153581,1.0,False


In [33]:
shot_stats = shots_df.groupby('appearance_id')['saved'].agg([
    ('total_shots', 'size'),
    ('saved', 'sum')
]).reset_index()
shot_stats["conceded"] = shot_stats["total_shots"] - shot_stats["saved"]

In [34]:
appearances_df = appearances_df.merge(shot_stats, on='appearance_id', how='left').fillna(0)
appearances_df

Unnamed: 0,appearance_id,match_id,player_id,appearance_number,total_shots,saved,conceded
0,0,0,3,1,75,15,60
1,1,1,0,1,71,30,41
2,2,2,3,1,58,9,49
3,3,3,0,1,58,25,33
4,4,4,0,1,63,27,36
5,5,4,1,2,74,23,51
6,6,5,0,1,59,32,27
7,7,6,0,1,53,21,32
8,8,6,3,2,55,9,46
9,9,7,3,1,55,7,48


In [35]:
appearance_stats = appearances_df.groupby('match_id')[['total_shots', 'saved', 'conceded']].sum().reset_index()

matches_df = matches_df.merge(appearance_stats, on='match_id', how='left').fillna(0)
matches_df

Unnamed: 0,match_id,date,opponent,total_shots,saved,conceded
0,0,2025-01-19,Fábrica de Azulejos Coentrão,75,15,60
1,1,2025-01-26,Sporting Clube de Cascos de Rolha,71,30,41
2,2,2025-02-02,Sporting Clube de Cascos de Rolha,58,9,49
3,3,2025-02-09,Sporting Clube de Cascos de Rolha,58,25,33
4,4,2025-02-16,Sporting Clube de Cascos de Rolha,137,50,87
5,5,2025-02-23,Comunidade Judaica de Vila Real,59,32,27
6,6,2025-03-02,Fábrica de Azulejos Coentrão,108,30,78
7,7,2025-03-09,Fábrica de Azulejos Coentrão,55,7,48
8,8,2025-03-16,Associação Desportiva de Cascos de Rolha,55,26,29
9,9,2025-03-23,Sporting Clube de Cascos de Rolha,61,23,38


In [36]:
matches_df['scored'] = np.maximum(0, rng.normal(loc=matches_df['conceded'].mean(), scale=1.2, size=len(matches_df))).astype(int)
matches_df
matches_df["result"] = "Draw"
matches_df.loc[matches_df["scored"] > matches_df["conceded"], "result"] = "Win"
matches_df.loc[matches_df["scored"] < matches_df["conceded"], "result"] = "Loss"
matches_df

Unnamed: 0,match_id,date,opponent,total_shots,saved,conceded,scored,result
0,0,2025-01-19,Fábrica de Azulejos Coentrão,75,15,60,47,Loss
1,1,2025-01-26,Sporting Clube de Cascos de Rolha,71,30,41,49,Win
2,2,2025-02-02,Sporting Clube de Cascos de Rolha,58,9,49,49,Draw
3,3,2025-02-09,Sporting Clube de Cascos de Rolha,58,25,33,49,Win
4,4,2025-02-16,Sporting Clube de Cascos de Rolha,137,50,87,49,Loss
5,5,2025-02-23,Comunidade Judaica de Vila Real,59,32,27,50,Win
6,6,2025-03-02,Fábrica de Azulejos Coentrão,108,30,78,48,Loss
7,7,2025-03-09,Fábrica de Azulejos Coentrão,55,7,48,48,Draw
8,8,2025-03-16,Associação Desportiva de Cascos de Rolha,55,26,29,49,Win
9,9,2025-03-23,Sporting Clube de Cascos de Rolha,61,23,38,49,Win


In [37]:
player_shot_stats = appearances_df.groupby('player_id')[['total_shots', 'saved', 'conceded']].sum().reset_index()
player_appearances_with_results = appearances_df.merge(matches_df[['match_id', 'result']], on='match_id')
player_result_stats = player_appearances_with_results.groupby('player_id')['result'].value_counts().unstack(fill_value=0).reset_index()
players_stats_df = player_shot_stats.merge(player_result_stats, on='player_id', how='left')
players_df = players_df.merge(players_stats_df, on='player_id', how='left').fillna(0)
players_df

Unnamed: 0,player_id,name,age,base_torso_x,base_torso_y,upper_arm_length,forearm_length,thigh_length,shin_length,head_vertical_offset,...,agility,presence,flexibility,reflexes,total_shots,saved,conceded,Draw,Loss,Win
0,0,Pedro Farelo,29,13,15,3.5,3.0,6.0,4.5,2.5,...,90,85,85,80,420.0,184.0,236.0,0.0,2.0,5.0
1,1,Vasco Palmeirim,23,13,15,3.3,3.1,6.0,4.3,2.4,...,80,90,80,75,74.0,23.0,51.0,0.0,1.0,0.0
2,2,José Saramago,38,11,15,3.6,3.0,6.1,4.5,2.5,...,70,90,90,60,0.0,0.0,0.0,0.0,0.0,0.0
3,3,Eça de Queirós,31,13,14,3.5,2.9,6.2,4.6,2.6,...,75,65,70,70,243.0,40.0,203.0,2.0,2.0,0.0


In [38]:
with pd.ExcelWriter("output.xlsx") as writer:
    shots_df.to_excel(writer, sheet_name="Shots", index=False)
    appearances_df.to_excel(writer, sheet_name="Appearances", index=False)
    players_df.to_excel(writer, sheet_name="Players", index=False)
    matches_df.to_excel(writer, sheet_name="Matches", index=False)

In [39]:
from pathlib import Path
import pandas as pd
import numpy as np

# Assumes matches_df / players_df exist from earlier cells
season_start = pd.to_datetime(matches_df["date"]).min()
season_end = pd.to_datetime(matches_df["date"]).max() + pd.Timedelta(days=7)

# Zonas principais onde podem ocorrer lesões em guarda-redes
BODY_PARTS = [
    "Head", "Shoulder", "Elbow", "Wrist/Hand",
    "Back", "Hip/Groin", "Hamstring", "Knee", "Calf", "Ankle/Foot",
]

# Tipos de lesão mais específicos, ligados a zonas prováveis
INJURY_TYPES = [
    "Concussion",
    "Shoulder sprain",
    "Elbow sprain",
    "Wrist sprain",
    "Back pain",
    "Hip/Groin strain",
    "Hamstring strain",
    "Knee ligament",
    "Calf strain",
    "Ankle sprain",
]

# Para cada tipo de lesão, limita as partes do corpo
INJURY_BODY_CONSTRAINTS = {
    "Concussion": ["Head"],
    "Shoulder sprain": ["Shoulder"],
    "Elbow sprain": ["Elbow"],
    "Wrist sprain": ["Wrist/Hand"],
    "Back pain": ["Back"],
    "Hip/Groin strain": ["Hip/Groin"],
    "Hamstring strain": ["Hamstring"],
    "Knee ligament": ["Knee"],
    "Calf strain": ["Calf"],
    "Ankle sprain": ["Ankle/Foot"],
}

def sample_duration_days(rng: np.random.Generator, severity: str) -> int:
    if severity == "short":
        # 3–14 dias
        return int(rng.integers(3, 15))
    # long: 21–180 dias
    return int(rng.integers(21, 181))

def generate_injuries(matches_df: pd.DataFrame, players_df: pd.DataFrame, rng: np.random.Generator) -> pd.DataFrame:
    player_ids = sorted(players_df["player_id"].unique().tolist())
    injuries = []
    next_id = 0

    # Taxa base (por época) — ligeiramente mais alta para povoar melhor o dataset
    for pid in player_ids:
        # número de lesões ~ Poisson (média 2.0)
        n = int(rng.poisson(lam=2.0))
        if n == 0:
            continue

        player_injuries = []
        for _ in range(n):
            # Escolher primeiro o tipo de lesão
            injury_type = str(rng.choice(INJURY_TYPES))
            allowed_parts = INJURY_BODY_CONSTRAINTS.get(injury_type, BODY_PARTS)
            body_part = str(rng.choice(allowed_parts))

            # Severidade: mais curtas que longas
            severity = "short" if rng.random() < 0.75 else "long"
            duration = sample_duration_days(rng, severity)

            # Data de início aleatória na época (garantindo espaço para duração)
            max_start = season_end - pd.Timedelta(days=duration)
            if max_start <= season_start:
                start = season_start
            else:
                start = season_start + pd.to_timedelta(rng.uniform(0, (max_start - season_start).days), unit="D")
            start = pd.to_datetime(start).normalize()
            end = start + pd.Timedelta(days=duration)

            # Recorrência: se já houve lesão na mesma zona, probabilidade extra
            recurrent = False
            recurrence_of = None
            same_part = [inj for inj in player_injuries if inj["body_part"] == body_part]
            if same_part and rng.random() < 0.35:
                recurrent = True
                # escolher uma anterior para ligar
                recurrence_of = int(rng.choice([inj["injury_id"] for inj in same_part]))
                # para ficar mais realista, puxa a data para perto da lesão anterior
                prev = next(inj for inj in player_injuries if inj["injury_id"] == recurrence_of)
                prev_end = prev["end_date"]
                gap = int(rng.integers(7, 61))  # 1–8 semanas
                start = (prev_end + pd.Timedelta(days=gap)).normalize()
                end = start + pd.Timedelta(days=duration)

            player_injuries.append({
                "injury_id": next_id,
                "player_id": int(pid),
                "injury_type": injury_type,
                "body_part": body_part,
                "severity": severity,
                "start_date": start,
                "end_date": end,
                "duration_days": int(duration),
                "recurrent": bool(recurrent),
                "recurrence_of": recurrence_of,
            })
            next_id += 1

        # Se este jogador tiver várias lesões mas nenhuma marcada como recorrente,
        # força pelo menos uma recidiva na mesma parte do corpo
        if len(player_injuries) >= 2 and not any(inj["recurrent"] for inj in player_injuries):
            base_inj = rng.choice(player_injuries)
            severity = "short" if rng.random() < 0.75 else "long"
            duration = sample_duration_days(rng, severity)
            gap = int(rng.integers(7, 61))  # 1–8 semanas depois da anterior
            start = (base_inj["end_date"] + pd.Timedelta(days=gap)).normalize()
            end = start + pd.Timedelta(days=duration)
            player_injuries.append({
                "injury_id": next_id,
                "player_id": base_inj["player_id"],
                "injury_type": base_inj["injury_type"],
                "body_part": base_inj["body_part"],
                "severity": severity,
                "start_date": start,
                "end_date": end,
                "duration_days": int(duration),
                "recurrent": True,
                "recurrence_of": int(base_inj["injury_id"]),
            })
            next_id += 1

        # Evitar sobreposições (empurra para a frente quando necessário)
        player_injuries = sorted(player_injuries, key=lambda x: x["start_date"])
        for i in range(1, len(player_injuries)):
            prev = player_injuries[i - 1]
            cur = player_injuries[i]
            if cur["start_date"] <= prev["end_date"]:
                shift = (prev["end_date"] - cur["start_date"]) + pd.Timedelta(days=1)
                cur["start_date"] = (cur["start_date"] + shift).normalize()
                cur["end_date"] = cur["start_date"] + pd.Timedelta(days=cur["duration_days"])

        injuries.extend(player_injuries)

    injuries_df = pd.DataFrame(injuries)
    if not injuries_df.empty:
        injuries_df = injuries_df.sort_values(["player_id", "start_date"]).reset_index(drop=True)
    return injuries_df

injuries_df = generate_injuries(matches_df, players_df, rng)
injuries_df.head(20)

# Guardar em data/output.xlsx (sheet Injuries)
output_path = Path("data") / "output.xlsx"
if output_path.exists():
    with pd.ExcelWriter(output_path, engine="openpyxl", mode="a", if_sheet_exists="replace") as writer:
        injuries_df.to_excel(writer, sheet_name="Injuries", index=False)
else:
    output_path.parent.mkdir(parents=True, exist_ok=True)
    with pd.ExcelWriter(output_path, engine="openpyxl", mode="w") as writer:
        injuries_df.to_excel(writer, sheet_name="Injuries", index=False)

print(f"Injuries written to: {output_path} (rows={len(injuries_df)})")

Injuries written to: data\output.xlsx (rows=12)


In [40]:
print("oi malandro")

oi malandro
