In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

In [2]:
from dataclasses import dataclass
rng = np.random.default_rng()

# Shots Table

In [3]:
df = pd.read_excel("Synthetic Data.xlsx", sheet_name=1)
df.head()

Unnamed: 0,type,x,y,isgoal,appearance
0,Jump,0,7,False,0
1,Standing,25,25,True,0
2,Dive,18,7,True,0
3,Wing,11,7,True,0
4,Lob,9,1,False,2


In [4]:
def generate_synthetic(df, n_rows, schema):
    synthetic = pd.DataFrame(index=range(n_rows))
    start_time=pd.Timestamp("2026-01-01 00:00")

    for col in schema:
        synthetic[col.name] = rng.integers(
            size=n_rows,
            **col.params
        )
    
    for col in df.select_dtypes(exclude=np.number):
        freq = df[col].value_counts(normalize=True)
        synthetic[col] = rng.choice(
            freq.index,
            size=n_rows,
            p=freq.values
        )

    synthetic["timestamp"] = start_time + pd.to_timedelta(
        rng.uniform(0, 3600, size=n_rows),
        unit="s"
    )
    
    return synthetic

In [5]:
@dataclass
class ColumnDescription:
    name: str
    params: dict

schema = [
    ColumnDescription("appearance", {"low": 0, "high": 25}),
    ColumnDescription("x", {"low": 0, "high": 27}),
    ColumnDescription("y", {"low": 0, "high": 27}),
]


In [6]:
synth = generate_synthetic(df, n_rows=500, schema=schema)
synth["time"] = synth["timestamp"].dt.time
#synth = synth.drop(["timestamp"], axis=1)
synth = synth.sort_values(by=["appearance", "time"]).reset_index(drop=True)
synth.head(15)

Unnamed: 0,appearance,x,y,type,isgoal,timestamp,time
0,0,21,25,Underarm,True,2026-01-01 00:00:00.855832124,00:00:00.855832
1,0,6,17,Jump,False,2026-01-01 00:13:50.806186017,00:13:50.806186
2,0,14,19,Spin,False,2026-01-01 00:13:52.381313426,00:13:52.381313
3,0,0,22,Bounce,True,2026-01-01 00:19:54.970122554,00:19:54.970122
4,0,7,23,Hip,False,2026-01-01 00:20:59.330383426,00:20:59.330383
5,0,19,10,Underarm,False,2026-01-01 00:29:56.092539534,00:29:56.092539
6,0,0,13,Standing,False,2026-01-01 00:30:38.244491275,00:30:38.244491
7,0,1,21,Wing,False,2026-01-01 00:30:57.052378932,00:30:57.052378
8,0,4,2,Underarm,False,2026-01-01 00:31:48.422477469,00:31:48.422477
9,0,3,12,Standing,True,2026-01-01 00:32:30.671879412,00:32:30.671879


In [7]:
synth["appearance"].value_counts().sort_values(ascending=False)

appearance
7     27
9     25
18    24
11    23
15    23
13    23
10    22
19    22
0     21
24    21
21    21
2     21
23    21
5     21
3     20
12    20
17    20
14    18
8     18
4     18
1     17
6     14
16    14
20    13
22    13
Name: count, dtype: int64

# Appearances Table

In [8]:
app_df = synth.groupby("appearance")["isgoal"].sum().reset_index(name="total_goals")
app_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   appearance   25 non-null     int64
 1   total_goals  25 non-null     int64
dtypes: int64(2)
memory usage: 532.0 bytes


In [9]:
n_players = 4


player_ids = np.arange(0, n_players)
player_probs = np.array([0.5, 0.25, 0.15, 0.1], dtype=float)
player_probs = player_probs / player_probs.sum()
app_df["player_id"] = rng.choice(player_ids, size=len(app_df), p=player_probs)


# atribuir posição a cada jogador (0-1 GR, 2-3 avançados)
player_positions = {
    0: "GK",
    1: "GK",
    2: "ST",
    3: "ST",
}
app_df["position"] = app_df["player_id"].map(player_positions)


app_df["match_id"] = rng.integers(0, 23, size=len(app_df))

In [10]:
app_df["player_id"].value_counts()

player_id
0    13
1     5
2     4
3     3
Name: count, dtype: int64

In [11]:
app_df.head(10)

Unnamed: 0,appearance,total_goals,player_id,position,match_id
0,0,6,2,ST,15
1,1,8,0,GK,21
2,2,10,0,GK,18
3,3,10,1,GK,20
4,4,8,0,GK,15
5,5,9,3,ST,16
6,6,5,2,ST,18
7,7,9,0,GK,0
8,8,11,0,GK,10
9,9,8,2,ST,19


# Players Table

In [None]:
player_df = app_df.groupby(["player_id", "position"])["total_goals"].sum().reset_index(name="total_goals")


# gerar stats físicas e mentais específicas por posição


gk_template = {
    "reflexes": (70, 95),
    "handling": (65, 90),
    "aerial_command": (70, 95),
    "one_v_one": (70, 95),
    "communication": (60, 90),
}


st_template = {
    "finishing": (70, 95),
    "off_ball": (65, 90),
    "pace": (70, 95),
    "strength": (60, 90),
    "pressing": (60, 90),
}


for idx, row in player_df.iterrows():
    pos = row["position"]
    if pos == "GK":
        template = gk_template
    else:
        template = st_template
    for stat_name, (low, high) in template.items():
        player_df.loc[idx, stat_name] = rng.integers(low, high)


player_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   player_id      4 non-null      int64  
 1   position       4 non-null      object 
 2   total_goals    4 non-null      int64  
 3   shot_stopping  4 non-null      float64
 4   distribution   4 non-null      float64
 5   aerial         4 non-null      float64
 6   sweeper        4 non-null      float64
 7   one_v_one      4 non-null      float64
dtypes: float64(5), int64(2), object(1)
memory usage: 388.0+ bytes


In [13]:
player_df.head()

Unnamed: 0,player_id,position,total_goals,shot_stopping,distribution,aerial,sweeper,one_v_one
0,0,GK,116,83.0,77.0,66.0,70.0,71.0
1,1,GK,39,86.0,68.0,87.0,81.0,85.0
2,2,ST,27,57.0,61.0,64.0,48.0,74.0
3,3,ST,18,57.0,72.0,52.0,55.0,76.0


# Matches Table

In [14]:
match_df = app_df.groupby("match_id")["total_goals"].sum().reset_index(name="conceded")
match_df.head(10)

Unnamed: 0,match_id,conceded
0,0,9
1,1,12
2,2,4
3,3,8
4,4,9
5,10,11
6,11,9
7,12,20
8,13,11
9,15,14


In [15]:
with pd.ExcelWriter('output.xlsx') as writer:
    synth.to_excel(writer, sheet_name='Shots', index=False)
    app_df.to_excel(writer, sheet_name='Appearances', index=False)
    player_df.to_excel(writer, sheet_name='Players', index=False)
    match_df.to_excel(writer, sheet_name='Matches', index=False)