In [1]:
import pandas as pd
import numpy as np

In [45]:
from dataclasses import dataclass
rng = np.random.default_rng()

# Shots Table

In [3]:
df = pd.read_excel("Synthetic Data.xlsx", sheet_name=1)
df.head()

Unnamed: 0,type,x,y,isgoal,appearance
0,Jump,0,7,False,0
1,Standing,25,25,True,0
2,Dive,18,7,True,0
3,Wing,11,7,True,0
4,Lob,9,1,False,2


In [None]:
def generate_synthetic(df, n_rows, schema):
    synthetic = pd.DataFrame(index=range(n_rows))
    start_time=pd.Timestamp("2026-01-01 00:00")

    for col in schema:
        synthetic[col.name] = rng.integers(
            size=n_rows,
            **col.params
        )
    
    for col in df.select_dtypes(exclude=np.number):
        freq = df[col].value_counts(normalize=True)
        synthetic[col] = rng.choice(
            freq.index,
            size=n_rows,
            p=freq.values
        )

    synthetic["timestamp"] = start_time + pd.to_timedelta(
        rng.uniform(0, 3600, size=n_rows),
        unit="s"
    )
    
    return synthetic

In [46]:
@dataclass
class ColumnDescription:
    name: str
    params: dict

schema = [
    ColumnDescription("appearance", {"low": 0, "high": 25}),
    ColumnDescription("x", {"low": 0, "high": 27}),
    ColumnDescription("y", {"low": 0, "high": 27}),
]


In [48]:
synth = generate_synthetic(df, n_rows=500, schema=schema)
synth["time"] = synth["timestamp"].dt.time
#synth = synth.drop(["timestamp"], axis=1)
synth = synth.sort_values(by=["appearance", "time"]).reset_index(drop=True)
synth.head(15)

TypeError: 'ColumnDescription' object is not subscriptable

In [34]:
synth["appearance"].value_counts().sort_values(ascending=False)

appearance
1     33
0     28
17    28
5     26
3     25
16    24
20    24
24    22
7     22
23    21
19    20
6     20
4     18
21    18
18    18
15    18
10    18
22    17
2     17
8     16
14    16
11    15
13    14
9     13
12     9
Name: count, dtype: int64

# Appearances Table

In [35]:
app_df = synth.groupby("appearance")["isgoal"].sum().reset_index(name="total_goals")
app_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25 entries, 0 to 24
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   appearance   25 non-null     int64
 1   total_goals  25 non-null     int64
dtypes: int64(2)
memory usage: 532.0 bytes


In [36]:
n_players = 4

player_ids = np.arange(0, n_players)
player_probs = np.array([0.5, 0.25, 0.15, 0.1], dtype=float)
player_probs = player_probs / player_probs.sum()
app_df["player_id"] = rng.choice(player_ids, size=len(app_df), p=player_probs)

app_df["match_id"] = rng.integers(0, 23, size=len(app_df))

In [37]:
app_df["player_id"].value_counts()

player_id
0    16
1     6
2     2
3     1
Name: count, dtype: int64

In [38]:
app_df.head(10)

Unnamed: 0,appearance,total_goals,player_id,match_id
0,0,10,0,10
1,1,16,0,19
2,2,9,1,21
3,3,13,1,18
4,4,6,0,7
5,5,13,0,6
6,6,9,0,5
7,7,11,0,13
8,8,7,0,7
9,9,2,0,12


# Players Table

In [39]:
player_df = app_df.groupby("player_id")["total_goals"].sum().reset_index(name="total_goals")
player_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype
---  ------       --------------  -----
 0   player_id    4 non-null      int64
 1   total_goals  4 non-null      int64
dtypes: int64(2)
memory usage: 196.0 bytes


In [40]:
player_df.head()

Unnamed: 0,player_id,total_goals
0,0,127
1,1,54
2,2,15
3,3,10


# Matches Table

In [41]:
match_df = app_df.groupby("match_id")["total_goals"].sum().reset_index(name="conceded")
match_df.head(10)

Unnamed: 0,match_id,conceded
0,0,16
1,1,6
2,3,13
3,5,22
4,6,23
5,7,13
6,8,6
7,10,19
8,12,11
9,13,17


In [43]:
with pd.ExcelWriter('output.xlsx') as writer:
    synth.to_excel(writer, sheet_name='Shots', index=False)
    app_df.to_excel(writer, sheet_name='Appearances', index=False)
    player_df.to_excel(writer, sheet_name='Players', index=False)
    match_df.to_excel(writer, sheet_name='Matches', index=False)