In [1]:
import os, duckdb, pandas as pd

DB_PATH = os.getenv("FRA_DUCKDB_PATH", "../lakehouse/analytics.duckdb")
SRC_TABLE = os.getenv("FRA_TABLE", "player_dataset_predictive")

con = duckdb.connect(DB_PATH, read_only=True)
print(con.execute("SHOW TABLES").fetchdf())
print(con.execute(f"SELECT COUNT(*) n FROM {SRC_TABLE}").fetchdf())
con.close()

                               name
0                           matches
1                       player_acwr
2                  player_acwr_true
3         player_acwr_true_seasonal
4                    player_acwr_v2
5                    player_acwr_v3
6              player_dataset_final
7         player_dataset_predictive
8              player_form_features
9              player_load_features
10        player_load_features_true
11            player_match_features
12       player_match_features_time
13  player_match_features_true_time
14         player_match_features_v2
15             player_match_minutes
16        player_match_minutes_true
17          player_match_minutes_v2
18               player_match_stats
       n
0  81158


In [2]:
import os
import numpy as np
import pandas as pd
import duckdb

DB_PATH = os.getenv("FRA_DUCKDB_PATH", "../lakehouse/analytics.duckdb")
SRC_TABLE = os.getenv("FRA_TABLE", "player_dataset_predictive")
DST_TABLE = "player_dataset_predictive_v2"

con = duckdb.connect(DB_PATH)

# Load source table
df = con.execute(f"SELECT * FROM {SRC_TABLE}").fetchdf()

# Basic cleaning / ordering
df["match_date"] = pd.to_datetime(df["match_date"])
df = df.sort_values(["player_id", "match_date", "match_id"]).reset_index(drop=True)

# -------------------------
# A) Rolling std (volatility) on minutes (past-only)
# -------------------------
g = df.groupby("player_id", group_keys=False)

# Std of minutes over last N matches (excluding current match to avoid leakage)
df["minutes_std_last_5_matches"]  = g["minutes"].apply(lambda s: s.rolling(5,  min_periods=3).std().shift(1))
df["minutes_std_last_10_matches"] = g["minutes"].apply(lambda s: s.rolling(10, min_periods=5).std().shift(1))

# -------------------------
# B) Deltas / ratios (acceleration & shape of load)
# -------------------------
eps = 1e-6
df["delta_7d_14d"]   = df["minutes_last_7d"]  - df["minutes_last_14d"]
df["delta_14d_28d"]  = df["minutes_last_14d"] - df["minutes_last_28d"]
df["ratio_7d_14d"]   = df["minutes_last_7d"]  / (df["minutes_last_14d"] + eps)
df["ratio_14d_28d"]  = df["minutes_last_14d"] / (df["minutes_last_28d"] + eps)

# ACWR lag + change (past-only)
df["acwr_lag1"]      = g["acwr"].shift(1)
df["acwr_change"]    = df["acwr"] - df["acwr_lag1"]

# -------------------------
# C) Cumulative season load (fatigue accumulation) â€” past-only
# -------------------------
gs = df.groupby(["player_id", "season_id"], group_keys=False)

# Minutes accumulated BEFORE current match
df["season_minutes_cum"] = gs["minutes"].cumsum().shift(1)

# Matches played in season BEFORE current match
df["season_matches_played"] = gs.cumcount()

# Average minutes so far in season (avoid div by zero)
df["season_avg_minutes"] = df["season_minutes_cum"] / (df["season_matches_played"].replace(0, np.nan))

# Optional: short-term season momentum (minutes last 3 matches vs season avg)
df["minutes_last_3_matches"] = g["minutes"].apply(lambda s: s.rolling(3, min_periods=2).sum().shift(1))
df["season_momentum_3v_season_avg"] = df["minutes_last_3_matches"] / (df["season_avg_minutes"] * 3 + eps)

# -------------------------
# Handle NaNs (keep conservative; do NOT leak)
# -------------------------
# For early matches, std/cum features are NaN. Fill with 0 for std/deltas, keep ratios bounded.
fill_zero = [
    "minutes_std_last_5_matches","minutes_std_last_10_matches",
    "delta_7d_14d","delta_14d_28d","acwr_change","minutes_last_3_matches",
    "season_minutes_cum","season_matches_played"
]
for c in fill_zero:
    df[c] = df[c].fillna(0)

# Ratios: fill NaN with 1 (neutral) and clip extremes to avoid numerical explosions
ratio_cols = ["ratio_7d_14d","ratio_14d_28d","season_momentum_3v_season_avg"]
for c in ratio_cols:
    df[c] = df[c].replace([np.inf, -np.inf], np.nan).fillna(1.0).clip(0, 5)

# ACWR lag: keep NaN as 0 for first obs (or 1.0 if you prefer neutral)
df["acwr_lag1"] = df["acwr_lag1"].fillna(0)

# -------------------------
# Write back to DuckDB as a new table
# -------------------------
con.execute(f"DROP TABLE IF EXISTS {DST_TABLE}")
con.register("dfv2", df)
con.execute(f"CREATE TABLE {DST_TABLE} AS SELECT * FROM dfv2")
con.close()

print("Created:", DST_TABLE, "| rows:", len(df), "| cols:", df.shape[1])

Created: player_dataset_predictive_v2 | rows: 81158 | cols: 36
