In [208]:
import os, glob
import numpy as np
import pandas as pd

RAW_DIR = "../data/raw"
PROC_DIR = "../data/processed"
os.makedirs(RAW_DIR, exist_ok=True)
os.makedirs(PROC_DIR, exist_ok=True)

FETCH_UP_TO_GW = 9  # we have GWs 1..9; we'll predict GW 10
ROLL_WIN = 3
SEED = 42
np.random.seed(SEED)

print("✅ Setup done")


✅ Setup done


In [209]:
import time, requests

all_gws = []
for gw in range(1, FETCH_UP_TO_GW + 1):
    url = f"https://fantasy.premierleague.com/api/event/{gw}/live/"
    r = requests.get(url, timeout=30)
    r.raise_for_status()
    data = r.json()
    gw_df = pd.json_normalize(data["elements"])
    gw_df["gameweek"] = gw
    all_gws.append(gw_df)
    print(f"GW{gw}: rows={len(gw_df)}")
    time.sleep(0.3)

df_live = pd.concat(all_gws, ignore_index=True)
keep = ["id","gameweek"] + [c for c in df_live.columns if c.startswith("stats.")]
df_live = df_live[keep].copy()
df_live.columns = [c.replace("stats.","") for c in df_live.columns]
df_live = df_live.rename(columns={"id":"player_id"})
df_live.to_csv(f"{RAW_DIR}/fpl_player_gameweeks.csv", index=False)

print("✅ Saved raw:", df_live.shape)


GW1: rows=690
GW2: rows=705
GW3: rows=712
GW4: rows=740
GW5: rows=741
GW6: rows=742
GW7: rows=743
GW8: rows=745
GW9: rows=746
✅ Saved raw: (6564, 30)


In [210]:
boot = requests.get("https://fantasy.premierleague.com/api/bootstrap-static/").json()

ele = pd.json_normalize(boot["elements"])[["id","web_name","team","element_type","now_cost"]]
ele = ele.rename(columns={"id":"player_id","web_name":"player_name","team":"team_id","element_type":"position_id"})
teams = pd.json_normalize(boot["teams"])[["id","name"]].rename(columns={"id":"team_id","name":"team_name"})
pos_map = {1:"GKP", 2:"DEF", 3:"MID", 4:"FWD"}

df = pd.read_csv(f"{RAW_DIR}/fpl_player_gameweeks.csv")
df = df.merge(ele, on="player_id", how="left").merge(teams, on="team_id", how="left")
df["position_name"] = df["position_id"].map(pos_map)

# Coerce numeric
NUMS = [
    "minutes","goals_scored","assists","clean_sheets","goals_conceded","own_goals",
    "penalties_saved","penalties_missed","yellow_cards","red_cards","saves","bonus",
    "bps","total_points","starts","clearances_blocks_interceptions","recoveries","tackles",
    "defensive_contribution","expected_goals","expected_assists","expected_goal_involvements",
    "expected_goals_conceded","influence","creativity","threat","ict_index","now_cost"
]
for c in NUMS:
    if c in df.columns:
        df[c] = pd.to_numeric(df[c], errors="coerce")

df = df.sort_values(["player_id","gameweek"]).reset_index(drop=True)
print("✅ Meta merged:", df.shape)


✅ Meta merged: (6564, 36)


In [211]:
# Next GW points label (for evaluation only; not available for GW9 rows)
df["event_points_next"] = df.groupby("player_id")["total_points"].shift(-1)

g = df.groupby("player_id", group_keys=False)

roll_mean = ["minutes","expected_goals","expected_assists","expected_goal_involvements",
             "ict_index","threat","creativity","influence"]
roll_sum  = ["total_points","goals_scored","assists","saves"]

for c in roll_mean:
    if c in df.columns:
        df[f"r3_{c}_mean"] = g[c].rolling(ROLL_WIN, min_periods=1).mean().reset_index(level=0, drop=True)
for c in roll_sum:
    if c in df.columns:
        df[f"r3_{c}_sum"] = g[c].rolling(ROLL_WIN, min_periods=1).sum().reset_index(level=0, drop=True)

for c in ["minutes","total_points","expected_goals","expected_assists",
          "expected_goal_involvements","ict_index","threat","creativity","influence"]:
    if c in df.columns:
        df[f"delta_{c}"] = g[c].diff(1)

df["r3_minutes_mean_ge_45"] = (df.get("r3_minutes_mean", 0) >= 45).astype(int)

ID_COLS = ["player_id","player_name","position_name","team_name","gameweek","now_cost"]
ID_COLS = [c for c in ID_COLS if c in df.columns]

# Candidate features
feat_cols_all = [c for c in df.columns if c.startswith("r3_") or c.startswith("delta_") or c=="r3_minutes_mean_ge_45"]

# Choose a stable, high-signal subset (and lock order)
priority = [
    "r3_minutes_mean","r3_total_points_sum",
    "r3_expected_goal_involvements_mean","r3_expected_goals_mean","r3_expected_assists_mean",
    "r3_threat_mean","r3_creativity_mean","r3_influence_mean",
    "r3_goals_scored_sum","r3_assists_sum","r3_saves_sum",
    "delta_expected_goal_involvements","delta_expected_goals","delta_expected_assists",
    "delta_threat","delta_creativity","delta_influence",
    "delta_minutes","delta_total_points",
    "r3_minutes_mean_ge_45"
]
FEAT_COLS = [c for c in priority if c in df.columns]
assert len(FEAT_COLS) > 0, "No features found — check earlier cells."

# Save locked feature order for reuse
pd.Series(FEAT_COLS).to_csv(f"{PROC_DIR}/feature_list.csv", index=False, header=False)

full = df[ID_COLS + ["event_points_next"] + FEAT_COLS].copy()
full.to_csv(f"{PROC_DIR}/features_full.csv", index=False)

CURRENT_GW = int(full["gameweek"].max())            # 9
train = full.dropna(subset=["event_points_next"]).copy()   # 1..8 with labels
pred  = full[full["gameweek"] == CURRENT_GW].copy()       # 9 -> predict GW10

train.to_csv(f"{PROC_DIR}/features_train.csv", index=False)
pred.to_csv(f"{PROC_DIR}/features_predict_gw{CURRENT_GW}.csv", index=False)

print("✅ Built features:",
      "\n  FEAT_COLS:", FEAT_COLS,
      "\n  train:", train.shape, "predict:", pred.shape)


✅ Built features: 
  FEAT_COLS: ['r3_minutes_mean', 'r3_total_points_sum', 'r3_expected_goal_involvements_mean', 'r3_expected_goals_mean', 'r3_expected_assists_mean', 'r3_threat_mean', 'r3_creativity_mean', 'r3_influence_mean', 'r3_goals_scored_sum', 'r3_assists_sum', 'r3_saves_sum', 'delta_expected_goal_involvements', 'delta_expected_goals', 'delta_expected_assists', 'delta_threat', 'delta_creativity', 'delta_influence', 'delta_minutes', 'delta_total_points', 'r3_minutes_mean_ge_45'] 
  train: (5818, 27) predict: (746, 27)


In [212]:
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error

train = pd.read_csv(f"{PROC_DIR}/features_train.csv")
feat_list = pd.read_csv(f"{PROC_DIR}/feature_list.csv", header=None)[0].tolist()

max_lab_gw = int(train["gameweek"].max())  # 8
train_mask = train["gameweek"] < max_lab_gw   # 1..7
valid_mask = train["gameweek"] == max_lab_gw  # 8

# minutes filter (use recent minutes mean if present; else keep all)
train_filt = train.loc[train_mask].copy()
if "r3_minutes_mean" in train_filt.columns:
    train_filt = train_filt[train_filt["r3_minutes_mean"] >= 30].copy()

X_train = train_filt[feat_list].apply(pd.to_numeric, errors="coerce").fillna(0).astype(float)
y_train = pd.to_numeric(train_filt["event_points_next"], errors="coerce").fillna(0).astype(float)

X_valid = train.loc[valid_mask, feat_list].apply(pd.to_numeric, errors="coerce").fillna(0).astype(float)
y_valid = pd.to_numeric(train.loc[valid_mask, "event_points_next"], errors="coerce").fillna(0).astype(float)

print(f"Train={X_train.shape} Valid={X_valid.shape}",
      f"| y_train mean={y_train.mean():.2f} nonzero={((y_train>0).mean()):.3f}",
      f"| y_valid mean={y_valid.mean():.2f} nonzero={((y_valid>0).mean()):.3f}")

dtrain = lgb.Dataset(X_train, label=y_train, feature_name=feat_list)
dvalid = lgb.Dataset(X_valid, label=y_valid, feature_name=feat_list)

params = {
    "objective": "regression",
    "metric": ["rmse","l1"],
    "learning_rate": 0.05,
    "num_leaves": 64,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.9,
    "bagging_freq": 5,
    "min_data_in_leaf": 20,
    "seed": SEED,
    "verbose": -1
}

callbacks = [early_stopping(100), log_evaluation(100)]

model = lgb.train(
    params=params,
    train_set=dtrain,
    valid_sets=[dtrain, dvalid],
    valid_names=["train","valid"],
    num_boost_round=3000,
    callbacks=callbacks
)

y_pred = model.predict(X_valid, num_iteration=model.best_iteration)
mae = mean_absolute_error(y_valid, y_pred)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f"\n✅ Holdout GW {max_lab_gw}→{max_lab_gw+1}  MAE={mae:.3f} | RMSE={rmse:.3f} | mean(pred)={y_pred.mean():.2f}")

# Basic importances to ensure model actually used features
imp = pd.DataFrame({
    "feature": model.feature_name(),
    "gain": model.feature_importance(importance_type="gain")
}).sort_values("gain", ascending=False)
print("\nTop importances:")
print(imp.head(10).to_string(index=False))


Train=(1807, 20) Valid=(745, 20) | y_train mean=2.85 nonzero=0.824 | y_valid mean=1.22 nonzero=0.403
Training until validation scores don't improve for 100 rounds
[100]	train's rmse: 1.65396	train's l1: 1.157	valid's rmse: 2.50577	valid's l1: 2.02415
Early stopping, best iteration is:
[66]	train's rmse: 1.96057	train's l1: 1.39126	valid's rmse: 2.40928	valid's l1: 1.89197

✅ Holdout GW 8→9  MAE=1.892 | RMSE=2.409 | mean(pred)=2.13

Top importances:
                           feature         gain
                   r3_minutes_mean 11911.006323
                 r3_influence_mean  9753.437868
r3_expected_goal_involvements_mean  9178.509554
                r3_creativity_mean  9175.843236
                   delta_influence  8588.947729
               r3_total_points_sum  8040.493716
                  delta_creativity  7601.053276
            r3_expected_goals_mean  6173.411293
                    r3_threat_mean  6052.626688
          r3_expected_assists_mean  4893.367346


In [213]:
feat_list = pd.read_csv(f"{PROC_DIR}/feature_list.csv", header=None)[0].tolist()
pred = pd.read_csv(f"{PROC_DIR}/features_predict_gw{FETCH_UP_TO_GW}.csv")

X_pred = pred[feat_list].apply(pd.to_numeric, errors="coerce").fillna(0).astype(float)
pred_pts = model.predict(X_pred, num_iteration=model.best_iteration)

pred["expected_points"] = np.clip(pred_pts, 0, None)  # no negatives
pred["target_gw"] = FETCH_UP_TO_GW + 1  # 10

# starters-only view
if "r3_minutes_mean" in pred.columns:
    starters = pred[pred["r3_minutes_mean"] >= 30].copy()
else:
    starters = pred.copy()

pred.sort_values("expected_points", ascending=False).to_csv(
    f"{PROC_DIR}/predictions_gw{FETCH_UP_TO_GW+1}_regression_all.csv", index=False
)
starters.sort_values("expected_points", ascending=False).to_csv(
    f"{PROC_DIR}/predictions_gw{FETCH_UP_TO_GW+1}_regression_starters.csv", index=False
)

print("Saved:",
      f"\n  {PROC_DIR}/predictions_gw{FETCH_UP_TO_GW+1}_regression_all.csv",
      f"\n  {PROC_DIR}/predictions_gw{FETCH_UP_TO_GW+1}_regression_starters.csv")

print("\nPred stats (all):", pred["expected_points"].describe())
print("Pred stats (starters):", starters["expected_points"].describe())

starters[["player_name","position_name","team_name","expected_points","target_gw"]].head(15)


Saved: 
  ../data/processed/predictions_gw10_regression_all.csv 
  ../data/processed/predictions_gw10_regression_starters.csv

Pred stats (all): count    746.000000
mean       2.176182
std        0.978233
min        0.262757
25%        1.706957
50%        1.706957
75%        2.496304
max        7.024952
Name: expected_points, dtype: float64
Pred stats (starters): count    263.000000
mean       2.927126
std        1.265562
min        0.290043
25%        2.018624
50%        2.867551
75%        3.732963
max        7.024952
Name: expected_points, dtype: float64


Unnamed: 0,player_name,position_name,team_name,expected_points,target_gw
0,Raya,GKP,Arsenal,3.974694,10
4,Gabriel,DEF,Arsenal,4.450677,10
5,Saliba,DEF,Arsenal,2.495377,10
6,Calafiori,DEF,Arsenal,2.086481,10
7,J.Timber,DEF,Arsenal,5.066033,10
15,Saka,MID,Arsenal,4.563934,10
19,Trossard,MID,Arsenal,2.813535,10
20,Rice,MID,Arsenal,3.731109,10
25,Zubimendi,MID,Arsenal,4.194575,10
31,Martinez,GKP,Aston Villa,2.109788,10


In [214]:
pred_out = pd.read_csv(f"{PROC_DIR}/predictions_gw{FETCH_UP_TO_GW+1}_regression_starters.csv")

print("\n=== GW {} — Top 20 Overall (Starters only) ===".format(int(pred_out['target_gw'].iloc[0])))
print(pred_out.sort_values("expected_points", ascending=False)
      .head(20)[["player_name","position_name","team_name","expected_points","target_gw"]]
      .to_string(index=False))

def top_by_pos(df, pos, k=10):
    sub = df[df["position_name"]==pos].sort_values("expected_points", ascending=False).head(k)
    return sub[["player_name","team_name","expected_points","target_gw"]]

for pos in ["GKP","DEF","MID","FWD"]:
    t = top_by_pos(pred_out, pos, 10)
    print(f"\n=== GW {int(t['target_gw'].iloc[0])} — Top {pos} (Starters only) ===")
    print(t.to_string(index=False))



=== GW 10 — Top 20 Overall (Starters only) ===
  player_name position_name      team_name  expected_points  target_gw
       Mateta           FWD Crystal Palace         7.024952         10
      Haaland           FWD       Man City         6.780602         10
      Collins           DEF      Brentford         6.106642         10
      Semenyo           MID    Bournemouth         6.103397         10
       Thiago           FWD      Brentford         5.983706         10
        Gakpo           MID      Liverpool         5.957383         10
   J.Palhinha           MID          Spurs         5.635133         10
   Van de Ven           DEF          Spurs         5.538637         10
       Kamada           MID Crystal Palace         5.500001         10
      Wharton           MID Crystal Palace         5.389962         10
      De Ligt           DEF        Man Utd         5.326436         10
       Mbeumo           MID        Man Utd         5.261337         10
      Mukiele           DEF  