In [None]:
%run fpl_analysis.ipynb

In [None]:
import pandas as pd

master = pd.read_csv('data/master_data.csv')
print(master.shape)
print(master.columns)

In [None]:
target_col = "total_points"

# Goal Keeper

opt_gk_feats = [
    "form_custom",
    "clean_sheets",
    "influence",
    "ict_index",
    "minutes",
    "pen_saved",
    "saves",
    "starts",
    "goals_conceded"
]


numeric_features = opt_gk_feats

#model_df = master[master["roll_points_5wk"] > 0].copy()
#model_df = master.copy()

model_df = master[master['position_label'] == 'GK'].copy()
display(model_df.head(90))

#model_df = model_df[model_df['position_label'] == 'GK']
model_df = model_df.sort_values(["player_id", "gameweek"])
model_df = model_df.dropna(subset=[target_col])

cat_features = ["position_label"]

core_cols = ["player_id", "player_name", "team_short_name", "gameweek", target_col]
core_cols = [c for c in core_cols if c in model_df.columns]
cols_needed = core_cols + [c for c in numeric_features if c in model_df.columns] + cat_features
cols_needed = list(dict.fromkeys(cols_needed))
model_df = model_df[cols_needed].copy()

print("model_df shape:", model_df.shape)
display(model_df.head())

all_gws = sorted(model_df["gameweek"].unique())
if len(all_gws) <= N_VAL_GWS:
    val_gws = all_gws[int(len(all_gws) / 2):]
else:
    val_gws = all_gws[-N_VAL_GWS:]
train_gws = [gw for gw in all_gws if gw not in val_gws]

print("Train GWs:", train_gws)
print("Val GWs:", val_gws)

train_mask = model_df["gameweek"].isin(train_gws)
val_mask = model_df["gameweek"].isin(val_gws)

X_num_train = model_df.loc[train_mask, numeric_features].fillna(0.0)
X_cat_train = pd.get_dummies(model_df.loc[train_mask, cat_features], drop_first=False)
X_train = pd.concat([X_num_train, X_cat_train], axis=1)
y_train = model_df.loc[train_mask, target_col].values

X_num_val = model_df.loc[val_mask, numeric_features].fillna(0.0)
X_cat_val = pd.get_dummies(model_df.loc[val_mask, cat_features], drop_first=False)
X_cat_val = X_cat_val.reindex(columns=X_cat_train.columns, fill_value=0)
X_val = pd.concat([X_num_val, X_cat_val], axis=1)
y_val = model_df.loc[val_mask, target_col].values

print("Train X shape:", X_train.shape, "Val X shape:", X_val.shape)


In [None]:
baseline_val_pred = model_df.loc[val_mask, "form_custom"].fillna(0.0).values

rf_full = RandomForestRegressor(
    n_estimators=300,
    random_state=RANDOM_STATE,
    n_jobs=-1,
)
rf_full.fit(X_train, y_train)
y_val_pred_rf = rf_full.predict(X_val)

def print_metrics(name, y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{name:20s} MAE={mae:6.3f} | RÂ²={r2:6.3f}")

print("Validation performance (target = total_points):")
print_metrics("Baseline (form)", y_val, baseline_val_pred)
print_metrics("RF full",         y_val, y_val_pred_rf)

fi = pd.DataFrame({
    "feature": X_train.columns,
    "importance": rf_full.feature_importances_
}).sort_values("importance", ascending=False)

print("\nTop RF feature importances:")
display(fi.head(20))
