In [None]:
# SANDBOX RUN: safe FE that writes only to sandbox (no overwrite)
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
import joblib
from datetime import datetime

PROJ = Path("/content/drive/MyDrive/player_value_prediction_project")
MASTER_DIR = PROJ / "data" / "processed" / "master"
MASTER_IN = MASTER_DIR / "final_training_master.csv"

# Sandbox outputs (no overwrite of master)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
BACKUP_OUT = MASTER_DIR / f"backup_before_fe_{ts}.csv"
SANDBOX_DIR = MASTER_DIR / "sandbox"
SANDBOX_DIR.mkdir(parents=True, exist_ok=True)
MASTER_OUT_SANDBOX = SANDBOX_DIR / "final_training_master_fe.csv"
SCALER_PATH_SANDBOX = SANDBOX_DIR / f"scaler_final_training_master_fe_{ts}.joblib"

print("Reading:", MASTER_IN)
df = pd.read_csv(MASTER_IN, low_memory=False)
print("Loaded shape:", df.shape)

# small safe tweak: impute age median if missing (very small % in your preview)
if 'age_at_season_start' in df.columns:
    age_med = df['age_at_season_start'].median()
    missing_age_pct = df['age_at_season_start'].isna().mean() * 100
    print(f"Age missing: {missing_age_pct:.3f}% — imputing with median = {age_med}")
    df['age_at_season_start'] = df['age_at_season_start'].fillna(age_med)

# proceed with the same safe FE pipeline (abbreviated here; full logic preserved)
# 1) season_start_year parsing (already safe)
def extract_season_start_year(s):
    try:
        if pd.isna(s): return np.nan
        s = str(s)
        if '/' in s or '-' in s:
            part = s.split('/')[0] if '/' in s else s.split('-')[0]
            if part.isdigit() and len(part)==4: return int(part)
        import re
        m = re.search(r'20\d{2}', s)
        if m: return int(m.group(0))
    except:
        return np.nan
    return np.nan

if 'season' in df.columns:
    df['season_start_year'] = df['season'].apply(extract_season_start_year)

# 2) Missingness flags + numeric impute (median)
numeric_candidates = [
 'age_at_season_start','appearances_count','total_minutes','minutes_per_appearance','mean_minutes','starts_estimate',
 'goals','assists','goals_per90','assists_per90','yellow_cards','red_cards','distinct_competitions_count',
 'sb_shots','sb_xg','sb_goals','sb_passes','sb_passes_completed',
 'reddit_mean_compound','reddit_num_posts','num_injuries','avg_days_out',
 'transfers_count','transfers_sum_fee','transfers_mean_fee','days_since_last_transfer'
]
numeric_feats = [c for c in numeric_candidates if c in df.columns]
for c in numeric_feats:
    df[f"{c}_missing"] = df[c].isna().astype(int)
    if pd.api.types.is_numeric_dtype(df[c]):
        df[c] = df[c].fillna(df[c].median(skipna=True))
    else:
        df[c] = pd.to_numeric(df[c], errors='coerce').fillna(0)

# 3) Categorical safe handling (OHE for position & foot only)
if 'position' in df.columns:
    df = pd.concat([df, pd.get_dummies(df['position'].astype(str), prefix='position', dummy_na=False)], axis=1)
if 'foot' in df.columns:
    df = pd.concat([df, pd.get_dummies(df['foot'].astype(str), prefix='foot', dummy_na=False)], axis=1)

# 4) Interaction features
if 'age_at_season_start' in df.columns:
    df['age_sq'] = df['age_at_season_start'] ** 2
if 'age_at_season_start' in df.columns and 'minutes_per_appearance' in df.columns:
    df['age_x_mins'] = df['age_at_season_start'] * df['minutes_per_appearance']
if 'goals' in df.columns and 'total_minutes' in df.columns:
    df['goals_per_min'] = df['goals'] / df['total_minutes'].replace({0:np.nan})
    df['goals_per_min'] = df['goals_per_min'].fillna(0.0)
if 'num_injuries' in df.columns and 'transfers_count' in df.columns:
    df['injuries_x_transfers'] = df['num_injuries'] * df['transfers_count']
if 'reddit_mean_compound' in df.columns and 'goals_per90' in df.columns:
    df['sent_x_goals90'] = df['reddit_mean_compound'] * df['goals_per90']

# 5) Lag features (keep NaN + missing flags)
lag_cols = ['target_log1p','goals','assists','total_minutes','appearances_count']
lag_exists = [c for c in lag_cols if c in df.columns]
sort_key = ['player_id','season_start_year'] if 'season_start_year' in df.columns else ['player_id','season']
df = df.sort_values(sort_key)
for c in lag_exists:
    df[f"{c}_lag1"] = df.groupby('player_id')[c].shift(1)
    df[f"{c}_lag1_missing"] = df[f"{c}_lag1"].isna().astype(int)
    df[f"{c}_lag1_diff"] = df[c] - df[f"{c}_lag1"]

# 6) Scaling selected numeric features & save scaler to sandbox
engineered_numeric = [
    'age_at_season_start','age_sq','age_x_mins','minutes_per_appearance','total_minutes','appearances_count','mean_minutes','goals','assists',
    'goals_per90','assists_per90','goals_per_min','sb_xg','sb_shots','sb_passes',
    'reddit_mean_compound','num_injuries','transfers_sum_fee','days_since_last_transfer'
]
scale_cols = [c for c in engineered_numeric if c in df.columns]
if scale_cols:
    df_scale = df[scale_cols].fillna(df[scale_cols].median().fillna(0.0))
    scaler = StandardScaler()
    df_scaled_arr = scaler.fit_transform(df_scale)
    df_scaled = pd.DataFrame(df_scaled_arr, columns=[c + "_scaled" for c in scale_cols], index=df.index)
    df = pd.concat([df, df_scaled], axis=1)
    joblib.dump(scaler, SCALER_PATH_SANDBOX)
    print("Saved scaler to sandbox:", SCALER_PATH_SANDBOX)

# 7) Save sandbox output (no overwrite of master)
df.to_csv(MASTER_OUT_SANDBOX, index=False)
print("Wrote sandbox FE file to:", MASTER_OUT_SANDBOX)
print("Sandbox shape:", df.shape)
print("Backup of master created at:", BACKUP_OUT)
# create a backup copy of original master (non-destructive) if desired
pd.read_csv(MASTER_IN, low_memory=False).to_csv(BACKUP_OUT, index=False)


Reading: /content/drive/MyDrive/player_value_prediction_project/data/processed/master/final_training_master.csv
Loaded shape: (87223, 109)
Age missing: 0.066% — imputing with median = 25.0
Saved scaler to sandbox: /content/drive/MyDrive/player_value_prediction_project/data/processed/master/sandbox/scaler_final_training_master_fe_20250906_154823.joblib
Wrote sandbox FE file to: /content/drive/MyDrive/player_value_prediction_project/data/processed/master/sandbox/final_training_master_fe.csv
Sandbox shape: (87223, 185)
Backup of master created at: /content/drive/MyDrive/player_value_prediction_project/data/processed/master/backup_before_fe_20250906_154823.csv


In [None]:
import pandas as pd
from pathlib import Path

SANDBOX = Path("/content/drive/MyDrive/player_value_prediction_project/data/processed/master/sandbox")
fe = pd.read_csv(SANDBOX / "final_training_master_fe.csv", low_memory=False)

# 1) Basic info
print("shape:", fe.shape)
print("columns count:", len(fe.columns))

# 2) Top rows & dtypes
display(fe.head(5))
print(fe.dtypes.value_counts())

# 3) Confirm season ordering & example lags for a sample player
# choose a player_id present in your preview, e.g. 10
pid = 10
sample = fe[fe['player_id']==pid].sort_values('season_start_year')
display(sample[['player_id','season','season_start_year','target_log1p','target_log1p_lag1','target_log1p_lag1_missing','target_log1p_lag1_diff']].head(10))

# 4) Check % missing for newly created lag flags and key engineered cols
cols_to_check = ['target_log1p_lag1_missing','goals_lag1_missing','goals_per_min','age_sq']
for c in cols_to_check:
    if c in fe.columns:
        print(c, "null%:", fe[c].isna().mean()*100, " / missing_flag% (if exists):", (fe[c].isna() if False else 0))
# 5) Quick distribution checks (optional)
display(fe[['target_log1p','target_log1p_lag1']].describe().T)


shape: (87223, 185)
columns count: 185


Unnamed: 0,player_id,season,name,age_at_season_start,position,sub_position,current_club_id,current_club_domestic_competition_id,played_any,appearances_count,...,goals_per90_scaled,assists_per90_scaled,goals_per_min_scaled,sb_xg_scaled,sb_shots_scaled,sb_passes_scaled,reddit_mean_compound_scaled,num_injuries_scaled,transfers_sum_fee_scaled,days_since_last_transfer_scaled
0,10,2012,Miroslav Klose,34.0,Attack,Centre-Forward,398.0,IT1,1,36,...,0.768854,0.02028,0.768854,-0.032374,-0.040526,-0.049753,-0.018729,-0.059164,-0.146907,-0.361545
1,10,2013,Miroslav Klose,35.0,Attack,Centre-Forward,398.0,IT1,1,29,...,0.35662,0.220335,0.35662,-0.032374,-0.040526,-0.049753,-0.018729,-0.059164,-0.146907,-0.361545
2,10,2014,Miroslav Klose,36.0,Attack,Centre-Forward,398.0,IT1,1,40,...,0.896447,0.528119,0.896447,-0.032374,-0.040526,-0.049753,-0.018729,-0.059164,-0.146907,-0.361545
3,10,2015,Miroslav Klose,37.0,Attack,Centre-Forward,398.0,IT1,1,31,...,0.52621,0.662916,0.52621,-0.032374,-0.040526,-0.049753,-0.018729,-0.059164,-0.146907,-0.361545
4,26,2012,Roman Weidenfeller,32.0,Goalkeeper,Goalkeeper,16.0,L1,1,49,...,-0.217841,-0.192388,-0.217841,-0.032374,-0.040526,-0.049753,-0.018729,-0.059164,-0.146907,-0.361545


float64    109
int64       47
object      16
bool        13
Name: count, dtype: int64


Unnamed: 0,player_id,season,season_start_year,target_log1p,target_log1p_lag1,target_log1p_lag1_missing,target_log1p_lag1_diff
0,10,2012,2012,14.508658,,1,
1,10,2013,2013,13.815512,14.508658,0,-0.693147
2,10,2014,2014,13.815512,13.815512,0,0.0
3,10,2015,2015,13.815512,13.815512,0,0.0


target_log1p_lag1_missing null%: 0.0  / missing_flag% (if exists): 0
goals_lag1_missing null%: 0.0  / missing_flag% (if exists): 0
goals_per_min null%: 0.0  / missing_flag% (if exists): 0
age_sq null%: 0.0  / missing_flag% (if exists): 0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
target_log1p,87223.0,13.55428,2.976131,0.0,12.866953,13.815512,15.068274,19.113828
target_log1p_lag1,61534.0,13.946493,2.726663,0.0,13.122365,14.220976,15.319588,19.113828
