In [85]:
import sys
import os
import pandas as pd
sys.path.append(os.path.abspath("../src"))

from feature_engineering import prepare_training_data

# Load the data
df = pd.read_csv("../data/player_stats.csv")

# 10 target stats
target_columns = ['PTS', 'AST', 'REB', 'ST'
'L', 'BLK', 'TOV', 'FG_PCT', 'FT_PCT', 'FG3_PCT', 'FG3M']

#prepare dataset
model_df = prepare_training_data(df, target_columns)

model_df.head()

Unnamed: 0,PLAYER_ID,PLAYER_NAME,NICKNAME,TEAM_ID,TEAM_ABBREVIATION,AGE,GP,W,L,W_PCT,...,NEXT_PTS,NEXT_AST,NEXT_REB,NEXT_STL,NEXT_BLK,NEXT_TOV,NEXT_FG_PCT,NEXT_FT_PCT,NEXT_FG3_PCT,NEXT_FG3M
0,1713,Vince Carter,Vince,1610612758,SAC,41.0,58,20,38,0.345,...,5.4,1.2,2.6,0.7,0.4,0.6,0.403,0.757,0.345,1.0
1,1713,Vince Carter,Vince,1610612737,ATL,42.0,76,25,51,0.329,...,7.4,1.1,2.6,0.6,0.4,0.6,0.419,0.712,0.389,1.6
2,1713,Vince Carter,Vince,1610612737,ATL,43.0,60,19,41,0.317,...,5.0,0.8,2.1,0.4,0.4,0.6,0.352,0.793,0.302,1.0
3,1717,Dirk Nowitzki,Dirk,1610612742,DAL,40.0,77,24,53,0.312,...,12.0,1.6,5.7,0.6,0.6,0.7,0.456,0.898,0.409,1.8
4,1717,Dirk Nowitzki,Dirk,1610612742,DAL,41.0,51,17,34,0.333,...,7.3,0.7,3.1,0.2,0.4,0.4,0.359,0.78,0.312,1.3


In [86]:
# drop any rows where NEXT stat columns are missing
model_df_clean = model_df.dropna(subset=[f'NEXT_{col}' for col in target_columns])

# check how many rows are left
print("Rows before cleaning:", len(model_df))
print("Rows after cleaning:", len(model_df_clean))

model_df_clean = model_df_clean.drop_duplicates(subset=['PLAYER_ID', 'SEASON'])

#feature engineering: smarter nba-aware features
# 1. age squared
model_df_clean['AGE_SQUARED'] = model_df_clean['AGE'] ** 2

#2. Stats per minute (only us if MIN > 0 to avoid dividing by 0)
minutes = model_df_clean['MIN'] if 'MIN' in model_df_clean.columns else model_df_clean['GP'] * model_df_clean['MINUTES']
for stat in ['PTS', 'AST', 'REB', 'STL', 'BLK', 'TOV']:
    model_df_clean[f'{stat}_PER_MIN'] = model_df_clean[stat] / minutes.clip(lower=1)

#3 Season as a numeric feature (2018, 2019, etc)
model_df_clean['SEASON_NUM'] = model_df_clean['SEASON'].str[:4].astype(int)

#4 aged-based growth signal, young players improve more
model_df_clean['AGE_IMPROVEMENT_SIGNAL'] = (30 - model_df_clean['AGE']).clip(lower=0)

#5 Usage rate Proxy 
model_df_clean['USG_PROXY'] = (
    model_df_clean[['PTS', 'AST', 'TOV']].sum(axis=1) / 
    model_df_clean['GP'].clip(lower=1)
)

#5 Teammate influence: average team usage / assists / rebounds / scoring
team_avg_stats = model_df_clean.groupby(['TEAM_ABBREVIATION', 'SEASON_NUM']).agg({
    'USG_PROXY': 'mean',
    'AST': 'mean',
    'REB': 'mean',
    'PTS': 'mean'
}).rename(columns={
    'USG_PROXY': 'TEAM_AVG_USG',
    'AST': 'TEAM_AVG_AST',
    'REB': 'TEAM_AVG_REB',
    'PTS': 'TEAM_AVG_PTS'
}).reset_index()

model_df_clean = model_df_clean.merge(team_avg_stats, on=['TEAM_ABBREVIATION', 'SEASON_NUM'], how='left')

#define X features and y targets

leakage_cols = ['PTS', 'PTS_RANK', 'NBA_FANTASY_PTS', 'NBA_FANTASY_PTS_RANK']

X = model_df_clean.drop(columns=[
    'PLAYER_ID', 'PLAYER_NAME', 'NICKNAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'SEASON'
] + leakage_cols + list(f'NEXT_{col}' for col in target_columns))

y = model_df_clean[[f'NEXT_{col}' for col in target_columns]]

X.head(), y.head()

Rows before cleaning: 3855
Rows after cleaning: 3855


(    AGE  GP   W   L  W_PCT   MIN  FGM  FGA  FG_PCT  FG3M  ...  STL_PER_MIN  \
 0  41.0  58  20  38  0.345  17.7  2.0  4.9   0.403   1.0  ...     0.039548   
 1  42.0  76  25  51  0.329  17.5  2.6  6.2   0.419   1.6  ...     0.034286   
 2  43.0  60  19  41  0.317  14.6  1.8  5.1   0.352   1.0  ...     0.027397   
 3  40.0  77  24  53  0.312  24.7  4.5  9.8   0.456   1.8  ...     0.024291   
 4  41.0  51  17  34  0.333  15.6  2.6  7.4   0.359   1.3  ...     0.012821   
 
    BLK_PER_MIN  TOV_PER_MIN  SEASON_NUM  AGE_IMPROVEMENT_SIGNAL  USG_PROXY  \
 0     0.022599     0.033898        2017                     0.0   0.124138   
 1     0.022857     0.034286        2018                     0.0   0.119737   
 2     0.027397     0.041096        2019                     0.0   0.106667   
 3     0.024291     0.028340        2017                     0.0   0.185714   
 4     0.025641     0.025641        2018                     0.0   0.164706   
 
    TEAM_AVG_USG  TEAM_AVG_AST  TEAM_AVG_REB  TE

In [87]:
import numpy as np

# Make a copy to avoid modifying the original directly
fe_df = model_df_clean.copy()

# Add age squared to model peak age behavior
fe_df["AGE_SQUARED"] = fe_df["AGE"] ** 2

# Add per-minute stats for major box score categories
for stat in ['PTS', 'AST', 'REB', 'STL', 'BLK', 'TOV']:
    fe_df[f"{stat}_PER_MIN"] = fe_df[stat] / fe_df['GP'].replace(0, np.nan)

# Turn SEASON like "2018-19" → numeric value (e.g. 2018)
fe_df["SEASON_NUM"] = fe_df["SEASON"].apply(lambda x: int(x.split("-")[0]))

# Check a sample
fe_df[["AGE", "AGE_SQUARED", "PTS_PER_MIN", "SEASON", "SEASON_NUM"]].head()


Unnamed: 0,AGE,AGE_SQUARED,PTS_PER_MIN,SEASON,SEASON_NUM
0,41.0,1681.0,0.093103,2017-18,2017
1,42.0,1764.0,0.097368,2018-19,2018
2,43.0,1849.0,0.083333,2019-20,2019
3,40.0,1600.0,0.155844,2017-18,2017
4,41.0,1681.0,0.143137,2018-19,2018


In [88]:
# Columns to exclude (identifiers, targets, ranks, fantasy)
excluded_cols = [
    'PLAYER_ID', 'PLAYER_NAME', 'NICKNAME', 'TEAM_ID',
    'TEAM_ABBREVIATION', 'SEASON'
] + list(f'NEXT_{col}' for col in target_columns) + [
    col for col in fe_df.columns if 'RANK' in col or 'FANTASY' in col
]

# Final X and y
X = fe_df.drop(columns=excluded_cols)
y = fe_df[[f'NEXT_{col}' for col in target_columns]]

X.head()


Unnamed: 0,AGE,GP,W,L,W_PCT,MIN,FGM,FGA,FG_PCT,FG3M,...,STL_PER_MIN,BLK_PER_MIN,TOV_PER_MIN,SEASON_NUM,AGE_IMPROVEMENT_SIGNAL,USG_PROXY,TEAM_AVG_USG,TEAM_AVG_AST,TEAM_AVG_REB,TEAM_AVG_PTS
0,41.0,58,20,38,0.345,17.7,2.0,4.9,0.403,1.0,...,0.012069,0.006897,0.010345,2017,0.0,0.124138,0.286319,1.69375,3.73125,8.0
1,42.0,76,25,51,0.329,17.5,2.6,6.2,0.419,1.6,...,0.007895,0.005263,0.007895,2018,0.0,0.119737,0.424882,1.688889,3.7,8.133333
2,43.0,60,19,41,0.317,14.6,1.8,5.1,0.352,1.0,...,0.006667,0.006667,0.01,2019,0.0,0.106667,0.308767,1.905556,4.15,8.972222
3,40.0,77,24,53,0.312,24.7,4.5,9.8,0.456,1.8,...,0.007792,0.007792,0.009091,2017,0.0,0.185714,0.502996,1.47619,3.195238,7.161905
4,41.0,51,17,34,0.333,15.6,2.6,7.4,0.359,1.3,...,0.003922,0.007843,0.007843,2018,0.0,0.164706,0.271348,1.925,3.025,8.2875


In [89]:
selected_features = [
    'AGE', 'AGE_SQUARED', 'MIN', 'GP', 'W_PCT', 'PLUS_MINUS',
    'FG_PCT', 'FT_PCT', 'FG3_PCT', 'SEASON_NUM'
]

X = fe_df[selected_features]



In [92]:
df_2023 = fetch_multiple_seasons(2023, 2023)

df_2023_fe = prepare_training_data(df_2023, target_columns)

df_2023_fe["AGE_SQUARED"] = df_2023_fe["AGE"] ** 2
df_2023_fe["SEASON_NUM"] = df_2023_fe["SEASON"].apply(lambda x: int(x.split("-")[0]))

players_to_predict = [
    "Victor Wembanyama", "LeBron James", "Nikola Jokic", 
    "Tyrese Haliburton", "Shai Gilgeous-Alexander"
]

player_df = df_2023_fe[df_2023_fe["PLAYER_NAME"].isin(players_to_predict)].copy()

X_predict = player_df[selected_features]

predictions = {}

for stat in target_columns:
    target = f"NEXT_{stat}"
    model = models[target]  # your trained RandomForestRegressor
    preds = model.predict(X_predict)
    predictions[target] = preds

import pandas as pd

results_df = pd.DataFrame(predictions)
results_df.insert(0, "PLAYER_NAME", player_df["PLAYER_NAME"].values)
results_df


Fetching season 2023-24...


Unnamed: 0,PLAYER_NAME,NEXT_PTS,NEXT_AST,NEXT_REB,NEXT_STL,NEXT_BLK,NEXT_TOV,NEXT_FG_PCT,NEXT_FT_PCT,NEXT_FG3_PCT,NEXT_FG3M
0,LeBron James,23.846,6.014,7.811,1.18,1.227,2.769,0.53999,0.75,0.41002,1.799
1,Shai Gilgeous-Alexander,28.187,5.907,5.963,1.66,0.991,2.385,0.5351,0.874,0.353,1.28
2,Tyrese Haliburton,19.552,8.781,4.215,1.162,0.67,2.218,0.477,0.85502,0.364,2.596
3,Victor Wembanyama,18.913,3.86,8.836,1.089,2.57,3.081,0.465,0.79605,0.325,1.579


In [93]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

from data_loader import fetch_multiple_seasons

# create updated features and labels
df = fetch_multiple_seasons(2017, 2023)
df_fe = prepare_training_data(df, target_columns)

selected_features = [col for col in df_fe.columns if col not in ['PLAYER_ID', 'PLAYER_NAME', 'NICKNAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'SEASON'] + [f"NEXT_{col}" for col in target_columns]]
X = df_fe[selected_features]

models = {}

#train a model for each stat
for stat in target_columns:
    y = df_fe[f"NEXT_{stat}"]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
    
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    print(f"{stat}: MSE={mean_squared_error(y_test, y_pred):.4f}, R2={r2_score(y_test, y_pred):.4f}")

    models[f"NEXT_{stat}"] = model

Fetching season 2017-18...
Fetching season 2018-19...
Fetching season 2019-20...
Fetching season 2020-21...
Fetching season 2021-22...
Fetching season 2022-23...
Fetching season 2023-24...
PTS: MSE=0.0014, R2=1.0000
AST: MSE=0.0005, R2=0.9999
REB: MSE=0.0007, R2=0.9999
STL: MSE=0.0000, R2=1.0000
BLK: MSE=0.0000, R2=0.9997
TOV: MSE=0.0002, R2=0.9997
FG_PCT: MSE=0.0000, R2=0.9999
FT_PCT: MSE=0.0000, R2=1.0000
FG3_PCT: MSE=0.0000, R2=1.0000
FG3M: MSE=0.0001, R2=0.9999


In [99]:
df_train = model_df_clean.copy()

df_future = model_df_clean[model_df_clean['SEASON_NUM'] == 2023].copy()

import unicodedata

def remove_accents(text):
    return ''.join(
        c for c in unicodedata.normalize('NFKD', text) if not unicodedata.combining(c)
    )
df_future['PLAYER_NAME_CLEAN'] = df_future['PLAYER_NAME'].apply(remove_accents)

for stat in target_columns:
    df_future.drop(columns=[f'NEXT_{stat}'], inplace=True)

# Use the same features you trained on
X_train = df_train.drop(columns=[
    'PLAYER_ID', 'PLAYER_NAME', 'NICKNAME', 'TEAM_ID', 'TEAM_ABBREVIATION', 'SEASON',
    'PTS', 'PTS_RANK', 'NBA_FANTASY_PTS', 'NBA_FANTASY_PTS_RANK'
] + [f'NEXT_{col}' for col in target_columns])

X_future = df_future[X_train.columns]

from sklearn.ensemble import RandomForestRegressor

models = {}
for stat in target_columns:
    y_train = df_train[f'NEXT_{stat}']

    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    models[f'NEXT_{stat}'] = model

    # Predict 2024–25 stats
    df_future[f'PRED_{stat}'] = model.predict(X_future)

df_future[['PLAYER_NAME', 'TEAM_ABBREVIATION', 'AGE'] + [f'PRED_{stat}' for stat in target_columns]].head(10)




Unnamed: 0,PLAYER_NAME,TEAM_ABBREVIATION,AGE,PRED_PTS,PRED_AST,PRED_REB,PRED_STL,PRED_BLK,PRED_TOV,PRED_FG_PCT,PRED_FT_PCT,PRED_FG3_PCT,PRED_FG3M
29,LeBron James,LAL,39.0,25.753,8.309,7.3,1.3,0.5,3.501,0.53996,0.75001,0.41,2.1
94,Chris Paul,GSW,39.0,9.296,6.794,3.9,1.2,0.1,1.3,0.441,0.827,0.371,1.3
153,Kyle Lowry,PHI,38.0,8.086,4.2,3.2,1.0,0.4,1.4,0.432,0.84,0.392,1.6
160,P.J. Tucker,LAC,39.0,1.757,0.5,2.7,0.5,0.2,0.3,0.36003,1.0,0.371,0.4
174,Kevin Durant,PHX,35.0,27.072,5.0,6.6,0.9,1.2,3.3,0.52302,0.85601,0.413,2.2
181,Al Horford,BOS,38.0,8.859,2.6,6.4,0.6,1.0,0.7,0.51109,0.867,0.41905,1.7
188,Mike Conley,MIN,36.0,11.335,5.9,2.9,1.2,0.2,1.3,0.457,0.91097,0.44165,2.4
195,Jeff Green,HOU,37.0,6.522,0.9,2.3,0.2,0.4,0.6,0.456,0.819,0.331,0.7
209,Thaddeus Young,PHX,36.0,4.253,1.7,3.1,0.7,0.2,0.5,0.60127,0.39978,0.143,0.0
246,Derrick Rose,MEM,35.0,8.123,3.3,1.9,0.3,0.1,1.3,0.461,0.889,0.366,0.6


In [100]:
# Filter predictions for selected players
players = [
    "Nikola Jokic",
    "Shai Gilgeous-Alexander",
    "Giannis Antetokounmpo",
    "Luka Doncic",
    "Stephen Curry",
    "LeBron James",
    "Victor Wembanyama"
]

# Define columns to show
predicted_stats = [f'PRED_{stat}' for stat in target_columns]

# Display
df_future[df_future['PLAYER_NAME_CLEAN'].isin(players)][
    ['PLAYER_NAME', 'TEAM_ABBREVIATION', 'AGE'] + predicted_stats
]


Unnamed: 0,PLAYER_NAME,TEAM_ABBREVIATION,AGE,PRED_PTS,PRED_AST,PRED_REB,PRED_STL,PRED_BLK,PRED_TOV,PRED_FG_PCT,PRED_FT_PCT,PRED_FG3_PCT,PRED_FG3M
29,LeBron James,LAL,39.0,25.753,8.309,7.3,1.3,0.5,3.501,0.53996,0.75001,0.41,2.1
377,Stephen Curry,GSW,36.0,26.135,5.1,4.5,0.7,0.4,2.8,0.45,0.92304,0.408,4.802
1041,Giannis Antetokounmpo,MIL,29.0,30.398,6.493,11.526,1.2,1.1,3.4,0.61101,0.65694,0.27396,0.5
1304,Nikola Jokić,DEN,29.0,26.35,8.953,12.402,1.4,0.9,3.0,0.58299,0.81699,0.359,1.1
2417,Shai Gilgeous-Alexander,OKC,25.0,30.133,6.2,5.5,2.0,0.9,2.2,0.53508,0.87396,0.353,1.3
2600,Luka Dončić,DAL,25.0,33.343,9.818,9.197,1.4,0.5,4.007,0.487,0.786,0.382,4.146
3794,Victor Wembanyama,SAS,20.0,21.818,3.9,10.599,1.2,3.361,3.704,0.465,0.796,0.325,1.8
