## Prepare Data 

### If only using fantasy points as target, remove other rookie stats from df

In [26]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [78]:
# df = pd.read_csv("df_master_2014_2024.csv")
df = pd.read_csv("df_master.csv")

In [97]:
columns_to_keep = [col for col in df.columns if not col.startswith('R_') or col == 'R_fantasy_points_halfppr_tep']
print(columns_to_keep)
df_filtered = df[columns_to_keep]

columns_to_remove = ['player_name', 'player_id_x', 'player_id_y']
df_filtered.drop(columns=columns_to_remove, inplace=True) 

['player_name', 'position', 'team', 'draft_year', 'draft_round', 'draft_pick_overall', 'age_on_draft_day', 'player_id_x', 'R_fantasy_points_halfppr_tep', 'C_season', 'player_id_y', 'C_team', 'C_conference', 'C_passing_TD', 'C_passing_YDS', 'C_passing_INT', 'C_rushing_TD', 'C_rushing_YDS', 'C_receiving_REC', 'C_receiving_TD', 'C_receiving_YDS', 'C_fumbles_LOST', 'C_passing_ATT', 'C_passing_COMPLETIONS', 'C_passing_PCT', 'C_passing_YPA', 'C_rushing_CAR', 'C_rushing_YPC', 'C_rushing_LONG', 'C_receiving_YPR', 'C_receiving_LONG', 'C_fumbles_FUM', 'C_conference_strength']


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered.drop(columns=columns_to_remove, inplace=True)


In [98]:
print(df_filtered.columns)

Index(['position', 'team', 'draft_year', 'draft_round', 'draft_pick_overall',
       'age_on_draft_day', 'R_fantasy_points_halfppr_tep', 'C_season',
       'C_team', 'C_conference', 'C_passing_TD', 'C_passing_YDS',
       'C_passing_INT', 'C_rushing_TD', 'C_rushing_YDS', 'C_receiving_REC',
       'C_receiving_TD', 'C_receiving_YDS', 'C_fumbles_LOST', 'C_passing_ATT',
       'C_passing_COMPLETIONS', 'C_passing_PCT', 'C_passing_YPA',
       'C_rushing_CAR', 'C_rushing_YPC', 'C_rushing_LONG', 'C_receiving_YPR',
       'C_receiving_LONG', 'C_fumbles_FUM', 'C_conference_strength'],
      dtype='object')


In [99]:
target = 'R_fantasy_points_halfppr_tep'  # column to predict
X = df_filtered.drop(columns=[target])
y = df_filtered[target]

# Identify categorical columns
categorical_cols = ['position', 'team', 'C_conference', 'C_team'] 
numeric_cols = [col for col in X.columns if col not in categorical_cols]

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [100]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_cols),
        ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
    ]
)

# Random Forest

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

In [101]:
model_RF = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=200, random_state=42))
])

# Train the model
model_RF.fit(X_train, y_train)

# Predict
y_pred = model_RF.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse:.2f}, R2: {r2:.2f}')


MSE: 2468.86, R2: 0.29




This model shows poor performance, likely due to the small dataset (n=228). The MSE indicates that the fantasy point estimations are off by the square root of 2468.86 (~ 50 points). And the R squared value indicates that the model onyl explains ~30% of the variance in the data. 

Edit: even after increasing sample to 569, the results did not improve... they worsened :/ 
MSE: 3867.75, R2: 0.25


# XGBoost

In [24]:
from xgboost import XGBRegressor

In [104]:
# XGBoost regressor
xgb_model = XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

# Full pipeline
pipeline_xgb = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', xgb_model)
])

# Train the model
pipeline_xgb.fit(X_train, y_train)

0,1,2
,steps,"[('preprocessor', ...), ('regressor', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,categories,'auto'
,drop,'first'
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False


In [105]:
# Predict
y_pred = pipeline_xgb.predict(X_test)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse:.2f}, R2: {r2:.2f}")

# Optional: cross-validation
cv_scores = cross_val_score(pipeline_xgb, X, y, cv=5, scoring='r2')
print(f"5-fold CV R2: {cv_scores.mean():.2f} ± {cv_scores.std():.2f}")



MSE: 2909.64, R2: 0.16




5-fold CV R2: 0.05 ± 0.45




# CatBoost

In [85]:
from catboost import CatBoostRegressor, Pool

# Convert data into CatBoost Pool (efficient handling of categoricals)
train_pool = Pool(X_train, y_train, cat_features=categorical_cols)
test_pool = Pool(X_test, y_test, cat_features=categorical_cols)

# CatBoost model
cat_model = CatBoostRegressor(
    iterations=500,
    learning_rate=0.05,
    depth=6,
    random_seed=42,
    verbose=0
)

cat_model.fit(train_pool)
y_pred = cat_model.predict(test_pool)

# Evaluate
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"CatBoost -> MSE: {mse:.2f}, R2: {r2:.2f}")


CatBoost -> MSE: 2616.79, R2: 0.24


# Linear Regression (Ridge)

In [106]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

results = {}

for pos in X['position'].unique():
    # Filter dataset for just this position
    mask = X['position'] == pos
    X_pos = X[mask].drop(columns=['position'])  # drop 'position'
    y_pos = y[mask]

    # Separate categorical and numeric for this subset
    categorical_cols = ['team', 'C_conference', 'C_team']
    numeric_cols = [col for col in X_pos.columns if col not in categorical_cols]

    # Preprocessor
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numeric_cols),
            ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
        ]
    )

    # Pipeline
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('ridge', Ridge(alpha=1.0))
    ])

    # Train/test split per position
    X_train, X_test, y_train, y_test = train_test_split(
        X_pos, y_pos, test_size=0.2, random_state=42
    )

    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results[pos] = (mse, r2)

print("Per-position linear regression results:")
for pos, (mse, r2) in results.items():
    print(f"{pos} -> MSE: {mse:.2f}, R2: {r2:.2f}")


Per-position linear regression results:
QB -> MSE: 5107.94, R2: 0.47
WR -> MSE: 2225.32, R2: 0.21
RB -> MSE: 3264.00, R2: -0.35
TE -> MSE: 57577.92, R2: -61.77




# Results


## Model results for data 2014-2024:
- CatBootst: MSE: 3664.83, R2: 0.29
- Linear: 
  - QB -> MSE: 13757.22, R2: -0.51
  - WR -> MSE: 3909.36, R2: -0.05
  - TE -> MSE: 20680.57, R2: -16.27
  - RB -> MSE: 3505.07, R2: 0.06
- XGBoost: MSE: 4066.48, R2: 0.21
  - 5-fold CV R2: 0.30 ± 0.09
- Random Forest: MSE: 3867.75, R2: 0.25

## Model Results for 2019-2024:
- XGBoost: MSE: 2909.64, R2: 0.16
  - 5-fold CV R2: 0.05 ± 0.45
- CatBoost: MSE: 2587.57, R2: 0.25
- Random Forest: MSE: 2468.86, R2: 0.29
- Linear:
  - QB -> MSE: 5107.94, R2: 0.47
  - WR -> MSE: 2225.32, R2: 0.21
  - RB -> MSE: 3264.00, R2: -0.35
  - TE -> MSE: 57577.92, R2: -61.77

# Predict Fantasy Points


In [109]:
rookies = pd.read_csv('df_master_rookies_2025.csv')
print(rookies.columns)

Index(['player_name', 'position', 'team', 'draft_year', 'draft_round',
       'draft_pick_overall', 'age_on_draft_day', 'C_season', 'player_id',
       'C_team', 'C_conference', 'C_passing_TD', 'C_passing_YDS',
       'C_passing_INT', 'C_rushing_TD', 'C_rushing_YDS', 'C_receiving_REC',
       'C_receiving_TD', 'C_receiving_YDS', 'C_fumbles_LOST', 'C_passing_ATT',
       'C_passing_COMPLETIONS', 'C_passing_PCT', 'C_passing_YPA',
       'C_rushing_CAR', 'C_rushing_YPC', 'C_rushing_LONG', 'C_receiving_YPR',
       'C_receiving_LONG', 'C_fumbles_FUM', 'C_conference_strength',
       'height_in', 'weight_lb'],
      dtype='object')


In [110]:
# Predict fantasy points
rookies_predictions_xgb = pipeline_xgb.predict(rookies)
rookies_predictions_rf = model_RF.predict(rookies)

# Add predictions to the dataframe
rookies['predicted_fp_xgb'] = rookies_predictions_xgb
rookies['predicted_fp_rf'] = rookies_predictions_rf

print(rookies[['player_name', 'position', 'predicted_fp_rf']])



ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [89]:
# Save to csv
rookies.to_csv('df_rookie_predictions_2025_xgb.csv', index=False)