# 

# Model Selection

## Choosing Model and Hyper-Parameter Tuning

In [27]:

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cross_decomposition import PLSRegression


### Preparing Training Data

In [2]:
training = pd.read_csv('~/Desktop/projects/Fantasy-Football-Predictor-2025/data/training.csv')

In [3]:
training = training.drop(['Unnamed: 0', 'Unnamed: 0_last1.1'], axis = 1)
training

Unnamed: 0,player_name,position,team,week,season,passing_yards_avg,passing_tds_avg,interceptions_avg,passing_epa_avg,carries_avg,...,opp_first_downs_last3,opp_yards_per_play_last3,opp_turnovers_last3,opp_pts_allowed_avg,opp_pts_allowed_last1,opp_pts_allowed_last3,is_home,spread_line,total_line,implied_team_total
0,Chase Edmonds,RB,ARI,1,2020,0.000000,0.000000,0.000000,0.000000,6.000000,...,23.333333,3.858866,0.333333,19.526316,31.0,20.333333,0,7.0,48.5,20.75
1,Christian Kirk,WR,ARI,1,2020,0.000000,0.000000,0.000000,0.000000,0.769231,...,23.333333,3.858866,0.333333,19.526316,31.0,20.333333,0,7.0,48.5,20.75
2,Dan Arnold,TE,ARI,1,2020,0.000000,0.000000,0.000000,0.000000,0.000000,...,23.333333,3.858866,0.333333,19.526316,31.0,20.333333,0,7.0,48.5,20.75
3,DeAndre Hopkins,WR,ARI,1,2020,0.352941,0.058824,0.058824,-2.789241,0.117647,...,23.333333,3.858866,0.333333,19.526316,31.0,20.333333,0,7.0,48.5,20.75
4,Kenyan Drake,RB,ARI,1,2020,0.000000,0.000000,0.000000,0.000000,12.000000,...,23.333333,3.858866,0.333333,19.526316,31.0,20.333333,0,7.0,48.5,20.75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23409,Jeremy McNichols,RB,WAS,17,2024,0.000000,0.000000,0.000000,0.000000,4.153846,...,17.666667,3.852186,2.000000,23.266667,7.0,19.333333,1,3.5,46.5,25.00
23410,John Bates,TE,WAS,17,2024,0.000000,0.000000,0.000000,0.000000,0.000000,...,17.666667,3.852186,2.000000,23.266667,7.0,19.333333,1,3.5,46.5,25.00
23411,Olamide Zaccheaus,WR,WAS,17,2024,0.000000,0.000000,0.000000,0.000000,0.000000,...,17.666667,3.852186,2.000000,23.266667,7.0,19.333333,1,3.5,46.5,25.00
23412,Terry McLaurin,WR,WAS,17,2024,0.000000,0.000000,0.000000,0.000000,0.133333,...,17.666667,3.852186,2.000000,23.266667,7.0,19.333333,1,3.5,46.5,25.00


In [4]:
training = pd.get_dummies(training, columns= ['position'])

training

Unnamed: 0,player_name,team,week,season,passing_yards_avg,passing_tds_avg,interceptions_avg,passing_epa_avg,carries_avg,rushing_yards_avg,...,opp_pts_allowed_last3,is_home,spread_line,total_line,implied_team_total,position_FB,position_QB,position_RB,position_TE,position_WR
0,Chase Edmonds,ARI,1,2020,0.000000,0.000000,0.000000,0.000000,6.000000,30.300000,...,20.333333,0,7.0,48.5,20.75,0,0,1,0,0
1,Christian Kirk,ARI,1,2020,0.000000,0.000000,0.000000,0.000000,0.769231,7.153846,...,20.333333,0,7.0,48.5,20.75,0,0,0,0,1
2,Dan Arnold,ARI,1,2020,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,20.333333,0,7.0,48.5,20.75,0,0,0,1,0
3,DeAndre Hopkins,ARI,1,2020,0.352941,0.058824,0.058824,-2.789241,0.117647,1.058824,...,20.333333,0,7.0,48.5,20.75,0,0,0,0,1
4,Kenyan Drake,ARI,1,2020,0.000000,0.000000,0.000000,0.000000,12.000000,56.071429,...,20.333333,0,7.0,48.5,20.75,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23409,Jeremy McNichols,WAS,17,2024,0.000000,0.000000,0.000000,0.000000,4.153846,19.692308,...,19.333333,1,3.5,46.5,25.00,0,0,1,0,0
23410,John Bates,WAS,17,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,19.333333,1,3.5,46.5,25.00,0,0,0,1,0
23411,Olamide Zaccheaus,WAS,17,2024,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,19.333333,1,3.5,46.5,25.00,0,0,0,0,1
23412,Terry McLaurin,WAS,17,2024,0.000000,0.000000,0.000000,0.000000,0.133333,0.133333,...,19.333333,1,3.5,46.5,25.00,0,0,0,0,1


In [5]:
# create target column
weekly = pd.read_csv('~/Desktop/projects/Fantasy-Football-Predictor-2025/data/weekly.csv')

training['fantasy_points_ppr'] = weekly['fantasy_points_ppr']

In [6]:
training.set_index(['player_name', 'team', 'season', 'week'], inplace = True)

training

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,passing_yards_avg,passing_tds_avg,interceptions_avg,passing_epa_avg,carries_avg,rushing_yards_avg,rushing_tds_avg,fumbles_avg,fumbles_lost_avg,rushing_epa_avg,...,is_home,spread_line,total_line,implied_team_total,position_FB,position_QB,position_RB,position_TE,position_WR,fantasy_points_ppr
player_name,team,season,week,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
Chase Edmonds,ARI,2020,1,0.000000,0.000000,0.000000,0.000000,6.000000,30.300000,0.400000,0.000000,0.000000,0.265610,...,0,7.0,48.5,20.75,0,0,1,0,0,25.64
Christian Kirk,ARI,2020,1,0.000000,0.000000,0.000000,0.000000,0.769231,7.153846,0.000000,0.000000,0.000000,0.492844,...,0,7.0,48.5,20.75,0,0,0,0,1,24.66
Dan Arnold,ARI,2020,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0,7.0,48.5,20.75,0,0,0,1,0,20.14
DeAndre Hopkins,ARI,2020,1,0.352941,0.058824,0.058824,-2.789241,0.117647,1.058824,0.000000,0.117647,0.058824,0.988049,...,0,7.0,48.5,20.75,0,0,0,0,1,3.70
Kenyan Drake,ARI,2020,1,0.000000,0.000000,0.000000,0.000000,12.000000,56.071429,0.571429,0.142857,0.071429,-0.044920,...,0,7.0,48.5,20.75,0,0,1,0,0,23.92
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Jeremy McNichols,WAS,2024,17,0.000000,0.000000,0.000000,0.000000,4.153846,19.692308,0.307692,0.000000,0.000000,0.630134,...,1,3.5,46.5,25.00,0,0,1,0,0,9.20
John Bates,WAS,2024,17,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.125000,0.125000,0.000000,...,1,3.5,46.5,25.00,0,0,0,1,0,6.70
Olamide Zaccheaus,WAS,2024,17,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1,3.5,46.5,25.00,0,0,0,0,1,2.20
Terry McLaurin,WAS,2024,17,0.000000,0.000000,0.000000,0.000000,0.133333,0.133333,0.000000,0.066667,0.066667,-3.091181,...,1,3.5,46.5,25.00,0,0,0,0,1,5.80


### Creating cross-validation split

In [23]:
# define folds
cv = KFold(n_splits = 3, shuffle = True, random_state = 88) # DezCaughtIt

### Creating X, Y


In [8]:
Y = training['fantasy_points_ppr']
X = training.drop(columns = ['fantasy_points_ppr'])

In [9]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,passing_yards_avg,passing_tds_avg,interceptions_avg,passing_epa_avg,carries_avg,rushing_yards_avg,rushing_tds_avg,fumbles_avg,fumbles_lost_avg,rushing_epa_avg,...,opp_pts_allowed_last3,is_home,spread_line,total_line,implied_team_total,position_FB,position_QB,position_RB,position_TE,position_WR
player_name,team,season,week,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
Chase Edmonds,ARI,2020,1,0.000000,0.000000,0.000000,0.000000,6.000000,30.300000,0.400000,0.000000,0.000000,0.265610,...,20.333333,0,7.0,48.5,20.75,0,0,1,0,0
Christian Kirk,ARI,2020,1,0.000000,0.000000,0.000000,0.000000,0.769231,7.153846,0.000000,0.000000,0.000000,0.492844,...,20.333333,0,7.0,48.5,20.75,0,0,0,0,1
Dan Arnold,ARI,2020,1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,20.333333,0,7.0,48.5,20.75,0,0,0,1,0
DeAndre Hopkins,ARI,2020,1,0.352941,0.058824,0.058824,-2.789241,0.117647,1.058824,0.000000,0.117647,0.058824,0.988049,...,20.333333,0,7.0,48.5,20.75,0,0,0,0,1
Kenyan Drake,ARI,2020,1,0.000000,0.000000,0.000000,0.000000,12.000000,56.071429,0.571429,0.142857,0.071429,-0.044920,...,20.333333,0,7.0,48.5,20.75,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Jeremy McNichols,WAS,2024,17,0.000000,0.000000,0.000000,0.000000,4.153846,19.692308,0.307692,0.000000,0.000000,0.630134,...,19.333333,1,3.5,46.5,25.00,0,0,1,0,0
John Bates,WAS,2024,17,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.125000,0.125000,0.000000,...,19.333333,1,3.5,46.5,25.00,0,0,0,1,0
Olamide Zaccheaus,WAS,2024,17,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,19.333333,1,3.5,46.5,25.00,0,0,0,0,1
Terry McLaurin,WAS,2024,17,0.000000,0.000000,0.000000,0.000000,0.133333,0.133333,0.000000,0.066667,0.066667,-3.091181,...,19.333333,1,3.5,46.5,25.00,0,0,0,0,1


In [10]:
Y

player_name        team  season  week
Chase Edmonds      ARI   2020    1       25.64
Christian Kirk     ARI   2020    1       24.66
Dan Arnold         ARI   2020    1       20.14
DeAndre Hopkins    ARI   2020    1        3.70
Kenyan Drake       ARI   2020    1       23.92
                                         ...  
Jeremy McNichols   WAS   2024    17       9.20
John Bates         WAS   2024    17       6.70
Olamide Zaccheaus  WAS   2024    17       2.20
Terry McLaurin     WAS   2024    17       5.80
Zach Ertz          WAS   2024    17      10.00
Name: fantasy_points_ppr, Length: 23414, dtype: float64

# Model 1: Linear Regression

In [65]:
lr = LinearRegression()
scores = cross_val_score(lr, X, Y, cv=cv, scoring='neg_root_mean_squared_error')

print(-np.mean(scores))

8.138754124664604


RMSE: 8.138754124664604

# Model 2: Ridge Regression

In [79]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

# define parameter grid
param_grid = {'ridge__alpha': np.logspace(-3, 5, 20)}

# define grid search
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=cv)

# run the grid search
grid.fit(X, Y)

# return best parameter and RMSE
print("Best alpha:", grid.best_params_['ridge__alpha'])
print("Best RMSE:", -grid.best_score_)

Best alpha: 5455.594781168515
Best RMSE: 8.130371474542098


Best alpha: 5455.594781168515
Best RMSE: 8.130371474542098

# Model 3: LASSO Regression

In [80]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('lasso', Lasso(max_iter=10000))
])
# define parameter grid
param_grid = {'lasso__alpha': np.logspace(-3, 3, 20)}

# define grid search
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=cv)

# run the grid search
grid.fit(X, Y)

# return best parameter and RMSE
print("Best alpha:", grid.best_params_['lasso__alpha'])
print("Best RMSE:", -grid.best_score_)

Best alpha: 0.0379269019073225
Best RMSE: 8.129782526704671


Best alpha: 0.0379269019073225
Best RMSE: 8.129782526704671

# Model 4: Elastic Net (L1 & L2 Penalty)

In [None]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('elasticnet', ElasticNet(max_iter=10000))
])

# Parameter grid:
# - alpha: overall regularization strength
# - l1_ratio: 0 = Ridge, 1 = Lasso, values in between mix both
param_grid = {
    'elasticnet__alpha': np.logspace(-3, 3, 10),   
    'elasticnet__l1_ratio': np.linspace(0, 1, 6)
}

# define grid search
grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=cv)

# run the grid search
grid.fit(X, Y)




Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sklearn.linear_model.Ridge/RidgeCV instead.
  model = cd_fast.enet_coordinate_descent(
Linear regression models with a zero l1 penalization strength are more efficiently fitted using one of the solvers implemented in sk

KeyError: 'lasso__alpha'

In [84]:
# return best parameter and RMSE
print("Best alpha:", grid.best_params_['elasticnet__alpha'])
print("Best RMSE:", -grid.best_score_)

Best alpha: 0.021544346900318832
Best RMSE: 8.129390209207472


# Model 5: Partial Least Squares (PLS)

In [11]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pls', PLSRegression())
])

param_grid = {
    'pls__n_components': list(range(1, 30))
}

grid = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=cv
)

grid.fit(X, Y)

0,1,2
,estimator,Pipeline(step...egression())])
,param_grid,"{'pls__n_components': [1, 2, ...]}"
,scoring,'neg_root_mean_squared_error'
,n_jobs,
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_components,7
,scale,True
,max_iter,500
,tol,1e-06
,copy,True


In [12]:

print("Best n_components:", grid.best_params_['pls__n_components'])
print("Best RMSE:", -grid.best_score_)

Best n_components: 7
Best RMSE: 8.134002644477196


# Model 6: Random Forest

In [45]:
rf = RandomForestRegressor(random_state = 4) #D4K

param_grid = {
    'n_estimators': [200],
    'max_depth': [None, 20, 50],
    'min_samples_split': [2, 10, 20, 50],
    'min_samples_leaf': [2, 4, 6, 10],
    'max_features': [0.5, 'sqrt', 0.3, 0.8, 0.75]
}

grid = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=cv,
    n_jobs = -1,
    verbose = 2
)

grid.fit(X, Y)

Fitting 3 folds for each of 240 candidates, totalling 720 fits


0,1,2
,estimator,RandomForestR...andom_state=4)
,param_grid,"{'max_depth': [None, 20, ...], 'max_features': [0.5, 'sqrt', ...], 'min_samples_leaf': [2, 4, ...], 'min_samples_split': [2, 10, ...], ...}"
,scoring,'neg_root_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,2
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_estimators,200
,criterion,'squared_error'
,max_depth,50
,min_samples_split,20
,min_samples_leaf,6
,min_weight_fraction_leaf,0.0
,max_features,0.8
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [46]:
print("Best RMSE:", -grid.best_score_)
print("Best parameters found:", grid.best_params_)


Best RMSE: 7.245198768018202
Best parameters found: {'max_depth': 50, 'max_features': 0.8, 'min_samples_leaf': 6, 'min_samples_split': 20, 'n_estimators': 200}


Best RMSE: 7.245198768018202

Best parameters found: 
- 'max_depth': 50
- 'max_features': 0.8
- 'min_samples_leaf': 6
- 'min_samples_split': 20
- 'n_estimators': 200

# Model 7: Histogram-Based Gradient Boosting Regressor

In [40]:
hgb = HistGradientBoostingRegressor(random_state = 4) # D4K

param_grid = {
    'max_iter': [1000, 1500, 2000],
    'learning_rate': [0.1, 0.2, 0.3],
    'max_depth': [15, 20, 25],
    'min_samples_leaf': [5, 10, 15],
    'l2_regularization': [0.0, 0.05, 0.1]
}


grid = GridSearchCV(
    estimator=hgb,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=cv,
    n_jobs=-1,
    verbose=1
)

grid.fit(X, Y)

Fitting 3 folds for each of 243 candidates, totalling 729 fits


0,1,2
,estimator,HistGradientB...andom_state=4)
,param_grid,"{'l2_regularization': [0.0, 0.05, ...], 'learning_rate': [0.1, 0.2, ...], 'max_depth': [15, 20, ...], 'max_iter': [1000, 1500, ...], ...}"
,scoring,'neg_root_mean_squared_error'
,n_jobs,-1
,refit,True
,cv,KFold(n_split... shuffle=True)
,verbose,1
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,loss,'squared_error'
,quantile,
,learning_rate,0.1
,max_iter,1000
,max_leaf_nodes,31
,max_depth,15
,min_samples_leaf,15
,l2_regularization,0.1
,max_features,1.0
,max_bins,255


In [39]:
print("Best RMSE:", -grid.best_score_)
print("Best params:", grid.best_params_)

Best RMSE: 7.474910242250448
Best params: {'l2_regularization': 0.1, 'learning_rate': 0.05, 'max_depth': 15, 'max_iter': 500, 'min_samples_leaf': 20}
