# Flex model
This notebook is basically a clone of our QB model.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nfl_data_py as nfl
import sqlite3

In [2]:
#from collections import defaultdict

Let's set up our error dictionaries to keep track of the models. We'll need separate models for FanDuel and DraftKings since the scoring is a little different.

In [3]:
# error_dict_FD = defaultdict(float)
# error_dict_DK = defaultdict(float)

Let's bring in our dataset from the database.

In [4]:
# Connect to the SQLite database
conn = sqlite3.connect('nfl_dfs.db')

# Query specific columns from the table
query = "SELECT * FROM flex_dataset"
flex_df = pd.read_sql_query(query, conn)

# Close the connection
conn.close()

In [5]:
flex_df.head()

Unnamed: 0,season,week,team,player_id,player_display_name,position,FD_Pts,DK_Pts,DK_Pts_RB_DvP,DK_Pts_TE_DvP,...,opp_total,qb_comp,qb_att,qb_yds,qb_pass_td,qb_int,qb_comp_pct,qb_yds_per_att,qb_td_pct,qb_int_pct
0,2006,2,ARI,00-0000552,Obafemi Ayanbadejo,RB,2.7,3.7,13.3,8.1,...,27.5,23.0,37.0,301.0,3.0,0.0,0.621622,8.135135,0.081081,0.0
1,2006,2,ARI,00-0008241,Edgerrin James,RB,13.2,16.7,13.3,8.1,...,27.5,23.0,37.0,301.0,3.0,0.0,0.621622,8.135135,0.081081,0.0
2,2006,2,ARI,00-0019552,Troy Walters,WR,2.0,3.0,13.3,8.1,...,27.5,23.0,37.0,301.0,3.0,0.0,0.621622,8.135135,0.081081,0.0
3,2006,2,ARI,00-0022084,Anquan Boldin,WR,9.2,12.2,13.3,8.1,...,27.5,23.0,37.0,301.0,3.0,0.0,0.621622,8.135135,0.081081,0.0
4,2006,2,ARI,00-0022156,Bryant Johnson,WR,10.5,11.0,13.3,8.1,...,27.5,23.0,37.0,301.0,3.0,0.0,0.621622,8.135135,0.081081,0.0


In [6]:
flex_df.tail()

Unnamed: 0,season,week,team,player_id,player_display_name,position,FD_Pts,DK_Pts,DK_Pts_RB_DvP,DK_Pts_TE_DvP,...,opp_total,qb_comp,qb_att,qb_yds,qb_pass_td,qb_int,qb_comp_pct,qb_yds_per_att,qb_td_pct,qb_int_pct
80810,2023,22,SF,00-0034407,Ray-Ray McCloud,WR,2.4,2.9,19.6125,11.85,...,22.75,19.75,29.875,267.125,1.875,0.875,0.661088,8.941423,0.062762,0.029289
80811,2023,22,SF,00-0035719,Deebo Samuel,WR,5.6,7.1,19.6125,11.85,...,22.75,19.75,29.875,267.125,1.875,0.875,0.661088,8.941423,0.062762,0.029289
80812,2023,22,SF,00-0036259,Jauan Jennings,WR,17.04,19.04,19.6125,11.85,...,22.75,19.75,29.875,267.125,1.875,0.875,0.661088,8.941423,0.062762,0.029289
80813,2023,22,SF,00-0036261,Brandon Aiyuk,WR,6.4,7.9,19.6125,11.85,...,22.75,19.75,29.875,267.125,1.875,0.875,0.661088,8.941423,0.062762,0.029289
80814,2023,22,SF,00-0036567,Elijah Mitchell,RB,0.8,0.8,19.6125,11.85,...,22.75,19.75,29.875,267.125,1.875,0.875,0.661088,8.941423,0.062762,0.029289


In [7]:
flex_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80815 entries, 0 to 80814
Data columns (total 50 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   season                          80815 non-null  int64  
 1   week                            80815 non-null  int64  
 2   team                            80815 non-null  object 
 3   player_id                       80815 non-null  object 
 4   player_display_name             80815 non-null  object 
 5   position                        80815 non-null  object 
 6   FD_Pts                          80815 non-null  float64
 7   DK_Pts                          80815 non-null  float64
 8   DK_Pts_RB_DvP                   80815 non-null  float64
 9   DK_Pts_TE_DvP                   80815 non-null  float64
 10  DK_Pts_WR_DvP                   80815 non-null  float64
 11  FD_Pts_RB_DvP                   80815 non-null  float64
 12  FD_Pts_TE_DvP                   

In [8]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score 
from sklearn.model_selection import train_test_split 
from sklearn import linear_model, preprocessing 

In [9]:
flex_model_even = flex_df[flex_df['season']%2 == 0]

In [10]:
main_df = flex_model_even

In [11]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40039 entries, 0 to 75971
Data columns (total 50 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   season                          40039 non-null  int64  
 1   week                            40039 non-null  int64  
 2   team                            40039 non-null  object 
 3   player_id                       40039 non-null  object 
 4   player_display_name             40039 non-null  object 
 5   position                        40039 non-null  object 
 6   FD_Pts                          40039 non-null  float64
 7   DK_Pts                          40039 non-null  float64
 8   DK_Pts_RB_DvP                   40039 non-null  float64
 9   DK_Pts_TE_DvP                   40039 non-null  float64
 10  DK_Pts_WR_DvP                   40039 non-null  float64
 11  FD_Pts_RB_DvP                   40039 non-null  float64
 12  FD_Pts_TE_DvP                   40039

In [12]:
main_df['season'].value_counts()

season
2022    4863
2020    4681
2014    4604
2012    4549
2016    4547
2018    4545
2010    4437
2008    4099
2006    3714
Name: count, dtype: int64

In [13]:
main_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
season,40039.0,2014.327481,5.098584,2006.0,2010.0,2014.0,2018.0,2022.0
week,40039.0,9.620445,5.279488,1.0,5.0,10.0,14.0,22.0
FD_Pts,40039.0,6.897175,6.784428,-4.7,1.7,4.8,10.2,54.7
DK_Pts,40039.0,8.404293,8.07124,-2.7,2.2,6.0,12.2,60.9
DK_Pts_RB_DvP,40039.0,24.267485,4.804113,5.3,20.875,23.9125,27.3125,47.133333
DK_Pts_TE_DvP,40039.0,12.039609,3.315715,0.0,9.75,11.825,14.15,25.8
DK_Pts_WR_DvP,40039.0,34.47859,6.020137,6.4,30.5625,34.4375,38.3675,63.3125
FD_Pts_RB_DvP,40039.0,20.973706,4.317832,4.8,17.9,20.6375,23.7625,42.3
FD_Pts_TE_DvP,40039.0,9.602321,2.755943,0.0,7.65,9.4625,11.325,21.925
FD_Pts_WR_DvP,40039.0,27.530229,4.878245,4.9,24.357143,27.5025,30.8,48.8125


# One-hot encoding
We create a categorical variable from the players' positions, as receiving_yards_after_catch_L8 matter a lot more for WRs and TEs than for RBs and rushing_yards_L8 matter a lot more for RBs than WRs and TEs.

In [14]:
position_dummies = pd.get_dummies(main_df['position'], prefix='pos')

In [15]:
main_df = pd.concat([main_df, position_dummies], axis=1)

In [16]:
main_df.head()

Unnamed: 0,season,week,team,player_id,player_display_name,position,FD_Pts,DK_Pts,DK_Pts_RB_DvP,DK_Pts_TE_DvP,...,qb_yds,qb_pass_td,qb_int,qb_comp_pct,qb_yds_per_att,qb_td_pct,qb_int_pct,pos_RB,pos_TE,pos_WR
0,2006,2,ARI,00-0000552,Obafemi Ayanbadejo,RB,2.7,3.7,13.3,8.1,...,301.0,3.0,0.0,0.621622,8.135135,0.081081,0.0,True,False,False
1,2006,2,ARI,00-0008241,Edgerrin James,RB,13.2,16.7,13.3,8.1,...,301.0,3.0,0.0,0.621622,8.135135,0.081081,0.0,True,False,False
2,2006,2,ARI,00-0019552,Troy Walters,WR,2.0,3.0,13.3,8.1,...,301.0,3.0,0.0,0.621622,8.135135,0.081081,0.0,False,False,True
3,2006,2,ARI,00-0022084,Anquan Boldin,WR,9.2,12.2,13.3,8.1,...,301.0,3.0,0.0,0.621622,8.135135,0.081081,0.0,False,False,True
4,2006,2,ARI,00-0022156,Bryant Johnson,WR,10.5,11.0,13.3,8.1,...,301.0,3.0,0.0,0.621622,8.135135,0.081081,0.0,False,False,True


In [17]:
main_df['pos_RB'] = main_df['pos_RB'].astype(int)
main_df['pos_TE'] = main_df['pos_TE'].astype(int)
main_df['pos_WR'] = main_df['pos_WR'].astype(int)

In [18]:
main_df.head()

Unnamed: 0,season,week,team,player_id,player_display_name,position,FD_Pts,DK_Pts,DK_Pts_RB_DvP,DK_Pts_TE_DvP,...,qb_yds,qb_pass_td,qb_int,qb_comp_pct,qb_yds_per_att,qb_td_pct,qb_int_pct,pos_RB,pos_TE,pos_WR
0,2006,2,ARI,00-0000552,Obafemi Ayanbadejo,RB,2.7,3.7,13.3,8.1,...,301.0,3.0,0.0,0.621622,8.135135,0.081081,0.0,1,0,0
1,2006,2,ARI,00-0008241,Edgerrin James,RB,13.2,16.7,13.3,8.1,...,301.0,3.0,0.0,0.621622,8.135135,0.081081,0.0,1,0,0
2,2006,2,ARI,00-0019552,Troy Walters,WR,2.0,3.0,13.3,8.1,...,301.0,3.0,0.0,0.621622,8.135135,0.081081,0.0,0,0,1
3,2006,2,ARI,00-0022084,Anquan Boldin,WR,9.2,12.2,13.3,8.1,...,301.0,3.0,0.0,0.621622,8.135135,0.081081,0.0,0,0,1
4,2006,2,ARI,00-0022156,Bryant Johnson,WR,10.5,11.0,13.3,8.1,...,301.0,3.0,0.0,0.621622,8.135135,0.081081,0.0,0,0,1


In [19]:
main_df.set_index(['season', 'week', 'player_id', 'player_display_name', 'position', 'team', 'opponent'], inplace = True)

In [20]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 40039 entries, (2006, 2, '00-0000552', 'Obafemi Ayanbadejo', 'RB', 'ARI', 'SEA') to (2022, 22, '00-0036919', 'Kenneth Gainwell', 'RB', 'PHI', 'KC')
Data columns (total 46 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   FD_Pts                          40039 non-null  float64
 1   DK_Pts                          40039 non-null  float64
 2   DK_Pts_RB_DvP                   40039 non-null  float64
 3   DK_Pts_TE_DvP                   40039 non-null  float64
 4   DK_Pts_WR_DvP                   40039 non-null  float64
 5   FD_Pts_RB_DvP                   40039 non-null  float64
 6   FD_Pts_TE_DvP                   40039 non-null  float64
 7   FD_Pts_WR_DvP                   40039 non-null  float64
 8   rushing_yards_L8                40039 non-null  float64
 9   rushing_tds_L8                  40039 non-null  float64
 10  rushing_fumbles_lost_L8         

# Correlations

In [21]:
corr_df = main_df.corr()

In [22]:
corr_df['FD_Pts'].sort_values(ascending = False)

FD_Pts                            1.000000
DK_Pts                            0.991358
receiving_yards_after_catch_L8    0.385577
receptions_L8                     0.382620
target_share_L8                   0.359013
targets_L8                        0.347801
receiving_yards_L8                0.343311
in_10_share_L8                    0.315164
rushing_yards_L8                  0.276919
carry_share_L8                    0.265971
carries_L8                        0.265165
rushing_tds_L8                    0.242546
receiving_tds_L8                  0.227152
receiving_air_yards_L8            0.213579
rushing_fumbles_lost_L8           0.125841
pred_total                        0.105739
yards_per_carry_L8                0.090849
yards_per_target_L8               0.088149
spread_line                       0.079499
qb_pass_td                        0.076429
qb_yds                            0.075921
total_line                        0.069490
qb_comp                           0.068338
pos_WR     

In [23]:
corr_df['DK_Pts'].sort_values(ascending = False)

DK_Pts                            1.000000
FD_Pts                            0.991358
receptions_L8                     0.423298
receiving_yards_after_catch_L8    0.406460
target_share_L8                   0.400407
targets_L8                        0.389418
receiving_yards_L8                0.386355
in_10_share_L8                    0.294587
receiving_tds_L8                  0.257146
receiving_air_yards_L8            0.254788
rushing_yards_L8                  0.238196
carry_share_L8                    0.227062
carries_L8                        0.225546
rushing_tds_L8                    0.208494
rushing_fumbles_lost_L8           0.108424
yards_per_target_L8               0.105408
pred_total                        0.102673
pos_WR                            0.098017
yards_per_reception_L8            0.086044
yards_per_carry_L8                0.080313
qb_yds                            0.078828
qb_pass_td                        0.077890
spread_line                       0.074984
receiving_f

In [24]:
X_FD = main_df.drop(columns = ['FD_Pts', 'DK_Pts'])
X_DK = main_df.drop(columns = ['FD_Pts', 'DK_Pts'])

In [25]:
y_FD = main_df[['FD_Pts']]
y_DK = main_df[['DK_Pts']]

In [26]:
X_FD_train, X_FD_test, y_FD_train, y_FD_test = train_test_split(X_FD, y_FD, test_size = .25, random_state = 42)

In [27]:
X_DK_train, X_DK_test, y_DK_train, y_DK_test = train_test_split(X_DK, y_DK, test_size = .25, random_state = 42)

In [28]:
from sklearn.preprocessing import StandardScaler

In [29]:
# Initialize the scaler
scaler_fd = StandardScaler()
scaler_dk = StandardScaler()

In [30]:
# Fit the scaler on the training data and transform the training data for FanDuel
X_FD_train_scaled = scaler_fd.fit_transform(X_FD_train)

In [31]:
# Use the already-fitted scaler to transform the test data for FanDuel
X_FD_test_scaled = scaler_fd.transform(X_FD_test)

In [32]:
# Fit the scaler on the training data and transform the training data for DraftKings
X_DK_train_scaled = scaler_dk.fit_transform(X_DK_train)

In [33]:
# Use the already-fitted scaler to transform the test data for DraftKings
X_DK_test_scaled = scaler_dk.transform(X_DK_test)

In [34]:
X_FD_scaled_df = pd.DataFrame(X_FD_train_scaled, columns = X_FD_train.columns)

In [35]:
# Check the mean and standard deviation of the scaled data
print("FanDuel Scaled Data - Mean:")
print(X_FD_scaled_df.mean(axis=0))

print("\nFanDuel Scaled Data - Standard Deviation:")
print(X_FD_scaled_df.std(axis=0))

FanDuel Scaled Data - Mean:
DK_Pts_RB_DvP                     1.166531e-16
DK_Pts_TE_DvP                     5.158291e-17
DK_Pts_WR_DvP                    -6.156822e-16
FD_Pts_RB_DvP                    -3.714916e-16
FD_Pts_TE_DvP                     2.015993e-16
FD_Pts_WR_DvP                    -3.234580e-16
rushing_yards_L8                 -1.230418e-17
rushing_tds_L8                    5.891809e-17
rushing_fumbles_lost_L8           8.045041e-18
receptions_L8                    -2.687990e-16
receiving_yards_L8               -1.325066e-17
receiving_tds_L8                  2.129570e-17
receiving_fumbles_lost_L8        -1.774641e-17
targets_L8                        7.098565e-17
carries_L8                       -1.325066e-17
receiving_yards_after_catch_L8    1.484783e-16
receiving_air_yards_L8            1.074250e-16
target_share_L8                   3.312664e-18
carry_share_L8                    1.019827e-16
in_10_share_L8                   -7.855746e-17
yards_per_carry_L8              

In [36]:
X_DK_scaled_df = pd.DataFrame(X_DK_train_scaled, columns = X_DK_train.columns)

In [37]:
# Check the mean and standard deviation of the scaled data
print("\nDraftKings Scaled Data - Mean:")
print(X_DK_scaled_df.mean(axis=0))

print("\nDraftKings Scaled Data - Standard Deviation:")
print(X_DK_scaled_df.std(axis=0))


DraftKings Scaled Data - Mean:
DK_Pts_RB_DvP                     1.166531e-16
DK_Pts_TE_DvP                     5.158291e-17
DK_Pts_WR_DvP                    -6.156822e-16
FD_Pts_RB_DvP                    -3.714916e-16
FD_Pts_TE_DvP                     2.015993e-16
FD_Pts_WR_DvP                    -3.234580e-16
rushing_yards_L8                 -1.230418e-17
rushing_tds_L8                    5.891809e-17
rushing_fumbles_lost_L8           8.045041e-18
receptions_L8                    -2.687990e-16
receiving_yards_L8               -1.325066e-17
receiving_tds_L8                  2.129570e-17
receiving_fumbles_lost_L8        -1.774641e-17
targets_L8                        7.098565e-17
carries_L8                       -1.325066e-17
receiving_yards_after_catch_L8    1.484783e-16
receiving_air_yards_L8            1.074250e-16
target_share_L8                   3.312664e-18
carry_share_L8                    1.019827e-16
in_10_share_L8                   -7.855746e-17
yards_per_carry_L8          

In [38]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

In [39]:
from sklearn.metrics import mean_absolute_error

In [40]:
from sklearn.model_selection import GridSearchCV

# K Nearest Neighbors
## FanDuel

In [41]:
# Lists to store the RMSE values and corresponding number of neighbors
# rmse_list_FD = []
# neighbors_list_FD = []

In [42]:
# param_grid = {
#     # 'n_neighbors': range(1, 101),  # Testing n_neighbors from 1 to 100
# }

In [43]:
# knn_FD = KNeighborsRegressor()

In [44]:
# grid_search = GridSearchCV(knn_FD, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

In [45]:
# grid_search.fit(X_FD_train_scaled, y_FD_train)

In [46]:
# # Best parameters found by GridSearchCV
# best_params = grid_search.best_params_
# print(f"Best parameters: {best_params}")

# # Best model
# best_knn_model = grid_search.best_estimator_

# # Evaluate on the test set
# y_pred = best_knn_model.predict(X_FD_test_scaled)

# best_rmse = mean_squared_error(y_FD_test, y_pred, squared=False)
# # Calculate and print the final errors
# print(f"Mean Absolute Error: {mean_absolute_error(y_FD_test, y_pred)}")
# print(f"Root Mean Squared Error: {best_rmse}")

In [47]:
# error_dict_FD['KNN'] = {'best_params': best_params, 'best_rmse': best_rmse}

## DraftKings

In [48]:
# knn_DK = KNeighborsRegressor()

In [49]:
# grid_search = GridSearchCV(knn_DK, param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

In [50]:
# grid_search.fit(X_DK_train_scaled, y_DK_train)

In [51]:
# # Best parameters found by GridSearchCV
# best_params = grid_search.best_params_
# print(f"Best parameters: {best_params}")

# # Best model
# best_knn_model = grid_search.best_estimator_

# # Evaluate on the test set
# y_pred = best_knn_model.predict(X_DK_test_scaled)

# best_rmse = mean_squared_error(y_DK_test, y_pred, squared=False)
# # Calculate and print the final errors
# print(f"Mean Absolute Error: {mean_absolute_error(y_DK_test, y_pred)}")
# print(f"Root Mean Squared Error: {best_rmse}")

In [52]:
# error_dict_DK['KNN'] = {'best_params': best_params, 'best_rmse': best_rmse}

# Linear Regression

## FanDuel

In [53]:
# base_LR_FD = LinearRegression()

In [54]:
# base_LR_FD.fit(X_FD_train_scaled, y_FD_train)

In [55]:
# base_LR_FD_pred = base_LR_FD.predict(X_FD_test_scaled)

In [56]:
# print(f"R_squared_train: {base_LR_FD.score(X_FD_train_scaled, y_FD_train)}")
# print(f"R_squared_test: {base_LR_FD.score(X_FD_test_scaled, y_FD_test)}")
# print(f"MAE: {mean_absolute_error(y_FD_test, base_LR_FD_pred)}")
# print(f"MSE: {mean_squared_error(y_FD_test, base_LR_FD_pred)}")
# print(f"RMSE: {mean_squared_error(y_FD_test, base_LR_FD_pred, squared = False)}")

In [57]:
# from sklearn.linear_model import Ridge

In [58]:
# alpha_values = np.linspace(0.1, 100, 50)

In [59]:
# param_grid = {'alpha': alpha_values}
# Ridge_LR_FD = Ridge()
# Ridge_CV_FD = GridSearchCV(Ridge_LR_FD, param_grid, cv = 5, scoring = 'neg_root_mean_squared_error')
# Ridge_CV_FD.fit(X_FD_train_scaled, y_FD_train)
# best_params = Ridge_CV_FD.best_params_
# print(f"Ridge_model best params: {best_params}")
# print(f"Ridge_model best score: {-Ridge_CV_FD.best_score_}")

In [60]:
# best_ridge_FD = Ridge(alpha=100, random_state=42)
# best_ridge_FD.fit(X_FD_train_scaled, y_FD_train)  # Use the correct variable name
# y_pred = best_ridge_FD.predict(X_FD_test_scaled)  # Use the correct variable name

# best_rmse = mean_squared_error(y_FD_test, y_pred, squared=False)
# print(f"best_ridge train R-squared: {best_ridge_FD.score(X_FD_train_scaled, y_FD_train)}")
# print(f"best_ridge test R-squared: {best_ridge_FD.score(X_FD_test_scaled, y_FD_test)}")
# print(f"best_ridge MAE: {mean_absolute_error(y_FD_test, y_pred)}")
# print(f"best_ridge RMSE: {best_rmse}")
# print(f"best_ridge MSE: {mean_squared_error(y_FD_test, y_pred)}")

In [61]:
# error_dict_FD['Linear_Regression'] = {'best_params': best_params, 'best_rmse': best_rmse}

## DraftKings

In [62]:
# base_LR_DK = LinearRegression()

In [63]:
# base_LR_DK.fit(X_DK_train_scaled, y_DK_train)

In [64]:
# base_LR_DK_pred = base_LR_DK.predict(X_DK_test_scaled)

In [65]:
# print(f"R_squared_train: {base_LR_DK.score(X_DK_train_scaled, y_DK_train)}")
# print(f"R_squared_test: {base_LR_DK.score(X_DK_test_scaled, y_DK_test)}")
# print(f"MAE: {mean_absolute_error(y_DK_test, base_LR_DK_pred)}")
# print(f"MSE: {mean_squared_error(y_DK_test, base_LR_DK_pred)}")
# print(f"RMSE: {mean_squared_error(y_DK_test, base_LR_DK_pred, squared = False)}")

In [66]:
# alpha_values = np.linspace(0.1, 100, 50)

In [67]:
# param_grid = {'alpha': alpha_values}
# Ridge_LR_DK = Ridge()
# Ridge_CV_DK = GridSearchCV(Ridge_LR_DK, param_grid, cv = 5, scoring = 'neg_root_mean_squared_error')
# Ridge_CV_DK.fit(X_DK_train_scaled, y_DK_train)
# best_params = Ridge_CV_DK.best_params_
# print(f"Ridge_model best params: {best_params}")
# print(f"Ridge_model best score: {-Ridge_CV_DK.best_score_}")

In [68]:
# best_ridge_DK = Ridge(alpha=100, random_state=42)
# best_ridge_DK.fit(X_DK_train_scaled, y_DK_train)  # Use the correct variable name
# y_pred = best_ridge_DK.predict(X_DK_test_scaled)  # Use the correct variable name

# best_rmse = mean_squared_error(y_DK_test, y_pred, squared=False)
# print(f"best_ridge train R-squared: {best_ridge_DK.score(X_DK_train_scaled, y_DK_train)}")
# print(f"best_ridge test R-squared: {best_ridge_DK.score(X_DK_test_scaled, y_DK_test)}")
# print(f"best_ridge MAE: {mean_absolute_error(y_DK_test, y_pred)}")
# print(f"best_ridge RMSE: {best_rmse}")
# print(f"best_ridge MSE: {mean_squared_error(y_DK_test, y_pred)}")

In [69]:
# error_dict_DK['Linear_Regression'] = {'best_params': best_params, 'best_rmse': best_rmse}

In [70]:
# error_dict_DK

In [71]:
from sklearn.metrics import make_scorer

In [72]:
rmse_scorer = make_scorer(mean_squared_error, squared=False, greater_is_better=False)

In [73]:
from sklearn.ensemble import RandomForestRegressor

# Random Forest

## FanDuel

In [74]:
# base_RF_FD = RandomForestRegressor(random_state = 42)

In [75]:
# base_RF_FD.fit(X_FD_train_scaled, y_FD_train)

In [76]:
# base_RF_FD_pred = base_RF_FD.predict(X_FD_test_scaled)

In [77]:
# print(f"R_squared_train: {base_RF_FD.score(X_FD_train_scaled, y_FD_train)}")
# print(f"R_squared_test: {base_RF_FD.score(X_FD_test_scaled, y_FD_test)}")
# print(f"MAE: {mean_absolute_error(y_FD_test, base_RF_FD_pred)}")
# print(f"MSE: {mean_squared_error(y_FD_test, base_RF_FD_pred)}")
# print(f"RMSE: {mean_squared_error(y_FD_test, base_RF_FD_pred, squared = False)}")

In [78]:
from sklearn.model_selection import RandomizedSearchCV

In [79]:
# param_dist = {'max_depth': [3, 6, 9, 12],\
#               'n_estimators': [100, 250, 400],\
#              'min_samples_split': [2, 5, 10],\
#               'min_samples_leaf': [1, 2, 4]}
# y_FD_train = y_FD_train.ravel()
# RF_Random_CV_FD = RandomizedSearchCV(base_RF_FD, param_distributions=param_dist, n_iter=36, scoring=rmse_scorer,\
#                                      cv=3, n_jobs=6, verbose=2, random_state=42)
# RF_Random_CV_FD.fit(X_FD_train_scaled, y_FD_train)
# best_params = RF_Random_CV_FD.best_params_
# print(f"Tuned Random Forest Best Estimator: {RF_Random_CV_FD.best_estimator_}")
# print(f"Tuned Random Forest Best Score: {RF_Random_CV_FD.best_score_}")
# print(f"Tuned Random Forest Best Params: {best_params}")

In [80]:
# best_RF_FD = RandomForestRegressor(max_depth=9, n_estimators=400, min_samples_split = 10, min_samples_leaf = 4, random_state=42)
# best_RF_FD.fit(X_FD_train_scaled, y_FD_train)  # Correct the variable name for consistency
# y_pred = best_RF_FD.predict(X_FD_test_scaled)  # Correct the variable name for consistency

# best_rmse = mean_squared_error(y_FD_test, y_pred, squared=False)
# print(f"Best_RF Train R-squared: {best_RF_FD.score(X_FD_train_scaled, y_FD_train)}")
# print(f"Best_RF Test R-squared: {best_RF_FD.score(X_FD_test_scaled, y_FD_test)}")
# print(f"Best_RF MAE: {mean_absolute_error(y_FD_test, y_pred)}")
# print(f"Best_RF RMSE: {best_rmse}")
# print(f"Best_RF MSE: {mean_squared_error(y_FD_test, y_pred)}")

In [81]:
# error_dict_FD['Random_Forest'] = {'best_params': best_params, 'best_rmse': best_rmse}

# DraftKings
Since the param_grid took a while for FanDuel, even with a randomized search, we're going to reel the grid in a little bit for DraftKings.

In [82]:
base_RF_DK = RandomForestRegressor(random_state = 42)

In [83]:
base_RF_DK.fit(X_DK_train_scaled, y_DK_train)

  return fit_method(estimator, *args, **kwargs)


In [84]:
base_RF_DK_pred = base_RF_DK.predict(X_DK_test_scaled)

In [85]:
print(f"R_squared_train: {base_RF_DK.score(X_DK_train_scaled, y_DK_train)}")
print(f"R_squared_test: {base_RF_DK.score(X_DK_test_scaled, y_DK_test)}")
print(f"MAE: {mean_absolute_error(y_DK_test, base_RF_DK_pred)}")
print(f"MSE: {mean_squared_error(y_DK_test, base_RF_DK_pred)}")
print(f"RMSE: {mean_squared_error(y_DK_test, base_RF_DK_pred, squared = False)}")

R_squared_train: 0.8983720584142745
R_squared_test: 0.27525438371728617
MAE: 5.129305972836688
MSE: 46.85643231949449
RMSE: 6.84517584284688


In [86]:
param_dist = {'max_depth': [6, 9, 12],\
              'n_estimators': [250, 400],\
             'min_samples_split': [2, 5, 10],\
              'min_samples_leaf': [1, 2, 4]}
y_DK_train = y_DK_train.values.ravel()
RF_Random_CV_DK = RandomizedSearchCV(base_RF_DK, param_distributions=param_dist, n_iter=18, scoring=rmse_scorer,\
                                     cv=3, n_jobs=6, verbose=2, random_state=42)
RF_Random_CV_DK.fit(X_DK_train_scaled, y_DK_train)
best_params = RF_Random_CV_DK.best_params_
print(f"Tuned Random Forest Best Estimator: {RF_Random_CV_DK.best_estimator_}")
print(f"Tuned Random Forest Best Score: {RF_Random_CV_DK.best_score_}")
print(f"Tuned Random Forest Best Params: {best_params}")

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Tuned Random Forest Best Estimator: RandomForestRegressor(max_depth=9, min_samples_leaf=4, min_samples_split=10,
                      n_estimators=250, random_state=42)
Tuned Random Forest Best Score: -6.804304122194972
Tuned Random Forest Best Params: {'n_estimators': 250, 'min_samples_split': 10, 'min_samples_leaf': 4, 'max_depth': 9}


In [87]:
best_RF_DK = RandomForestRegressor(max_depth=9, n_estimators=250, min_samples_split = 10, min_samples_leaf = 4, random_state=42)
best_RF_DK.fit(X_DK_train_scaled, y_DK_train)  # Correct the variable name for consistency
y_pred = best_RF_DK.predict(X_DK_test_scaled)  # Correct the variable name for consistency

best_rmse = mean_squared_error(y_DK_test, y_pred, squared=False)
print(f"Best_RF Train R-squared: {best_RF_DK.score(X_DK_train_scaled, y_DK_train)}")
print(f"Best_RF Test R-squared: {best_RF_DK.score(X_DK_test_scaled, y_DK_test)}")
print(f"Best_RF MAE: {mean_absolute_error(y_DK_test, y_pred)}")
print(f"Best_RF RMSE: {best_rmse}")
print(f"Best_RF MSE: {mean_squared_error(y_DK_test, y_pred)}")

Best_RF Train R-squared: 0.42427279444036625
Best_RF Test R-squared: 0.30368142880460014
Best_RF MAE: 4.974051542505229
Best_RF RMSE: 6.70958716985463
Best_RF MSE: 45.018559989877865


In [88]:
# error_dict_DK['Random_Forest'] = {'best_params': best_params, 'best_rmse': best_rmse}

In [89]:
# error_dict_DK

In [90]:
from sklearn.ensemble import GradientBoostingRegressor

# Gradient Boost

## DraftKings
For this one, we'll do DraftKings first, then if we have to compromise the grid, this time it will be FanDuel that gets the reduced grid.

In [91]:
base_GB_DK = GradientBoostingRegressor(random_state = 42)

In [92]:
base_GB_DK.fit(X_DK_train_scaled, y_DK_train)

In [93]:
y_pred = base_GB_DK.predict(X_DK_test_scaled)

In [94]:
print(f"Base GB R_squared_train: {base_GB_DK.score(X_DK_train_scaled, y_DK_train)}")
print(f"Base GB R_squared_test: {base_GB_DK.score(X_DK_test_scaled, y_DK_test)}")
print(f"Base GB MAE: {mean_absolute_error(y_DK_test, y_pred)}")
print(f"Base GB MSE: {mean_squared_error(y_DK_test, y_pred)}")
print(f"Base GB RMSE: {mean_squared_error(y_DK_test, y_pred, squared = False)}")

Base GB R_squared_train: 0.3277394606904285
Base GB R_squared_test: 0.3066739883846069
Base GB MAE: 4.960263375038516
Base GB MSE: 44.825084289891095
Base GB RMSE: 6.6951537913546915


In [95]:
param_dist = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2, 0.3],  # Expanded learning rates
    'max_depth': [4, 6, 8]
}

In [96]:
GB_CV_DK = RandomizedSearchCV(base_GB_DK, param_dist, scoring=rmse_scorer, cv = 3, n_iter = 18, n_jobs = 6, verbose = 3)
GB_CV_DK.fit(X_DK_train_scaled, y_DK_train)
best_params = GB_CV_DK.best_params_
print(f"Tuned Gradient Boost Best Estimator: {GB_CV_DK.best_estimator_}")
print(f"Tuned Gradient Boost Best Score: {GB_CV_DK.best_score_}")
print(f"Tuned Gradient Boost Best Params: {best_params}")

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Tuned Gradient Boost Best Estimator: GradientBoostingRegressor(learning_rate=0.01, max_depth=4, n_estimators=500,
                          random_state=42)
Tuned Gradient Boost Best Score: -6.79868533901557
Tuned Gradient Boost Best Params: {'n_estimators': 500, 'max_depth': 4, 'learning_rate': 0.01}


In [97]:
best_GB_DK = GradientBoostingRegressor(max_depth=4, n_estimators=500, learning_rate = 0.01, random_state=42)
best_GB_DK.fit(X_DK_train_scaled, y_DK_train)  # Correct the variable name for consistency
y_pred = best_GB_DK.predict(X_DK_test_scaled)  # Correct the variable name for consistency

best_rmse = mean_squared_error(y_DK_test, y_pred, squared=False)
print(f"Best_GB Train R-squared: {best_GB_DK.score(X_DK_train_scaled, y_DK_train)}")
print(f"Best_GB Test R-squared: {best_GB_DK.score(X_DK_test_scaled, y_DK_test)}")
print(f"Best_GB MAE: {mean_absolute_error(y_DK_test, y_pred)}")
print(f"Best_GB RMSE: {best_rmse}")
print(f"Best_GB MSE: {mean_squared_error(y_DK_test, y_pred)}")

Best_GB Train R-squared: 0.3337526325017415
Best_GB Test R-squared: 0.30789600677571527
Best_GB MAE: 4.969178948086518
Best_GB RMSE: 6.6892509338114206
Best_GB MSE: 44.746078055496966


In [390]:
# error_dict_DK['Gradient_Boost'] = {'best_params': best_params, 'best_rmse': best_rmse}

In [391]:
# error_dict_DK

defaultdict(float,
            {'KNN': {'best_params': {'n_neighbors': 35},
              'best_rmse': 7.704548048681453},
             'Linear_Regression': {'best_params': {'alpha': 24.565306122448984},
              'best_rmse': 7.470141938786919},
             'Random_Forest': {'best_params': {'n_estimators': 500,
               'min_samples_split': 2,
               'min_samples_leaf': 4,
               'max_depth': 6},
              'best_rmse': 7.405278678453883},
             'Gradient_Boost': {'best_params': {'n_estimators': 100,
               'max_depth': 2,
               'learning_rate': 0.1},
              'best_rmse': 7.44117424990967}})

## FanDuel

In [98]:
base_GB_FD = GradientBoostingRegressor(random_state = 42)

In [99]:
base_GB_FD.fit(X_FD_train_scaled, y_FD_train)

  y = column_or_1d(y, warn=True)


In [100]:
y_pred = base_GB_FD.predict(X_FD_test_scaled)

In [101]:
print(f"Base GB R_squared_train: {base_GB_FD.score(X_FD_train_scaled, y_FD_train)}")
print(f"Base GB R_squared_test: {base_GB_FD.score(X_FD_test_scaled, y_FD_test)}")
print(f"Base GB MAE: {mean_absolute_error(y_FD_test, y_pred)}")
print(f"Base GB MSE: {mean_squared_error(y_FD_test, y_pred)}")
print(f"Base GB RMSE: {mean_squared_error(y_FD_test, y_pred, squared = False)}")

Base GB R_squared_train: 0.3173687856165539
Base GB R_squared_test: 0.2947525202531307
Base GB MAE: 4.231682018699661
Base GB MSE: 32.25119990555209
Base GB RMSE: 5.679013990610702


In [102]:
param_dist = {
    'n_estimators': [100, 300, 500],
    'learning_rate': [0.001, 0.01, 0.05, 0.1, 0.2, 0.3],  # Expanded learning rates
    'max_depth': [4, 6, 8]
}

In [103]:
GB_CV_FD = RandomizedSearchCV(base_GB_FD, param_dist, scoring=rmse_scorer, cv = 3, n_iter = 18, n_jobs = 6, verbose = 3)
GB_CV_FD.fit(X_FD_train_scaled, y_FD_train)
best_params = GB_CV_FD.best_params_
print(f"Tuned Gradient Boost Best Estimator: {GB_CV_FD.best_estimator_}")
print(f"Tuned Gradient Boost Best Score: {GB_CV_FD.best_score_}")
print(f"Tuned Gradient Boost Best Params: {best_params}")

Fitting 3 folds for each of 18 candidates, totalling 54 fits


  y = column_or_1d(y, warn=True)


Tuned Gradient Boost Best Estimator: GradientBoostingRegressor(learning_rate=0.01, max_depth=4, n_estimators=500,
                          random_state=42)
Tuned Gradient Boost Best Score: -5.760182092404126
Tuned Gradient Boost Best Params: {'n_estimators': 500, 'max_depth': 4, 'learning_rate': 0.01}


In [104]:
best_GB_FD = GradientBoostingRegressor(max_depth=4, n_estimators=500, learning_rate = 0.01, random_state=42)
best_GB_FD.fit(X_FD_train_scaled, y_FD_train)  # Correct the variable name for consistency
y_pred = best_GB_FD.predict(X_FD_test_scaled)  # Correct the variable name for consistency

best_rmse = mean_squared_error(y_FD_test, y_pred, squared=False)
print(f"Best_GB Train R-squared: {best_GB_FD.score(X_FD_train_scaled, y_FD_train)}")
print(f"Best_GB Test R-squared: {best_GB_FD.score(X_FD_test_scaled, y_FD_test)}")
print(f"Best_GB MAE: {mean_absolute_error(y_FD_test, y_pred)}")
print(f"Best_GB RMSE: {best_rmse}")
print(f"Best_GB MSE: {mean_squared_error(y_FD_test, y_pred)}")

  y = column_or_1d(y, warn=True)


Best_GB Train R-squared: 0.32304697289018935
Best_GB Test R-squared: 0.2954862267672601
Best_GB MAE: 4.236497715331216
Best_GB RMSE: 5.676059131529178
Best_GB MSE: 32.21764726461577


In [400]:
# error_dict_FD['Gradient_Boost'] = {'best_params': best_params, 'best_rmse': best_rmse}

In [401]:
# error_dict_FD

defaultdict(float,
            {'KNN': {'best_params': {'n_neighbors': 35},
              'best_rmse': 7.077679118367947},
             'Linear_Regression': {'best_params': {'alpha': 36.79795918367348},
              'best_rmse': 6.851643471593359},
             'Random_Forest': {'best_params': {'n_estimators': 300,
               'min_samples_split': 5,
               'min_samples_leaf': 4,
               'max_depth': 6},
              'best_rmse': 6.794550117882793},
             'Gradient_Boost': {'best_params': {'n_estimators': 100,
               'max_depth': 2,
               'learning_rate': 0.05},
              'best_rmse': 6.837069043873724}})

# XGBoost

## FanDuel

In [105]:
import xgboost as xgb

In [106]:
base_XGB_FD = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

In [107]:
# Fit the best model on the training data
base_XGB_FD.fit(X_FD_train_scaled, y_FD_train)

In [108]:
# Make predictions on the test data
y_pred = base_XGB_FD.predict(X_FD_test_scaled)

In [109]:
print(f"Base XGB R_squared_train: {base_XGB_FD.score(X_FD_train_scaled, y_FD_train)}")
print(f"Base XGB R_squared_test: {base_XGB_FD.score(X_FD_test_scaled, y_FD_test)}")
print(f"Base XGB MAE: {mean_absolute_error(y_FD_test, y_pred)}")
print(f"Base XGB MSE: {mean_squared_error(y_FD_test, y_pred)}")
print(f"Base XGB RMSE: {mean_squared_error(y_FD_test, y_pred, squared=False)}")

Base XGB R_squared_train: 0.6083049984671678
Base XGB R_squared_test: 0.23894014367911398
Base XGB MAE: 4.3696285401426564
Base XGB MSE: 34.80351829843545
Base XGB RMSE: 5.899450677684784


In [110]:
param_dist = {
    'n_estimators': [300, 500, 700],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [2, 4, 6],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [1, 1.5, 2]
}

In [111]:
# Set up the GridSearchCV with XGBoost
XGB_CV_FD = RandomizedSearchCV(base_XGB_FD, param_dist, scoring=rmse_scorer, n_iter = 100, cv=3, n_jobs=6, verbose=3)
XGB_CV_FD.fit(X_FD_train_scaled, y_FD_train)
best_params = XGB_CV_FD.best_params_
print(f"Tuned XGBoost Best Estimator: {XGB_CV_FD.best_estimator_}")
print(f"Tuned XGBoost Best Score: {XGB_CV_FD.best_score_}")
print(f"Tuned XGBoost Best Params: {best_params}")

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Tuned XGBoost Best Estimator: XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=1.0, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=4, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=500, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)
Tuned XGBoost Best Score: -5.754595860304164
Tuned XGBoost Best Params: {'subsample': 0.8, 'reg_lambda': 1.5, 'reg_alpha': 0.1, 'n_estimators': 500, 'max_depth': 4, 'lear

In [112]:
best_XGB_FD = xgb.XGBRegressor(max_depth=4, n_estimators=500, learning_rate=0.01,\
                               subsample = 0.8, reg_lambda = 1.5, reg_alpha = 0.1,\
                               gamma = 0, colsample_bytree = 1.0, objective='reg:squarederror', random_state=42)

In [113]:
best_XGB_FD.fit(X_FD_train_scaled, y_FD_train)
y_pred = best_XGB_FD.predict(X_FD_test_scaled)

In [114]:
# Print evaluation metrics for the best model
best_rmse = mean_squared_error(y_FD_test, y_pred, squared=False)
print(f"Best_XGB Train R-squared: {best_XGB_FD.score(X_FD_train_scaled, y_FD_train)}")
print(f"Best_XGB Test R-squared: {best_XGB_FD.score(X_FD_test_scaled, y_FD_test)}")
print(f"Best_XGB MAE: {mean_absolute_error(y_FD_test, y_pred)}")
print(f"Best_XGB RMSE: {best_rmse}")
print(f"Best_XGB MSE: {mean_squared_error(y_FD_test, y_pred)}")

Best_XGB Train R-squared: 0.3213694002326647
Best_XGB Test R-squared: 0.29848365212156247
Best_XGB MAE: 4.22626695069495
Best_XGB RMSE: 5.663971576052591
Best_XGB MSE: 32.08057401433167


In [412]:
# error_dict_FD['XGBoost'] = {'best_params': best_params, 'best_rmse': best_rmse}

In [413]:
# error_dict_FD

defaultdict(float,
            {'KNN': {'best_params': {'n_neighbors': 35},
              'best_rmse': 7.077679118367947},
             'Linear_Regression': {'best_params': {'alpha': 36.79795918367348},
              'best_rmse': 6.851643471593359},
             'Random_Forest': {'best_params': {'n_estimators': 300,
               'min_samples_split': 5,
               'min_samples_leaf': 4,
               'max_depth': 6},
              'best_rmse': 6.794550117882793},
             'Gradient_Boost': {'best_params': {'n_estimators': 100,
               'max_depth': 2,
               'learning_rate': 0.05},
              'best_rmse': 6.837069043873724},
             'XGBoost': {'best_params': {'subsample': 0.6,
               'reg_lambda': 1.5,
               'reg_alpha': 0.01,
               'n_estimators': 100,
               'max_depth': 2,
               'learning_rate': 0.05,
               'gamma': 0.3,
               'colsample_bytree': 1.0},
              'best_rmse': 6.825956219

In [414]:
# errors_FD = pd.DataFrame(error_dict_FD)

In [415]:
# errors_FD

Unnamed: 0,KNN,Linear_Regression,Random_Forest,Gradient_Boost,XGBoost
best_params,{'n_neighbors': 35},{'alpha': 36.79795918367348},"{'n_estimators': 300, 'min_samples_split': 5, ...","{'n_estimators': 100, 'max_depth': 2, 'learnin...","{'subsample': 0.6, 'reg_lambda': 1.5, 'reg_alp..."
best_rmse,7.077679,6.851643,6.79455,6.837069,6.825956


In [416]:
# errors_FD.to_csv('errors_FLEX_FD.csv', index = False)

In [425]:
# errors_FD.loc['best_params', 'XGBoost']

{'subsample': 0.6,
 'reg_lambda': 1.5,
 'reg_alpha': 0.01,
 'n_estimators': 100,
 'max_depth': 2,
 'learning_rate': 0.05,
 'gamma': 0.3,
 'colsample_bytree': 1.0}

## DraftKings

In [115]:
base_XGB_DK = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

In [116]:
# Fit the best model on the training data
base_XGB_DK.fit(X_DK_train_scaled, y_DK_train)

In [117]:
# Make predictions on the test data
y_pred = base_XGB_DK.predict(X_DK_test_scaled)

In [119]:
print(f"Base XGB R_squared_train: {base_XGB_DK.score(X_DK_train_scaled, y_DK_train)}")
print(f"Base XGB R_squared_test: {base_XGB_DK.score(X_DK_test_scaled, y_DK_test)}")
print(f"Base XGB MAE: {mean_absolute_error(y_DK_test, y_pred)}")
print(f"Base XGB MSE: {mean_squared_error(y_DK_test, y_pred)}")
print(f"Base XGB RMSE: {mean_squared_error(y_DK_test, y_pred, squared=False)}")

Base XGB R_squared_train: 0.6100629336667105
Base XGB R_squared_test: 0.24806973153403866
Base XGB MAE: 5.146628449958908
Base XGB MSE: 48.61398115668049
Base XGB RMSE: 6.972372706380554


In [118]:
param_dist = {
    'n_estimators': [300, 500, 700],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'max_depth': [4, 6, 8],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3],
    'reg_alpha': [0, 0.01, 0.1],
    'reg_lambda': [1, 1.5, 2]
}

In [120]:
# Set up the GridSearchCV with XGBoost
XGB_CV_DK = RandomizedSearchCV(base_XGB_DK, param_dist, scoring=rmse_scorer, n_iter = 100, cv=3, n_jobs=6, verbose=3)
XGB_CV_DK.fit(X_DK_train_scaled, y_DK_train)
best_params = XGB_CV_DK.best_params_
print(f"Tuned XGBoost Best Estimator: {XGB_CV_DK.best_estimator_}")
print(f"Tuned XGBoost Best Score: {XGB_CV_DK.best_score_}")
print(f"Tuned XGBoost Best Params: {best_params}")

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Tuned XGBoost Best Estimator: XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=0.6, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0.1, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.01, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=4, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=500, n_jobs=None,
             num_parallel_tree=None, random_state=42, ...)
Tuned XGBoost Best Score: -6.787525458885561
Tuned XGBoost Best Params: {'subsample': 0.8, 'reg_lambda': 1.5, 'reg_alpha': 0, 'n_estimators': 500, 'max_depth': 4, 'lear

In [122]:
best_XGB_DK = xgb.XGBRegressor(max_depth=4, n_estimators=500, learning_rate=0.01,\
                               subsample = 0.8, reg_lambda = 1.5, reg_alpha = 0,\
                               gamma = 0.1, colsample_bytree = 0.6, objective='reg:squarederror', random_state=42)

In [123]:
best_XGB_DK.fit(X_DK_train_scaled, y_DK_train)
y_pred = best_XGB_DK.predict(X_DK_test_scaled)

In [124]:
# Print evaluation metrics for the best model
best_rmse = mean_squared_error(y_DK_test, y_pred, squared=False)
print(f"Best_XGB Train R-squared: {best_XGB_DK.score(X_DK_train_scaled, y_DK_train)}")
print(f"Best_XGB Test R-squared: {best_XGB_DK.score(X_DK_test_scaled, y_DK_test)}")
print(f"Best_XGB MAE: {mean_absolute_error(y_DK_test, y_pred)}")
print(f"Best_XGB RMSE: {best_rmse}")
print(f"Best_XGB MSE: {mean_squared_error(y_DK_test, y_pred)}")

Best_XGB Train R-squared: 0.3300371592098784
Best_XGB Test R-squared: 0.311512329670687
Best_XGB MAE: 4.9545485543366325
Best_XGB RMSE: 6.671751992980524
Best_XGB MSE: 44.51227465583959


In [429]:
error_dict_DK['XGBoost'] = {'best_params': best_params, 'best_rmse': best_rmse}

In [430]:
error_dict_DK

defaultdict(float,
            {'KNN': {'best_params': {'n_neighbors': 35},
              'best_rmse': 7.704548048681453},
             'Linear_Regression': {'best_params': {'alpha': 24.565306122448984},
              'best_rmse': 7.470141938786919},
             'Random_Forest': {'best_params': {'n_estimators': 500,
               'min_samples_split': 2,
               'min_samples_leaf': 4,
               'max_depth': 6},
              'best_rmse': 7.405278678453883},
             'Gradient_Boost': {'best_params': {'n_estimators': 100,
               'max_depth': 2,
               'learning_rate': 0.1},
              'best_rmse': 7.44117424990967},
             'XGBoost': {'best_params': {'subsample': 0.8,
               'reg_lambda': 1,
               'reg_alpha': 0.01,
               'n_estimators': 100,
               'max_depth': 2,
               'learning_rate': 0.05,
               'gamma': 0.1,
               'colsample_bytree': 1.0},
              'best_rmse': 7.449694639196

In [431]:
errors_DK = pd.DataFrame(error_dict_DK)

In [432]:
errors_DK

Unnamed: 0,KNN,Linear_Regression,Random_Forest,Gradient_Boost,XGBoost
best_params,{'n_neighbors': 35},{'alpha': 24.565306122448984},"{'n_estimators': 500, 'min_samples_split': 2, ...","{'n_estimators': 100, 'max_depth': 2, 'learnin...","{'subsample': 0.8, 'reg_lambda': 1, 'reg_alpha..."
best_rmse,7.704548,7.470142,7.405279,7.441174,7.449695


In [433]:
errors_DK.to_csv('errors_FLEX_DK.csv', index = False)