In [1]:
#Imports
import pandas as pd
import datetime
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.metrics import mean_squared_error, r2_score 
from sklearn.model_selection import train_test_split 
from sklearn import linear_model, preprocessing 

In [2]:
#These first few cells will be the same as the EDA notebook

In [3]:
main_df = pd.read_csv('Pitching_Data_Through_2022_08_07.csv')

In [4]:
main_df.head()

Unnamed: 0.1,Unnamed: 0,Name,Team,W,ER,SO,H,BB,CG,HBP,...,FB%_Opp,HR/FB_Opp,Soft%_Opp,Med%_Opp,Hard%_Opp,Pk_Fct_Overall,Pk_Fct_HR,Pk_Fct_SO,Temp,Wind
0,0,Tyler Anderson,LAD,1,0,3,2,1,0,0,...,35.9,8.8,19.9,50.4,29.6,101,125,105,92.0,4.0
1,1,Zach Davies,ARI,0,0,3,4,2,0,0,...,33.4,10.7,18.3,52.2,29.5,101,81,96,72.0,3.0
2,2,Kevin Gausman,TOR,0,0,5,6,0,0,0,...,37.4,13.4,13.8,52.0,34.2,97,94,100,72.0,3.0
3,3,Triston McKenzie,CLE,1,0,8,2,1,0,0,...,38.9,13.5,16.2,53.8,30.1,100,98,102,86.0,3.0
4,4,Jesus Luzardo,MIA,1,0,6,1,1,0,0,...,33.0,14.8,18.2,50.0,31.8,99,99,105,78.0,4.0


In [5]:
main_df.drop(columns = ['Unnamed: 0'], inplace = True)

In [6]:
main_df.shape

(1194, 90)

In [7]:
#Dropping rows that contribute to fantasy points
#Also dropping K% and BB% since we're more interested in a pitcher's K/9 and BB/9
#Also dropping CG_Season and ShO_Season since they're so rare
#Dropping OBP and SLG since we have OPS (on-base plus slugging)
#Dropping xFIP since we have FIP and wRC+ since we have wRC
#Dropping GB% and FB% since and GB_Opp% and FB_Opp% since we have GB/FB for both
#Dropping Outs_Season since we have Avg_Outs
main_df = main_df.drop(columns = ['W', 'ER', 'SO', 'H', 'BB', 'CG', 'HBP', 'ShO', 'Outs', 'QS', 'K%', 'BB%',\
                                 'CG_Season', 'ShO_Season', 'OBP', 'SLG', 'xFIP', 'wRC+', 'GB%', 'FB%',\
                                 'GB%_Opp', 'FB%_Opp', 'CGS', 'NH', 'Outs_Season', 'Proj_Runs', 'Park', 'Hand', 'Opp_Team',\
                                 'Team_Season', 'Position'])

In [8]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1194 entries, 0 to 1193
Data columns (total 59 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            1194 non-null   object 
 1   Team            1194 non-null   object 
 2   Date            1194 non-null   object 
 3   GS              1194 non-null   float64
 4   Avg_Outs        1194 non-null   float64
 5   W_Season        1194 non-null   float64
 6   ERA             1194 non-null   float64
 7   K/9             1194 non-null   float64
 8   BB/9            1194 non-null   float64
 9   K/BB            1194 non-null   float64
 10  HR/9            1194 non-null   float64
 11  AVG             1194 non-null   float64
 12  BABIP           1194 non-null   float64
 13  FIP             1194 non-null   float64
 14  SIERA           1194 non-null   float64
 15  WHIP            1194 non-null   float64
 16  GB/FB           1194 non-null   float64
 17  LD%             1194 non-null   f

In [9]:
main_df_num = main_df.select_dtypes(include = ['int64', 'float64'])

In [10]:
main_df_corr = main_df_num.corr()

In [11]:
main_df_corr

Unnamed: 0,GS,Avg_Outs,W_Season,ERA,K/9,BB/9,K/BB,HR/9,AVG,BABIP,...,LD%_Opp,HR/FB_Opp,Soft%_Opp,Med%_Opp,Hard%_Opp,Pk_Fct_Overall,Pk_Fct_HR,Pk_Fct_SO,Temp,Wind
GS,1.0,0.612585,0.716807,-0.222785,0.082452,-0.154689,0.141815,-0.20211,-0.176407,-0.083836,...,0.011113,0.007079,-0.019751,0.021313,-0.005395,-0.021005,-0.010725,0.018261,0.025442,0.010607
Avg_Outs,0.612585,1.0,0.582005,-0.391464,0.06045,-0.311532,0.300621,-0.234053,-0.303442,-0.222724,...,0.014143,0.023228,-0.018813,0.023027,-0.005956,0.013489,0.050774,0.032654,0.055346,-0.004621
W_Season,0.716807,0.582005,1.0,-0.423023,0.197056,-0.302728,0.351296,-0.302528,-0.360772,-0.188225,...,0.026564,-0.005161,0.022192,0.003851,-0.017926,-0.03592,0.021304,0.107933,-0.000965,0.047409
ERA,-0.222785,-0.391464,-0.423023,1.0,-0.163047,0.36706,-0.346828,0.63228,0.752855,0.531139,...,0.009036,-0.021734,-0.015422,0.038625,-0.020989,0.054796,-0.030562,-0.103201,0.005245,-0.019806
K/9,0.082452,0.06045,0.197056,-0.163047,1.0,0.040437,0.513107,-0.010659,-0.35294,0.11858,...,0.032903,0.009352,0.050097,-0.017321,-0.019565,0.069992,0.135581,0.168725,-0.025198,0.055241
BB/9,-0.154689,-0.311532,-0.302728,0.36706,0.040437,1.0,-0.690995,0.049966,0.138826,0.163656,...,0.011743,0.037851,-0.029699,0.027619,-0.002588,0.0486,0.010919,-0.06267,0.008414,-0.055206
K/BB,0.141815,0.300621,0.351296,-0.346828,0.513107,-0.690995,1.0,-0.054138,-0.280536,-0.038354,...,0.006248,-0.038204,0.072061,-0.033798,-0.020344,-0.018748,0.066127,0.150583,-0.018713,0.05282
HR/9,-0.20211,-0.234053,-0.302528,0.63228,-0.010659,0.049966,-0.054138,1.0,0.377519,0.036288,...,-0.00081,-0.038797,-0.019784,0.055322,-0.031595,0.020114,-0.001135,-0.063773,0.016985,0.002358
AVG,-0.176407,-0.303442,-0.360772,0.752855,-0.35294,0.138826,-0.280536,0.377519,1.0,0.818218,...,-0.010107,0.00015,-0.010453,-0.006542,0.011446,0.044101,-0.037793,-0.13634,0.016293,-0.023211
BABIP,-0.083836,-0.222724,-0.188225,0.531139,0.11858,0.163656,-0.038354,0.036288,0.818218,1.0,...,0.004481,0.018027,0.017904,-0.037065,0.017074,0.079386,0.029058,-0.043563,0.001444,0.000576


In [12]:
FD_corr = main_df_corr['FD_Pts'].sort_values(ascending = False)

In [13]:
FD_corr

FD_Pts            1.000000
DK_Pts            0.990194
Avg_Outs          0.310977
W_Season          0.271873
Proj_Run_Diff     0.265782
GS                0.263977
SwStr%            0.238162
Events            0.221390
K/9               0.219623
K/BB              0.187983
HardHit           0.170964
Barrels           0.151638
GB/FB_Opp         0.105314
K%_Opp            0.104575
Pk_Fct_SO         0.096819
maxEV             0.079142
RS/9              0.067998
Soft%             0.064819
Med%              0.046900
Med%_Opp          0.036619
LD%_Opp           0.033855
Soft%_Opp         0.029593
GB/FB             0.016655
Pk_Fct_HR         0.010309
CStr%            -0.007549
LA               -0.010598
BABIP_Opp        -0.011653
Wind             -0.020225
LD%              -0.033284
HR/FB_Opp        -0.042729
BB%_Opp          -0.047056
Pk_Fct_Overall   -0.047811
BABIP            -0.048123
Hard%_Opp        -0.049808
Temp             -0.059901
wRC              -0.073388
Barrel%          -0.079324
H

In [14]:
DK_corr = main_df_corr['DK_Pts'].sort_values(ascending = False)

In [15]:
DK_corr

DK_Pts            1.000000
FD_Pts            0.990194
Avg_Outs          0.262932
Proj_Run_Diff     0.260713
W_Season          0.247340
SwStr%            0.246125
GS                0.228733
K/9               0.226703
K/BB              0.189685
Events            0.186286
HardHit           0.135630
Barrels           0.127026
K%_Opp            0.104590
GB/FB_Opp         0.102034
Pk_Fct_SO         0.098082
maxEV             0.066868
RS/9              0.066591
Soft%             0.061712
Med%              0.043415
Med%_Opp          0.034140
Soft%_Opp         0.031128
LD%_Opp           0.030516
LA                0.005791
Pk_Fct_HR         0.004262
GB/FB            -0.001131
CStr%            -0.005523
BABIP_Opp        -0.012602
Wind             -0.019611
LD%              -0.034801
HR/FB_Opp        -0.044038
Hard%_Opp        -0.048947
BABIP            -0.050063
Pk_Fct_Overall   -0.054810
BB%_Opp          -0.055075
Temp             -0.065397
Barrel%          -0.065435
HR/FB            -0.072994
w

In [16]:
#FD_first_tier = list(FD_corr[FD_corr >= .15].index) + list(FD_corr[FD_corr <= -.15].index)

In [17]:
#FD_first_tier

In [18]:
#DK_first_tier = list(DK_corr[DK_corr >= .15].index) + list(DK_corr[DK_corr <= -.15].index)

In [19]:
#DK_first_tier

In [20]:
#first_tier_features = set(FD_first_tier).union(set(DK_first_tier))

In [21]:
#first_tier_features = list(first_tier_features)

In [22]:
#first_tier_features

In [23]:
df1 = main_df.pop('FD_Pts')
df2 = main_df.pop('DK_Pts')

In [24]:
main_df['FD_Pts'] = df1
main_df['DK_Pts'] = df2

In [25]:
main_df_corr = main_df.corr()

In [26]:
main_df_corr

Unnamed: 0,GS,Avg_Outs,W_Season,ERA,K/9,BB/9,K/BB,HR/9,AVG,BABIP,...,Soft%_Opp,Med%_Opp,Hard%_Opp,Pk_Fct_Overall,Pk_Fct_HR,Pk_Fct_SO,Temp,Wind,FD_Pts,DK_Pts
GS,1.0,0.612585,0.716807,-0.222785,0.082452,-0.154689,0.141815,-0.20211,-0.176407,-0.083836,...,-0.019751,0.021313,-0.005395,-0.021005,-0.010725,0.018261,0.025442,0.010607,0.263977,0.228733
Avg_Outs,0.612585,1.0,0.582005,-0.391464,0.06045,-0.311532,0.300621,-0.234053,-0.303442,-0.222724,...,-0.018813,0.023027,-0.005956,0.013489,0.050774,0.032654,0.055346,-0.004621,0.310977,0.262932
W_Season,0.716807,0.582005,1.0,-0.423023,0.197056,-0.302728,0.351296,-0.302528,-0.360772,-0.188225,...,0.022192,0.003851,-0.017926,-0.03592,0.021304,0.107933,-0.000965,0.047409,0.271873,0.24734
ERA,-0.222785,-0.391464,-0.423023,1.0,-0.163047,0.36706,-0.346828,0.63228,0.752855,0.531139,...,-0.015422,0.038625,-0.020989,0.054796,-0.030562,-0.103201,0.005245,-0.019806,-0.16446,-0.158013
K/9,0.082452,0.06045,0.197056,-0.163047,1.0,0.040437,0.513107,-0.010659,-0.35294,0.11858,...,0.050097,-0.017321,-0.019565,0.069992,0.135581,0.168725,-0.025198,0.055241,0.219623,0.226703
BB/9,-0.154689,-0.311532,-0.302728,0.36706,0.040437,1.0,-0.690995,0.049966,0.138826,0.163656,...,-0.029699,0.027619,-0.002588,0.0486,0.010919,-0.06267,0.008414,-0.055206,-0.105597,-0.10466
K/BB,0.141815,0.300621,0.351296,-0.346828,0.513107,-0.690995,1.0,-0.054138,-0.280536,-0.038354,...,0.072061,-0.033798,-0.020344,-0.018748,0.066127,0.150583,-0.018713,0.05282,0.187983,0.189685
HR/9,-0.20211,-0.234053,-0.302528,0.63228,-0.010659,0.049966,-0.054138,1.0,0.377519,0.036288,...,-0.019784,0.055322,-0.031595,0.020114,-0.001135,-0.063773,0.016985,0.002358,-0.131109,-0.118066
AVG,-0.176407,-0.303442,-0.360772,0.752855,-0.35294,0.138826,-0.280536,0.377519,1.0,0.818218,...,-0.010453,-0.006542,0.011446,0.044101,-0.037793,-0.13634,0.016293,-0.023211,-0.188413,-0.188274
BABIP,-0.083836,-0.222724,-0.188225,0.531139,0.11858,0.163656,-0.038354,0.036288,0.818218,1.0,...,0.017904,-0.037065,0.017074,0.079386,0.029058,-0.043563,0.001444,0.000576,-0.048123,-0.050063


In [27]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1194 entries, 0 to 1193
Data columns (total 59 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            1194 non-null   object 
 1   Team            1194 non-null   object 
 2   Date            1194 non-null   object 
 3   GS              1194 non-null   float64
 4   Avg_Outs        1194 non-null   float64
 5   W_Season        1194 non-null   float64
 6   ERA             1194 non-null   float64
 7   K/9             1194 non-null   float64
 8   BB/9            1194 non-null   float64
 9   K/BB            1194 non-null   float64
 10  HR/9            1194 non-null   float64
 11  AVG             1194 non-null   float64
 12  BABIP           1194 non-null   float64
 13  FIP             1194 non-null   float64
 14  SIERA           1194 non-null   float64
 15  WHIP            1194 non-null   float64
 16  GB/FB           1194 non-null   float64
 17  LD%             1194 non-null   f

In [28]:
main_df.head()

Unnamed: 0,Name,Team,Date,GS,Avg_Outs,W_Season,ERA,K/9,BB/9,K/BB,...,Soft%_Opp,Med%_Opp,Hard%_Opp,Pk_Fct_Overall,Pk_Fct_HR,Pk_Fct_SO,Temp,Wind,FD_Pts,DK_Pts
0,Tyler Anderson,LAD,2022-08-07,18.0,17.888889,12.0,2.89,7.34,1.79,4.09,...,19.9,50.4,29.6,101,125,105,92.0,4.0,40,23.95
1,Zach Davies,ARI,2022-08-07,16.0,15.375,2.0,4.28,6.91,3.07,2.25,...,18.3,52.2,29.5,101,81,96,72.0,3.0,24,13.65
2,Kevin Gausman,TOR,2022-08-07,20.0,16.75,8.0,3.06,10.64,1.61,6.6,...,13.8,52.0,34.2,97,94,100,72.0,3.0,37,19.9
3,Triston McKenzie,CLE,2022-08-07,19.0,18.473684,7.0,3.38,8.48,2.4,3.53,...,16.2,53.8,30.1,100,98,102,86.0,3.0,58,36.2
4,Jesus Luzardo,MIA,2022-08-07,7.0,14.571429,2.0,3.97,12.18,4.24,2.88,...,18.2,50.0,31.8,99,99,105,78.0,4.0,49,30.55


In [29]:
main_df_corr = main_df.select_dtypes(include = ['int64', 'float64'])

In [31]:
main_df_corr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1194 entries, 0 to 1193
Data columns (total 56 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   GS              1194 non-null   float64
 1   Avg_Outs        1194 non-null   float64
 2   W_Season        1194 non-null   float64
 3   ERA             1194 non-null   float64
 4   K/9             1194 non-null   float64
 5   BB/9            1194 non-null   float64
 6   K/BB            1194 non-null   float64
 7   HR/9            1194 non-null   float64
 8   AVG             1194 non-null   float64
 9   BABIP           1194 non-null   float64
 10  FIP             1194 non-null   float64
 11  SIERA           1194 non-null   float64
 12  WHIP            1194 non-null   float64
 13  GB/FB           1194 non-null   float64
 14  LD%             1194 non-null   float64
 15  HR/FB           1194 non-null   float64
 16  RS/9            1194 non-null   float64
 17  Soft%           1194 non-null   f

In [32]:
main_df_corr = main_df_corr.corr()

In [33]:
main_df_corr

Unnamed: 0,GS,Avg_Outs,W_Season,ERA,K/9,BB/9,K/BB,HR/9,AVG,BABIP,...,Soft%_Opp,Med%_Opp,Hard%_Opp,Pk_Fct_Overall,Pk_Fct_HR,Pk_Fct_SO,Temp,Wind,FD_Pts,DK_Pts
GS,1.0,0.612585,0.716807,-0.222785,0.082452,-0.154689,0.141815,-0.20211,-0.176407,-0.083836,...,-0.019751,0.021313,-0.005395,-0.021005,-0.010725,0.018261,0.025442,0.010607,0.263977,0.228733
Avg_Outs,0.612585,1.0,0.582005,-0.391464,0.06045,-0.311532,0.300621,-0.234053,-0.303442,-0.222724,...,-0.018813,0.023027,-0.005956,0.013489,0.050774,0.032654,0.055346,-0.004621,0.310977,0.262932
W_Season,0.716807,0.582005,1.0,-0.423023,0.197056,-0.302728,0.351296,-0.302528,-0.360772,-0.188225,...,0.022192,0.003851,-0.017926,-0.03592,0.021304,0.107933,-0.000965,0.047409,0.271873,0.24734
ERA,-0.222785,-0.391464,-0.423023,1.0,-0.163047,0.36706,-0.346828,0.63228,0.752855,0.531139,...,-0.015422,0.038625,-0.020989,0.054796,-0.030562,-0.103201,0.005245,-0.019806,-0.16446,-0.158013
K/9,0.082452,0.06045,0.197056,-0.163047,1.0,0.040437,0.513107,-0.010659,-0.35294,0.11858,...,0.050097,-0.017321,-0.019565,0.069992,0.135581,0.168725,-0.025198,0.055241,0.219623,0.226703
BB/9,-0.154689,-0.311532,-0.302728,0.36706,0.040437,1.0,-0.690995,0.049966,0.138826,0.163656,...,-0.029699,0.027619,-0.002588,0.0486,0.010919,-0.06267,0.008414,-0.055206,-0.105597,-0.10466
K/BB,0.141815,0.300621,0.351296,-0.346828,0.513107,-0.690995,1.0,-0.054138,-0.280536,-0.038354,...,0.072061,-0.033798,-0.020344,-0.018748,0.066127,0.150583,-0.018713,0.05282,0.187983,0.189685
HR/9,-0.20211,-0.234053,-0.302528,0.63228,-0.010659,0.049966,-0.054138,1.0,0.377519,0.036288,...,-0.019784,0.055322,-0.031595,0.020114,-0.001135,-0.063773,0.016985,0.002358,-0.131109,-0.118066
AVG,-0.176407,-0.303442,-0.360772,0.752855,-0.35294,0.138826,-0.280536,0.377519,1.0,0.818218,...,-0.010453,-0.006542,0.011446,0.044101,-0.037793,-0.13634,0.016293,-0.023211,-0.188413,-0.188274
BABIP,-0.083836,-0.222724,-0.188225,0.531139,0.11858,0.163656,-0.038354,0.036288,0.818218,1.0,...,0.017904,-0.037065,0.017074,0.079386,0.029058,-0.043563,0.001444,0.000576,-0.048123,-0.050063


In [34]:
# sns.set(font_scale = 1.5)
# plt.figure(figsize = (30, 20))
# sns.heatmap(main_df_corr, annot = True)

In [35]:
main_df = main_df.set_index(['Date', 'Name', 'Team'])

In [36]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1194 entries, ('2022-08-07', 'Tyler Anderson', 'LAD') to ('2022-06-20', 'Caleb Kilian', 'CHC')
Data columns (total 56 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   GS              1194 non-null   float64
 1   Avg_Outs        1194 non-null   float64
 2   W_Season        1194 non-null   float64
 3   ERA             1194 non-null   float64
 4   K/9             1194 non-null   float64
 5   BB/9            1194 non-null   float64
 6   K/BB            1194 non-null   float64
 7   HR/9            1194 non-null   float64
 8   AVG             1194 non-null   float64
 9   BABIP           1194 non-null   float64
 10  FIP             1194 non-null   float64
 11  SIERA           1194 non-null   float64
 12  WHIP            1194 non-null   float64
 13  GB/FB           1194 non-null   float64
 14  LD%             1194 non-null   float64
 15  HR/FB           1194 non-null   float64
 16  RS/9   

In [37]:
X = main_df.drop(columns = ['FD_Pts', 'DK_Pts'])

In [38]:
X.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1194 entries, ('2022-08-07', 'Tyler Anderson', 'LAD') to ('2022-06-20', 'Caleb Kilian', 'CHC')
Data columns (total 54 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   GS              1194 non-null   float64
 1   Avg_Outs        1194 non-null   float64
 2   W_Season        1194 non-null   float64
 3   ERA             1194 non-null   float64
 4   K/9             1194 non-null   float64
 5   BB/9            1194 non-null   float64
 6   K/BB            1194 non-null   float64
 7   HR/9            1194 non-null   float64
 8   AVG             1194 non-null   float64
 9   BABIP           1194 non-null   float64
 10  FIP             1194 non-null   float64
 11  SIERA           1194 non-null   float64
 12  WHIP            1194 non-null   float64
 13  GB/FB           1194 non-null   float64
 14  LD%             1194 non-null   float64
 15  HR/FB           1194 non-null   float64
 16  RS/9   

In [39]:
y_FD = main_df['FD_Pts']
y_DK = main_df['DK_Pts']

In [40]:
y_FD

Date        Name              Team
2022-08-07  Tyler Anderson    LAD     40
            Zach Davies       ARI     24
            Kevin Gausman     TOR     37
            Triston McKenzie  CLE     58
            Jesus Luzardo     MIA     49
                                      ..
2022-06-20  Zach Davies       ARI     15
            Noah Syndergaard  LAA     22
            Alex Faedo        DET      7
            Jose Berrios      TOR     -3
            Caleb Kilian      CHC      1
Name: FD_Pts, Length: 1194, dtype: int64

In [41]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,GS,Avg_Outs,W_Season,ERA,K/9,BB/9,K/BB,HR/9,AVG,BABIP,...,LD%_Opp,HR/FB_Opp,Soft%_Opp,Med%_Opp,Hard%_Opp,Pk_Fct_Overall,Pk_Fct_HR,Pk_Fct_SO,Temp,Wind
Date,Name,Team,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2022-08-07,Tyler Anderson,LAD,18.0,17.888889,12.0,2.89,7.34,1.79,4.09,0.78,0.223,0.264,...,19.5,8.8,19.9,50.4,29.6,101,125,105,92.0,4.0
2022-08-07,Zach Davies,ARI,16.0,15.375000,2.0,4.28,6.91,3.07,2.25,1.32,0.230,0.252,...,19.8,10.7,18.3,52.2,29.5,101,81,96,72.0,3.0
2022-08-07,Kevin Gausman,TOR,20.0,16.750000,8.0,3.06,10.64,1.61,6.60,0.48,0.269,0.370,...,20.3,13.4,13.8,52.0,34.2,97,94,100,72.0,3.0
2022-08-07,Triston McKenzie,CLE,19.0,18.473684,7.0,3.38,8.48,2.40,3.53,1.35,0.203,0.232,...,20.3,13.5,16.2,53.8,30.1,100,98,102,86.0,3.0
2022-08-07,Jesus Luzardo,MIA,7.0,14.571429,2.0,3.97,12.18,4.24,2.88,1.06,0.172,0.236,...,18.3,14.8,18.2,50.0,31.8,99,99,105,78.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-06-20,Zach Davies,ARI,13.0,15.923077,2.0,3.78,7.43,2.87,2.59,1.04,0.234,0.270,...,19.8,8.2,18.1,52.8,29.1,94,93,107,75.0,3.0
2022-06-20,Noah Syndergaard,LAA,10.0,15.300000,4.0,3.53,6.18,2.29,2.69,0.71,0.237,0.270,...,19.3,7.1,15.1,53.6,31.3,105,119,98,87.0,4.0
2022-06-20,Alex Faedo,DET,8.0,15.000000,1.0,4.28,7.43,2.70,2.75,1.35,0.278,0.319,...,21.3,9.1,16.6,52.3,31.1,109,101,94,68.0,2.0
2022-06-20,Jose Berrios,TOR,13.0,16.538462,5.0,4.65,7.79,2.13,3.65,1.51,0.256,0.290,...,18.1,7.4,16.7,57.4,25.9,101,125,99,89.0,3.0


In [62]:
X.shape

(1194, 54)

In [42]:
X_FD_train, X_FD_test, y_FD_train, y_FD_test = train_test_split(X, y_FD, test_size = 0.3, random_state = 43)
X_DK_train, X_DK_test, y_DK_train, y_DK_test = train_test_split(X, y_DK, test_size = 0.3, random_state = 43)

In [43]:
from sklearn.preprocessing import StandardScaler

In [44]:
scaler = StandardScaler()

In [45]:
X_FD_train = scaler.fit_transform(X_FD_train)
X_FD_test = scaler.transform(X_FD_test)
X_DK_train = scaler.fit_transform(X_DK_train)
X_DK_test = scaler.transform(X_DK_test)

In [46]:
X_FD_scaled_df = pd.DataFrame(X_FD_train, columns = X.columns)
X_DK_scaled_df = pd.DataFrame(X_DK_train, columns = X.columns)

In [47]:
X_FD_scaled_df.mean()

GS                8.376533e-17
Avg_Outs         -1.874748e-17
W_Season          3.331999e-16
ERA               1.239195e-16
K/9              -4.423940e-16
BB/9             -2.947409e-16
K/BB             -1.013162e-16
HR/9             -1.861452e-16
AVG              -3.842568e-16
BABIP            -2.665200e-16
FIP               4.294635e-16
SIERA            -2.645256e-16
WHIP              5.694713e-16
GB/FB             1.207284e-16
LD%              -5.241316e-16
HR/FB            -3.191060e-16
RS/9             -3.297429e-17
Soft%            -1.493483e-16
Med%             -1.208681e-15
Hard%            -3.790714e-16
Contact%          1.434847e-15
SwStr%            1.535033e-16
CStr%             5.422143e-16
EV                1.676370e-15
LA                2.590077e-16
Barrels           1.728491e-18
Barrel%          -2.369362e-16
maxEV             6.945874e-16
HardHit           2.161943e-16
HardHit%          5.172842e-16
Events            4.307931e-17
OpenOU           -2.257010e-16
Opp_Proj

In [48]:
X_FD_scaled_df.std()

GS                1.000599
Avg_Outs          1.000599
W_Season          1.000599
ERA               1.000599
K/9               1.000599
BB/9              1.000599
K/BB              1.000599
HR/9              1.000599
AVG               1.000599
BABIP             1.000599
FIP               1.000599
SIERA             1.000599
WHIP              1.000599
GB/FB             1.000599
LD%               1.000599
HR/FB             1.000599
RS/9              1.000599
Soft%             1.000599
Med%              1.000599
Hard%             1.000599
Contact%          1.000599
SwStr%            1.000599
CStr%             1.000599
EV                1.000599
LA                1.000599
Barrels           1.000599
Barrel%           1.000599
maxEV             1.000599
HardHit           1.000599
HardHit%          1.000599
Events            1.000599
OpenOU            1.000599
Opp_Proj_Runs     1.000599
Proj_Run_Diff     1.000599
BB%_Opp           1.000599
K%_Opp            1.000599
BB/K              1.000599
O

In [49]:
X_DK_scaled_df.mean()

GS                8.376533e-17
Avg_Outs         -1.874748e-17
W_Season          3.331999e-16
ERA               1.239195e-16
K/9              -4.423940e-16
BB/9             -2.947409e-16
K/BB             -1.013162e-16
HR/9             -1.861452e-16
AVG              -3.842568e-16
BABIP            -2.665200e-16
FIP               4.294635e-16
SIERA            -2.645256e-16
WHIP              5.694713e-16
GB/FB             1.207284e-16
LD%              -5.241316e-16
HR/FB            -3.191060e-16
RS/9             -3.297429e-17
Soft%            -1.493483e-16
Med%             -1.208681e-15
Hard%            -3.790714e-16
Contact%          1.434847e-15
SwStr%            1.535033e-16
CStr%             5.422143e-16
EV                1.676370e-15
LA                2.590077e-16
Barrels           1.728491e-18
Barrel%          -2.369362e-16
maxEV             6.945874e-16
HardHit           2.161943e-16
HardHit%          5.172842e-16
Events            4.307931e-17
OpenOU           -2.257010e-16
Opp_Proj

In [50]:
X_DK_scaled_df.std()

GS                1.000599
Avg_Outs          1.000599
W_Season          1.000599
ERA               1.000599
K/9               1.000599
BB/9              1.000599
K/BB              1.000599
HR/9              1.000599
AVG               1.000599
BABIP             1.000599
FIP               1.000599
SIERA             1.000599
WHIP              1.000599
GB/FB             1.000599
LD%               1.000599
HR/FB             1.000599
RS/9              1.000599
Soft%             1.000599
Med%              1.000599
Hard%             1.000599
Contact%          1.000599
SwStr%            1.000599
CStr%             1.000599
EV                1.000599
LA                1.000599
Barrels           1.000599
Barrel%           1.000599
maxEV             1.000599
HardHit           1.000599
HardHit%          1.000599
Events            1.000599
OpenOU            1.000599
Opp_Proj_Runs     1.000599
Proj_Run_Diff     1.000599
BB%_Opp           1.000599
K%_Opp            1.000599
BB/K              1.000599
O

In [51]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

In [52]:
from sklearn.metrics import mean_absolute_error

In [53]:
base_LR_FD = LinearRegression()
base_LR_DK = LinearRegression()
base_RF_FD = RandomForestRegressor()
base_RF_DK = RandomForestRegressor()
base_GB_FD = GradientBoostingRegressor()
base_GB_DK = GradientBoostingRegressor()
base_AB_FD = AdaBoostRegressor()
base_AB_DK = AdaBoostRegressor()
base_XGB_FD = XGBRegressor()
base_XGB_DK = XGBRegressor()

In [54]:
model_list_FD = [base_LR_FD, base_RF_FD, base_GB_FD, base_AB_FD, base_XGB_FD]
model_list_DK = [base_LR_DK, base_RF_DK, base_GB_DK, base_AB_DK, base_XGB_DK]

In [55]:
for model in model_list_FD:
    model_name = str(model)
    model.fit(X_FD_train, y_FD_train)
    y_FD_pred = model.predict(X_FD_test)
    print(f"{model_name} R_squared_train: {model.score(X_FD_train, y_FD_train)}")
    print(f"{model_name} R_squared_test: {model.score(X_FD_test, y_FD_test)}")
    print(f"{model_name} MAE: {mean_absolute_error(y_FD_test, y_FD_pred)}")
    print(f"{model_name} MSE: {mean_squared_error(y_FD_test, y_FD_pred)}")
    print(f"{model_name} RMSE: {mean_squared_error(y_FD_test, y_FD_pred, squared = False)}")
    print()
    

LinearRegression() R_squared_train: 0.25421879522823454
LinearRegression() R_squared_test: 0.1551196691915976
LinearRegression() MAE: 12.459108543434887
LinearRegression() MSE: 239.44400631484848
LinearRegression() RMSE: 15.473978360940295

RandomForestRegressor() R_squared_train: 0.8839849103108183
RandomForestRegressor() R_squared_test: 0.1938612833138027
RandomForestRegressor() MAE: 12.069136490250695
RandomForestRegressor() MSE: 228.46440724233983
RandomForestRegressor() RMSE: 15.115039108197498

GradientBoostingRegressor() R_squared_train: 0.6412615619247499
GradientBoostingRegressor() R_squared_test: 0.15110207048269753
GradientBoostingRegressor() MAE: 12.280841376873187
GradientBoostingRegressor() MSE: 240.58261718735386
GradientBoostingRegressor() RMSE: 15.510725875578933

AdaBoostRegressor() R_squared_train: 0.34894464585023766
AdaBoostRegressor() R_squared_test: 0.17298828414216738
AdaBoostRegressor() MAE: 12.447127778526966
AdaBoostRegressor() MSE: 234.37993677145167
AdaBoos

In [56]:
for model in model_list_DK:
    model_name = str(model)
    model.fit(X_DK_train, y_DK_train)
    y_DK_pred = model.predict(X_DK_test)
    print(f"{model_name} R_squared_train: {model.score(X_DK_train, y_DK_train)}")
    print(f"{model_name} R_squared_test: {model.score(X_DK_test, y_DK_test)}")
    print(f"{model_name} MAE: {mean_absolute_error(y_DK_test, y_DK_pred)}")
    print(f"{model_name} MSE: {mean_squared_error(y_DK_test, y_DK_pred)}")
    print(f"{model_name} RMSE: {mean_squared_error(y_DK_test, y_DK_pred, squared = False)}")
    print()

LinearRegression() R_squared_train: 0.23893988142295874
LinearRegression() R_squared_test: 0.12505718393778775
LinearRegression() MAE: 8.36709505869342
LinearRegression() MSE: 110.31915563275155
LinearRegression() RMSE: 10.50329260911794

RandomForestRegressor() R_squared_train: 0.8803343425791456
RandomForestRegressor() R_squared_test: 0.1610188638382103
RandomForestRegressor() MAE: 8.03559610027855
RandomForestRegressor() MSE: 105.78484540250696
RandomForestRegressor() RMSE: 10.285176002505109

GradientBoostingRegressor() R_squared_train: 0.6424634569512707
GradientBoostingRegressor() R_squared_test: 0.09392621506398036
GradientBoostingRegressor() MAE: 8.278175771718203
GradientBoostingRegressor() MSE: 114.24437467237357
GradientBoostingRegressor() RMSE: 10.688516018249379

AdaBoostRegressor() R_squared_train: 0.36385365074534726
AdaBoostRegressor() R_squared_test: 0.13731570715546548
AdaBoostRegressor() MAE: 8.25966322866518
AdaBoostRegressor() MSE: 108.7735118422635
AdaBoostRegress

In [64]:
# base_LR_FD = LinearRegression()
# base_LR_FD.fit(X_train, y_FD_train)
# y_FD_pred = base_LR_FD.predict(X_test)

# print(f"R_squared_train: {base_LR_FD.score(X_train, y_FD_train)}")
# print(f"R_squared_test: {base_LR_FD.score(X_test, y_FD_test)}")
# print(f"Mean Absolute Error: {mean_absolute_error(y_FD_test, y_FD_pred)}")

**Baseline Linear Regression with 13 features:**<br>
R_squared_train: 0.18071066533848312<br>
R_squared_test: 0.19072120487638278<br>
Mean Absolute Error: 12.24811229330219<br>

**Baseline Linear Regression with 18 features:**<br>
R_squared_train: 0.20180995373804955<br>
R_squared_test: 0.1971620695168933<br>
Mean Absolute Error: 12.192276418555357<br>

**Baseline Linear Regression with 29 features:**<br>
R_squared_train: 0.21998751553333518<br>
R_squared_test: 0.18935419781678475<br>
Mean Absolute Error: 12.328237485325364<br>

In [57]:
from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV

In [58]:
param_grid = {'alpha': np.arange(0, 1, .01)}
Lasso_model_FD = Lasso(random_state = 43)
Lasso_CV = GridSearchCV(Lasso_model_FD, param_grid, cv = 5, scoring = 'neg_mean_absolute_error')
Lasso_CV.fit(X_FD_train, y_FD_train)
print(f"Lasso_model_FD best params: {Lasso_CV.best_params_}")
print(f"Lasso_model_FD best score: {Lasso_CV.best_score_}")

  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso_model_FD best params: {'alpha': 0.52}
Lasso_model_FD best score: -11.938403660218913


In [59]:
best_lasso_FD = Lasso(alpha = 0.52, random_state = 43)
best_lasso_FD.fit(X_FD_train, y_FD_train)
y_FD_pred = best_lasso_FD.predict(X_FD_test)
print(f"best_lasso_FD Train R-squared: {best_lasso_FD.score(X_FD_train, y_FD_train)}")
print(f"best_lasso_FD Test R-squared: {best_lasso_FD.score(X_FD_test, y_FD_test)}")
print(f"best_lasso_FD MAE: {mean_absolute_error(y_FD_test, y_FD_pred)}")
print(f"best_lasso_FD RMSE: {mean_squared_error(y_FD_test, y_FD_pred, squared = False)}")
print(f"best_lasso_FD MSE: {mean_squared_error(y_FD_test, y_FD_pred)}")

best_lasso_FD Train R-squared: 0.1976542181188099
best_lasso_FD Test R-squared: 0.18878441049252792
best_lasso_FD MAE: 12.295144826590782
best_lasso_FD RMSE: 15.162559894934644
best_lasso_FD MSE: 229.90322256748047


In [60]:
param_grid = {'alpha': np.arange(0, 1, .01)}
Lasso_model_DK = Lasso(random_state = 43)
Lasso_CV = GridSearchCV(Lasso_model_DK, param_grid, cv = 5, scoring = 'neg_mean_absolute_error')
Lasso_CV.fit(X_DK_train, y_DK_train)
print(f"Lasso_model_DK best params: {Lasso_CV.best_params_}")
print(f"Lasso_model_DK best score: {Lasso_CV.best_score_}")

  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  estimator.fit(X_train, y_train, **fit_params)
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


Lasso_model_DK best params: {'alpha': 0.15}
Lasso_model_DK best score: -8.06786155800821


In [61]:
best_lasso_DK = Lasso(alpha = 0.15, random_state = 43)
best_lasso_DK.fit(X_DK_train, y_DK_train)
y_DK_pred = best_lasso_DK.predict(X_DK_test)
print(f"best_lasso_DK Train R-squared: {best_lasso_DK.score(X_DK_train, y_DK_train)}")
print(f"best_lasso_DK Test R-squared: {best_lasso_DK.score(X_DK_test, y_DK_test)}")
print(f"best_lasso_DK MAE: {mean_absolute_error(y_DK_test, y_DK_pred)}")
print(f"best_lasso_DK RMSE: {mean_squared_error(y_DK_test, y_DK_pred, squared = False)}")
print(f"best_lasso_DK MSE: {mean_squared_error(y_DK_test, y_DK_pred)}")

best_lasso_DK Train R-squared: 0.19978782437568432
best_lasso_DK Test R-squared: 0.16070127391123046
best_lasso_DK MAE: 8.194736204515
best_lasso_DK RMSE: 10.287122505882326
best_lasso_DK MSE: 105.82488945103069


In [64]:
#Tuned Random Forest
param_grid = {'max_depth': [3, 5, 7, 9], 'n_estimators': [100, 200, 300, 400, 500]}

In [65]:
RF_FD_cv = GridSearchCV(base_RF_FD, param_grid, n_jobs = 2, verbose = 3)
RF_FD_cv.fit(X_FD_train, y_FD_train)
print(f"Tuned Random Forest FD Best Estimator: {RF_FD_cv.best_estimator_}")
print(f"Tuned Random Forest FD Best Score: {RF_FD_cv.best_score_}")
print(f"Tuned Random Forest FD Best Params: {RF_FD_cv.best_params_}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   27.4s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  3.0min finished


Tuned Random Forest FD Best Estimator: RandomForestRegressor(max_depth=7, n_estimators=300)
Tuned Random Forest FD Best Score: 0.16246155549395913
Tuned Random Forest FD Best Params: {'max_depth': 7, 'n_estimators': 300}


In [66]:
best_RF_FD = RandomForestRegressor(max_depth = 7, n_estimators = 300, random_state = 43)
best_RF_FD.fit(X_FD_train, y_FD_train)
y_FD_pred = best_RF_FD.predict(X_FD_test)
print(f"Best_RF_FD Train R-squared: {best_RF_FD.score(X_FD_train, y_FD_train)}")
print(f"Best_RF_FD Test R-squared: {best_RF_FD.score(X_FD_test, y_FD_test)}")
print(f"Best_RF_FD MAE: {mean_absolute_error(y_FD_test, y_FD_pred)}")
print(f"Best_RF_FD RMSE: {mean_squared_error(y_FD_test, y_FD_pred, squared = False)}")
print(f"Best_RF_FD MSE: {mean_squared_error(y_FD_test, y_FD_pred)}")

Best_RF_FD Train R-squared: 0.6402535286135557
Best_RF_FD Test R-squared: 0.19489601429812775
Best_RF_FD MAE: 12.109395120347523
Best_RF_FD RMSE: 15.105335430076769
Best_RF_FD MSE: 228.1711584551325


In [67]:
RF_DK_cv = GridSearchCV(base_RF_DK, param_grid, n_jobs = 2, verbose = 3)
RF_DK_cv.fit(X_DK_train, y_DK_train)
print(f"Tuned Random Forest DK Best Estimator: {RF_DK_cv.best_estimator_}")
print(f"Tuned Random Forest DK Best Score: {RF_DK_cv.best_score_}")
print(f"Tuned Random Forest DK Best Params: {RF_DK_cv.best_params_}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   28.6s
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  3.1min finished


Tuned Random Forest DK Best Estimator: RandomForestRegressor(max_depth=9, n_estimators=300)
Tuned Random Forest DK Best Score: 0.14729927142036808
Tuned Random Forest DK Best Params: {'max_depth': 9, 'n_estimators': 300}


In [68]:
best_RF_DK = RandomForestRegressor(max_depth = 9, n_estimators = 300, random_state = 43)
best_RF_DK.fit(X_DK_train, y_DK_train)
y_DK_pred = best_RF_DK.predict(X_DK_test)
print(f"Best_RF_DK Train R-squared: {best_RF_DK.score(X_DK_train, y_DK_train)}")
print(f"Best_RF_DK Test R-squared: {best_RF_DK.score(X_DK_test, y_DK_test)}")
print(f"Best_RF_DK MAE: {mean_absolute_error(y_DK_test, y_DK_pred)}")
print(f"Best_RF_DK RMSE: {mean_squared_error(y_DK_test, y_DK_pred, squared = False)}")
print(f"Best_RF_DK MSE: {mean_squared_error(y_DK_test, y_DK_pred)}")

Best_RF_DK Train R-squared: 0.7585401147600648
Best_RF_DK Test R-squared: 0.15536193725018133
Best_RF_DK MAE: 8.109912745373363
Best_RF_DK RMSE: 10.319792239284457
Best_RF_DK MSE: 106.49811186199571


In [69]:
param_grid = {'n_estimators': [100, 200, 300, 400, 500], 'max_depth' : [3, 5, 7, 9], 'learning_rate' : [0.01, 0.05, 0.1]}
GB_FD_cv = GridSearchCV(base_GB_FD, param_grid, scoring = 'neg_mean_absolute_error', n_jobs = 2, verbose = 3)
GB_FD_cv.fit(X_FD_train, y_FD_train)
print(f"Tuned Gradient Boost FD Best Estimator: {GB_FD_cv.best_estimator_}")
print(f"Tuned Gradient Boost FD Best Score: {GB_FD_cv.best_score_}")
print(f"Tuned Gradient Boost FD Best Params: {GB_FD_cv.best_params_}")

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   57.4s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:  7.4min
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed: 30.9min
[Parallel(n_jobs=2)]: Done 300 out of 300 | elapsed: 33.0min finished


Tuned Gradient Boost FD Best Estimator: GradientBoostingRegressor(learning_rate=0.01, max_depth=5, n_estimators=200)
Tuned Gradient Boost FD Best Score: -11.877483320720255
Tuned Gradient Boost FD Best Params: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 200}


In [70]:
best_GB_FD = GradientBoostingRegressor(max_depth = 5, n_estimators = 200, learning_rate = 0.01, random_state = 43)
best_GB_FD.fit(X_FD_train, y_FD_train)
y_FD_pred = best_GB_FD.predict(X_FD_test)
print(f"Best_GB_FD Train R-squared: {best_GB_FD.score(X_FD_train, y_FD_train)}")
print(f"Best_GB_FD Test R-squared: {best_GB_FD.score(X_FD_test, y_FD_test)}")
print(f"Best_GB_FD MAE: {mean_absolute_error(y_FD_test, y_FD_pred)}")
print(f"Best_GB_FD RMSE: {mean_squared_error(y_FD_test, y_FD_pred, squared = False)}")
print(f"Best_GB_FD MSE: {mean_squared_error(y_FD_test, y_FD_pred)}")

Best_GB_FD Train R-squared: 0.6100013741996952
Best_GB_FD Test R-squared: 0.17922107943732224
Best_GB_FD MAE: 12.146892725312407
Best_GB_FD RMSE: 15.251672901198326
Best_GB_FD MSE: 232.61352628514737


In [72]:
GB_DK_cv = GridSearchCV(base_GB_DK, param_grid, scoring = 'neg_mean_absolute_error', n_jobs = 2, verbose = 3)
GB_DK_cv.fit(X_DK_train, y_DK_train)
print(f"Tuned Gradient Boost DK Best Estimator: {GB_DK_cv.best_estimator_}")
print(f"Tuned Gradient Boost DK Best Score: {GB_DK_cv.best_score_}")
print(f"Tuned Gradient Boost DK Best Params: {GB_DK_cv.best_params_}")

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   53.6s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:  7.5min
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed: 17.9min
[Parallel(n_jobs=2)]: Done 300 out of 300 | elapsed: 19.9min finished


Tuned Gradient Boost DK Best Estimator: GradientBoostingRegressor(learning_rate=0.01, max_depth=5, n_estimators=300)
Tuned Gradient Boost DK Best Score: -8.007036369749585
Tuned Gradient Boost DK Best Params: {'learning_rate': 0.01, 'max_depth': 5, 'n_estimators': 300}


In [73]:
best_GB_DK = GradientBoostingRegressor(max_depth = 5, n_estimators = 300, learning_rate = 0.01, random_state = 43)
best_GB_DK.fit(X_DK_train, y_DK_train)
y_DK_pred = best_GB_DK.predict(X_DK_test)
print(f"Best_GB_DK Train R-squared: {best_GB_DK.score(X_DK_train, y_DK_train)}")
print(f"Best_GB_DK Test R-squared: {best_GB_DK.score(X_DK_test, y_DK_test)}")
print(f"Best_GB_DK MAE: {mean_absolute_error(y_DK_test, y_DK_pred)}")
print(f"Best_GB_DK RMSE: {mean_squared_error(y_DK_test, y_DK_pred, squared = False)}")
print(f"Best_GB_DK MSE: {mean_squared_error(y_DK_test, y_DK_pred)}")

Best_GB_DK Train R-squared: 0.696852799129325
Best_GB_DK Test R-squared: 0.13475456854803003
Best_GB_DK MAE: 8.148276970961545
Best_GB_DK RMSE: 10.444924070125332
Best_GB_DK MSE: 109.09643883068354


In [74]:
param_grid = {'n_estimators': [100, 200, 300, 400, 500], 'learning_rate' : [0.001, 0.01, 0.05, 0.1]}
AB_FD_cv = GridSearchCV(base_AB_FD, param_grid, scoring = 'neg_mean_absolute_error', n_jobs = 2, verbose = 3)
AB_FD_cv.fit(X_FD_train, y_FD_train)
print(f"Tuned Ada Boost FD Best Estimator: {AB_FD_cv.best_estimator_}")
print(f"Tuned Ada Boost FD Best Score: {AB_FD_cv.best_score_}")
print(f"Tuned Ada Boost FD Best Params: {AB_FD_cv.best_params_}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:  1.0min
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  3.9min finished


Tuned Ada Boost FD Best Estimator: AdaBoostRegressor(learning_rate=0.01, n_estimators=400)
Tuned Ada Boost FD Best Score: -11.94322429703656
Tuned Ada Boost FD Best Params: {'learning_rate': 0.01, 'n_estimators': 400}


In [75]:
best_AB_FD = AdaBoostRegressor(n_estimators = 400, learning_rate = 0.01, random_state = 43)
best_AB_FD.fit(X_FD_train, y_FD_train)
y_FD_pred = best_AB_FD.predict(X_FD_test)
print(f"Best_AB_FD Train R-squared: {best_AB_FD.score(X_FD_train, y_FD_train)}")
print(f"Best_AB_FD Test R-squared: {best_AB_FD.score(X_FD_test, y_FD_test)}")
print(f"Best_AB_FD MAE: {mean_absolute_error(y_FD_test, y_FD_pred)}")
print(f"Best_AB_FD RMSE: {mean_squared_error(y_FD_test, y_FD_pred, squared = False)}")
print(f"Best_AB_FD MSE: {mean_squared_error(y_FD_test, y_FD_pred)}")

Best_AB_FD Train R-squared: 0.2760884428588509
Best_AB_FD Test R-squared: 0.17939327780707381
Best_AB_FD MAE: 12.270171420565596
Best_AB_FD RMSE: 15.250072926523348
Best_AB_FD MSE: 232.5647242642804


In [76]:
AB_DK_cv = GridSearchCV(base_AB_DK, param_grid, scoring = 'neg_mean_absolute_error', n_jobs = 2, verbose = 3)
AB_DK_cv.fit(X_DK_train, y_DK_train)
print(f"Tuned Ada Boost DK Best Estimator: {AB_DK_cv.best_estimator_}")
print(f"Tuned Ada Boost DK Best Score: {AB_DK_cv.best_score_}")
print(f"Tuned Ada Boost DK Best Params: {AB_DK_cv.best_params_}")

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:  1.1min
[Parallel(n_jobs=2)]: Done 100 out of 100 | elapsed:  4.0min finished


Tuned Ada Boost DK Best Estimator: AdaBoostRegressor(learning_rate=0.01, n_estimators=400)
Tuned Ada Boost DK Best Score: -8.023036413551226
Tuned Ada Boost DK Best Params: {'learning_rate': 0.01, 'n_estimators': 400}


In [77]:
best_AB_DK = AdaBoostRegressor(n_estimators = 400, learning_rate = 0.01, random_state = 43)
best_AB_DK.fit(X_DK_train, y_DK_train)
y_DK_pred = best_AB_DK.predict(X_DK_test)
print(f"Best_AB_DK Train R-squared: {best_AB_DK.score(X_DK_train, y_DK_train)}")
print(f"Best_AB_DK Test R-squared: {best_AB_DK.score(X_DK_test, y_DK_test)}")
print(f"Best_AB_DK MAE: {mean_absolute_error(y_DK_test, y_DK_pred)}")
print(f"Best_AB_DK RMSE: {mean_squared_error(y_DK_test, y_DK_pred, squared = False)}")
print(f"Best_AB_DK MSE: {mean_squared_error(y_DK_test, y_DK_pred)}")

Best_AB_DK Train R-squared: 0.2641275467646994
Best_AB_DK Test R-squared: 0.15709635563306745
Best_AB_DK MAE: 8.170237226543483
Best_AB_DK RMSE: 10.309191227930613
Best_AB_DK MSE: 106.27942377404149


In [78]:
param_grid = {'n_estimators': [100, 200, 300, 400, 500], 'max_depth': [3, 5, 7, 9],\
              'learning_rate' : [0.001, 0.01, 0.1], 'alpha': [0, 50, 100]}

In [79]:
XGB_FD_cv = GridSearchCV(base_XGB_FD, param_grid, scoring = 'neg_mean_absolute_error', cv = 4, n_jobs = 2, verbose = 3)
XGB_FD_cv.fit(X_FD_train, y_FD_train)
print(f"Tuned XG Boost FD Best Estimator: {XGB_FD_cv.best_estimator_}")
print(f"Tuned XG Boost FD Best Score: {XGB_FD_cv.best_score_}")
print(f"Tuned XG Boost FD Best Params: {XGB_FD_cv.best_params_}")

Fitting 4 folds for each of 180 candidates, totalling 720 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   10.9s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:  1.6min
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:  3.9min
[Parallel(n_jobs=2)]: Done 508 tasks      | elapsed:  7.2min
[Parallel(n_jobs=2)]: Done 720 out of 720 | elapsed: 12.0min finished


Tuned XG Boost FD Best Estimator: XGBRegressor(alpha=0, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=400, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)
Tuned XG Boost FD Best Score: -11.946140421333013
Tuned XG Boost FD Best Params: {'alpha': 0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 400}


In [80]:
best_XGB_FD = XGBRegressor(n_estimators = 400, learning_rate = 0.01, max_depth = 3, alpha = 0, random_state = 43)
best_XGB_FD.fit(X_FD_train, y_FD_train)
y_FD_pred = best_XGB_FD.predict(X_FD_test)
print(f"Best_XGB_FD Train R-squared: {best_XGB_FD.score(X_FD_train, y_FD_train)}")
print(f"Best_XGB_FD Test R-squared: {best_XGB_FD.score(X_FD_test, y_FD_test)}")
print(f"Best_XGB_FD MAE: {mean_absolute_error(y_FD_test, y_FD_pred)}")
print(f"Best_XGB_FD RMSE: {mean_squared_error(y_FD_test, y_FD_pred, squared = False)}")
print(f"Best_XGB_FD MSE: {mean_squared_error(y_FD_test, y_FD_pred)}")

Best_XGB_FD Train R-squared: 0.4620868512268762
Best_XGB_FD Test R-squared: 0.18233590684983847
Best_XGB_FD MAE: 12.097975393523745
Best_XGB_FD RMSE: 15.222705608917114
Best_XGB_FD MSE: 231.73076605575656


In [81]:
XGB_DK_cv = GridSearchCV(base_XGB_DK, param_grid, scoring = 'neg_mean_absolute_error', cv = 4, n_jobs = 2, verbose = 3)
XGB_DK_cv.fit(X_DK_train, y_DK_train)
print(f"Tuned XG Boost DK Best Estimator: {XGB_DK_cv.best_estimator_}")
print(f"Tuned XG Boost DK Best Score: {XGB_DK_cv.best_score_}")
print(f"Tuned XG Boost DK Best Params: {XGB_DK_cv.best_params_}")

Fitting 4 folds for each of 180 candidates, totalling 720 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:   11.1s
[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:  1.6min
[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:  3.9min
[Parallel(n_jobs=2)]: Done 508 tasks      | elapsed:  7.3min
[Parallel(n_jobs=2)]: Done 720 out of 720 | elapsed: 10.6min finished


Tuned XG Boost DK Best Estimator: XGBRegressor(alpha=0, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=300, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=None)
Tuned XG Boost DK Best Score: -8.046038207404088
Tuned XG Boost DK Best Params: {'alpha': 0, 'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 300}


In [82]:
best_XGB_DK = XGBRegressor(n_estimators = 300, learning_rate = 0.01, max_depth = 3, alpha = 0, random_state = 43)
best_XGB_DK.fit(X_DK_train, y_DK_train)
y_DK_pred = best_XGB_DK.predict(X_DK_test)
print(f"Best_XGB_DK Train R-squared: {best_XGB_DK.score(X_DK_train, y_DK_train)}")
print(f"Best_XGB_DK Test R-squared: {best_XGB_DK.score(X_DK_test, y_DK_test)}")
print(f"Best_XGB_DK MAE: {mean_absolute_error(y_DK_test, y_DK_pred)}")
print(f"Best_XGB_DK RMSE: {mean_squared_error(y_DK_test, y_DK_pred, squared = False)}")
print(f"Best_XGB_DK MSE: {mean_squared_error(y_DK_test, y_DK_pred)}")

Best_XGB_DK Train R-squared: 0.40301731173467636
Best_XGB_DK Test R-squared: 0.1447099848331993
Best_XGB_DK MAE: 8.162682701684638
Best_XGB_DK RMSE: 10.384661165286463
Best_XGB_DK MSE: 107.8411875178088
