In [1]:
##As of Aug 17, tuned through FanDuel Gradient Boost. Next is DraftKings Gradient Boost
#Imports
import pandas as pd
import datetime
from datetime import timedelta
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.metrics import mean_squared_error, r2_score 
from sklearn.model_selection import train_test_split 
from sklearn import linear_model, preprocessing 

In [2]:
#These first few cells will be the same as the EDA notebook

In [3]:
main_df = pd.read_csv('Pitching_Data_Through_2022_08_07.csv')

In [4]:
main_df.head()

Unnamed: 0.1,Unnamed: 0,Name,Team,W,ER,SO,H,BB,CG,HBP,...,FB%_Opp,HR/FB_Opp,Soft%_Opp,Med%_Opp,Hard%_Opp,Pk_Fct_Overall,Pk_Fct_HR,Pk_Fct_SO,Temp,Wind
0,0,Tyler Anderson,LAD,1,0,3,2,1,0,0,...,35.9,8.8,19.9,50.4,29.6,101,125,105,92.0,4.0
1,1,Zach Davies,ARI,0,0,3,4,2,0,0,...,33.4,10.7,18.3,52.2,29.5,101,81,96,72.0,3.0
2,2,Kevin Gausman,TOR,0,0,5,6,0,0,0,...,37.4,13.4,13.8,52.0,34.2,97,94,100,72.0,3.0
3,3,Triston McKenzie,CLE,1,0,8,2,1,0,0,...,38.9,13.5,16.2,53.8,30.1,100,98,102,86.0,3.0
4,4,Jesus Luzardo,MIA,1,0,6,1,1,0,0,...,33.0,14.8,18.2,50.0,31.8,99,99,105,78.0,4.0


In [5]:
main_df.drop(columns = ['Unnamed: 0'], inplace = True)

In [6]:
main_df.shape

(1194, 90)

In [7]:
#Dropping rows that contribute to fantasy points
#Also dropping K% and BB% since we're more interested in a pitcher's K/9 and BB/9
#Also dropping CG_Season and ShO_Season since they're so rare
#Dropping OBP and SLG since we have OPS (on-base plus slugging)
#Dropping xFIP since we have FIP and wRC+ since we have wRC
#Dropping GB% and FB% since and GB_Opp% and FB_Opp% since we have GB/FB for both
#Dropping Outs_Season since we have Avg_Outs
main_df = main_df.drop(columns = ['W', 'ER', 'SO', 'H', 'BB', 'CG', 'HBP', 'ShO', 'Outs', 'QS', 'K%', 'BB%',\
                                 'CG_Season', 'ShO_Season', 'OBP', 'SLG', 'xFIP', 'wRC+', 'GB%', 'FB%',\
                                 'GB%_Opp', 'FB%_Opp', 'CGS', 'NH', 'Outs_Season', 'Proj_Runs', 'Park', 'Hand', 'Opp_Team',\
                                 'Team_Season', 'Position'])

In [8]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1194 entries, 0 to 1193
Data columns (total 59 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            1194 non-null   object 
 1   Team            1194 non-null   object 
 2   Date            1194 non-null   object 
 3   GS              1194 non-null   float64
 4   Avg_Outs        1194 non-null   float64
 5   W_Season        1194 non-null   float64
 6   ERA             1194 non-null   float64
 7   K/9             1194 non-null   float64
 8   BB/9            1194 non-null   float64
 9   K/BB            1194 non-null   float64
 10  HR/9            1194 non-null   float64
 11  AVG             1194 non-null   float64
 12  BABIP           1194 non-null   float64
 13  FIP             1194 non-null   float64
 14  SIERA           1194 non-null   float64
 15  WHIP            1194 non-null   float64
 16  GB/FB           1194 non-null   float64
 17  LD%             1194 non-null   f

In [9]:
main_df_num = main_df.select_dtypes(include = ['int64', 'float64'])

In [10]:
main_df_corr = main_df_num.corr()

In [11]:
main_df_corr

Unnamed: 0,GS,Avg_Outs,W_Season,ERA,K/9,BB/9,K/BB,HR/9,AVG,BABIP,...,LD%_Opp,HR/FB_Opp,Soft%_Opp,Med%_Opp,Hard%_Opp,Pk_Fct_Overall,Pk_Fct_HR,Pk_Fct_SO,Temp,Wind
GS,1.0,0.612585,0.716807,-0.222785,0.082452,-0.154689,0.141815,-0.20211,-0.176407,-0.083836,...,0.011113,0.007079,-0.019751,0.021313,-0.005395,-0.021005,-0.010725,0.018261,0.025442,0.010607
Avg_Outs,0.612585,1.0,0.582005,-0.391464,0.06045,-0.311532,0.300621,-0.234053,-0.303442,-0.222724,...,0.014143,0.023228,-0.018813,0.023027,-0.005956,0.013489,0.050774,0.032654,0.055346,-0.004621
W_Season,0.716807,0.582005,1.0,-0.423023,0.197056,-0.302728,0.351296,-0.302528,-0.360772,-0.188225,...,0.026564,-0.005161,0.022192,0.003851,-0.017926,-0.03592,0.021304,0.107933,-0.000965,0.047409
ERA,-0.222785,-0.391464,-0.423023,1.0,-0.163047,0.36706,-0.346828,0.63228,0.752855,0.531139,...,0.009036,-0.021734,-0.015422,0.038625,-0.020989,0.054796,-0.030562,-0.103201,0.005245,-0.019806
K/9,0.082452,0.06045,0.197056,-0.163047,1.0,0.040437,0.513107,-0.010659,-0.35294,0.11858,...,0.032903,0.009352,0.050097,-0.017321,-0.019565,0.069992,0.135581,0.168725,-0.025198,0.055241
BB/9,-0.154689,-0.311532,-0.302728,0.36706,0.040437,1.0,-0.690995,0.049966,0.138826,0.163656,...,0.011743,0.037851,-0.029699,0.027619,-0.002588,0.0486,0.010919,-0.06267,0.008414,-0.055206
K/BB,0.141815,0.300621,0.351296,-0.346828,0.513107,-0.690995,1.0,-0.054138,-0.280536,-0.038354,...,0.006248,-0.038204,0.072061,-0.033798,-0.020344,-0.018748,0.066127,0.150583,-0.018713,0.05282
HR/9,-0.20211,-0.234053,-0.302528,0.63228,-0.010659,0.049966,-0.054138,1.0,0.377519,0.036288,...,-0.00081,-0.038797,-0.019784,0.055322,-0.031595,0.020114,-0.001135,-0.063773,0.016985,0.002358
AVG,-0.176407,-0.303442,-0.360772,0.752855,-0.35294,0.138826,-0.280536,0.377519,1.0,0.818218,...,-0.010107,0.00015,-0.010453,-0.006542,0.011446,0.044101,-0.037793,-0.13634,0.016293,-0.023211
BABIP,-0.083836,-0.222724,-0.188225,0.531139,0.11858,0.163656,-0.038354,0.036288,0.818218,1.0,...,0.004481,0.018027,0.017904,-0.037065,0.017074,0.079386,0.029058,-0.043563,0.001444,0.000576


In [12]:
FD_corr = main_df_corr['FD_Pts'].sort_values(ascending = False)

In [13]:
FD_corr

FD_Pts            1.000000
DK_Pts            0.990194
Avg_Outs          0.310977
W_Season          0.271873
Proj_Run_Diff     0.265782
GS                0.263977
SwStr%            0.238162
Events            0.221390
K/9               0.219623
K/BB              0.187983
HardHit           0.170964
Barrels           0.151638
GB/FB_Opp         0.105314
K%_Opp            0.104575
Pk_Fct_SO         0.096819
maxEV             0.079142
RS/9              0.067998
Soft%             0.064819
Med%              0.046900
Med%_Opp          0.036619
LD%_Opp           0.033855
Soft%_Opp         0.029593
GB/FB             0.016655
Pk_Fct_HR         0.010309
CStr%            -0.007549
LA               -0.010598
BABIP_Opp        -0.011653
Wind             -0.020225
LD%              -0.033284
HR/FB_Opp        -0.042729
BB%_Opp          -0.047056
Pk_Fct_Overall   -0.047811
BABIP            -0.048123
Hard%_Opp        -0.049808
Temp             -0.059901
wRC              -0.073388
Barrel%          -0.079324
H

In [14]:
DK_corr = main_df_corr['DK_Pts'].sort_values(ascending = False)

In [15]:
DK_corr

DK_Pts            1.000000
FD_Pts            0.990194
Avg_Outs          0.262932
Proj_Run_Diff     0.260713
W_Season          0.247340
SwStr%            0.246125
GS                0.228733
K/9               0.226703
K/BB              0.189685
Events            0.186286
HardHit           0.135630
Barrels           0.127026
K%_Opp            0.104590
GB/FB_Opp         0.102034
Pk_Fct_SO         0.098082
maxEV             0.066868
RS/9              0.066591
Soft%             0.061712
Med%              0.043415
Med%_Opp          0.034140
Soft%_Opp         0.031128
LD%_Opp           0.030516
LA                0.005791
Pk_Fct_HR         0.004262
GB/FB            -0.001131
CStr%            -0.005523
BABIP_Opp        -0.012602
Wind             -0.019611
LD%              -0.034801
HR/FB_Opp        -0.044038
Hard%_Opp        -0.048947
BABIP            -0.050063
Pk_Fct_Overall   -0.054810
BB%_Opp          -0.055075
Temp             -0.065397
Barrel%          -0.065435
HR/FB            -0.072994
w

In [16]:
#FD_first_tier = list(FD_corr[FD_corr >= .15].index) + list(FD_corr[FD_corr <= -.15].index)

In [17]:
#FD_first_tier

In [18]:
#DK_first_tier = list(DK_corr[DK_corr >= .15].index) + list(DK_corr[DK_corr <= -.15].index)

In [19]:
#DK_first_tier

In [20]:
#first_tier_features = set(FD_first_tier).union(set(DK_first_tier))

In [21]:
#first_tier_features = list(first_tier_features)

In [22]:
#first_tier_features

In [23]:
df1 = main_df.pop('FD_Pts')
df2 = main_df.pop('DK_Pts')

In [24]:
main_df['FD_Pts'] = df1
main_df['DK_Pts'] = df2

In [25]:
main_df_corr = main_df.corr()

In [26]:
main_df_corr

Unnamed: 0,GS,Avg_Outs,W_Season,ERA,K/9,BB/9,K/BB,HR/9,AVG,BABIP,...,Soft%_Opp,Med%_Opp,Hard%_Opp,Pk_Fct_Overall,Pk_Fct_HR,Pk_Fct_SO,Temp,Wind,FD_Pts,DK_Pts
GS,1.0,0.612585,0.716807,-0.222785,0.082452,-0.154689,0.141815,-0.20211,-0.176407,-0.083836,...,-0.019751,0.021313,-0.005395,-0.021005,-0.010725,0.018261,0.025442,0.010607,0.263977,0.228733
Avg_Outs,0.612585,1.0,0.582005,-0.391464,0.06045,-0.311532,0.300621,-0.234053,-0.303442,-0.222724,...,-0.018813,0.023027,-0.005956,0.013489,0.050774,0.032654,0.055346,-0.004621,0.310977,0.262932
W_Season,0.716807,0.582005,1.0,-0.423023,0.197056,-0.302728,0.351296,-0.302528,-0.360772,-0.188225,...,0.022192,0.003851,-0.017926,-0.03592,0.021304,0.107933,-0.000965,0.047409,0.271873,0.24734
ERA,-0.222785,-0.391464,-0.423023,1.0,-0.163047,0.36706,-0.346828,0.63228,0.752855,0.531139,...,-0.015422,0.038625,-0.020989,0.054796,-0.030562,-0.103201,0.005245,-0.019806,-0.16446,-0.158013
K/9,0.082452,0.06045,0.197056,-0.163047,1.0,0.040437,0.513107,-0.010659,-0.35294,0.11858,...,0.050097,-0.017321,-0.019565,0.069992,0.135581,0.168725,-0.025198,0.055241,0.219623,0.226703
BB/9,-0.154689,-0.311532,-0.302728,0.36706,0.040437,1.0,-0.690995,0.049966,0.138826,0.163656,...,-0.029699,0.027619,-0.002588,0.0486,0.010919,-0.06267,0.008414,-0.055206,-0.105597,-0.10466
K/BB,0.141815,0.300621,0.351296,-0.346828,0.513107,-0.690995,1.0,-0.054138,-0.280536,-0.038354,...,0.072061,-0.033798,-0.020344,-0.018748,0.066127,0.150583,-0.018713,0.05282,0.187983,0.189685
HR/9,-0.20211,-0.234053,-0.302528,0.63228,-0.010659,0.049966,-0.054138,1.0,0.377519,0.036288,...,-0.019784,0.055322,-0.031595,0.020114,-0.001135,-0.063773,0.016985,0.002358,-0.131109,-0.118066
AVG,-0.176407,-0.303442,-0.360772,0.752855,-0.35294,0.138826,-0.280536,0.377519,1.0,0.818218,...,-0.010453,-0.006542,0.011446,0.044101,-0.037793,-0.13634,0.016293,-0.023211,-0.188413,-0.188274
BABIP,-0.083836,-0.222724,-0.188225,0.531139,0.11858,0.163656,-0.038354,0.036288,0.818218,1.0,...,0.017904,-0.037065,0.017074,0.079386,0.029058,-0.043563,0.001444,0.000576,-0.048123,-0.050063


In [27]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1194 entries, 0 to 1193
Data columns (total 59 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Name            1194 non-null   object 
 1   Team            1194 non-null   object 
 2   Date            1194 non-null   object 
 3   GS              1194 non-null   float64
 4   Avg_Outs        1194 non-null   float64
 5   W_Season        1194 non-null   float64
 6   ERA             1194 non-null   float64
 7   K/9             1194 non-null   float64
 8   BB/9            1194 non-null   float64
 9   K/BB            1194 non-null   float64
 10  HR/9            1194 non-null   float64
 11  AVG             1194 non-null   float64
 12  BABIP           1194 non-null   float64
 13  FIP             1194 non-null   float64
 14  SIERA           1194 non-null   float64
 15  WHIP            1194 non-null   float64
 16  GB/FB           1194 non-null   float64
 17  LD%             1194 non-null   f

In [28]:
main_df.head()

Unnamed: 0,Name,Team,Date,GS,Avg_Outs,W_Season,ERA,K/9,BB/9,K/BB,...,Soft%_Opp,Med%_Opp,Hard%_Opp,Pk_Fct_Overall,Pk_Fct_HR,Pk_Fct_SO,Temp,Wind,FD_Pts,DK_Pts
0,Tyler Anderson,LAD,2022-08-07,18.0,17.888889,12.0,2.89,7.34,1.79,4.09,...,19.9,50.4,29.6,101,125,105,92.0,4.0,40,23.95
1,Zach Davies,ARI,2022-08-07,16.0,15.375,2.0,4.28,6.91,3.07,2.25,...,18.3,52.2,29.5,101,81,96,72.0,3.0,24,13.65
2,Kevin Gausman,TOR,2022-08-07,20.0,16.75,8.0,3.06,10.64,1.61,6.6,...,13.8,52.0,34.2,97,94,100,72.0,3.0,37,19.9
3,Triston McKenzie,CLE,2022-08-07,19.0,18.473684,7.0,3.38,8.48,2.4,3.53,...,16.2,53.8,30.1,100,98,102,86.0,3.0,58,36.2
4,Jesus Luzardo,MIA,2022-08-07,7.0,14.571429,2.0,3.97,12.18,4.24,2.88,...,18.2,50.0,31.8,99,99,105,78.0,4.0,49,30.55


In [29]:
best_features = sorted(list(FD_corr[FD_corr >= .1].index) + list(FD_corr[FD_corr <= -.1].index))

In [30]:
best_features

['AVG',
 'Avg_Outs',
 'BB/9',
 'Barrels',
 'Contact%',
 'DK_Pts',
 'ERA',
 'EV',
 'Events',
 'FD_Pts',
 'FIP',
 'GB/FB_Opp',
 'GS',
 'HR/9',
 'Hard%',
 'HardHit',
 'HardHit%',
 'ISO',
 'K%_Opp',
 'K/9',
 'K/BB',
 'OPS',
 'OpenOU',
 'Opp_Proj_Runs',
 'Proj_Run_Diff',
 'SIERA',
 'SwStr%',
 'WHIP',
 'W_Season',
 'wOBA',
 'wRAA']

In [31]:
best_features.remove('FD_Pts')
best_features.remove('DK_Pts')

In [32]:
main_df_corr = pd.concat([main_df[best_features], main_df['FD_Pts'], main_df['DK_Pts']], axis = 1)

In [33]:
main_df_corr = main_df_corr.select_dtypes(include = ['int64', 'float64'])

In [34]:
main_df_corr.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1194 entries, 0 to 1193
Data columns (total 31 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   AVG            1194 non-null   float64
 1   Avg_Outs       1194 non-null   float64
 2   BB/9           1194 non-null   float64
 3   Barrels        1194 non-null   float64
 4   Contact%       1194 non-null   float64
 5   ERA            1194 non-null   float64
 6   EV             1194 non-null   float64
 7   Events         1194 non-null   float64
 8   FIP            1194 non-null   float64
 9   GB/FB_Opp      1194 non-null   float64
 10  GS             1194 non-null   float64
 11  HR/9           1194 non-null   float64
 12  Hard%          1194 non-null   float64
 13  HardHit        1194 non-null   float64
 14  HardHit%       1194 non-null   float64
 15  ISO            1194 non-null   float64
 16  K%_Opp         1194 non-null   float64
 17  K/9            1194 non-null   float64
 18  K/BB    

In [35]:
main_df_corr = main_df_corr.corr()

In [36]:
main_df_corr

Unnamed: 0,AVG,Avg_Outs,BB/9,Barrels,Contact%,ERA,EV,Events,FIP,GB/FB_Opp,...,Opp_Proj_Runs,Proj_Run_Diff,SIERA,SwStr%,WHIP,W_Season,wOBA,wRAA,FD_Pts,DK_Pts
AVG,1.0,-0.303442,0.138826,0.005271,0.383572,0.752855,0.239057,-0.115392,0.502939,-0.025667,...,0.352175,-0.272878,0.416283,-0.384826,0.868081,-0.360772,-0.024585,-0.023582,-0.188413,-0.188274
Avg_Outs,-0.303442,1.0,-0.311532,0.420544,-0.054582,-0.391464,-0.069931,0.612915,-0.321236,-0.014156,...,-0.295952,0.257436,-0.210505,0.118038,-0.424446,0.582005,0.001726,-0.005434,0.310977,0.262932
BB/9,0.138826,-0.311532,1.0,-0.134571,-0.045415,0.36706,0.224107,-0.229973,0.363757,0.046314,...,0.227883,-0.191889,0.50508,-0.10999,0.598902,-0.302728,0.030532,0.045907,-0.105597,-0.10466
Barrels,0.005271,0.420544,-0.134571,1.0,0.078583,-0.025695,0.210342,0.80051,0.037648,0.00832,...,-0.053191,0.039808,0.045718,-0.023201,-0.089577,0.45915,-0.01449,-0.024967,0.151638,0.127026
Contact%,0.383572,-0.054582,-0.045415,0.078583,1.0,0.194991,0.093092,0.04424,0.381498,-0.013052,...,0.337526,-0.270092,0.629022,-0.959334,0.260911,-0.200348,-0.043407,-0.047309,-0.213668,-0.222541
ERA,0.752855,-0.391464,0.36706,-0.025695,0.194991,1.0,0.342418,-0.242516,0.72722,-0.014725,...,0.326969,-0.266334,0.434486,-0.256123,0.805671,-0.423023,-0.024335,-0.018997,-0.16446,-0.158013
EV,0.239057,-0.069931,0.224107,0.210342,0.093092,0.342418,1.0,-0.048022,0.410627,-0.064421,...,0.167865,-0.148096,0.204507,-0.089627,0.297788,-0.162736,-0.047944,-0.03796,-0.106841,-0.10742
Events,-0.115392,0.612915,-0.229973,0.80051,0.04424,-0.242516,-0.048022,1.0,-0.257441,-0.000418,...,-0.199668,0.169677,-0.088457,0.014237,-0.241282,0.717079,0.011774,0.00208,0.22139,0.186286
FIP,0.502939,-0.321236,0.363757,0.037648,0.381498,0.72722,0.410627,-0.257441,1.0,-0.011288,...,0.360442,-0.298394,0.633856,-0.434299,0.596992,-0.432529,-0.027734,-0.031795,-0.225234,-0.217198
GB/FB_Opp,-0.025667,-0.014156,0.046314,0.00832,-0.013052,-0.014725,-0.064421,-0.000418,-0.011288,1.0,...,-0.190146,0.242523,0.004352,0.016287,0.002639,-0.016803,-0.213104,-0.19713,0.105314,0.102034


In [37]:
# sns.set(font_scale = 1.5)
# plt.figure(figsize = (30, 20))
# sns.heatmap(main_df_corr, annot = True)

In [38]:
main_df = main_df.set_index(['Date', 'Name', 'Team'])

In [39]:
main_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1194 entries, ('2022-08-07', 'Tyler Anderson', 'LAD') to ('2022-06-20', 'Caleb Kilian', 'CHC')
Data columns (total 56 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   GS              1194 non-null   float64
 1   Avg_Outs        1194 non-null   float64
 2   W_Season        1194 non-null   float64
 3   ERA             1194 non-null   float64
 4   K/9             1194 non-null   float64
 5   BB/9            1194 non-null   float64
 6   K/BB            1194 non-null   float64
 7   HR/9            1194 non-null   float64
 8   AVG             1194 non-null   float64
 9   BABIP           1194 non-null   float64
 10  FIP             1194 non-null   float64
 11  SIERA           1194 non-null   float64
 12  WHIP            1194 non-null   float64
 13  GB/FB           1194 non-null   float64
 14  LD%             1194 non-null   float64
 15  HR/FB           1194 non-null   float64
 16  RS/9   

In [40]:
X = main_df[best_features]

In [41]:
X.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 1194 entries, ('2022-08-07', 'Tyler Anderson', 'LAD') to ('2022-06-20', 'Caleb Kilian', 'CHC')
Data columns (total 29 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   AVG            1194 non-null   float64
 1   Avg_Outs       1194 non-null   float64
 2   BB/9           1194 non-null   float64
 3   Barrels        1194 non-null   float64
 4   Contact%       1194 non-null   float64
 5   ERA            1194 non-null   float64
 6   EV             1194 non-null   float64
 7   Events         1194 non-null   float64
 8   FIP            1194 non-null   float64
 9   GB/FB_Opp      1194 non-null   float64
 10  GS             1194 non-null   float64
 11  HR/9           1194 non-null   float64
 12  Hard%          1194 non-null   float64
 13  HardHit        1194 non-null   float64
 14  HardHit%       1194 non-null   float64
 15  ISO            1194 non-null   float64
 16  K%_Opp         1194 non-n

In [42]:
y_FD = main_df['FD_Pts']
y_DK = main_df['DK_Pts']

In [43]:
y_FD

Date        Name              Team
2022-08-07  Tyler Anderson    LAD     40
            Zach Davies       ARI     24
            Kevin Gausman     TOR     37
            Triston McKenzie  CLE     58
            Jesus Luzardo     MIA     49
                                      ..
2022-06-20  Zach Davies       ARI     15
            Noah Syndergaard  LAA     22
            Alex Faedo        DET      7
            Jose Berrios      TOR     -3
            Caleb Kilian      CHC      1
Name: FD_Pts, Length: 1194, dtype: int64

In [44]:
X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AVG,Avg_Outs,BB/9,Barrels,Contact%,ERA,EV,Events,FIP,GB/FB_Opp,...,OPS,OpenOU,Opp_Proj_Runs,Proj_Run_Diff,SIERA,SwStr%,WHIP,W_Season,wOBA,wRAA
Date,Name,Team,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2022-08-07,Tyler Anderson,LAD,0.223,17.888889,1.79,17.0,76.9,2.89,85.1,336.0,3.36,1.24,...,0.699,9.0,4.275000,0.450000,3.95,12.1,1.03,12.0,0.309,-1.8
2022-08-07,Zach Davies,ARI,0.230,15.375000,3.07,19.0,79.7,4.28,86.1,250.0,4.58,1.40,...,0.710,8.0,3.525000,0.950000,4.52,8.7,1.22,2.0,0.312,3.3
2022-08-07,Kevin Gausman,TOR,0.269,16.750000,1.61,24.0,71.6,3.06,89.0,311.0,2.01,1.13,...,0.749,8.0,3.600000,0.800000,2.95,15.8,1.24,8.0,0.326,37.0
2022-08-07,Triston McKenzie,CLE,0.203,18.473684,2.40,33.0,76.4,3.38,90.2,320.0,4.10,1.05,...,0.741,8.0,4.400000,-0.800000,3.88,12.0,1.00,7.0,0.323,28.7
2022-08-07,Jesus Luzardo,MIA,0.172,14.571429,4.24,7.0,67.7,3.97,89.8,76.0,3.35,1.48,...,0.709,8.0,4.150000,-0.300000,3.31,13.9,1.09,2.0,0.310,-1.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-06-20,Zach Davies,ARI,0.234,15.923077,2.87,14.0,78.6,3.78,85.8,204.0,4.03,1.21,...,0.691,7.5,4.617188,-1.734375,4.22,9.2,1.20,2.0,0.304,-7.4
2022-06-20,Noah Syndergaard,LAA,0.237,15.300000,2.29,11.0,79.3,3.53,87.5,163.0,3.72,1.05,...,0.664,8.5,3.506250,1.487500,4.39,10.2,1.18,4.0,0.294,-21.7
2022-06-20,Alex Faedo,DET,0.278,15.000000,2.70,10.0,77.5,4.28,90.6,125.0,4.40,1.11,...,0.726,9.5,5.521875,-1.543750,4.40,11.1,1.40,1.0,0.317,11.1
2022-06-20,Jose Berrios,TOR,0.256,16.538462,2.13,26.0,80.8,4.65,90.8,219.0,4.45,1.28,...,0.642,8.5,4.090625,0.318750,4.03,9.3,1.24,5.0,0.285,-36.5


In [45]:
X_FD_train, X_FD_test, y_FD_train, y_FD_test = train_test_split(X, y_FD, test_size = 0.3, random_state = 45)
X_DK_train, X_DK_test, y_DK_train, y_DK_test = train_test_split(X, y_DK, test_size = 0.3, random_state = 45)

In [46]:
y_FD_train.head()

Date        Name              Team
2022-07-07  Tony Gonsolin     LAD     34
            Chase Silseth     LAA      6
2022-07-05  Jeffrey Springs   TBR     18
2022-06-21  Tony Gonsolin     LAD     27
2022-07-29  Brandon Woodruff  MIL     53
Name: FD_Pts, dtype: int64

In [47]:
y_FD_test.head()

Date        Name             Team
2022-07-16  Alex Cobb        SFG     41
2022-07-05  Jason Alexander  MIL      4
2022-07-08  Blake Snell      SDP     58
2022-06-23  Kyle Wright      ATL     22
2022-07-15  Sandy Alcantara  MIA     58
Name: FD_Pts, dtype: int64

In [48]:
y_DK_train.head()

Date        Name              Team
2022-07-07  Tony Gonsolin     LAD     19.35
            Chase Silseth     LAA      0.80
2022-07-05  Jeffrey Springs   TBR      8.80
2022-06-21  Tony Gonsolin     LAD     16.25
2022-07-29  Brandon Woodruff  MIL     30.65
Name: DK_Pts, dtype: float64

In [49]:
X_FD_train.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AVG,Avg_Outs,BB/9,Barrels,Contact%,ERA,EV,Events,FIP,GB/FB_Opp,...,OPS,OpenOU,Opp_Proj_Runs,Proj_Run_Diff,SIERA,SwStr%,WHIP,W_Season,wOBA,wRAA
Date,Name,Team,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2022-07-07,Tony Gonsolin,LAD,0.155,16.333333,2.53,13.0,74.1,1.54,88.2,206.0,3.29,1.3,...,0.722,8.5,2.523438,3.453125,3.69,12.7,0.82,10.0,0.318,14.3
2022-07-07,Chase Silseth,LAA,0.272,12.4,3.05,5.0,76.1,5.23,91.6,64.0,5.77,0.94,...,0.68,8.5,4.303125,-0.10625,4.19,10.6,1.4,1.0,0.298,-21.4
2022-07-05,Jeffrey Springs,TBR,0.213,14.9,1.95,12.0,72.2,2.25,87.3,156.0,3.33,1.41,...,0.783,9.5,5.10625,-0.7125,3.09,14.0,1.0,3.0,0.341,17.7
2022-06-21,Tony Gonsolin,LAD,0.147,15.833333,2.84,8.0,75.2,1.42,88.4,158.0,3.1,1.21,...,0.677,9.5,3.8,1.9,3.72,11.8,0.82,8.0,0.3,-13.3
2022-07-29,Brandon Woodruff,MIL,0.225,15.5,2.74,14.0,70.5,3.73,87.1,184.0,2.87,1.11,...,0.715,9.0,4.05,0.9,3.1,14.2,1.16,8.0,0.311,2.4


In [50]:
X_FD_test.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AVG,Avg_Outs,BB/9,Barrels,Contact%,ERA,EV,Events,FIP,GB/FB_Opp,...,OPS,OpenOU,Opp_Proj_Runs,Proj_Run_Diff,SIERA,SwStr%,WHIP,W_Season,wOBA,wRAA
Date,Name,Team,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
2022-07-16,Alex Cobb,SFG,0.267,14.538462,2.86,7.0,78.1,4.57,87.5,188.0,3.14,1.05,...,0.742,8.0,3.8,0.4,3.23,10.0,1.38,3.0,0.324,26.2
2022-07-05,Jason Alexander,MIL,0.315,16.8,3.82,8.0,87.2,3.82,88.7,109.0,4.35,1.3,...,0.721,8.5,3.878125,0.74375,5.31,5.3,1.7,2.0,0.318,14.2
2022-07-08,Blake Snell,SDP,0.232,15.125,5.13,9.0,68.8,5.13,89.6,105.0,3.7,0.91,...,0.715,7.5,3.5625,0.375,4.02,14.4,1.46,0.0,0.314,2.8
2022-06-23,Kyle Wright,ATL,0.213,18.384615,2.94,13.0,72.8,2.94,88.7,203.0,2.71,1.25,...,0.726,9.5,4.334375,0.83125,3.3,12.8,1.1,7.0,0.32,14.7
2022-07-15,Sandy Alcantara,MIA,0.191,21.722222,2.14,15.0,76.7,1.73,87.0,349.0,2.83,1.14,...,0.712,7.0,3.15,0.7,3.5,11.9,0.91,9.0,0.31,0.0


In [51]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor

In [52]:
from sklearn.metrics import mean_absolute_error

In [53]:
base_RF_FD = RandomForestRegressor()
base_RF_DK = RandomForestRegressor()
base_GB_FD = GradientBoostingRegressor()
base_GB_DK = GradientBoostingRegressor()
base_AB_FD = AdaBoostRegressor()
base_AB_DK = AdaBoostRegressor()
base_XGB_FD = XGBRegressor()
base_XGB_DK = XGBRegressor()

In [54]:
model_list_FD = [base_RF_FD, base_GB_FD, base_AB_FD, base_XGB_FD]
model_list_DK = [base_RF_DK, base_GB_DK, base_AB_DK, base_XGB_DK]

In [55]:
for model in model_list_FD:
    model_name = str(model)
    model.fit(X_FD_train, y_FD_train)
    y_FD_pred = model.predict(X_FD_test)
    print(f"{model_name} R_squared_train: {model.score(X_FD_train, y_FD_train)}")
    print(f"{model_name} R_squared_test: {model.score(X_FD_test, y_FD_test)}")
    print(f"{model_name} MAE: {mean_absolute_error(y_FD_test, y_FD_pred)}")
    print(f"{model_name} MSE: {mean_squared_error(y_FD_test, y_FD_pred)}")
    print(f"{model_name} RMSE: {mean_squared_error(y_FD_test, y_FD_pred, squared = False)}")
    print()
    

RandomForestRegressor() R_squared_train: 0.8833637734456335
RandomForestRegressor() R_squared_test: 0.18438273442964703
RandomForestRegressor() MAE: 11.853231197771589
RandomForestRegressor() MSE: 220.55579080779947
RandomForestRegressor() RMSE: 14.85112086031891

GradientBoostingRegressor() R_squared_train: 0.6104232312020995
GradientBoostingRegressor() R_squared_test: 0.19524171388340372
GradientBoostingRegressor() MAE: 11.693939614713633
GradientBoostingRegressor() MSE: 217.61935125227564
GradientBoostingRegressor() RMSE: 14.75192703521393

AdaBoostRegressor() R_squared_train: 0.30396307767331165
AdaBoostRegressor() R_squared_test: 0.14166444388308297
AdaBoostRegressor() MAE: 12.26182621398368
AdaBoostRegressor() MSE: 232.10749128199947
AdaBoostRegressor() RMSE: 15.23507437730448

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             i

In [56]:
for model in model_list_DK:
    model_name = str(model)
    model.fit(X_DK_train, y_DK_train)
    y_DK_pred = model.predict(X_DK_test)
    print(f"{model_name} R_squared_train: {model.score(X_DK_train, y_DK_train)}")
    print(f"{model_name} R_squared_test: {model.score(X_DK_test, y_DK_test)}")
    print(f"{model_name} MAE: {mean_absolute_error(y_DK_test, y_DK_pred)}")
    print(f"{model_name} MSE: {mean_squared_error(y_DK_test, y_DK_pred)}")
    print(f"{model_name} RMSE: {mean_squared_error(y_DK_test, y_DK_pred, squared = False)}")
    print()

RandomForestRegressor() R_squared_train: 0.8807542441253129
RandomForestRegressor() R_squared_test: 0.16291486691798074
RandomForestRegressor() MAE: 7.933452646239554
RandomForestRegressor() MSE: 101.62085333077994
RandomForestRegressor() RMSE: 10.080716905596542

GradientBoostingRegressor() R_squared_train: 0.6153030100084265
GradientBoostingRegressor() R_squared_test: 0.16727770998260483
GradientBoostingRegressor() MAE: 7.920454974156768
GradientBoostingRegressor() MSE: 101.09121086354006
GradientBoostingRegressor() RMSE: 10.054412507130392

AdaBoostRegressor() R_squared_train: 0.3088634485943669
AdaBoostRegressor() R_squared_test: 0.1203008648566759
AdaBoostRegressor() MAE: 8.29324632836876
AdaBoostRegressor() MSE: 106.79412792635813
AdaBoostRegressor() RMSE: 10.334124439271966

XGBRegressor(base_score=None, booster=None, colsample_bylevel=None,
             colsample_bynode=None, colsample_bytree=None,
             enable_categorical=False, gamma=None, gpu_id=None,
             imp

In [57]:
from sklearn.model_selection import GridSearchCV

In [58]:
#Tuned Random Forest
param_grid = {'max_depth': [3, 5, 7, 9], 'n_estimators': [100, 200, 300, 400, 500]}

In [59]:
# RF_FD_cv = GridSearchCV(base_RF_FD, param_grid, n_jobs = 2, verbose = 3)
# RF_FD_cv.fit(X_FD_train, y_FD_train)
# print(f"Tuned Random Forest FD Best Estimator: {RF_FD_cv.best_estimator_}")
# print(f"Tuned Random Forest FD Best Score: {RF_FD_cv.best_score_}")
# print(f"Tuned Random Forest FD Best Params: {RF_FD_cv.best_params_}")

In [60]:
best_RF_FD = RandomForestRegressor(max_depth = 7, n_estimators = 400, random_state= 45)
best_RF_FD.fit(X_FD_train, y_FD_train)
y_FD_pred_RF = best_RF_FD.predict(X_FD_test)
print(f"Best_RF_FD Train R-squared: {best_RF_FD.score(X_FD_train, y_FD_train)}")
print(f"Best_RF_FD Test R-squared: {best_RF_FD.score(X_FD_test, y_FD_test)}")
print(f"Best_RF_FD MAE: {mean_absolute_error(y_FD_test, y_FD_pred_RF)}")
print(f"Best_RF_FD RMSE: {mean_squared_error(y_FD_test, y_FD_pred_RF, squared = False)}")
print(f"Best_RF_FD MSE: {mean_squared_error(y_FD_test, y_FD_pred_RF)}")

Best_RF_FD Train R-squared: 0.6137995194119052
Best_RF_FD Test R-squared: 0.19544276698096763
Best_RF_FD MAE: 11.862007684253589
Best_RF_FD RMSE: 14.750084180070916
Best_RF_FD MSE: 217.5649833191783


In [61]:
# RF_DK_cv = GridSearchCV(base_RF_DK, param_grid, n_jobs = 2, verbose = 3)
# RF_DK_cv.fit(X_DK_train, y_DK_train)
# print(f"Tuned Random Forest DK Best Estimator: {RF_DK_cv.best_estimator_}")
# print(f"Tuned Random Forest DK Best Score: {RF_DK_cv.best_score_}")
# print(f"Tuned Random Forest DK Best Params: {RF_DK_cv.best_params_}")

In [62]:
best_RF_DK = RandomForestRegressor(max_depth = 5, n_estimators = 200, random_state = 45)
best_RF_DK.fit(X_DK_train, y_DK_train)
y_DK_pred_RF = best_RF_DK.predict(X_DK_test)
print(f"Best_RF_DK Train R-squared: {best_RF_DK.score(X_DK_train, y_DK_train)}")
print(f"Best_RF_DK Test R-squared: {best_RF_DK.score(X_DK_test, y_DK_test)}")
print(f"Best_RF_DK MAE: {mean_absolute_error(y_DK_test, y_DK_pred_RF)}")
print(f"Best_RF_DK RMSE: {mean_squared_error(y_DK_test, y_DK_pred_RF, squared = False)}")
print(f"Best_RF_DK MSE: {mean_squared_error(y_DK_test, y_DK_pred_RF)}")

Best_RF_DK Train R-squared: 0.4326139489537779
Best_RF_DK Test R-squared: 0.17410912885506435
Best_RF_DK MAE: 7.996768093274399
Best_RF_DK RMSE: 10.013085791770187
Best_RF_DK MSE: 100.26188707335001


In [63]:
# param_grid = {'n_estimators': [100, 200, 300, 400, 500], 'max_depth' : [3, 5, 7, 9], 'learning_rate' : [0.01, 0.05, 0.1]}
# GB_FD_cv = GridSearchCV(base_GB_FD, param_grid, scoring = 'neg_mean_absolute_error', n_jobs = 2, verbose = 3)
# GB_FD_cv.fit(X_FD_train, y_FD_train)
# print(f"Tuned Gradient Boost FD Best Estimator: {GB_FD_cv.best_estimator_}")
# print(f"Tuned Gradient Boost FD Best Score: {GB_FD_cv.best_score_}")
# print(f"Tuned Gradient Boost FD Best Params: {GB_FD_cv.best_params_}")

In [64]:
best_GB_FD = GradientBoostingRegressor(max_depth = 3, n_estimators = 400, learning_rate = 0.01, random_state = 45)
best_GB_FD.fit(X_FD_train, y_FD_train)
y_FD_pred_GB = best_GB_FD.predict(X_FD_test)
print(f"Best_GB_FD Train R-squared: {best_GB_FD.score(X_FD_train, y_FD_train)}")
print(f"Best_GB_FD Test R-squared: {best_GB_FD.score(X_FD_test, y_FD_test)}")
print(f"Best_GB_FD MAE: {mean_absolute_error(y_FD_test, y_FD_pred_GB)}")
print(f"Best_GB_FD RMSE: {mean_squared_error(y_FD_test, y_FD_pred_GB, squared = False)}")
print(f"Best_GB_FD MSE: {mean_squared_error(y_FD_test, y_FD_pred_GB)}")

Best_GB_FD Train R-squared: 0.4237408526504909
Best_GB_FD Test R-squared: 0.21070950593072524
Best_GB_FD MAE: 11.677006149114751
Best_GB_FD RMSE: 14.60947007861521
Best_GB_FD MSE: 213.43661597795312


In [65]:
# GB_DK_cv = GridSearchCV(base_GB_DK, param_grid, scoring = 'neg_mean_absolute_error', n_jobs = 2, verbose = 3)
# GB_DK_cv.fit(X_DK_train, y_DK_train)
# print(f"Tuned Gradient Boost DK Best Estimator: {GB_DK_cv.best_estimator_}")
# print(f"Tuned Gradient Boost DK Best Score: {GB_DK_cv.best_score_}")
# print(f"Tuned Gradient Boost DK Best Params: {GB_DK_cv.best_params_}")

In [66]:
best_GB_DK = GradientBoostingRegressor(max_depth = 3, n_estimators = 200, learning_rate = 0.01, random_state = 45)
best_GB_DK.fit(X_DK_train, y_DK_train)
y_DK_pred_GB = best_GB_DK.predict(X_DK_test)
print(f"Best_GB_DK Train R-squared: {best_GB_DK.score(X_DK_train, y_DK_train)}")
print(f"Best_GB_DK Test R-squared: {best_GB_DK.score(X_DK_test, y_DK_test)}")
print(f"Best_GB_DK MAE: {mean_absolute_error(y_DK_test, y_DK_pred_GB)}")
print(f"Best_GB_DK RMSE: {mean_squared_error(y_DK_test, y_DK_pred_GB, squared = False)}")
print(f"Best_GB_DK MSE: {mean_squared_error(y_DK_test, y_DK_pred_GB)}")

Best_GB_DK Train R-squared: 0.32006207349661076
Best_GB_DK Test R-squared: 0.1680996110565296
Best_GB_DK MAE: 8.070778556875137
Best_GB_DK RMSE: 10.049449404407174
Best_GB_DK MSE: 100.99143333173969


In [67]:
# param_grid = {'n_estimators': [100, 200, 300, 400, 500], 'learning_rate' : [0.001, 0.01, 0.05, 0.1]}
# AB_FD_cv = GridSearchCV(base_AB_FD, param_grid, scoring = 'neg_mean_absolute_error', n_jobs = 2, verbose = 3)
# AB_FD_cv.fit(X_FD_train, y_FD_train)
# print(f"Tuned Ada Boost FD Best Estimator: {AB_FD_cv.best_estimator_}")
# print(f"Tuned Ada Boost FD Best Score: {AB_FD_cv.best_score_}")
# print(f"Tuned Ada Boost FD Best Params: {AB_FD_cv.best_params_}")

In [68]:
best_AB_FD = AdaBoostRegressor(n_estimators = 400, learning_rate = 0.001, random_state = 45)
best_AB_FD.fit(X_FD_train, y_FD_train)
y_FD_pred_AB = best_AB_FD.predict(X_FD_test)
print(f"Best_AB_FD Train R-squared: {best_AB_FD.score(X_FD_train, y_FD_train)}")
print(f"Best_AB_FD Test R-squared: {best_AB_FD.score(X_FD_test, y_FD_test)}")
print(f"Best_AB_FD MAE: {mean_absolute_error(y_FD_test, y_FD_pred_AB)}")
print(f"Best_AB_FD RMSE: {mean_squared_error(y_FD_test, y_FD_pred_AB, squared = False)}")
print(f"Best_AB_FD MSE: {mean_squared_error(y_FD_test, y_FD_pred_AB)}")

Best_AB_FD Train R-squared: 0.2458315039909943
Best_AB_FD Test R-squared: 0.18123749498031205
Best_AB_FD MAE: 12.00190696453063
Best_AB_FD RMSE: 14.87972826472704
Best_AB_FD MSE: 221.4063132321168


In [69]:
# AB_DK_cv = GridSearchCV(base_AB_DK, param_grid, scoring = 'neg_mean_absolute_error', n_jobs = 2, verbose = 3)
# AB_DK_cv.fit(X_DK_train, y_DK_train)
# print(f"Tuned Ada Boost DK Best Estimator: {AB_DK_cv.best_estimator_}")
# print(f"Tuned Ada Boost DK Best Score: {AB_DK_cv.best_score_}")
# print(f"Tuned Ada Boost DK Best Params: {AB_DK_cv.best_params_}")

In [70]:
best_AB_DK = AdaBoostRegressor(n_estimators = 500, learning_rate = 0.001, random_state = 45)
best_AB_DK.fit(X_DK_train, y_DK_train)
y_DK_pred_AB = best_AB_DK.predict(X_DK_test)
print(f"Best_AB_DK Train R-squared: {best_AB_DK.score(X_DK_train, y_DK_train)}")
print(f"Best_AB_DK Test R-squared: {best_AB_DK.score(X_DK_test, y_DK_test)}")
print(f"Best_AB_DK MAE: {mean_absolute_error(y_DK_test, y_DK_pred_AB)}")
print(f"Best_AB_DK RMSE: {mean_squared_error(y_DK_test, y_DK_pred_AB, squared = False)}")
print(f"Best_AB_DK MSE: {mean_squared_error(y_DK_test, y_DK_pred_AB)}")

Best_AB_DK Train R-squared: 0.23174792189220084
Best_AB_DK Test R-squared: 0.15782851922888919
Best_AB_DK MAE: 8.11487041263312
Best_AB_DK RMSE: 10.111297055844528
Best_AB_DK MSE: 102.2383281515302


In [71]:
param_grid = {'n_estimators': [100, 200, 300, 400, 500], 'max_depth': [3, 5, 7, 9],\
              'learning_rate' : [0.001, 0.01, 0.05, 0.1], 'alpha': [0, 50, 100]}

In [72]:
# XGB_FD_cv = GridSearchCV(base_XGB_FD, param_grid, scoring = 'neg_mean_absolute_error', cv = 4, n_jobs = 2, verbose = 3)
# XGB_FD_cv.fit(X_FD_train, y_FD_train)
# print(f"Tuned XG Boost FD Best Estimator: {XGB_FD_cv.best_estimator_}")
# print(f"Tuned XG Boost FD Best Score: {XGB_FD_cv.best_score_}")
# print(f"Tuned XG Boost FD Best Params: {XGB_FD_cv.best_params_}")

In [73]:
best_XGB_FD = XGBRegressor(n_estimators = 400, learning_rate = 0.01, max_depth = 3, alpha = 0, random_state = 45)
best_XGB_FD.fit(X_FD_train, y_FD_train)
y_FD_pred_XGB = best_XGB_FD.predict(X_FD_test)
print(f"Best_XGB_FD Train R-squared: {best_XGB_FD.score(X_FD_train, y_FD_train)}")
print(f"Best_XGB_FD Test R-squared: {best_XGB_FD.score(X_FD_test, y_FD_test)}")
print(f"Best_XGB_FD MAE: {mean_absolute_error(y_FD_test, y_FD_pred_XGB)}")
print(f"Best_XGB_FD RMSE: {mean_squared_error(y_FD_test, y_FD_pred_XGB, squared = False)}")
print(f"Best_XGB_FD MSE: {mean_squared_error(y_FD_test, y_FD_pred_XGB)}")

Best_XGB_FD Train R-squared: 0.41410955342761724
Best_XGB_FD Test R-squared: 0.2025649334000077
Best_XGB_FD MAE: 11.689480996729603
Best_XGB_FD RMSE: 14.684653112314834
Best_XGB_FD MSE: 215.63903702901771


In [74]:
# XGB_DK_cv = GridSearchCV(base_XGB_DK, param_grid, scoring = 'neg_mean_absolute_error', cv = 4, n_jobs = 2, verbose = 3)
# XGB_DK_cv.fit(X_DK_train, y_DK_train)
# print(f"Tuned XG Boost DK Best Estimator: {XGB_DK_cv.best_estimator_}")
# print(f"Tuned XG Boost DK Best Score: {XGB_DK_cv.best_score_}")
# print(f"Tuned XG Boost DK Best Params: {XGB_DK_cv.best_params_}")

In [75]:
best_XGB_DK = XGBRegressor(n_estimators = 400, learning_rate = 0.01, max_depth = 3, alpha = 0, random_state = 45)
best_XGB_DK.fit(X_DK_train, y_DK_train)
y_DK_pred_XGB = best_XGB_DK.predict(X_DK_test)
print(f"Best_XGB_DK Train R-squared: {best_XGB_DK.score(X_DK_train, y_DK_train)}")
print(f"Best_XGB_DK Test R-squared: {best_XGB_DK.score(X_DK_test, y_DK_test)}")
print(f"Best_XGB_DK MAE: {mean_absolute_error(y_DK_test, y_DK_pred_XGB)}")
print(f"Best_XGB_DK RMSE: {mean_squared_error(y_DK_test, y_DK_pred_XGB, squared = False)}")
print(f"Best_XGB_DK MSE: {mean_squared_error(y_DK_test, y_DK_pred_XGB)}")

Best_XGB_DK Train R-squared: 0.4097689875162006
Best_XGB_DK Test R-squared: 0.17321853724363145
Best_XGB_DK MAE: 7.954145234540976
Best_XGB_DK RMSE: 10.018483095320285
Best_XGB_DK MSE: 100.3700035312183


In [76]:
best_XGB_FD

XGBRegressor(alpha=0, base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.01, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=400, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=45,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [77]:
#Live Predictions

In [78]:
today_pitch_df = pd.read_csv('pitch_ready4pred_2022_08_30.csv')

In [79]:
today_pitch_df.head()

Unnamed: 0.1,Unnamed: 0,Name,Team,Hand,DK_Sal,FD_Sal,Opponent,Opp_Proj_Runs,OpenOU,Proj_Run_Diff,...,Contact%,FIP,SIERA,GB/FB_Opp,K%_Opp,ISO,wRAA,wOBA,OPS,Date
0,0,Spenser Watkins,BAL,R,5800,7100,CLE,4.38,8.0,-0.76,...,81.5,4.26,4.81,1.18,17.6,0.14,6.7,0.313,0.716,2022-08-30
1,1,Cal Quantrill,CLE,R,7700,8500,BAL,3.62,8.0,0.76,...,83.5,4.32,4.62,0.97,22.5,0.155,-10.5,0.307,0.7,2022-08-30
2,2,Dakota Hudson,STL,R,6100,7300,CIN,4.2,10.5,2.1,...,83.1,4.41,5.11,1.26,24.2,0.137,-29.8,0.299,0.678,2022-08-30
3,3,Justin Dunn,CIN,R,5500,6700,STL,6.3,10.5,-2.1,...,78.4,7.32,5.34,1.06,20.4,0.164,51.9,0.328,0.748,2022-08-30
4,4,Shane McClanahan,TBR,L,11200,11200,MIA,2.67,6.5,1.16,...,67.1,2.65,2.5,1.11,28.6,0.115,-44.7,0.258,0.58,2022-08-30


In [80]:
today_pitch_df.drop(columns = ['Unnamed: 0'], inplace = True)

In [81]:
today_pitch_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 36 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           30 non-null     object 
 1   Team           30 non-null     object 
 2   Hand           30 non-null     object 
 3   DK_Sal         30 non-null     int64  
 4   FD_Sal         30 non-null     int64  
 5   Opponent       30 non-null     object 
 6   Opp_Proj_Runs  30 non-null     float64
 7   OpenOU         30 non-null     float64
 8   Proj_Run_Diff  30 non-null     float64
 9   Avg_Outs       30 non-null     float64
 10  W_Season       30 non-null     int64  
 11  GS             30 non-null     int64  
 12  SwStr%         30 non-null     float64
 13  Events         30 non-null     int64  
 14  K/9            30 non-null     float64
 15  K/BB           30 non-null     float64
 16  HardHit        30 non-null     int64  
 17  Barrels        30 non-null     int64  
 18  Hard%       

In [82]:
today_pitch_df.drop(columns = ['Hand', 'Opponent'], inplace = True)

In [83]:
today_pitch_df[['Name', 'Team', 'K%_Opp']]

Unnamed: 0,Name,Team,K%_Opp
0,Spenser Watkins,BAL,17.6
1,Cal Quantrill,CLE,22.5
2,Dakota Hudson,STL,24.2
3,Justin Dunn,CIN,20.4
4,Shane McClanahan,TBR,28.6
5,Jesus Luzardo,MIA,21.4
6,Cole Irvin,OAK,19.9
7,Erick Fedde,WSN,23.6
8,Marcus Stroman,CHC,20.9
9,Kevin Gausman,TOR,23.1


In [84]:
today_pitch_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30 entries, 0 to 29
Data columns (total 34 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Name           30 non-null     object 
 1   Team           30 non-null     object 
 2   DK_Sal         30 non-null     int64  
 3   FD_Sal         30 non-null     int64  
 4   Opp_Proj_Runs  30 non-null     float64
 5   OpenOU         30 non-null     float64
 6   Proj_Run_Diff  30 non-null     float64
 7   Avg_Outs       30 non-null     float64
 8   W_Season       30 non-null     int64  
 9   GS             30 non-null     int64  
 10  SwStr%         30 non-null     float64
 11  Events         30 non-null     int64  
 12  K/9            30 non-null     float64
 13  K/BB           30 non-null     float64
 14  HardHit        30 non-null     int64  
 15  Barrels        30 non-null     int64  
 16  Hard%          30 non-null     float64
 17  BB/9           30 non-null     float64
 18  EV          

In [85]:
X_cols = list(X.columns)

In [86]:
today_pitch_df = today_pitch_df.set_index(['Name', 'Team', 'Date', 'DK_Sal', 'FD_Sal'])

In [87]:
today_pitch_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 30 entries, ('Spenser Watkins', 'BAL', '2022-08-30', 5800, 7100) to ('Logan Webb', 'SFG', '2022-08-30', 9100, 9400)
Data columns (total 29 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Opp_Proj_Runs  30 non-null     float64
 1   OpenOU         30 non-null     float64
 2   Proj_Run_Diff  30 non-null     float64
 3   Avg_Outs       30 non-null     float64
 4   W_Season       30 non-null     int64  
 5   GS             30 non-null     int64  
 6   SwStr%         30 non-null     float64
 7   Events         30 non-null     int64  
 8   K/9            30 non-null     float64
 9   K/BB           30 non-null     float64
 10  HardHit        30 non-null     int64  
 11  Barrels        30 non-null     int64  
 12  Hard%          30 non-null     float64
 13  BB/9           30 non-null     float64
 14  EV             30 non-null     float64
 15  HardHit%       30 non-null     float64
 16  HR/9

In [88]:
df_cols = list(today_pitch_df.columns)

In [89]:
len(df_cols)

29

In [90]:
set(X_cols).difference(set(df_cols))

set()

In [91]:
y_FD_pred_GB.min(), y_FD_pred_GB.max()

(11.110432454933694, 44.96178476085973)

In [92]:
y_DK_pred_XGB.min(), y_DK_pred_XGB.max()

(3.1160052, 25.44302)

In [93]:
today_pitch_df = today_pitch_df[best_features]

In [94]:
today_pitch_df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 30 entries, ('Spenser Watkins', 'BAL', '2022-08-30', 5800, 7100) to ('Logan Webb', 'SFG', '2022-08-30', 9100, 9400)
Data columns (total 29 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   AVG            30 non-null     float64
 1   Avg_Outs       30 non-null     float64
 2   BB/9           30 non-null     float64
 3   Barrels        30 non-null     int64  
 4   Contact%       30 non-null     float64
 5   ERA            30 non-null     float64
 6   EV             30 non-null     float64
 7   Events         30 non-null     int64  
 8   FIP            30 non-null     float64
 9   GB/FB_Opp      30 non-null     float64
 10  GS             30 non-null     int64  
 11  HR/9           30 non-null     float64
 12  Hard%          30 non-null     float64
 13  HardHit        30 non-null     int64  
 14  HardHit%       30 non-null     float64
 15  ISO            30 non-null     float64
 16  K%_O

In [95]:
y_FD_pred_GB = best_GB_FD.predict(today_pitch_df)
y_DK_pred_XGB = best_XGB_DK.predict(today_pitch_df)
# y_FD_pred_RF = best_RF_FD.predict(today_pitch_df)
# y_FD_pred_AB = best_AB_FD.predict(today_pitch_df)
# y_FD_pred_XGB = best_XGB_FD.predict(today_pitch_df)
# y_DK_pred_RF = best_RF_DK.predict(today_pitch_df)
# y_DK_pred_GB = best_GB_DK.predict(today_pitch_df)
# y_DK_pred_AB = best_AB_DK.predict(today_pitch_df)



# 'RandomForest_DK': best_RF_DK, 'GradientBoost_FD': best_GB_FD,\
#                'GradientBoost_DK': best_GB_DK, 'XGBoost_FD': best_XGB_FD, 'XGBoost_DK': best_XGB_DK}

In [96]:
y_FD_pred_GB

array([23.17531352, 28.47330844, 15.12998588, 14.02550158, 40.25156395,
       37.17041669, 20.24552429, 18.20803185, 23.63468729, 39.2856717 ,
       31.64863284, 24.2237527 , 36.55560702, 24.45794825, 16.31892134,
       32.35186508, 23.07160934, 23.18271781, 27.33944871, 24.85646587,
       29.29335211, 35.0869765 , 23.5966302 , 21.8068177 , 32.54406332,
       14.7858817 , 36.95431012, 28.25950264, 31.58539303, 22.27320147])

In [97]:
y_DK_pred_XGB

array([10.905191 , 13.6752205,  8.27438  ,  6.9913363, 21.779486 ,
       19.901232 ,  9.38379  ,  8.805973 , 11.59917  , 23.345814 ,
       16.765804 , 12.922014 , 20.53717  , 12.031065 ,  6.021964 ,
       17.178461 , 11.155189 , 11.609089 , 13.164487 , 12.565856 ,
       14.389056 , 19.132158 , 11.463988 , 10.03463  , 17.814806 ,
        6.285184 , 19.983734 , 14.075282 , 16.656454 , 11.252496 ],
      dtype=float32)

In [98]:
# model_dict = {}
# for k, v in best_models.items():
#     model_name = k
#     pred = v.predict(today_pitch_df)
#     model_dict.update({k:pred})

In [99]:
#model_dict

In [100]:
# for k, v in model_dict.items():
#     today_pitch_df[k] = v

In [101]:
#model_col_names = list(model_dict.keys())

In [102]:
today_pitch_df.reset_index(inplace=True)

In [103]:
today_pitch_df.set_index(['Name', 'Team', 'Date'], inplace = True)

In [104]:
today_pitch_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,DK_Sal,FD_Sal,AVG,Avg_Outs,BB/9,Barrels,Contact%,ERA,EV,Events,...,OPS,OpenOU,Opp_Proj_Runs,Proj_Run_Diff,SIERA,SwStr%,WHIP,W_Season,wOBA,wRAA
Name,Team,Date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Spenser Watkins,BAL,2022-08-30,5800,7100,0.261,14.0625,2.57,23,81.5,3.96,89.9,280,...,0.716,8.0,4.38,-0.76,4.81,8.8,1.32,4,0.313,6.7
Cal Quantrill,CLE,2022-08-30,7700,8500,0.246,17.73913,2.39,32,83.5,3.59,87.8,450,...,0.7,8.0,3.62,0.76,4.62,7.9,1.2,10,0.307,-10.5
Dakota Hudson,STL,2022-08-30,6100,7300,0.254,15.590909,4.08,24,83.1,4.23,89.5,378,...,0.678,10.5,4.2,2.1,5.11,7.3,1.39,7,0.299,-29.8
Justin Dunn,CIN,2022-08-30,5500,6700,0.26,13.333333,4.66,7,78.4,5.12,91.3,58,...,0.748,10.5,6.3,-2.1,5.34,9.3,1.5,1,0.328,51.9
Shane McClanahan,TBR,2022-08-30,11200,11200,0.184,18.434783,1.83,21,67.1,2.2,87.1,345,...,0.58,6.5,2.67,1.16,2.5,16.3,0.86,11,0.258,-44.7
Jesus Luzardo,MIA,2022-08-30,8700,9000,0.175,15.7,3.49,9,70.1,3.34,89.0,143,...,0.694,6.5,3.83,-1.16,3.46,13.8,1.01,3,0.305,-4.9
Cole Irvin,OAK,2022-08-30,7900,8600,0.226,18.454545,1.64,44,81.1,3.16,89.7,438,...,0.676,8.5,4.25,0.0,4.31,9.8,1.03,6,0.299,-13.5
Erick Fedde,WSN,2022-08-30,5700,6800,0.261,14.631579,4.42,22,82.4,4.88,89.1,304,...,0.614,8.5,4.25,0.0,4.97,7.4,1.51,5,0.272,-102.6
Marcus Stroman,CHC,2022-08-30,7400,7900,0.243,16.176471,2.51,21,80.2,4.1,90.6,290,...,0.753,7.5,4.48,-1.46,3.78,9.0,1.22,3,0.328,55.3
Kevin Gausman,TOR,2022-08-30,8700,10200,0.274,16.869565,1.47,29,72.4,3.15,89.2,382,...,0.704,7.5,3.02,1.46,3.0,15.5,1.26,9,0.31,-0.7


In [105]:
today_pitch_df = today_pitch_df[['DK_Sal', 'FD_Sal']]

In [106]:
today_pitch_df['FD_Pred'] = y_FD_pred_GB
today_pitch_df['DK_Pred'] = y_DK_pred_XGB

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  today_pitch_df['FD_Pred'] = y_FD_pred_GB
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  today_pitch_df['DK_Pred'] = y_DK_pred_XGB


In [107]:
today_pitch_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,DK_Sal,FD_Sal,FD_Pred,DK_Pred
Name,Team,Date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Spenser Watkins,BAL,2022-08-30,5800,7100,23.175314,10.905191
Cal Quantrill,CLE,2022-08-30,7700,8500,28.473308,13.67522
Dakota Hudson,STL,2022-08-30,6100,7300,15.129986,8.27438
Justin Dunn,CIN,2022-08-30,5500,6700,14.025502,6.991336
Shane McClanahan,TBR,2022-08-30,11200,11200,40.251564,21.779486
Jesus Luzardo,MIA,2022-08-30,8700,9000,37.170417,19.901232
Cole Irvin,OAK,2022-08-30,7900,8600,20.245524,9.38379
Erick Fedde,WSN,2022-08-30,5700,6800,18.208032,8.805973
Marcus Stroman,CHC,2022-08-30,7400,7900,23.634687,11.59917
Kevin Gausman,TOR,2022-08-30,8700,10200,39.285672,23.345814


In [108]:
today_pitch_df['FD_Val'] = today_pitch_df['FD_Sal']/today_pitch_df['FD_Pred']
today_pitch_df['DK_Val'] = today_pitch_df['DK_Sal']/today_pitch_df['DK_Pred']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  today_pitch_df['FD_Val'] = today_pitch_df['FD_Sal']/today_pitch_df['FD_Pred']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  today_pitch_df['DK_Val'] = today_pitch_df['DK_Sal']/today_pitch_df['DK_Pred']


In [109]:
today_pitch_df.sort_values(by = 'FD_Val')

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,DK_Sal,FD_Sal,FD_Pred,DK_Pred,FD_Val,DK_Val
Name,Team,Date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Lucas Giolito,CHW,2022-08-30,8200,8400,35.086977,19.132158,239.405068,428.59775
Jesus Luzardo,MIA,2022-08-30,8700,9000,37.170417,19.901232,242.128036,437.15887
George Kirby,SEA,2022-08-30,8500,9200,36.555607,20.53717,251.671378,413.883696
Kevin Gausman,TOR,2022-08-30,8700,10200,39.285672,23.345814,259.636645,372.657817
Jameson Taillon,NYY,2022-08-30,7700,8700,32.544063,17.814806,267.329863,432.224746
Kutter Crawford,BOS,2022-08-30,5500,6400,23.071609,11.155189,277.397207,493.044109
Shane McClanahan,TBR,2022-08-30,11200,11200,40.251564,21.779486,278.250058,514.245385
Chris Archer,MIN,2022-08-30,5900,6500,23.182718,11.609089,280.381276,508.222484
Jason Alexander,MIL,2022-08-30,5400,6200,21.806818,10.03463,284.314754,538.136443
Mitch Keller,PIT,2022-08-30,6200,7000,23.59663,11.463988,296.652528,540.823999


In [110]:
today_pitch_df.sort_values(by = 'FD_Pred', ascending = False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,DK_Sal,FD_Sal,FD_Pred,DK_Pred,FD_Val,DK_Val
Name,Team,Date,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Shane McClanahan,TBR,2022-08-30,11200,11200,40.251564,21.779486,278.250058,514.245385
Kevin Gausman,TOR,2022-08-30,8700,10200,39.285672,23.345814,259.636645,372.657817
Jesus Luzardo,MIA,2022-08-30,8700,9000,37.170417,19.901232,242.128036,437.15887
Aaron Nola,PHI,2022-08-30,10600,11000,36.95431,19.983734,297.664872,530.431396
George Kirby,SEA,2022-08-30,8500,9200,36.555607,20.53717,251.671378,413.883696
Lucas Giolito,CHW,2022-08-30,8200,8400,35.086977,19.132158,239.405068,428.59775
Jameson Taillon,NYY,2022-08-30,7700,8700,32.544063,17.814806,267.329863,432.224746
Max Fried,ATL,2022-08-30,9900,10500,32.351865,17.178461,324.55625,576.303078
Andrew Heaney,LAD,2022-08-30,9600,9700,31.648633,16.765804,306.490332,572.594063
Blake Snell,SDP,2022-08-30,9300,9500,31.585393,16.656454,300.771942,558.342127


In [111]:
filename = r'Pitching_Proj_2022_08_30'
today_pitch_df.to_csv(filename)

In [550]:
today = datetime.date.today()

In [551]:
yesterday = today - timedelta(days = 1)

In [552]:
yesterday_str = str(yesterday)

In [553]:
from bs4 import BeautifulSoup
import requests

In [554]:
#Getting yesterday's pitching points
url_string_page1 = 'https://www.fangraphs.com/leaders.aspx?pos=all&stats=sta&lg=all&qual=0&type=c,4,13,17,24,15,19,9,21,10&season=2022&month=1000&season1=2022&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate=' + yesterday_str + '&enddate=' + yesterday_str
r_page1 = requests.get(url_string_page1)
html_doc_page1 = r_page1.text
soup_obj_page1 = BeautifulSoup(html_doc_page1)
#Finding number of pages to scrape since there are only 30 rows per page
num_pages = int(soup_obj_page1.find_all('strong')[1].get_text()) + 1
print(num_pages)
col_names = []
headers = soup_obj_page1.find_all('th', class_ = 'rgHeader')
for header in headers:
    col_names.append(header.get_text())
all_data = []
data = soup_obj_page1.find_all('td', class_ = 'grid_line_regular')
for item in data:
    all_data.append(item.get_text())
print(len(all_data))
if num_pages > 2:
    for j in range(2, num_pages):
        temp_url_string = 'https://www.fangraphs.com/leaders.aspx?pos=all&stats=sta&lg=all&qual=0&type=c,4,13,17,24,15,19,9,21,10&season=2022&month=1000&season1=2022&ind=0&team=0&rost=0&age=0&filter=&players=0&startdate=' + yesterday_str + '&enddate=' + yesterday_str + '&page=' + str(j) + '_30'
        temp_r = requests.get(temp_url_string)
        temp_html_doc = temp_r.text
        temp_soup_obj = BeautifulSoup(temp_html_doc)
        temp_data = temp_soup_obj.find_all('td', class_ = 'grid_line_regular')
        for entry in temp_data:
            all_data.append(entry.get_text())
#Turning the list of data into an iterator before diving it into rows. Determining number of rows by dividing
#number of column names by length of data list
data_iter = iter(all_data)
num_rows = int(len(all_data)/len(col_names))
data_lists = []
for k in range(num_rows):
    temp_list = []
    for l in range(len(headers)):
        temp_list.append(next(data_iter))
    data_lists.append(temp_list)
print(len(data_lists))
SP_live_df = pd.DataFrame(data_lists, columns = col_names)

3
360
32


In [555]:
SP_live_df.head()

Unnamed: 0,#,Name,Team,W,IP,ER,SO,H,BB,CG,HBP,ShO
0,1,Justin Verlander,HOU,1,6.0,0,10,0,0,0,0,0
1,2,Pablo Lopez,MIA,1,6.0,0,5,4,2,0,0,0
2,3,Javier Assad,CHC,0,4.0,0,3,4,4,0,0,0
3,4,Jon Heasley,KCR,0,4.2,0,2,4,4,0,0,0
4,5,Max Fried,ATL,1,8.0,1,7,3,1,0,0,0


In [556]:
SP_live_cols = list(SP_live_df.columns)

In [557]:
SP_live_cols = SP_live_cols[3:]

In [558]:
SP_live_cols

['W', 'IP', 'ER', 'SO', 'H', 'BB', 'CG', 'HBP', 'ShO']

In [559]:
for col in SP_live_cols:
    SP_live_df.loc[:, col] = pd.to_numeric(SP_live_df.loc[:, col])

In [560]:
SP_live_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   #       32 non-null     object 
 1   Name    32 non-null     object 
 2   Team    32 non-null     object 
 3   W       32 non-null     int64  
 4   IP      32 non-null     float64
 5   ER      32 non-null     int64  
 6   SO      32 non-null     int64  
 7   H       32 non-null     int64  
 8   BB      32 non-null     int64  
 9   CG      32 non-null     int64  
 10  HBP     32 non-null     int64  
 11  ShO     32 non-null     int64  
dtypes: float64(1), int64(8), object(3)
memory usage: 3.1+ KB


In [561]:
yesterday_str = yesterday_str.replace('-', '_')

In [562]:
def split_innings(df):
    """Turns 'IP' variable into float then an int, creates 'Part_IP' variable for number after decimal point
    """
    df['IP'] = df['IP'].astype(float)
    df['Part_IP'] = round(df['IP'] % 1, 1)
    df['IP'] = df['IP'].astype(int)
    return df

In [563]:
def make_outs_col(df):
    """Completes process of turning 'IP' into 'Outs' by turning 'Part_IP' into 0, 1 or 2 and
    then multiplying IP by 3
    """
    conds = [df['Part_IP'] == 0, df['Part_IP'] == .1, df['Part_IP'] == .2]
    choices = [0, 1, 2]
    df['Part_IP'] = np.select(conds, choices)
    df['Outs'] = df['IP'] * 3 + df['Part_IP']   
    return df

In [564]:
SP_live_df = split_innings(SP_live_df)

In [565]:
SP_live_df = make_outs_col(SP_live_df)

In [566]:
SP_live_df.drop(columns = ['IP', 'Part_IP'], inplace = True)

In [567]:
SP_live_df['CGS'] = np.where(SP_live_df['CG'] + SP_live_df['ShO'] == 2, 1, 0)
SP_live_df['NH'] = np.where(((SP_live_df['CG'] == 1) & (SP_live_df['H'] == 0)), 1, 0)
SP_live_df['QS'] = np.where((SP_live_df['Outs'] >= 18) & (SP_live_df['ER'] <= 3), 1, 0)

In [568]:
SP_live_df['FD_Pts'] = (SP_live_df['W'] * 6) + (SP_live_df['QS'] * 4) - (SP_live_df['ER'] * 3) + (SP_live_df['SO'] * 3) + (SP_live_df['Outs'])
SP_live_df['DK_Pts'] = (SP_live_df['Outs'] * .75) + (SP_live_df['SO'] * 2) + (SP_live_df['W'] * 4) - (SP_live_df['ER'] * 2) -\
(SP_live_df['H'] * .6) - (SP_live_df['BB'] * .6) - (SP_live_df['HBP'] * .6) + (SP_live_df['CG'] * 2.5) + (SP_live_df['CGS'] * 2.5) +\
(SP_live_df['NH'] * 5)

In [569]:
SP_live_df = SP_live_df[['Name', 'FD_Pts', 'DK_Pts']]

In [570]:
today_pitch_df = pd.merge(today_pitch_df, SP_live_df, on = 'Name', how = 'left')

In [571]:
today_pitch_df

Unnamed: 0,Name,DK_Sal,FD_Sal,FD_Pred,DK_Pred,FD_Val,DK_Val,FD_Pts,DK_Pts
0,Nick Lodolo,9000,9400,27.028246,13.484868,347.784318,667.414762,19,10.4
1,Ranger Suarez,9200,9200,29.492905,14.900973,311.939426,617.409333,23,10.75
2,Max Fried,9400,10200,28.090427,16.988113,363.113029,553.32807,52,31.6
3,JT Brubaker,7200,8500,26.324148,13.086819,322.897445,550.171907,26,13.0
4,Dylan Cease,10000,10300,39.243883,23.716122,262.461286,421.654103,16,8.4
5,Austin Voth,6300,7600,25.448612,13.354377,298.641047,471.755447,26,12.75
6,Taijuan Walker,8300,8300,26.229989,14.021778,316.431702,591.936339,18,9.65
7,Frankie Montas,7900,8700,28.393209,14.769177,306.411299,534.897765,29,15.95
8,Carlos Rodon,11000,10800,33.394332,18.924931,323.408179,581.243876,58,34.75
9,Drew Hutchison,5500,6100,20.153359,8.681081,302.679079,633.561663,21,11.65


In [574]:
today_pitch_df['FD_Err'] = today_pitch_df['FD_Pts'] - today_pitch_df['FD_Pred']
today_pitch_df['DK_Err'] = today_pitch_df['DK_Pts'] - today_pitch_df['DK_Pred']

In [575]:
today_pitch_df

Unnamed: 0,Name,DK_Sal,FD_Sal,FD_Pred,DK_Pred,FD_Val,DK_Val,FD_Pts,DK_Pts,FD_Err,DK_Err
0,Nick Lodolo,9000,9400,27.028246,13.484868,347.784318,667.414762,19,10.4,-8.028246,-3.084868
1,Ranger Suarez,9200,9200,29.492905,14.900973,311.939426,617.409333,23,10.75,-6.492905,-4.150973
2,Max Fried,9400,10200,28.090427,16.988113,363.113029,553.32807,52,31.6,23.909573,14.611887
3,JT Brubaker,7200,8500,26.324148,13.086819,322.897445,550.171907,26,13.0,-0.324148,-0.086819
4,Dylan Cease,10000,10300,39.243883,23.716122,262.461286,421.654103,16,8.4,-23.243883,-15.316122
5,Austin Voth,6300,7600,25.448612,13.354377,298.641047,471.755447,26,12.75,0.551388,-0.604377
6,Taijuan Walker,8300,8300,26.229989,14.021778,316.431702,591.936339,18,9.65,-8.229989,-4.371778
7,Frankie Montas,7900,8700,28.393209,14.769177,306.411299,534.897765,29,15.95,0.606791,1.180823
8,Carlos Rodon,11000,10800,33.394332,18.924931,323.408179,581.243876,58,34.75,24.605668,15.825069
9,Drew Hutchison,5500,6100,20.153359,8.681081,302.679079,633.561663,21,11.65,0.846641,2.968919
