In [1]:
import os
import pandas as pd
import numpy as np
from pybaseball import pitching_stats


In [2]:
# Setting parameters for date search
START = 2015
END = 2022

In [3]:
if os.path.exists("pitching.csv"):
    pitching = pd.read_csv("pitching.csv", index_col=0)
else:
    pitching = pitching_stats(START, END)
    pitching.to_csv("pitching.csv")

In [4]:
pitching = pitching.groupby("IDfg",group_keys=False).filter(lambda x: x.shape[0]>1)

In [5]:
pitching

Unnamed: 0,IDfg,Season,Name,Team,Age,W,L,WAR,ERA,G,...,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xERA
2,10954,2018,Jacob deGrom,NYM,30,10,9,9.0,1.70,32,...,11.1,20,0.039,112.9,148,0.287,515,0.164,0.315,
10,2036,2015,Clayton Kershaw,LAD,27,16,7,8.6,2.13,33,...,8.9,15,0.028,113.3,132,0.244,542,0.164,0.323,
75,10603,2017,Chris Sale,BOS,28,17,8,7.6,2.90,32,...,15.0,27,0.055,112.7,149,0.303,492,0.183,0.332,
32,13125,2019,Gerrit Cole,HOU,28,20,5,7.5,2.50,33,...,14.0,25,0.057,115.2,156,0.355,440,0.171,0.339,
36,3137,2018,Max Scherzer,WSN,33,18,7,7.5,2.53,33,...,20.6,34,0.068,111.1,159,0.316,503,0.165,0.327,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311,15467,2022,Marco Gonzales,SEA,30,10,15,0.1,4.13,32,...,14.5,45,0.072,111.2,215,0.345,623,0.151,0.235,
434,7593,2021,Jordan Lyles,TEX,30,10,13,0.1,5.15,32,...,15.3,51,0.091,115.5,233,0.416,560,0.158,0.263,
451,15474,2018,Lucas Giolito,CHW,23,10,13,0.1,6.13,32,...,13.3,39,0.072,114.4,201,0.369,545,0.171,0.254,
452,15440,2020,Matthew Boyd,DET,29,3,7,-0.1,6.71,12,...,15.4,20,0.109,110.8,68,0.370,184,0.138,0.263,


In [6]:
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player ["WAR"].shift(-1)
    return player

pitching = pitching.groupby("IDfg",group_keys=False).apply(next_season)


In [7]:
pitching

Unnamed: 0,IDfg,Season,Name,Team,Age,W,L,WAR,ERA,G,...,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xERA,Next_WAR
319,375,2015,Bartolo Colon,NYM,42,14,13,2.5,4.16,33,...,33,0.051,113.4,227,0.349,651,0.212,0.276,,2.4
175,375,2016,Bartolo Colon,NYM,43,15,8,2.4,3.43,34,...,29,0.046,117.1,232,0.369,628,0.210,0.265,,
408,404,2015,CC Sabathia,NYY,34,6,10,1.2,4.73,29,...,23,0.043,115.3,166,0.311,533,0.185,0.276,,2.6
267,404,2016,CC Sabathia,NYY,35,9,12,2.6,3.91,30,...,22,0.041,111.1,146,0.269,542,0.176,0.275,,
111,1118,2015,Marco Estrada,TOR,31,13,8,1.8,3.13,34,...,24,0.045,112.8,164,0.307,534,0.147,0.246,,2.7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
135,19374,2019,Walker Buehler,LAD,24,14,4,5.1,3.26,30,...,21,0.044,111.3,174,0.364,478,0.180,0.301,,5.5
29,19374,2021,Walker Buehler,LAD,26,16,4,5.5,2.47,33,...,37,0.068,114.4,201,0.369,545,0.181,0.296,,
137,19427,2019,Shane Bieber,CLE,24,15,8,5.5,3.28,34,...,41,0.074,113.5,231,0.417,554,0.185,0.325,,3.2
0,19427,2020,Shane Bieber,CLE,25,8,1,3.2,1.63,12,...,11,0.072,112.9,66,0.431,153,0.167,0.338,,4.9


In [8]:
null_count = pitching.isnull().sum()

In [9]:
complete_cols = list(pitching.columns[null_count==0])

In [10]:
pitching = pitching[complete_cols + ["Next_WAR"]].copy()

In [11]:
pitching.isnull().sum()

IDfg          0
Season        0
Name          0
Team          0
Age           0
           ... 
HardHit%      0
Events        0
CStr%         0
CSW%          0
Next_WAR    112
Length: 161, dtype: int64

In [12]:
pitching.dtypes[pitching.dtypes=="object"]

Name       object
Team       object
Dollars    object
Age Rng    object
dtype: object

In [13]:
del pitching["Age Rng"]
del pitching["Dollars"]

In [14]:
pitching["team_code"]= pitching["Team"].astype("category").cat.codes

In [15]:
pitching_full = pitching.copy()
pitching = pitching.dropna().copy()

In [16]:
pitching

Unnamed: 0,IDfg,Season,Name,Team,Age,W,L,WAR,ERA,G,...,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,Next_WAR,team_code
319,375,2015,Bartolo Colon,NYM,42,14,13,2.5,4.16,33,...,33,0.051,113.4,227,0.349,651,0.212,0.276,2.4,18
408,404,2015,CC Sabathia,NYY,34,6,10,1.2,4.73,29,...,23,0.043,115.3,166,0.311,533,0.185,0.276,2.6,19
111,1118,2015,Marco Estrada,TOR,31,13,8,1.8,3.13,34,...,24,0.045,112.8,164,0.307,534,0.147,0.246,2.7,29
182,1118,2016,Marco Estrada,TOR,32,9,9,2.7,3.48,29,...,33,0.067,113.2,154,0.315,489,0.167,0.276,2.6,29
266,1245,2015,R.A. Dickey,TOR,40,11,11,1.8,3.91,33,...,26,0.038,110.9,208,0.303,686,0.163,0.254,0.8,29
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52,19291,2020,Zac Gallen,ARI,24,3,2,1.4,2.75,12,...,13,0.071,109.5,59,0.324,182,0.195,0.316,4.3,1
24,19361,2021,Corbin Burnes,MIL,26,11,5,7.5,2.43,28,...,12,0.031,114.2,117,0.305,383,0.172,0.338,4.6,16
135,19374,2019,Walker Buehler,LAD,24,14,4,5.1,3.26,30,...,21,0.044,111.3,174,0.364,478,0.180,0.301,5.5,14
137,19427,2019,Shane Bieber,CLE,24,15,8,5.5,3.28,34,...,41,0.074,113.5,231,0.417,554,0.185,0.325,3.2,8


In [17]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs=SequentialFeatureSelector(rr,n_features_to_select=10,direction = "forward", cv=split, n_jobs=8)

In [18]:
removed_columns = ["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_columns = pitching.columns[~pitching.columns.isin(removed_columns)]

In [19]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
pitching.loc[:,selected_columns]=scaler.fit_transform(pitching[selected_columns])

  pitching.loc[:,selected_columns]=scaler.fit_transform(pitching[selected_columns])


In [20]:
sfs.fit(pitching[selected_columns],pitching["Next_WAR"])

In [21]:
predictors = list(selected_columns[sfs.get_support()])
predictors

['WAR',
 'IFH',
 'HR/FB',
 'tERA',
 'FBv',
 'xFIP-',
 'BB%',
 'Zone% (sc)',
 'Hard%',
 'Pull%+']

In [22]:
def backtest(data,model,predictors,start=3,step=1):
    all_predictions=[]
    
    years=sorted(data["Season"].unique())
    
    for i in range(start, len(years),step):
        current_year=years[i]
        train=data[data["Season"]<current_year]
        test=data[data["Season"]==current_year]
        
        model.fit(train[predictors],train["Next_WAR"])
        
        preds=model.predict(test[predictors])
        preds=pd.Series(preds,index=test.index)
        combined=pd.concat([test["Next_WAR"],preds],axis=1)
        combined.columns=["actual","prediction"]
        
        all_predictions.append(combined)
        
    return pd.concat(all_predictions)

In [23]:
predictions = backtest(pitching, rr, predictors)

In [24]:
predictions

Unnamed: 0,actual,prediction
127,5.3,2.394819
324,2.1,1.798846
73,3.0,3.836649
337,1.8,2.683320
36,6.5,5.132629
...,...,...
392,6.3,3.457844
269,4.4,3.981179
122,5.7,4.027255
328,1.4,2.718562


In [25]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

2.4385903598074226

In [26]:
pitching["Next_WAR"].describe()

count    255.000000
mean       3.065490
std        1.693871
min       -1.000000
25%        1.900000
50%        2.800000
75%        4.150000
max        9.000000
Name: Next_WAR, dtype: float64

In [27]:
2.4385903598074217 ** 0.5

1.5615986551631702

In [28]:
def player_history(df):
    df = df.sort_values("Season")
    
    df["player_season"] = range(0,df.shape[0])
    
    df["war_corr"] = list(df[["player_season","WAR"]].expanding().corr().loc[(slice(None),"player_season"),"WAR"])
    
    df ["war_corr"].fillna(0,inplace=True)
    
    df["war_diff"]=df["WAR"]/df["WAR"].shift(1)
    df["war_diff"].fillna(1,inplace=True)
    df["war_diff"][df["war_diff"]==np.inf] = 1
    
    return df

pitching = pitching.groupby("IDfg",group_keys=False).apply(player_history)

In [29]:
def group_averages(df):
    return df["WAR"]/df["WAR"].mean()

pitching["war_season"] = pitching.groupby("Season", group_keys=False).apply(group_averages)

In [30]:
new_predictors = predictors + ["player_season", "war_season"]

In [31]:
predictions = backtest(pitching,rr, new_predictors)

In [32]:
mean_squared_error(predictions["actual"],predictions["prediction"])

2.468286370804577

I have learned that, apparently, adding player history makes the model worse for pitchers. Pitching WAR is better explained by prior season stats without adding player history and WAR information.