In [30]:
import sklearn.linear_model as model
import numpy as np
import pandas as pd
import pybaseball as pyb
from sklearn.metrics import r2_score

## Data Gathering

In [62]:
# Download the stats from 2024 and 2023

min_innings = 0 # change this if you want to lower the minimum number of innings (we only look at pitchers who pitched more than this number of IP in *both* seasons)

stats_2024 = pyb.pitching_stats(start_season = 2024, end_season=2024, qual=min_innings)
stats_2023 = pyb.pitching_stats(start_season = 2023, end_season=2023, qual=min_innings)

In [63]:
# Here is the list of stats I am going to look at. IDfg is an id to make sure we are looking at the same play in 2023 as in 2024. Name is the player's name. Most stats are self-explanatory, but I threw in a couple that I thought would be funny (BABIP and WPA) but are unlikely to have any coorelation. If you want to see the full list of stats, you can look at stats_2023.columns
f2023 = stats_2023.filter(items=["IDfg", "Name", "ERA", "ERA-", "FIP", "xERA", "xFIP", "SIERA", "CSW%", "K-BB%", "K%", "BB%", "WPA", "BABIP", "Zone%", "Contact%", "Swing%", "Soft%", "Hard%", "Stuff+", "Location+", "Pitching+"])

In [64]:
# Just looking at ERA and ERA- in 2024. These should both be relativly the same but just wanted to try them both. I am renaming them so I can combine this with the 2023 stats
f2024 = stats_2024.filter(items=["IDfg", "ERA", "ERA-"]).rename(columns={"ERA": "2024ERA", "ERA-": "2024ERA-"})

In [65]:
# This combines both of the tables into one big one. Note only pitchers who pitched min_innings in both seasons will be on this

all_pitching = pd.merge(f2023, f2024, "inner", "IDfg")
all_pitching

Unnamed: 0,IDfg,Name,ERA,ERA-,FIP,xERA,xFIP,SIERA,CSW%,K-BB%,K%,BB%,WPA,BABIP,Zone%,Contact%,Swing%,Soft%,Hard%,Stuff+,Location+,Pitching+,2024ERA,2024ERA-
0,10310,Zack Wheeler,3.61,82,3.15,3.18,3.54,3.53,0.275,0.220,0.269,0.050,1.94,0.292,0.421,0.740,0.511,0.214,0.313,114,107,122,2.57,62
1,27498,Spencer Strider,3.86,87,2.85,3.09,2.92,2.86,0.338,0.292,0.368,0.076,1.78,0.316,0.438,0.643,0.530,0.142,0.354,113,103,115,7.00,168
2,13125,Gerrit Cole,2.63,61,3.16,3.48,3.60,3.63,0.277,0.212,0.270,0.058,4.39,0.261,0.425,0.765,0.499,0.151,0.311,109,106,114,3.41,85
3,12768,Sonny Gray,2.79,65,2.83,3.66,3.64,3.95,0.279,0.170,0.243,0.073,3.01,0.295,0.422,0.759,0.474,0.143,0.300,96,101,100,3.84,95
4,19291,Zac Gallen,3.47,78,3.26,4.16,3.49,3.67,0.285,0.204,0.260,0.056,2.87,0.301,0.419,0.761,0.470,0.098,0.401,97,107,105,3.65,87
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,6893,Johnny Cueto,6.02,135,7.02,5.13,5.20,4.93,0.224,0.110,0.179,0.069,-0.91,0.236,0.408,0.831,0.523,0.161,0.323,93,110,103,7.15,174
607,20167,Kyle Muller,7.60,184,6.14,7.40,5.50,5.47,0.224,0.046,0.151,0.105,-2.54,0.372,0.453,0.811,0.482,0.113,0.391,93,97,91,4.01,104
608,13763,Dominic Leone,4.67,111,6.29,5.30,4.94,4.49,0.277,0.111,0.230,0.119,-0.27,0.255,0.365,0.684,0.507,0.086,0.391,100,97,98,6.63,164
609,14527,Jorge Lopez,5.95,138,5.76,5.41,4.69,4.35,0.277,0.102,0.184,0.083,-0.77,0.314,0.449,0.792,0.472,0.139,0.364,108,100,107,2.89,73


## Statistics time

In [66]:
# First, lets see how last year's ERA predicts next years ERA
era2era = model.LinearRegression().fit(X=np.array(all_pitching["ERA"]).reshape(-1,1), y=np.array(all_pitching["2024ERA"]).reshape(-1,1))

In [67]:
era_pred = era2era.predict(np.array(all_pitching["ERA"]).reshape(-1,1))

In [68]:
r2_score(np.array(all_pitching["2024ERA"]).reshape(-1,1), era_pred)

0.02100884536335612

Wow thats a small number! Lets see how every other stat does

In [69]:
y = np.array(all_pitching["2024ERA"]).reshape(-1,1)
stats = ["ERA", "ERA-", "FIP", "xERA", "xFIP", "SIERA", "CSW%", "K-BB%", "K%", "BB%", "WPA", "BABIP", "Zone%", "Contact%", "Swing%", "Soft%", "Hard%", "Stuff+", "Location+", "Pitching+"]
r2s = {}

for stat in stats:
    X = np.array(all_pitching[stat]).reshape(-1,1)
    reg = model.LinearRegression().fit(X, y)

    y_pred = reg.predict(X)
    r2 = r2_score(y, y_pred)

    print(f"{stat} has a r2 value of {r2}")
    r2s[stat] = r2


ERA has a r2 value of 0.02100884536335612
ERA- has a r2 value of 0.017492347058716384
FIP has a r2 value of 0.04272143364139169
xERA has a r2 value of 0.04062252803415434
xFIP has a r2 value of 0.03008001548307493
SIERA has a r2 value of 0.026818565586842036
CSW% has a r2 value of 0.024016800862309173
K-BB% has a r2 value of 0.03196087810616566
K% has a r2 value of 0.04091179418095048
BB% has a r2 value of 0.00036385916309178423
WPA has a r2 value of 0.0019764374004610508
BABIP has a r2 value of 0.0033954380388329097
Zone% has a r2 value of 0.002886407650614431
Contact% has a r2 value of 0.025411814335613125
Swing% has a r2 value of 9.728139925835322e-06
Soft% has a r2 value of 0.000773704084157778
Hard% has a r2 value of 0.01226904140248064
Stuff+ has a r2 value of 0.034521617191316745
Location+ has a r2 value of 0.015230943836014532
Pitching+ has a r2 value of 0.04660504194453208


So the individual stats that won are SIERA and Stuff+

Knowers of ball are probably not that surprised

Now lets see if using multiple of these stats can lead us to a formula with a higher r2

In [51]:
stats = ["SIERA", "CSW%", "K%", "BB%", "Contact%", "Swing%", "Stuff+", "Location+"]

X = all_pitching.filter(items=stats).values
y = np.array(all_pitching["2024ERA"]).reshape(-1,1)

reg = model.LinearRegression().fit(X, y)
reg.coef_, reg.intercept_

(array([[ 8.81062162e-01, -2.58419507e+00,  5.43002887e+00,
         -8.82854695e+00, -7.73513794e-01, -1.39407815e-01,
         -4.94452893e-02, -7.31716831e-03]]),
 array([6.72336924]))

In [52]:
y_pred = reg.predict(X)
r2_score(y, y_pred)

0.292406267013817

In [53]:
for i, stat in enumerate(stats):
    print(f"the coefficient of {stat} is {reg.coef_[0][i]}")

the coefficient of SIERA is 0.8810621619470753
the coefficient of CSW% is -2.5841950701528624
the coefficient of K% is 5.430028870655146
the coefficient of BB% is -8.828546945276168
the coefficient of Contact% is -0.7735137941706123
the coefficient of Swing% is -0.13940781547272943
the coefficient of Stuff+ is -0.04944528931724987
the coefficient of Location+ is -0.007317168311920412


In [71]:
all_pitching.loc[all_pitching.Name == "David Bote"].size

0