In [30]:
import sklearn.linear_model as model
import numpy as np
import pandas as pd
import pybaseball as pyb
from sklearn.metrics import r2_score

## Data Gathering

In [18]:
# Download the stats from 2024 and 2023

min_innings = 80 # change this if you want to lower the minimum number of innings (we only look at pitchers who pitched more than this number of IP in *both* seasons)

stats_2024 = pyb.pitching_stats(start_season = 2024, end_season=2024, qual=min_innings)
stats_2023 = pyb.pitching_stats(start_season = 2023, end_season=2023, qual=min_innings)

In [19]:
# Here is the list of stats I am going to look at. IDfg is an id to make sure we are looking at the same play in 2023 as in 2024. Name is the player's name. Most stats are self-explanatory, but I threw in a couple that I thought would be funny (BABIP and WPA) but are unlikely to have any coorelation. If you want to see the full list of stats, you can look at stats_2023.columns
f2023 = stats_2023.filter(items=["IDfg", "Name", "ERA", "ERA-", "FIP", "xERA", "xFIP", "SIERA", "CSW%", "K-BB%", "K%", "BB%", "WPA", "BABIP", "Zone%", "Contact%", "Swing%", "Soft%", "Hard%", "Stuff+", "Location+", "Pitching+"])

In [20]:
# Just looking at ERA and ERA- in 2024. These should both be relativly the same but just wanted to try them both. I am renaming them so I can combine this with the 2023 stats
f2024 = stats_2024.filter(items=["IDfg", "ERA", "ERA-"]).rename(columns={"ERA": "2024ERA", "ERA-": "2024ERA-"})

In [21]:
# This combines both of the tables into one big one. Note only pitchers who pitched min_innings in both seasons will be on this

all_pitching = pd.merge(f2023, f2024, "inner", "IDfg")
all_pitching

Unnamed: 0,IDfg,Name,ERA,ERA-,FIP,xERA,xFIP,SIERA,CSW%,K-BB%,K%,BB%,WPA,BABIP,Zone%,Contact%,Swing%,Soft%,Hard%,Stuff+,Location+,Pitching+,2024ERA,2024ERA-
0,10310,Zack Wheeler,3.61,82,3.15,3.18,3.54,3.53,0.275,0.220,0.269,0.050,1.94,0.292,0.421,0.740,0.511,0.214,0.313,114,107,122,2.57,62
1,13125,Gerrit Cole,2.63,61,3.16,3.48,3.60,3.63,0.277,0.212,0.270,0.058,4.39,0.261,0.425,0.765,0.499,0.151,0.311,109,106,114,3.41,85
2,12768,Sonny Gray,2.79,65,2.83,3.66,3.64,3.95,0.279,0.170,0.243,0.073,3.01,0.295,0.422,0.759,0.474,0.143,0.300,96,101,100,3.84,95
3,19291,Zac Gallen,3.47,78,3.26,4.16,3.49,3.67,0.285,0.204,0.260,0.056,2.87,0.301,0.419,0.761,0.470,0.098,0.401,97,107,105,3.65,87
4,14107,Kevin Gausman,3.16,75,2.97,3.85,3.22,3.34,0.296,0.239,0.311,0.072,2.21,0.321,0.399,0.737,0.492,0.165,0.369,101,104,107,3.83,96
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
97,13273,Ross Stripling,5.36,126,5.21,5.59,3.98,4.13,0.280,0.142,0.184,0.042,-1.38,0.308,0.433,0.785,0.481,0.150,0.328,92,112,104,6.01,155
98,27646,Luis L. Ortiz,4.78,107,5.57,6.86,5.38,5.62,0.245,0.028,0.148,0.120,-0.94,0.309,0.394,0.793,0.454,0.127,0.412,95,92,89,3.32,79
99,6632,Carlos Carrasco,6.80,162,5.86,6.77,4.93,5.16,0.261,0.067,0.158,0.091,-2.32,0.336,0.382,0.765,0.466,0.130,0.352,88,96,89,5.64,142
100,15890,Luis Severino,6.65,155,6.14,5.89,4.83,4.78,0.239,0.108,0.189,0.082,-1.69,0.326,0.447,0.822,0.513,0.144,0.346,102,101,101,3.91,99


## Statistics time

In [26]:
# First, lets see how last year's ERA predicts next years ERA
era2era = model.LinearRegression().fit(X=np.array(all_pitching["ERA"]).reshape(-1,1), y=np.array(all_pitching["2024ERA"]).reshape(-1,1))

In [29]:
era_pred = era2era.predict(np.array(all_pitching["ERA"]).reshape(-1,1))

In [31]:
r2_score(np.array(all_pitching["2024ERA"]).reshape(-1,1), era_pred)

0.07518875484083554

Wow thats a small number! Lets see how every other stat does

In [33]:
y = np.array(all_pitching["2024ERA"]).reshape(-1,1)
stats = ["ERA", "ERA-", "FIP", "xERA", "xFIP", "SIERA", "CSW%", "K-BB%", "K%", "BB%", "WPA", "BABIP", "Zone%", "Contact%", "Swing%", "Soft%", "Hard%", "Stuff+", "Location+", "Pitching+"]
r2s = {}

for stat in stats:
    X = np.array(all_pitching[stat]).reshape(-1,1)
    reg = model.LinearRegression().fit(X, y)

    y_pred = reg.predict(X)
    r2 = r2_score(y, y_pred)

    print(f"{stat} has a r2 value of {r2}")
    r2s[stat] = r2


ERA has a r2 value of 0.07518875484083554
ERA- has a r2 value of 0.07003256633144583
FIP has a r2 value of 0.13205893213455322
xERA has a r2 value of 0.1377941313535591
xFIP has a r2 value of 0.18471672475041645
SIERA has a r2 value of 0.21886739274300016
CSW% has a r2 value of 0.1543128386635808
K-BB% has a r2 value of 0.19821510566062306
K% has a r2 value of 0.1897346839723294
BB% has a r2 value of 0.007876687438809093
WPA has a r2 value of 0.04032490031922642
BABIP has a r2 value of 0.00995319796175842
Zone% has a r2 value of 0.02039152797721644
Contact% has a r2 value of 0.10399179394641767
Swing% has a r2 value of 0.015056609193781156
Soft% has a r2 value of 0.02076643005479839
Hard% has a r2 value of 0.019726914918756
Stuff+ has a r2 value of 0.2411923288957616
Location+ has a r2 value of 0.0040368894799541755
Pitching+ has a r2 value of 0.18441842486546878
