# Multiple Linear Regression - Predicting ERA

In [None]:
import pandas as pd
import numpy as np
import pybaseball as pyb
import matplotlib.pyplot as plt
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor 

pd.set_option("display.max_columns", None)
pd.set_option('display.max_colwidth', None)

In [None]:
# Only going back to 2015 because that is the Statcast era
import datetime
today = datetime.date.today()
year = today.year
batterMetrics = pyb.batting_stats(start_season=2015, end_season=(year-1), split_seasons=True)

In [None]:
batterMetrics

In [None]:
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'K%', 'OBP', 'LD%', 'GB%', 'FB%', 'IFFB%', 'O-Swing%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'SwStr%', 'Age', 'Pull%', 'Cent%', 'Oppo%', 'Soft%', 'Med%', 'Hard%', 'EV', 'LA', 'Barrel%', 'maxEV', 'HardHit%', 'CStr%', 'CSW%']]

In [None]:
# Split predictors and target
X = batterStats.drop('HR', axis=1)
y = batterStats.HR

In [None]:
# Initial model
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# Removed FB% for highest p-value
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'K%', 'OBP', 'LD%', 'GB%', 'IFFB%', 'O-Swing%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'SwStr%', 'Age', 'Pull%', 'Cent%', 'Oppo%', 'Soft%', 'Med%', 'Hard%', 'EV', 'LA', 'Barrel%', 'maxEV', 'HardHit%', 'CStr%', 'CSW%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# Removed more high p values: SwStr%
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'K%', 'OBP', 'LD%', 'GB%', 'IFFB%', 'O-Swing%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'Age', 'Pull%', 'Cent%', 'Oppo%', 'Soft%', 'Med%', 'Hard%', 'EV', 'LA', 'Barrel%', 'maxEV', 'HardHit%', 'CStr%', 'CSW%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# Drop Pull%
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'K%', 'OBP', 'LD%', 'GB%', 'IFFB%', 'O-Swing%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'Age', 'Cent%', 'Oppo%', 'Soft%', 'Med%', 'Hard%', 'EV', 'LA', 'Barrel%', 'maxEV', 'HardHit%', 'CStr%', 'CSW%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# Drop O-Swing%
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'K%', 'OBP', 'LD%', 'GB%', 'IFFB%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'Age', 'Cent%', 'Oppo%', 'Soft%', 'Med%', 'Hard%', 'EV', 'LA', 'Barrel%', 'maxEV', 'HardHit%', 'CStr%', 'CSW%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# Drop K%
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'OBP', 'LD%', 'GB%', 'IFFB%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'Age', 'Cent%', 'Oppo%', 'Soft%', 'Med%', 'Hard%', 'EV', 'LA', 'Barrel%', 'maxEV', 'HardHit%', 'CStr%', 'CSW%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# Drop Med%
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'OBP', 'LD%', 'GB%', 'IFFB%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'Age', 'Cent%', 'Oppo%', 'Soft%', 'Hard%', 'EV', 'LA', 'Barrel%', 'maxEV', 'HardHit%', 'CStr%', 'CSW%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# Drop IFFB%
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'OBP', 'LD%', 'GB%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'Age', 'Cent%', 'Oppo%', 'Soft%', 'Hard%', 'EV', 'LA', 'Barrel%', 'maxEV', 'HardHit%', 'CStr%', 'CSW%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# Drop Swing%
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'OBP', 'LD%', 'GB%', 'Z-Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'F-Strike%', 'Age', 'Cent%', 'Oppo%', 'Soft%', 'Hard%', 'EV', 'LA', 'Barrel%', 'maxEV', 'HardHit%', 'CStr%', 'CSW%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# Drop Zone%
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'OBP', 'LD%', 'GB%', 'Z-Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'F-Strike%', 'Age', 'Cent%', 'Oppo%', 'Soft%', 'Hard%', 'EV', 'LA', 'Barrel%', 'maxEV', 'HardHit%', 'CStr%', 'CSW%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# Drop EV
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'OBP', 'LD%', 'GB%', 'Z-Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'F-Strike%', 'Age', 'Cent%', 'Oppo%', 'Soft%', 'Hard%', 'LA', 'Barrel%', 'maxEV', 'HardHit%', 'CStr%', 'CSW%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# Drop Z-Contact%
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'OBP', 'LD%', 'GB%', 'Z-Swing%', 'O-Contact%', 'Contact%', 'F-Strike%', 'Age', 'Cent%', 'Oppo%', 'Soft%', 'Hard%', 'LA', 'Barrel%', 'maxEV', 'HardHit%', 'CStr%', 'CSW%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# Drop Age
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'OBP', 'LD%', 'GB%', 'Z-Swing%', 'O-Contact%', 'Contact%', 'F-Strike%', 'Cent%', 'Oppo%', 'Soft%', 'Hard%', 'LA', 'Barrel%', 'maxEV', 'HardHit%', 'CStr%', 'CSW%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# Drop O-Contact%
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'OBP', 'LD%', 'GB%', 'Z-Swing%', 'Contact%', 'F-Strike%', 'Cent%', 'Oppo%', 'Soft%', 'Hard%', 'LA', 'Barrel%', 'maxEV', 'HardHit%', 'CStr%', 'CSW%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# Calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  
print(vif_data)

In [None]:
# Multicollinearity is heavy, but that is expected because many percentages are directly affecting each other.
# This will make interpreting coefficients IMPOSSIBLE
# Drop Contact%
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'OBP', 'LD%', 'GB%', 'Z-Swing%', 'F-Strike%', 'Cent%', 'Oppo%', 'Soft%', 'Hard%', 'LA', 'Barrel%', 'maxEV', 'HardHit%', 'CStr%', 'CSW%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# Calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  
print(vif_data)

In [None]:
# Drop Z-Swing%
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'OBP', 'LD%', 'GB%', 'F-Strike%', 'Cent%', 'Oppo%', 'Soft%', 'Hard%', 'LA', 'Barrel%', 'maxEV', 'HardHit%', 'CStr%', 'CSW%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# Calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  
print(vif_data)

In [None]:
# Drop maxEV%
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'OBP', 'LD%', 'GB%', 'F-Strike%', 'Cent%', 'Oppo%', 'Soft%', 'Hard%', 'LA', 'Barrel%', 'HardHit%', 'CStr%', 'CSW%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# Calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  
print(vif_data)

In [None]:
# Drop F-Strike%
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'OBP', 'LD%', 'GB%', 'Cent%', 'Oppo%', 'Soft%', 'Hard%', 'LA', 'Barrel%', 'HardHit%', 'CStr%', 'CSW%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# Calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  
print(vif_data)

In [None]:
# Drop OBP
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'LD%', 'GB%', 'Cent%', 'Oppo%', 'Soft%', 'Hard%', 'LA', 'Barrel%', 'HardHit%', 'CStr%', 'CSW%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
print(results.summary())

In [None]:
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# Calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  
print(vif_data)

In [None]:
# Drop CSW%
batterStats = batterMetrics[['HR', 'PA', 'BB%', 'LD%', 'GB%', 'Cent%', 'Oppo%', 'Soft%', 'Hard%', 'LA', 'Barrel%', 'HardHit%', 'CStr%']]
X = batterStats.drop('HR', axis=1)
y = batterStats.HR
model = sm.OLS(y, X)
results = model.fit()
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# Calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  
print(vif_data)

In [None]:
print(results.summary())

In [63]:
# Declare test set
batterMetrics2024 = pyb.batting_stats(start_season=year, end_season=year, split_seasons=True)

In [64]:
batterStats2024 = batterMetrics2024[['HR', 'PA', 'BB%', 'LD%', 'GB%', 'Cent%', 'Oppo%', 'Soft%', 'Hard%', 'LA', 'Barrel%', 'HardHit%', 'CStr%']]
X = batterStats2024.drop('HR', axis=1)
pred = results.predict(X)
batterMetrics2024['predHR'] = pred
batterMetrics2024['Diff'] = batterMetrics2024['predHR']-batterMetrics2024['HR']
batterMetrics2024 = batterMetrics2024[['Name', 'IDfg', 'Team', 'Age', 'HR', 'predHR', 'Diff', 'PA', 'BB%', 'LD%', 'GB%', 'Cent%', 'Oppo%', 'Soft%', 'Hard%', 'LA', 'Barrel%', 'HardHit%', 'CStr%']]

In [66]:
# Average difference
batterMetrics2024['Diff'].abs().mean()

3.916573265675284

In [70]:
batterMetrics2024.sort_values(by='predHR', ascending=False).head(20)

Unnamed: 0,Name,IDfg,Team,Age,HR,predHR,Diff,PA,BB%,LD%,GB%,Cent%,Oppo%,Soft%,Hard%,LA,Barrel%,HardHit%,CStr%
0,Aaron Judge,15640,NYY,32,32,42.307095,10.307095,396,0.159,0.194,0.33,0.361,0.256,0.07,0.485,17.7,0.276,0.614,0.152
18,Christian Walker,13419,ARI,33,22,29.919921,7.919921,388,0.093,0.147,0.365,0.353,0.214,0.135,0.401,16.9,0.167,0.484,0.135
1,Shohei Ohtani,19755,LAD,29,28,28.9827,0.9827,403,0.119,0.26,0.393,0.335,0.24,0.103,0.43,13.1,0.196,0.6,0.151
114,Cal Raleigh,21534,SEA,27,15,28.38224,13.38224,331,0.106,0.162,0.351,0.357,0.151,0.141,0.443,18.7,0.151,0.541,0.13
6,Marcell Ozuna,10324,ATL,33,23,27.719293,4.719293,371,0.113,0.243,0.356,0.331,0.205,0.117,0.406,15.5,0.18,0.548,0.125
13,Brent Rooker,19627,OAK,29,18,27.687145,9.687145,317,0.098,0.235,0.296,0.335,0.196,0.179,0.419,19.1,0.162,0.503,0.132
2,Juan Soto,20123,NYY,25,21,27.312956,6.312956,398,0.191,0.188,0.465,0.337,0.198,0.12,0.399,9.9,0.182,0.57,0.191
33,Corey Seager,13624,TEX,30,16,26.923757,10.923757,330,0.106,0.155,0.416,0.326,0.249,0.103,0.451,13.6,0.159,0.489,0.094
107,Adolis Garcia,19287,TEX,31,16,24.233558,8.233558,353,0.076,0.176,0.383,0.308,0.278,0.185,0.361,14.8,0.141,0.476,0.114
117,Shea Langeliers,25816,OAK,26,17,23.538866,6.538866,291,0.055,0.148,0.407,0.418,0.148,0.18,0.349,14.8,0.164,0.455,0.163


In [None]:
batterMetrics2024.to_csv('currYearPitching.csv')