In [17]:
import pandas as pd
import numpy as np
import pybaseball
from pybaseball import batting_stats
pybaseball.cache.enable()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit

In [18]:
#Load in batting stats from fangraphs using pybaseball
#Uncomment lines below to load in
#batting = batting_stats(2017, 2023, qual = 150)
#batting.to_csv("batting.csv")
batting = pd.read_csv("batting.csv")

In [19]:
#makes it only players with multiple seasons
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)
batting.head()

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,2,15640,2022,Aaron Judge,NYY,30,157,570,696,177,...,118.4,246,0.609,404,0.169,0.287,,,,11.6
1,4,13611,2018,Mookie Betts,BOS,25,136,520,614,180,...,110.6,217,0.5,434,0.22,0.27,,,,10.4
2,6,10155,2018,Mike Trout,LAA,26,140,471,608,147,...,118.0,162,0.46,352,0.201,0.261,,,,9.5
3,18,18401,2023,Ronald Acuna Jr.,ATL,25,159,643,735,217,...,121.2,309,0.55,562,0.137,0.214,,,,8.4
4,15,15640,2017,Aaron Judge,NYY,25,155,542,678,154,...,121.1,186,0.55,338,0.157,0.29,,,,8.7


In [20]:
#find nulls (cant have for ML)
null_variables = batting.isnull().sum() 
null_variables

Unnamed: 0       0
IDfg             0
Season           0
Name             0
Team             0
              ... 
CSW%             0
xBA           2388
xSLG          2388
xwOBA         2388
L-WAR            0
Length: 321, dtype: int64

In [21]:
full_variables = list(batting.columns[null_variables == 0]) #gets list of all non nulls
batting = batting[full_variables].copy() #updates to only non nulls
batting.head()

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,...,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,L-WAR
0,2,15640,2022,Aaron Judge,NYY,30,157,570,696,177,...,14.9,106,0.262,118.4,246,0.609,404,0.169,0.287,11.6
1,4,13611,2018,Mookie Betts,BOS,25,136,520,614,180,...,18.5,57,0.131,110.6,217,0.5,434,0.22,0.27,10.4
2,6,10155,2018,Mike Trout,LAA,26,140,471,608,147,...,18.6,54,0.153,118.0,162,0.46,352,0.201,0.261,9.5
3,18,18401,2023,Ronald Acuna Jr.,ATL,25,159,643,735,217,...,7.4,86,0.153,121.2,309,0.55,562,0.137,0.214,8.4
4,15,15640,2017,Aaron Judge,NYY,25,155,542,678,154,...,15.8,84,0.249,121.1,186,0.55,338,0.157,0.29,8.7


In [22]:
#removes all object other than Name
batting = batting.drop('Dol', axis=1)
batting = batting.drop('Team', axis=1)
batting = batting.drop('Age Rng', axis=1)
batting = batting.drop('Unnamed: 0', axis=1)
batting = batting.drop('IDfg', axis=1)
batting = batting.drop('L-WAR', axis=1)
batting.head()

Unnamed: 0,Season,Name,Age,G,AB,PA,H,1B,2B,3B,...,EV,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%
0,2022,Aaron Judge,30,157,570,696,177,87,28,0,...,95.8,14.9,106,0.262,118.4,246,0.609,404,0.169,0.287
1,2018,Mookie Betts,25,136,520,614,180,96,47,5,...,92.3,18.5,57,0.131,110.6,217,0.5,434,0.22,0.27
2,2018,Mike Trout,26,140,471,608,147,80,24,4,...,91.2,18.6,54,0.153,118.0,162,0.46,352,0.201,0.261
3,2023,Ronald Acuna Jr.,25,159,643,735,217,137,35,4,...,94.7,7.4,86,0.153,121.2,309,0.55,562,0.137,0.214
4,2017,Aaron Judge,25,155,542,678,154,75,24,3,...,94.9,15.8,84,0.249,121.1,186,0.55,338,0.157,0.29


In [23]:
batting.dtypes[batting.dtypes == "object"] #confirms removal

Name    object
dtype: object

In [24]:
batting.sort_values("Name")

Unnamed: 0,Season,Name,Age,G,AB,PA,H,1B,2B,3B,...,EV,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%
2092,2017,A.J. Ellis,36,51,143,163,30,19,5,0,...,86.0,11.5,4,0.035,106.4,36,0.310,116,0.211,0.286
2267,2018,A.J. Ellis,37,66,151,183,41,32,8,0,...,90.4,12.5,5,0.042,106.8,44,0.370,119,0.225,0.275
578,2018,A.J. Pollock,30,113,413,460,106,59,21,5,...,89.2,13.5,31,0.097,108.6,130,0.405,321,0.165,0.272
386,2021,A.J. Pollock,33,117,384,422,114,65,27,1,...,90.3,12.0,34,0.111,111.2,144,0.471,306,0.135,0.256
1210,2020,A.J. Pollock,32,55,196,210,54,29,9,0,...,89.6,13.0,16,0.105,110.0,66,0.431,153,0.182,0.284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2154,2021,Zach McKinstry,26,60,158,172,34,18,9,0,...,88.4,13.7,9,0.081,106.2,39,0.351,111,0.213,0.320
207,2017,Zack Cozart,31,122,438,507,130,75,24,7,...,85.8,14.8,16,0.044,108.3,105,0.289,364,0.204,0.264
2221,2018,Zack Cozart,32,58,224,253,49,29,13,2,...,85.6,17.4,4,0.022,107.0,51,0.276,185,0.203,0.277
2168,2023,Zack Short,28,112,221,253,45,29,9,0,...,87.6,21.2,14,0.088,107.0,52,0.327,159,0.190,0.286


In [25]:
#Adds decay for WAR to weight more recent season higher
#decay_factor = 0.1 #Random Choice
#batting['Weighted_WAR'] = batting['WAR'] * np.exp(-decay_factor * (2024 - batting['Season']))
#batting.head()

In [26]:
from sklearn.preprocessing import MinMaxScaler

batting_copy = batting.copy()
batting_copy = batting_copy.dropna()

columns_to_scale = batting.drop(['Name', 'WAR'], axis=1).select_dtypes(include=[np.number]).columns
scaler = MinMaxScaler()
batting_copy[columns_to_scale] = pd.DataFrame(scaler.fit_transform(batting_copy[columns_to_scale]))
batting_copy.head()

Unnamed: 0,Season,Name,Age,G,AB,PA,H,1B,2B,3B,...,EV,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%
0,0.833333,Aaron Judge,0.478261,0.960317,0.80036,0.905473,0.79798,0.490798,0.465517,0.0,...,0.894737,0.57958,1.0,0.952727,0.836207,0.786441,0.942408,0.657315,0.449761,0.635593
1,0.166667,Mookie Betts,0.26087,0.793651,0.710432,0.769486,0.813131,0.546012,0.793103,0.357143,...,0.690058,0.687688,0.537736,0.476364,0.5,0.688136,0.752182,0.717435,0.69378,0.563559
2,0.166667,Mike Trout,0.304348,0.825397,0.622302,0.759536,0.646465,0.447853,0.396552,0.285714,...,0.625731,0.690691,0.509434,0.556364,0.818966,0.501695,0.682373,0.553106,0.602871,0.525424
3,1.0,Ronald Acuna Jr.,0.26087,0.97619,0.931655,0.970149,1.0,0.797546,0.586207,0.285714,...,0.830409,0.354354,0.811321,0.556364,0.956897,1.0,0.839442,0.973948,0.296651,0.326271
4,0.0,Aaron Judge,0.26087,0.944444,0.75,0.875622,0.681818,0.417178,0.396552,0.214286,...,0.842105,0.606607,0.792453,0.905455,0.952586,0.583051,0.839442,0.52505,0.392344,0.648305


In [27]:
X = batting.drop(['Name', 'WAR'], axis=1)
y = batting['WAR']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75, random_state=12345)

In [28]:
#Code used to find the optimal alpha value for lasso
from sklearn.linear_model import LassoCV
from sklearn.exceptions import ConvergenceWarning
import warnings
from sklearn.model_selection import TimeSeriesSplit

warnings.filterwarnings('ignore', category=ConvergenceWarning)

split = TimeSeriesSplit(n_splits=3)

lasso_cv = LassoCV(alphas=[0.000001,0.00001,0.0001,0.001,0.01,0.1,1], cv=split, random_state=12345)
lasso_cv.fit(X_train, y_train)

print("Optimal alpha value:", lasso_cv.alpha_)

# Using the best alpha value, predictions can be made for the test set
y_pred = lasso_cv.predict(X_test)

Optimal alpha value: 0.01


In [29]:
lasso = Lasso(alpha=0.01)
lasso.fit(X_train, y_train)

In [30]:
#Evaluates lasso
y_pred = lasso.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 0.0034634560297052116


In [31]:
# Displays coefficients and feature names
coefficients = lasso.coef_
feature_names = X_train.columns

# Create a DataFrame containing feature names and their corresponding coefficients
coeff_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Sort the DataFrame by the coefficients in descending order
sorted_coeff_df = coeff_df.sort_values(by='Coefficient', ascending=False)

# Display the sorted DataFrame
print(sorted_coeff_df)

# Outputs to csv
sorted_coeff_df.to_csv('lasso_coefficients.csv', index=False)


       Feature  Coefficient
54         RAR     0.100627
49        wRAA     0.007585
3           AB     0.003686
23          FB     0.001862
110  SI-X (sc)     0.001560
..         ...          ...
5            H    -0.001406
14          SO    -0.003300
9           HR    -0.003604
221     Events    -0.004481
50         wRC    -0.007897

[224 rows x 2 columns]
