In [114]:
import pandas as pd
import numpy as np
import pybaseball
from pybaseball import batting_stats
pybaseball.cache.enable()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit

In [115]:
#Load in batting stats from fangraphs using pybaseball
#Uncomment lines below to load in
#batting = batting_stats(2017, 2023, qual = 150)
#batting.to_csv("batting.csv")
batting = pd.read_csv("batting.csv")

In [116]:
#makes it only players with multiple seasons
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)
batting.head()

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,2,15640,2022,Aaron Judge,NYY,30,157,570,696,177,...,118.4,246,0.609,404,0.169,0.287,,,,11.6
1,4,13611,2018,Mookie Betts,BOS,25,136,520,614,180,...,110.6,217,0.5,434,0.22,0.27,,,,10.4
2,6,10155,2018,Mike Trout,LAA,26,140,471,608,147,...,118.0,162,0.46,352,0.201,0.261,,,,9.5
3,18,18401,2023,Ronald Acuna Jr.,ATL,25,159,643,735,217,...,121.2,309,0.55,562,0.137,0.214,,,,8.4
4,15,15640,2017,Aaron Judge,NYY,25,155,542,678,154,...,121.1,186,0.55,338,0.157,0.29,,,,8.7


In [117]:
#find nulls (cant have for ML)
null_variables = batting.isnull().sum() 
null_variables

Unnamed: 0       0
IDfg             0
Season           0
Name             0
Team             0
              ... 
CSW%             0
xBA           2388
xSLG          2388
xwOBA         2388
L-WAR            0
Length: 321, dtype: int64

In [118]:
full_variables = list(batting.columns[null_variables == 0]) #gets list of all non nulls
batting = batting[full_variables].copy() #updates to only non nulls
batting.head()

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,...,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,L-WAR
0,2,15640,2022,Aaron Judge,NYY,30,157,570,696,177,...,14.9,106,0.262,118.4,246,0.609,404,0.169,0.287,11.6
1,4,13611,2018,Mookie Betts,BOS,25,136,520,614,180,...,18.5,57,0.131,110.6,217,0.5,434,0.22,0.27,10.4
2,6,10155,2018,Mike Trout,LAA,26,140,471,608,147,...,18.6,54,0.153,118.0,162,0.46,352,0.201,0.261,9.5
3,18,18401,2023,Ronald Acuna Jr.,ATL,25,159,643,735,217,...,7.4,86,0.153,121.2,309,0.55,562,0.137,0.214,8.4
4,15,15640,2017,Aaron Judge,NYY,25,155,542,678,154,...,15.8,84,0.249,121.1,186,0.55,338,0.157,0.29,8.7


In [119]:
#removes all object other than Name
batting = batting.drop('Dol', axis=1)
batting = batting.drop('Team', axis=1)
batting = batting.drop('Age Rng', axis=1)
batting = batting.drop('Unnamed: 0', axis=1)
batting = batting.drop('IDfg', axis=1)
batting = batting.drop('L-WAR', axis=1)
batting.head()

Unnamed: 0,Season,Name,Age,G,AB,PA,H,1B,2B,3B,...,EV,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%
0,2022,Aaron Judge,30,157,570,696,177,87,28,0,...,95.8,14.9,106,0.262,118.4,246,0.609,404,0.169,0.287
1,2018,Mookie Betts,25,136,520,614,180,96,47,5,...,92.3,18.5,57,0.131,110.6,217,0.5,434,0.22,0.27
2,2018,Mike Trout,26,140,471,608,147,80,24,4,...,91.2,18.6,54,0.153,118.0,162,0.46,352,0.201,0.261
3,2023,Ronald Acuna Jr.,25,159,643,735,217,137,35,4,...,94.7,7.4,86,0.153,121.2,309,0.55,562,0.137,0.214
4,2017,Aaron Judge,25,155,542,678,154,75,24,3,...,94.9,15.8,84,0.249,121.1,186,0.55,338,0.157,0.29


In [120]:
batting.dtypes[batting.dtypes == "object"] #confirms removal

Name    object
dtype: object

In [121]:
#Create helper function to apply to each player in the batting database
def next_season(p):
    p = p.sort_values("Season")
    p["Next_WAR"] = p["WAR"].shift(-1)
    return p

#use the higher order function apply to use the helper function on each player. Yay csci275
batting = batting.groupby("Name", group_keys=False).apply(next_season)
batting

  batting = batting.groupby("Name", group_keys=False).apply(next_season)


Unnamed: 0,Season,Name,Age,G,AB,PA,H,1B,2B,3B,...,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,Next_WAR
2092,2017,A.J. Ellis,36,51,143,163,30,19,5,0,...,11.5,4,0.035,106.4,36,0.310,116,0.211,0.286,-0.3
2266,2018,A.J. Ellis,37,66,151,183,41,32,8,0,...,12.5,5,0.042,106.8,44,0.370,119,0.225,0.275,
776,2017,A.J. Pollock,29,112,425,466,113,60,33,6,...,8.5,19,0.054,108.7,136,0.384,354,0.210,0.280,2.5
578,2018,A.J. Pollock,30,113,413,460,106,59,21,5,...,13.5,31,0.097,108.6,130,0.405,321,0.165,0.272,0.5
1700,2019,A.J. Pollock,31,86,308,342,82,51,15,1,...,13.7,18,0.076,108.1,95,0.399,238,0.169,0.283,1.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1392,2023,Zach McKinstry,28,148,464,518,107,73,21,4,...,17.8,27,0.076,108.3,118,0.332,356,0.165,0.250,
207,2017,Zack Cozart,31,122,438,507,130,75,24,7,...,14.8,16,0.044,108.3,105,0.289,364,0.204,0.264,-0.2
2221,2018,Zack Cozart,32,58,224,253,49,29,13,2,...,17.4,4,0.022,107.0,51,0.276,185,0.203,0.277,
2527,2021,Zack Short,26,61,156,184,22,12,4,0,...,24.3,5,0.049,108.2,34,0.330,103,0.200,0.305,-0.1


In [122]:
#Adds decay for WAR to weight more recent season higher
#decay_factor = 0.1 #Random Choice
#batting['Weighted_WAR'] = batting['WAR'] * np.exp(-decay_factor * (2024 - batting['Season']))
#batting.head()

In [123]:
from sklearn.preprocessing import MinMaxScaler

batting_copy = batting.copy()
batting_copy = batting_copy.dropna()

columns_to_scale = batting.drop(['Name', 'Next_WAR'], axis=1).select_dtypes(include=[np.number]).columns
scaler = MinMaxScaler()
batting_copy[columns_to_scale] = pd.DataFrame(scaler.fit_transform(batting_copy[columns_to_scale]))
batting_copy.head()

Unnamed: 0,Season,Name,Age,G,AB,PA,H,1B,2B,3B,...,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,Next_WAR
2092,,A.J. Ellis,,,,,,,,,...,,,,,,,,,,-0.3
776,0.0,A.J. Pollock,0.636364,0.928571,0.794964,0.829146,0.706806,0.509317,0.842105,0.214286,...,0.737179,0.245283,0.209924,0.283784,0.616279,0.534451,0.789579,0.230769,0.279661,2.5
578,0.4,A.J. Pollock,0.227273,0.547619,0.31295,0.296482,0.272251,0.273292,0.140351,0.285714,...,0.560897,0.066038,0.125954,0.324324,0.155039,0.327747,0.278557,0.663462,0.677966,0.5
1700,0.6,A.J. Pollock,0.409091,0.166667,0.115108,0.125628,0.125654,0.124224,0.157895,0.0,...,0.432692,0.132075,0.400763,0.635135,0.182171,0.716946,0.116232,0.346154,0.694915,1.2
1209,1.0,A.J. Pollock,0.545455,0.793651,0.609712,0.695142,0.361257,0.236025,0.368421,0.071429,...,0.810897,0.415094,0.51145,0.504505,0.507752,0.674115,0.507014,0.615385,0.690678,3.2


In [124]:
X = batting.drop(['Name', 'Next_WAR'], axis=1)
y = batting['Next_WAR']
bat_copy=batting.copy()
bat_copy2=batting.copy()

X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, test_size=0.75, random_state=12345)

In [125]:
batting.sort_values("Season")
X_train_temp=bat_copy[bat_copy["Season"]<2023]
X_train_temp=X_train_temp[X_train_temp["PA"]>300]
X_train_names = X_train_temp.dropna()
X_train=X_train_names.drop(['Name','Next_WAR'],axis=1)
y_train=X_train_names['Next_WAR']
y_train

776     2.5
578     0.5
1700    1.2
386     0.5
685     0.1
       ... 
847     4.0
239    -0.1
368    -0.7
2467   -0.3
207    -0.2
Name: Next_WAR, Length: 1210, dtype: float64

In [126]:
X_test_temp=bat_copy2[bat_copy2["Season"]>2022]
#X_test_temp=X_test_temp[X_test_temp["PA"]>300]
players_2023=X_test_temp.drop(['Next_WAR'],axis=1)
X_test=players_2023.drop(['Name'],axis=1)
y_test=X_test_temp['Next_WAR']
X_test

Unnamed: 0,Season,Age,G,AB,PA,H,1B,2B,3B,HR,...,EV,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%
1358,2023,33,93,269,312,68,50,9,1,8,...,85.3,12.6,8,0.040,106.7,59,0.294,201,0.196,0.285
135,2023,31,106,367,458,98,45,16,0,37,...,97.6,20.4,66,0.275,116.9,154,0.642,240,0.160,0.298
1026,2023,34,92,320,353,79,32,24,2,21,...,88.8,28.9,28,0.130,110.0,84,0.391,215,0.183,0.320
1803,2023,31,141,412,455,99,63,21,2,13,...,85.7,15.2,11,0.031,106.7,80,0.228,351,0.178,0.264
87,2023,25,154,588,687,163,111,31,1,20,...,88.4,12.6,37,0.075,111.1,189,0.384,492,0.201,0.257
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1303,2023,28,92,334,357,87,55,20,1,11,...,89.4,12.3,23,0.101,112.7,89,0.390,228,0.168,0.298
192,2023,26,114,410,496,120,64,24,1,31,...,93.3,17.1,58,0.180,117.7,168,0.522,322,0.163,0.251
2280,2023,39,108,298,329,73,50,16,3,4,...,88.6,16.5,5,0.019,107.9,89,0.345,258,0.194,0.262
1392,2023,28,148,464,518,107,73,21,4,9,...,87.2,17.8,27,0.076,108.3,118,0.332,356,0.165,0.250


In [127]:
#Code used to find the optimal alpha value for lasso
from sklearn.linear_model import LassoCV
from sklearn.exceptions import ConvergenceWarning
import warnings
from sklearn.model_selection import TimeSeriesSplit

warnings.filterwarnings('ignore', category=ConvergenceWarning)

split = TimeSeriesSplit(n_splits=3)

lasso_cv = LassoCV(alphas=[0.000001,0.00001,0.0001,0.001,0.01,0.1,1], cv=split, random_state=12345)
lasso_cv.fit(X_train, y_train)

print("Optimal alpha value:", lasso_cv.alpha_)

# Using the best alpha value, predictions can be made for the test set
y_pred = lasso_cv.predict(X_test)

Optimal alpha value: 0.1


In [128]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

In [129]:
#Evaluates lasso
y_pred = lasso.predict(X_test)
#mse = mean_squared_error(y_test, y_pred)
#print("Mean Squared Error:", mse)
#y_pred

In [130]:
# Displays coefficients and feature names
coefficients = lasso.coef_
feature_names = X_train.columns

# Create a DataFrame containing feature names and their corresponding coefficients
coeff_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Sort the DataFrame by the coefficients in descending order
sorted_coeff_df = coeff_df.sort_values(by='Coefficient', ascending=False)

# Display the sorted DataFrame
print(sorted_coeff_df)

# Outputs to csv
sorted_coeff_df.to_csv('lasso_coefficients.csv', index=False)


     Feature  Coefficient
54       RAR     0.037844
13       IBB     0.029053
217  Barrels     0.027265
18       GDP     0.021924
53       Pos     0.020346
..       ...          ...
30        BU    -0.011362
50       wRC    -0.049270
0     Season    -0.064544
9         HR    -0.066806
1        Age    -0.098330

[225 rows x 2 columns]


In [131]:
players_2023['Prediction'] = y_pred.tolist()
diff=np.subtract(y_pred,players_2023['WAR'])
players_2023['Increase'] = diff.tolist()
players_2023

Unnamed: 0,Season,Name,Age,G,AB,PA,H,1B,2B,3B,...,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,Prediction,Increase
1358,2023,Aaron Hicks,33,93,269,312,68,50,9,1,...,8,0.040,106.7,59,0.294,201,0.196,0.285,0.030060,-0.969940
135,2023,Aaron Judge,31,106,367,458,98,45,16,0,...,66,0.275,116.9,154,0.642,240,0.160,0.298,3.137286,-1.662714
1026,2023,Adam Duvall,34,92,320,353,79,32,24,2,...,28,0.130,110.0,84,0.391,215,0.183,0.320,-0.201676,-1.701676
1803,2023,Adam Frazier,31,141,412,455,99,63,21,2,...,11,0.031,106.7,80,0.228,351,0.178,0.264,-0.258459,-0.658459
87,2023,Adley Rutschman,25,154,588,687,163,111,31,1,...,37,0.075,111.1,189,0.384,492,0.201,0.257,4.147286,-1.452714
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1303,2023,Yoan Moncada,28,92,334,357,87,55,20,1,...,23,0.101,112.7,89,0.390,228,0.168,0.298,0.661068,-0.438932
192,2023,Yordan Alvarez,26,114,410,496,120,64,24,1,...,58,0.180,117.7,168,0.522,322,0.163,0.251,2.941144,-1.358856
2280,2023,Yuli Gurriel,39,108,298,329,73,50,16,3,...,5,0.019,107.9,89,0.345,258,0.194,0.262,-0.437617,-0.137617
1392,2023,Zach McKinstry,28,148,464,518,107,73,21,4,...,27,0.076,108.3,118,0.332,356,0.165,0.250,1.814231,0.814231


In [132]:
players_2023.to_csv('predictions.csv', index=False)


In [133]:
players_2023.sort_values("Increase")

Unnamed: 0,Season,Name,Age,G,AB,PA,H,1B,2B,3B,...,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,Prediction,Increase
9,2023,Freddie Freeman,33,161,637,730,211,121,59,2,...,58,0.111,110.6,218,0.418,521,0.114,0.208,4.099064,-3.700936
33,2023,Matt Olson,29,162,608,720,172,88,27,3,...,73,0.164,118.6,245,0.551,445,0.131,0.249,3.183170,-3.316830
12,2023,Mookie Betts,30,152,584,693,179,99,40,1,...,60,0.125,110.1,232,0.481,482,0.205,0.262,4.412729,-3.287271
32,2023,Shohei Ohtani,28,135,497,599,151,73,26,8,...,70,0.193,118.6,193,0.533,362,0.125,0.265,3.779883,-2.720117
196,2023,Isaac Paredes,24,143,492,571,123,68,24,0,...,23,0.059,107.7,110,0.283,389,0.171,0.250,1.635039,-2.664961
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1169,2023,Vladimir Guerrero Jr.,24,156,602,682,159,103,30,0,...,56,0.111,116.7,249,0.492,506,0.128,0.236,2.755076,1.455076
2578,2023,Enrique Hernandez,31,140,465,508,110,76,23,0,...,17,0.046,109.2,125,0.334,374,0.177,0.280,0.361101,1.461101
2618,2023,Gavin Sheets,27,118,311,344,63,43,10,0,...,13,0.052,109.5,81,0.325,249,0.151,0.243,0.055188,1.555188
1178,2023,Spencer Torkelson,23,159,606,684,141,75,34,1,...,62,0.141,112.7,222,0.505,440,0.166,0.269,2.939004,1.639004


In [136]:
test=players_2023.copy()
test=test[test["Increase"]>1.25]
test=test[test["Age"]<27]
test=test[test["WAR"]<3]
test.sort_values("Increase")

Unnamed: 0,Season,Name,Age,G,AB,PA,H,1B,2B,3B,...,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,Prediction,Increase
2403,2023,Keibert Ruiz,24,136,523,562,136,94,24,0,...,27,0.058,110.1,148,0.317,467,0.163,0.229,0.674955,1.274955
1585,2023,Shea Langeliers,25,135,448,490,92,47,19,4,...,41,0.133,113.4,136,0.442,308,0.146,0.303,1.988846,1.288846
2197,2023,Bryan De La Cruz,26,153,579,626,149,98,32,0,...,39,0.088,111.0,188,0.424,443,0.158,0.292,1.165109,1.365109
1169,2023,Vladimir Guerrero Jr.,24,156,602,682,159,103,30,0,...,56,0.111,116.7,249,0.492,506,0.128,0.236,2.755076,1.455076
1178,2023,Spencer Torkelson,23,159,606,684,141,75,34,1,...,62,0.141,112.7,222,0.505,440,0.166,0.269,2.939004,1.639004
1870,2023,MJ Melendez,24,148,533,602,125,75,29,5,...,42,0.114,113.2,182,0.496,367,0.157,0.306,2.145168,1.845168
