In [103]:
import pandas as pd
import numpy as np
import pybaseball
from pybaseball import batting_stats
pybaseball.cache.enable()
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit

In [104]:
#Load in batting stats from fangraphs using pybaseball
#Uncomment lines below to load in
#batting = batting_stats(2017, 2023, qual = 150)
#batting.to_csv("batting.csv")
batting = pd.read_csv("batting.csv")

In [105]:
#makes it only players with multiple seasons
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)
batting.head()

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,...,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,xBA,xSLG,xwOBA,L-WAR
0,2,15640,2022,Aaron Judge,NYY,30,157,570,696,177,...,118.4,246,0.609,404,0.169,0.287,,,,11.6
1,4,13611,2018,Mookie Betts,BOS,25,136,520,614,180,...,110.6,217,0.5,434,0.22,0.27,,,,10.4
2,6,10155,2018,Mike Trout,LAA,26,140,471,608,147,...,118.0,162,0.46,352,0.201,0.261,,,,9.5
3,18,18401,2023,Ronald Acuna Jr.,ATL,25,159,643,735,217,...,121.2,309,0.55,562,0.137,0.214,,,,8.4
4,15,15640,2017,Aaron Judge,NYY,25,155,542,678,154,...,121.1,186,0.55,338,0.157,0.29,,,,8.7


In [106]:
#find nulls (cant have for ML)
null_variables = batting.isnull().sum() 
null_variables

Unnamed: 0       0
IDfg             0
Season           0
Name             0
Team             0
              ... 
CSW%             0
xBA           2388
xSLG          2388
xwOBA         2388
L-WAR            0
Length: 321, dtype: int64

In [107]:
full_variables = list(batting.columns[null_variables == 0]) #gets list of all non nulls
batting = batting[full_variables].copy() #updates to only non nulls
batting.head()

Unnamed: 0.1,Unnamed: 0,IDfg,Season,Name,Team,Age,G,AB,PA,H,...,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,L-WAR
0,2,15640,2022,Aaron Judge,NYY,30,157,570,696,177,...,14.9,106,0.262,118.4,246,0.609,404,0.169,0.287,11.6
1,4,13611,2018,Mookie Betts,BOS,25,136,520,614,180,...,18.5,57,0.131,110.6,217,0.5,434,0.22,0.27,10.4
2,6,10155,2018,Mike Trout,LAA,26,140,471,608,147,...,18.6,54,0.153,118.0,162,0.46,352,0.201,0.261,9.5
3,18,18401,2023,Ronald Acuna Jr.,ATL,25,159,643,735,217,...,7.4,86,0.153,121.2,309,0.55,562,0.137,0.214,8.4
4,15,15640,2017,Aaron Judge,NYY,25,155,542,678,154,...,15.8,84,0.249,121.1,186,0.55,338,0.157,0.29,8.7


In [108]:
#removes all object other than Name
batting = batting.drop('Dol', axis=1)
batting = batting.drop('Team', axis=1)
batting = batting.drop('Age Rng', axis=1)
batting = batting.drop('Unnamed: 0', axis=1)
batting = batting.drop('IDfg', axis=1)
batting = batting.drop('L-WAR', axis=1)
batting.head()

Unnamed: 0,Season,Name,Age,G,AB,PA,H,1B,2B,3B,...,EV,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%
0,2022,Aaron Judge,30,157,570,696,177,87,28,0,...,95.8,14.9,106,0.262,118.4,246,0.609,404,0.169,0.287
1,2018,Mookie Betts,25,136,520,614,180,96,47,5,...,92.3,18.5,57,0.131,110.6,217,0.5,434,0.22,0.27
2,2018,Mike Trout,26,140,471,608,147,80,24,4,...,91.2,18.6,54,0.153,118.0,162,0.46,352,0.201,0.261
3,2023,Ronald Acuna Jr.,25,159,643,735,217,137,35,4,...,94.7,7.4,86,0.153,121.2,309,0.55,562,0.137,0.214
4,2017,Aaron Judge,25,155,542,678,154,75,24,3,...,94.9,15.8,84,0.249,121.1,186,0.55,338,0.157,0.29


In [109]:
batting.dtypes[batting.dtypes == "object"] #confirms removal

Name    object
dtype: object

In [110]:
#Create helper function to apply to each player in the batting database
def next_season(p):
    p = p.sort_values("Season")
    p["Next_WAR"] = p["WAR"].shift(-1)
    return p

#use the higher order function apply to use the helper function on each player. Yay csci275
batting = batting.groupby("Name", group_keys=False).apply(next_season)
batting.loc[:, 'WAR':'Next_WAR']

  batting = batting.groupby("Name", group_keys=False).apply(next_season)


Unnamed: 0,WAR,Spd,wRC+,WPA,-WPA,+WPA,RE24,REW,pLI,PH,...,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,Next_WAR
2092,0.0,1.4,82,-1.17,-3.22,2.05,-8.46,-0.90,0.95,12,...,11.5,4,0.035,106.4,36,0.310,116,0.211,0.286,-0.3
2266,-0.3,1.2,105,-0.47,-3.08,2.61,1.18,0.17,1.02,20,...,12.5,5,0.042,106.8,44,0.370,119,0.225,0.275,
776,2.0,7.5,103,0.06,-8.94,9.00,4.96,0.51,1.05,9,...,8.5,19,0.054,108.7,136,0.384,354,0.210,0.280,2.5
578,2.5,6.7,110,1.51,-8.73,10.24,9.30,0.94,1.10,5,...,13.5,31,0.097,108.6,130,0.405,321,0.165,0.272,0.5
1700,0.5,4.9,107,0.54,-5.93,6.47,0.90,0.16,1.02,8,...,13.7,18,0.076,108.1,95,0.399,238,0.169,0.283,1.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1392,1.0,6.0,81,-1.85,-9.03,7.17,-18.63,-1.86,0.95,28,...,17.8,27,0.076,108.3,118,0.332,356,0.165,0.250,
207,4.2,5.3,139,2.27,-7.66,9.93,29.39,2.90,0.92,9,...,14.8,16,0.044,108.3,105,0.289,364,0.204,0.264,-0.2
2221,-0.2,3.4,81,0.31,-4.20,4.51,-7.59,-0.77,0.96,0,...,17.4,4,0.022,107.0,51,0.276,185,0.203,0.277,
2527,-0.8,4.2,42,-1.45,-3.83,2.38,-16.48,-1.62,1.04,3,...,24.3,5,0.049,108.2,34,0.330,103,0.200,0.305,-0.1


In [111]:
#Adds decay for WAR to weight more recent season higher
#decay_factor = 0.1 #Random Choice
#batting['Weighted_WAR'] = batting['WAR'] * np.exp(-decay_factor * (2024 - batting['Season']))
#batting.head()

In [112]:
from sklearn.preprocessing import MinMaxScaler

batting_copy = batting.copy()
batting_copy = batting_copy.dropna()

columns_to_scale = batting.drop(['Name', 'Next_WAR'], axis=1).select_dtypes(include=[np.number]).columns
scaler = MinMaxScaler()
batting_copy[columns_to_scale] = pd.DataFrame(scaler.fit_transform(batting_copy[columns_to_scale]))
batting_copy.head()

Unnamed: 0,Season,Name,Age,G,AB,PA,H,1B,2B,3B,...,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,Next_WAR
2092,,A.J. Ellis,,,,,,,,,...,,,,,,,,,,-0.3
776,0.0,A.J. Pollock,0.636364,0.928571,0.794964,0.829146,0.706806,0.509317,0.842105,0.214286,...,0.737179,0.245283,0.209924,0.283784,0.616279,0.534451,0.789579,0.230769,0.279661,2.5
578,0.4,A.J. Pollock,0.227273,0.547619,0.31295,0.296482,0.272251,0.273292,0.140351,0.285714,...,0.560897,0.066038,0.125954,0.324324,0.155039,0.327747,0.278557,0.663462,0.677966,0.5
1700,0.6,A.J. Pollock,0.409091,0.166667,0.115108,0.125628,0.125654,0.124224,0.157895,0.0,...,0.432692,0.132075,0.400763,0.635135,0.182171,0.716946,0.116232,0.346154,0.694915,1.2
1209,1.0,A.J. Pollock,0.545455,0.793651,0.609712,0.695142,0.361257,0.236025,0.368421,0.071429,...,0.810897,0.415094,0.51145,0.504505,0.507752,0.674115,0.507014,0.615385,0.690678,3.2


In [113]:
X = batting.drop(['Name', 'Next_WAR'], axis=1)
y = batting['Next_WAR']
bat_copy=batting.copy()
bat_copy2=batting.copy()




#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.75, random_state=12345)

In [122]:
batting.sort_values("Season")
X_train_temp=bat_copy[bat_copy["Season"]<2023]
X_train_names = X_train_temp.dropna() 
X_train=X_train_names.drop(['Name','Next_WAR'],axis=1)
y_train=X_train_names['Next_WAR']
y_train

2092   -0.3
776     2.5
578     0.5
1700    1.2
1209    3.2
       ... 
2467   -0.3
2218    0.8
1534    1.0
207    -0.2
2527   -0.1
Name: Next_WAR, Length: 1779, dtype: float64

In [120]:
X_test_temp=bat_copy2[bat_copy2["Season"]>2022]
X_test=X_test_temp.drop(['Name'],axis=1)
y_test=X_test_temp['Next_WAR']
X_test

Unnamed: 0,Season,Age,G,AB,PA,H,1B,2B,3B,HR,...,LA,Barrels,Barrel%,maxEV,HardHit,HardHit%,Events,CStr%,CSW%,Next_WAR
1358,2023,33,93,269,312,68,50,9,1,8,...,12.6,8,0.040,106.7,59,0.294,201,0.196,0.285,
135,2023,31,106,367,458,98,45,16,0,37,...,20.4,66,0.275,116.9,154,0.642,240,0.160,0.298,
1026,2023,34,92,320,353,79,32,24,2,21,...,28.9,28,0.130,110.0,84,0.391,215,0.183,0.320,
1803,2023,31,141,412,455,99,63,21,2,13,...,15.2,11,0.031,106.7,80,0.228,351,0.178,0.264,
87,2023,25,154,588,687,163,111,31,1,20,...,12.6,37,0.075,111.1,189,0.384,492,0.201,0.257,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1303,2023,28,92,334,357,87,55,20,1,11,...,12.3,23,0.101,112.7,89,0.390,228,0.168,0.298,
192,2023,26,114,410,496,120,64,24,1,31,...,17.1,58,0.180,117.7,168,0.522,322,0.163,0.251,
2280,2023,39,108,298,329,73,50,16,3,4,...,16.5,5,0.019,107.9,89,0.345,258,0.194,0.262,
1392,2023,28,148,464,518,107,73,21,4,9,...,17.8,27,0.076,108.3,118,0.332,356,0.165,0.250,


In [121]:
#Code used to find the optimal alpha value for lasso
from sklearn.linear_model import LassoCV
from sklearn.exceptions import ConvergenceWarning
import warnings
from sklearn.model_selection import TimeSeriesSplit

warnings.filterwarnings('ignore', category=ConvergenceWarning)

split = TimeSeriesSplit(n_splits=3)

lasso_cv = LassoCV(alphas=[0.000001,0.00001,0.0001,0.001,0.01,0.1,1], cv=split, random_state=12345)
lasso_cv.fit(X_train, y_train)

print("Optimal alpha value:", lasso_cv.alpha_)

# Using the best alpha value, predictions can be made for the test set
y_pred = lasso_cv.predict(X_test)

Optimal alpha value: 0.1


ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- Next_WAR


In [None]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)

In [None]:
#Evaluates lasso
y_pred = lasso.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

In [47]:
# Displays coefficients and feature names
coefficients = lasso.coef_
feature_names = X_train.columns

# Create a DataFrame containing feature names and their corresponding coefficients
coeff_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Sort the DataFrame by the coefficients in descending order
sorted_coeff_df = coeff_df.sort_values(by='Coefficient', ascending=False)

# Display the sorted DataFrame
print(sorted_coeff_df)

# Outputs to csv
sorted_coeff_df.to_csv('lasso_coefficients.csv', index=False)


ValueError: All arrays must be of the same length