In [1]:
import os
import pandas as pd
import numpy as np
from pybaseball import batting_stats


In [2]:
START = 2002
END = 2022

In [3]:
if os.path.exists("batting.csv"):
    batting = pd.read_csv("batting.csv", index_col=0)
else:
    batting = batting_stats(START, END, qual=200)
    batting.to_csv("batting.csv")

In [4]:
batting = batting.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0] > 1)


In [5]:
def next_season(player):
    player = player.sort_values("Season")
    player["Next_WAR"] = player["WAR"].shift(-1)
    return player

batting = batting.groupby("IDfg", group_keys=False).apply(next_season)

In [6]:
null_count = batting.isnull().sum()


In [7]:
complete_cols = list(batting.columns[null_count == 0])
batting = batting[complete_cols + ["Next_WAR"]].copy()

In [8]:
batting.dtypes[batting.dtypes == "object"]


Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [9]:
del batting["Age Rng"]
del batting["Dol"]

In [10]:
batting["team_code"] = batting["Team"].astype("category").cat.codes


In [11]:
batting_full = batting.copy()
batting = batting.dropna().copy()

In [12]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

sfs = SequentialFeatureSelector(rr, 
                                n_features_to_select=20, 
                                direction="forward",
                                cv=split,
                                n_jobs=8
                               )

In [13]:
removed_columns = ["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [14]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
batting.loc[:,selected_columns] = scaler.fit_transform(batting[selected_columns])

In [15]:
sfs.fit(batting[selected_columns], batting["Next_WAR"])


SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=Ridge(alpha=1), n_features_to_select=20,
                          n_jobs=8)

In [16]:
predictors = list(selected_columns[sfs.get_support()])


In [17]:
predictors

['Age',
 'IBB',
 'SO',
 'SB',
 'BU',
 'BABIP',
 'IFH%',
 'WAR',
 'Spd',
 'PH',
 'CB%',
 'O-Contact%',
 'wGDP',
 'Oppo%',
 'OBP+',
 'SLG+',
 'Pull%+',
 'Soft%+',
 'Hard%+',
 'L-WAR']

In [19]:
def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    
    years = sorted(data["Season"].unique())
    
    for i in range(start, len(years), step):
        current_year = years[i]
        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]
        
        model.fit(train[predictors], train["Next_WAR"])
        
        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]
        
        all_predictions.append(combined)
    return pd.concat(all_predictions)

In [20]:
predictions = backtest(batting, rr, predictors)


In [21]:
predictions.shape

(4127, 2)

In [22]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

2.7972427459505527

In [23]:
def player_history(df):
    df = df.sort_values("Season")
        
    df["player_season"] = range(0, df.shape[0])
    df["war_corr"] = list(df[["player_season", "WAR"]].expanding().corr().loc[(slice(None), "player_season"),"WAR"])
    df["war_corr"].fillna(0, inplace=True)
    
    df["war_diff"] = df["WAR"] / df["WAR"].shift(1)
    df["war_diff"].fillna(1, inplace=True)
    df["war_diff"][df["war_diff"] == np.inf] = 1
    
    return df

batting = batting.groupby("IDfg", group_keys=False).apply(player_history)

In [24]:
def group_averages(df):
    return df["WAR"] / df["WAR"].mean()

In [25]:
batting["war_season"] = batting.groupby("Season", group_keys=False).apply(group_averages)


In [26]:
new_predictors = predictors + ["player_season", "war_corr", "war_season", "war_diff"]


In [27]:
predictions = backtest(batting, rr, new_predictors)


In [28]:
mean_squared_error(predictions["actual"], predictions["prediction"]) 


2.7132462617165722

In [29]:
pd.Series(rr.coef_, index=new_predictors).sort_values()


Age             -2.585902
BABIP           -1.853663
WAR             -1.853517
SLG+            -1.461184
Soft%+          -1.275252
BU              -0.953047
PH              -0.709528
SO              -0.647148
war_diff        -0.586128
wGDP            -0.443833
CB%             -0.333501
Pull%+          -0.195604
war_corr        -0.093396
player_season    0.000698
L-WAR            0.213106
O-Contact%       0.258397
IFH%             0.402026
OBP+             0.483645
Oppo%            0.697790
Spd              0.749664
SB               1.053095
IBB              1.682070
Hard%+           2.369994
war_season       3.397239
dtype: float64

In [30]:
diff = predictions["actual"] - predictions["prediction"]


In [31]:
merged = predictions.merge(batting, left_index=True, right_index=True)


In [32]:
merged["diff"] = (predictions["actual"] - predictions["prediction"]).abs()


In [33]:
merged[["IDfg", "Season", "Name", "WAR", "Next_WAR", "diff"]].sort_values(["diff"])


Unnamed: 0,IDfg,Season,Name,WAR,Next_WAR,diff
5447,11846,2016,Leonys Martin,0.422360,1.9,0.001054
1018,7859,2018,Charlie Blackmon,0.347826,1.7,0.001550
6050,1638,2007,Miguel Olivo,0.254658,0.4,0.001826
6337,4712,2011,Ben Revere,0.304348,2.1,0.001852
4859,4712,2013,Ben Revere,0.267081,1.8,0.003036
...,...,...,...,...,...,...
3823,1875,2009,Josh Hamilton,0.291925,8.4,6.361302
3161,4810,2007,Brian McCann,0.304348,8.6,6.373673
871,9166,2010,Buster Posey,0.459627,10.1,6.581365
2516,11579,2014,Bryce Harper,0.310559,9.3,7.454659
