In [1]:
import os
import pandas as pd
import numpy as np

data = pd.read_csv("batting.csv")


In [2]:
batting = data.groupby("IDfg", group_keys=False).filter(lambda x: x.shape[0]>1)
#batting

In [3]:
def next_season(player):
    player=player.sort_values("Season")
    player["Next_WAR"] = player['WAR'].shift(-1)
    return player

batting= batting.groupby("IDfg", group_keys=False).apply(next_season)
#batting[["Name", "Season", "WAR", "Next_WAR"]]


In [4]:
null_count = batting.isnull().sum()
complete_cols = list(batting.columns[null_count == 0])
batting = batting[complete_cols + ["Next_WAR"]].copy()
#batting

In [5]:
batting.dtypes[batting.dtypes == "object"]

Name       object
Team       object
Dol        object
Age Rng    object
dtype: object

In [6]:
del batting["Dol"]
del batting["Age Rng"]

In [7]:
#assign team codes into a set of unique numbers for each team
batting["team_code"] = batting["Team"].astype("category").cat.codes

In [8]:
batting_full = batting.copy()
batting = batting.dropna().copy()

In [9]:
from sklearn.linear_model import Ridge
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import TimeSeriesSplit

#play around with aplha: higher --> reduces overfitting
rr = Ridge(alpha=1)

split = TimeSeriesSplit(n_splits=3)

#n_jobs uses multiple threads - direction selects the best 20
sfs = SequentialFeatureSelector(rr, n_features_to_select=20,
                                direction="forward", cv=split, n_jobs=4)

In [10]:
#remove columns that won't work in the machine learning model
removed_columns = ["Next_WAR", "Name", "Team", "IDfg", "Season"]
selected_columns = batting.columns[~batting.columns.isin(removed_columns)]

In [11]:
from sklearn.preprocessing import MinMaxScaler

#normalize the data? 
scaler = MinMaxScaler()
batting.loc[:, selected_columns] = scaler.fit_transform(batting[selected_columns])

In [12]:
sfs.fit(batting[selected_columns], batting["Next_WAR"])

SequentialFeatureSelector(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),
                          estimator=Ridge(alpha=1), n_features_to_select=20,
                          n_jobs=4)

In [13]:
#create a list of best features based on the ridge regression model
predictors = list(selected_columns[sfs.get_support()])
predictors

['Age',
 'IBB',
 'SO',
 'SB',
 'BU',
 'BABIP',
 'IFH%',
 'WAR',
 'Spd',
 'PH',
 'CB%',
 'O-Contact%',
 'wGDP',
 'Oppo%',
 'OBP+',
 'SLG+',
 'Pull%+',
 'Soft%+',
 'Hard%+',
 'L-WAR']

In [99]:
#function for training model and making predctions 

def backtest(data, model, predictors, start=5, step=1):
    all_predictions = []
    years = sorted(batting["Season"].unique())
    
    for i in range(start, len(years), step):
        current_year = years[i]

        train = data[data["Season"] < current_year]
        test = data[data["Season"] == current_year]

        model.fit(train[predictors], train["Next_WAR"])

        preds = model.predict(test[predictors])
        preds = pd.Series(preds, index=test.index)
        combined = pd.concat([test["Next_WAR"], preds], axis=1)
        combined.columns = ["actual", "prediction"]

        all_predictions.append(combined)

    return pd.concat(all_predictions)

In [100]:
predictions = backtest(batting, rr, predictors)

In [101]:
from sklearn.metrics import mean_squared_error

mean_squared_error(predictions["actual"], predictions["prediction"])

2.7973640439713954

In [102]:
batting["Next_WAR"].describe()

count    5575.000000
mean        1.794798
std         1.996427
min        -3.400000
25%         0.300000
50%         1.500000
75%         2.900000
max        11.900000
Name: Next_WAR, dtype: float64

In [103]:
#rule of thumb: the mean squared error should be lower than std squared
2.7973640439713954 ** 0.5

1.6725322250920593