In [1]:
## Madeline Quigley
## 2023.04.01
## Predicting each player's on-base percentage for the 2021 season

In [2]:
# Reading the Data

import csv

# open csv file
with open(r'C:\Users\Madeline\Documents\Philadelphia Phillies\obp.csv', mode = 'r') as file:
    #read csv file
    obp = csv.reader(file)
    #print csv file
    #for lines in obp:
        #print(lines)

In [3]:
# Splitting the data into attributes and target variable

import pandas as pd

# load into dataframe
df = pd.read_csv(r'C:\Users\Madeline\Documents\Philadelphia Phillies\obp.csv', index_col=0)

# split data, drop string values, OBP_21 is target attribute
X = df.drop(['playerid', 'birth_date', 'OBP_21'], axis=1)
y = df['OBP_21']


In [4]:
# Handling missing values -- need predictions for every player so we couldn't just drop rows
# with missing values, i am simply using the mean to estimate the missing values. this is likely
# an inaccurate estimate and a more accurate model might use regression or kNN imputation

from sklearn.impute import SimpleImputer

# using column means to fill in missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)
X = pd.DataFrame(X_imputed, columns=X.columns, index=X.index)
print(X.dtypes)


# Source: https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html 

PA_21     float64
PA_20     float64
OBP_20    float64
PA_19     float64
OBP_19    float64
PA_18     float64
OBP_18    float64
PA_17     float64
OBP_17    float64
PA_16     float64
OBP_16    float64
dtype: object


In [5]:
## Random Forest Model
# Fit the model and make predictions

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# split into training and test sets (same for all models)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

X_test_imputed = imputer.transform(X_test)
X_test = pd.DataFrame(X_test_imputed, columns=X_test.columns, index=X_test.index)

# fit random forest model
regr = RandomForestRegressor()
regr.fit(X_train, y_train)

# make prediction on test set
y_pred = regr.predict(X_test)

# array of on-base percentage perdictions
print(y_pred)


# Source: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html 

[0.27762    0.3020175  0.34144    0.33648    0.26136    0.31318
 0.30945    0.3072     0.29138567 0.33403    0.29985    0.3246
 0.24548    0.2945325  0.25247    0.30891    0.35926    0.27733
 0.27328917 0.29975    0.317      0.34235    0.24071    0.34974
 0.2702965  0.34146    0.31935    0.24878217 0.30434    0.28391
 0.28822    0.22855    0.23674    0.23418    0.29103    0.34886
 0.34856    0.32467    0.3279     0.35645    0.29387    0.31226567
 0.27752    0.28877    0.28565    0.31371    0.33764    0.2444775
 0.34153    0.32467    0.29703    0.35707    0.275991   0.269651
 0.25931    0.27166    0.35545    0.34725    0.35737    0.29519
 0.25353    0.36489    0.25840567 0.28939    0.32306    0.29482
 0.33957    0.35677    0.32858    0.35419    0.34925    0.32113
 0.24879    0.28515    0.29738    0.28637917 0.29273    0.2719125
 0.28916    0.273128   0.33229    0.30316    0.33714    0.32138
 0.2852525  0.2807955  0.32829    0.269651   0.24761    0.26288
 0.33704    0.28826    0.32207333

In [6]:
# Model Performance (Random Forest)

from sklearn.metrics import mean_squared_error

# model performance using mse
mse = mean_squared_error(y_test, y_pred)
print("MSE:", mse)


MSE: 0.0022702027553666653


In [7]:
## Gradient Boosting Model
# Fit the model and make predictions

from sklearn.ensemble import GradientBoostingRegressor

# fitting the gradient boosting model
reg = GradientBoostingRegressor(random_state=42)
reg.fit(X_train, y_train)

# predict on-base percentages and print results
y_pred = reg.predict(X_test)
print(y_pred)


# Source: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html

[0.26951052 0.28301356 0.33043475 0.32368801 0.2683466  0.30427995
 0.3028678  0.29718168 0.28909895 0.31241488 0.29217313 0.32062924
 0.22946016 0.28909895 0.26360153 0.31533297 0.35173897 0.28173127
 0.2683466  0.29528669 0.31919834 0.35244781 0.24583149 0.32667513
 0.27551102 0.34219408 0.32159142 0.2703514  0.30211844 0.28324452
 0.27865464 0.24948406 0.23981167 0.22561468 0.29740451 0.36046831
 0.3475254  0.32105491 0.33792763 0.34267562 0.30164609 0.29529317
 0.27712011 0.30169309 0.28926796 0.32591969 0.33060787 0.30010279
 0.32655694 0.31662658 0.30376388 0.36021304 0.27337739 0.27773512
 0.2787044  0.27397092 0.3585255  0.34950251 0.36176139 0.29979328
 0.25848882 0.37695649 0.26527694 0.29101681 0.32839816 0.3119736
 0.34441629 0.35082706 0.31726682 0.36345695 0.34604408 0.31417692
 0.27381725 0.28725051 0.31236815 0.28299117 0.29986843 0.27122204
 0.29533392 0.26548416 0.32913605 0.30365826 0.33189567 0.29672726
 0.28752678 0.28510626 0.34026866 0.27773512 0.25617957 0.26834

In [8]:
# Model Performace (Gradient Boosting)

mse = mean_squared_error(y_test, y_pred)
print('Mean squared error:', mse)

Mean squared error: 0.0023213523479738768


In [9]:
## Decision Tree
# Fit the model and make predictions

from sklearn.tree import DecisionTreeRegressor

# fitting the decision tree model
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(X_train, y_train)

# predict on-base percentages and print results
y_pred = regressor.predict(X_test)
print(y_pred)


# Source: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html

[0.278  0.3075 0.376  0.358  0.255  0.303  0.394  0.286  0.314  0.316
 0.281  0.33   0.194  0.297  0.2    0.31   0.394  0.258  0.293  0.298
 0.334  0.314  0.152  0.24   0.317  0.357  0.305  0.2    0.247  0.311
 0.253  0.194  0.278  0.342  0.286  0.286  0.335  0.32   0.308  0.383
 0.242  0.353  0.278  0.301  0.216  0.316  0.286  0.2375 0.34   0.322
 0.3    0.24   0.242  0.256  0.215  0.202  0.351  0.348  0.351  0.31
 0.255  0.351  0.233  0.34   0.349  0.263  0.353  0.334  0.319  0.372
 0.351  0.293  0.208  0.379  0.312  0.382  0.31   0.311  0.3    0.208
 0.329  0.317  0.328  0.28   0.282  0.2    0.236  0.256  0.356  0.26
 0.362  0.305  0.466  0.27   0.316  0.383  0.2    0.281  0.312  0.311
 0.233  0.346  0.347  0.328  0.314  0.311  0.322  0.39   0.302  0.355
 0.312  0.304  0.262  0.348  0.373 ]


In [10]:
# Model Performace (Decision Tree)

mse = mean_squared_error(y_test, y_pred)
print('Mean squared error:', mse)

Mean squared error: 0.0038627782608695637


In [11]:
## Model Comparison

# All three models performed well to predict the on-base percentages for each player for the 2021
# season. The Random Forest model and the Gradient Boosting model performed better than the Decision
# tree, likely because they are ensemble methods so they can handle larger and more complex datasets;
# it is possible that the Decision Tree had overfitted (or underfitted) the data. For future preditions, 
# in order to improve the models, we should first find a better way to handle missing values in the data,
# and additionally we may be able to mess with the hyperparameters in the models to make them perform
# better on the dataset.