# MLB Predictor Project

Group 21, Plotters for Success

Gerardo Skrut, Victor Gikunda, Mathew Huang

In [1]:
import pandas as pd
import seaborn as sn
import numpy as np
from datetime import datetime

## Data Cleaning

Prior to importing the data, we cleaned and explored the existing data.

Finally, After we consolidated two datasets with pitching and batting, we are going to separate each portion to inputs and outputs respectively. 

Our inputs overall would include Left Field, Right Field, and Centerfield Distance, Maximum and minimum wall height, Day/night, Attendance, Precipitation, Sky Condition, Temperature, Wind Direction, and Wind Speed. 

For Pitching specifically, we will be using the pitcher's **Season ERA** from the 2023 Season. 

For Batting Specifically, we will be using the batter's **Season Batting Average** from the 2023 Season.

Our outputs would be game specific statistics. 

For Pitching, we would have the number of Hits Allowed, Runs Allowed, Earned Runs, Walks Given, Hit by Pitches, and Wild Pitches.

For Batting, we would have the number of Hits, Doubles, Triples, Home Runs, RBIs, Walks, and Strikeouts. 

## Pitching Data

In [3]:
pitching_data = pd.read_csv('2023_complete_pitching_data.csv')

categorical_data = ['daynight', 'precip', 'sky', 'winddir']


pitching_inputs= ['left_field', 'center_field', 'right_field','min_wall_height','max_wall_height',
                                    'attendance','temp','windspeed','season_era', 'daynight_day', 'daynight_night', 'precip_drizzle', 'precip_none', 'precip_rain', 
                                    'precip_snow', 'sky_cloudy', 'sky_dome', 'sky_overcast', 'sky_sunny', 'winddir_fromcf', 'winddir_fromlf', 'winddir_fromrf', 'winddir_ltor', 
                                    'winddir_rtol', 'winddir_tocf', 'winddir_tolf', 'winddir_torf', 'winddir_unknown']

pitching_outputs = ['p_ipouts','p_h','p_r','p_er','p_w','p_hbp','p_wp']



pitching_data.loc[:, 'precip'] = pitching_data['precip'].fillna('none')
pitching_data = pitching_data.dropna()
pitching_data = pd.get_dummies(pitching_data, columns=categorical_data)


#post encoding: 
p_encoded_variables = ['daynight_day', 'daynight_night', 'precip_drizzle', 
                       'precip_none', 'precip_rain', 'precip_snow', 'sky_cloudy', 'sky_dome', 
                       'sky_overcast', 'sky_sunny', 'winddir_fromcf', 'winddir_fromlf', 'winddir_fromrf', 
                       'winddir_ltor', 'winddir_rtol', 'winddir_tocf', 'winddir_tolf', 'winddir_torf', 
                       'winddir_unknown']

#some columns were object classees
pitching_data[['left_field', 'center_field', 'min_wall_height']] = pitching_data[['left_field', 'center_field', 'min_wall_height']].astype(float)
pitching_data[p_encoded_variables] = pitching_data[p_encoded_variables].astype(int)


pitching_input_data = pitching_data[pitching_inputs]
pitching_output_data = pitching_data[pitching_outputs]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)  # Show all rows

print(pitching_input_data.info())  # Check data types
print(pitching_input_data.head())  # Check first few rows

<class 'pandas.core.frame.DataFrame'>
Index: 21042 entries, 0 to 21061
Data columns (total 28 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   left_field       21042 non-null  float64
 1   center_field     21042 non-null  float64
 2   right_field      21042 non-null  int64  
 3   min_wall_height  21042 non-null  float64
 4   max_wall_height  21042 non-null  int64  
 5   attendance       21042 non-null  float64
 6   temp             21042 non-null  float64
 7   windspeed        21042 non-null  float64
 8   season_era       21042 non-null  float64
 9   daynight_day     21042 non-null  int64  
 10  daynight_night   21042 non-null  int64  
 11  precip_drizzle   21042 non-null  int64  
 12  precip_none      21042 non-null  int64  
 13  precip_rain      21042 non-null  int64  
 14  precip_snow      21042 non-null  int64  
 15  sky_cloudy       21042 non-null  int64  
 16  sky_dome         21042 non-null  int64  
 17  sky_overcast     

## Batting Data

In [4]:
batting_data = pd.read_csv('2023_complete_batting_data.csv')

batting_inputs= ['left_field', 'center_field', 'right_field','min_wall_height','max_wall_height',
                 'attendance','temp','windspeed','season_batting_avg', 'daynight_day', 'daynight_night', 'precip_drizzle', 'precip_none', 'precip_rain', 
                 'precip_snow', 'sky_cloudy', 'sky_dome', 'sky_overcast', 'sky_sunny', 'winddir_fromcf', 'winddir_fromlf', 'winddir_fromrf', 'winddir_ltor', 
                 'winddir_rtol', 'winddir_tocf', 'winddir_tolf', 'winddir_torf', 'winddir_unknown']

batting_outputs = ['b_ab','b_h', 'b_d','b_t','b_hr','b_rbi','b_w','b_k']


batting_data.loc[:, 'precip'] = batting_data['precip'].fillna('none')
batting_data = batting_data.dropna()
batting_data = pd.get_dummies(batting_data, columns=categorical_data)


#post encoding: 
b_encoded_variables = ['daynight_day', 'daynight_night', 'precip_drizzle', 
                       'precip_none', 'precip_rain', 'precip_snow', 'sky_cloudy', 'sky_dome', 
                       'sky_overcast', 'sky_sunny', 'winddir_fromcf', 'winddir_fromlf', 'winddir_fromrf', 
                       'winddir_ltor', 'winddir_rtol', 'winddir_tocf', 'winddir_tolf', 'winddir_torf', 
                       'winddir_unknown']

#some columns were object classees
batting_data[['left_field', 'center_field', 'min_wall_height']] = batting_data[['left_field', 'center_field', 'min_wall_height']].astype(float)
batting_data[p_encoded_variables] = batting_data[b_encoded_variables].astype(int)

batting_input_data = batting_data[batting_inputs]
batting_output_data = batting_data[batting_outputs]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)  # Show all rows

print(pitching_input_data.info())  # Check data types
print(pitching_input_data.head())  # Check first few rows

<class 'pandas.core.frame.DataFrame'>
Index: 21042 entries, 0 to 21061
Data columns (total 28 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   left_field       21042 non-null  float64
 1   center_field     21042 non-null  float64
 2   right_field      21042 non-null  int64  
 3   min_wall_height  21042 non-null  float64
 4   max_wall_height  21042 non-null  int64  
 5   attendance       21042 non-null  float64
 6   temp             21042 non-null  float64
 7   windspeed        21042 non-null  float64
 8   season_era       21042 non-null  float64
 9   daynight_day     21042 non-null  int64  
 10  daynight_night   21042 non-null  int64  
 11  precip_drizzle   21042 non-null  int64  
 12  precip_none      21042 non-null  int64  
 13  precip_rain      21042 non-null  int64  
 14  precip_snow      21042 non-null  int64  
 15  sky_cloudy       21042 non-null  int64  
 16  sky_dome         21042 non-null  int64  
 17  sky_overcast     

  batting_data = pd.read_csv('2023_complete_batting_data.csv')


In [7]:
print(pitching_output_data.dtypes)
print(pitching_output_data.values.shape)

p_ipouts    int64
p_h         int64
p_r         int64
p_er        int64
p_w         int64
p_hbp       int64
p_wp        int64
dtype: object
(21042, 7)


# Implementing Neural Networks

To Tune or Neural Network (NN), we are using different numbers. To do so, we will use the gridsearch CV function to process our Data 

In [8]:
import sklearn as sk
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
import pickle
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer, accuracy_score


In [9]:
def multioutput_accuracy(y_true, y_pred):
    # Compute accuracy for each target column and average them
    accuracies = [accuracy_score(y_true[:, i], y_pred[:, i]) for i in range(y_true.shape[1])]
    return np.mean(accuracies)

multioutput_scorer = make_scorer(multioutput_accuracy)

Firstly we will start with our pitching neural network. We start off with scaling our data and reducing our number of dimensions. From there, we will run it through our MLPClassifier Algorithm from Sci-kit learn. We will determine what hyperparameters work best for our neural network by using the GridSearchCV function to get a cross validation accuracy.

In [16]:
# Create Pipeline of processes to run through
pline = Pipeline([('scaling', sk.preprocessing.StandardScaler()), ('pca', PCA()),
                  ('nnet', MultiOutputClassifier(MLPClassifier(max_iter = 1000, early_stopping= True)))])

# Defines Parameters to Test
param_grid = {
    'pca__n_components':[5,10,15,20, 25, 30],
    'nnet__estimator__hidden_layer_sizes':[30, 45, 60],
    'nnet__estimator__activation': ['relu'],
    'nnet__estimator__alpha':[0.0001, 0.001]

}

gs_pitching_input_data = pitching_input_data.sample(10000, random_state=42)
gs_pitching_output_data = pitching_output_data.sample(10000, random_state=42)

# Grid Search + Scoring
gs = GridSearchCV(pline, param_grid, cv=5, scoring=multioutput_scorer, n_jobs=-1)

# Cross-validate using the subsampled data
pitching_nested_score = cross_val_score(gs, gs_pitching_input_data.values, gs_pitching_output_data.values, 
                                        cv=3,scoring=multioutput_scorer, n_jobs=-1)

print("Nested cross-validation scores:", pitching_nested_score)
print("Mean Accuracy: ", pitching_nested_score.mean() * 100)


30 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/pipeline.py", line 406, in _fit
    X, fitted_transformer = f

Nested cross-validation scores: [0.59983718 0.6047319  0.60421756]
Mean Accuracy:  60.29288805990891


After tuning our hyperparameters, we will now build our final model

In [17]:
# Extract the Best Parameters
gs.fit(pitching_input_data, pitching_output_data)
best_params = gs.best_params_
print(best_params)


Traceback (most recent call last):
  File "/home/mathew/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 173, in pandas._libs.index.IndexEngine.get_loc
TypeError: '(slice(None, None, None), 0)' is an invalid key

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign *

{'nnet__estimator__activation': 'relu', 'nnet__estimator__alpha': 0.0001, 'nnet__estimator__hidden_layer_sizes': 30, 'pca__n_components': 5}


In [18]:
# Generate Final Algorithm

X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(
    pitching_input_data.values,  # Ensure NumPy arrays
    pitching_output_data.values, 
    test_size=0.2, 
    random_state=42
)

# Generate Final Algorithm with the best parameters
final_model = Pipeline([
    ('scaling', sk.preprocessing.StandardScaler()), 
    ('pca', PCA(n_components=best_params['pca__n_components'])),
    ('nnet', MultiOutputClassifier(MLPClassifier(
        activation=best_params['nnet__estimator__activation'],
        hidden_layer_sizes=best_params['nnet__estimator__hidden_layer_sizes'],
        alpha=best_params['nnet__estimator__alpha'],
        max_iter=1000,
        early_stopping=True
    )))
])

# Train the final model on the training set
final_model.fit(X_train, y_train)
print("Final model training completed.")

y_pred = final_model.predict(X_test)
# Compute and print the accuracy
test_accuracy = multioutput_accuracy(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Generate detailed classification reports for each output variable
for i, col in enumerate(pitching_output_data.columns):
    print(f"Classification Report for {col}:")
    print(sk.metrics.classification_report(y_test[:, i], y_pred[:, i]))


Final model training completed.
Test Accuracy: 60.25%
Classification Report for p_ipouts:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        35
           1       0.00      0.00      0.00       281
           2       0.00      0.00      0.00       380
           3       0.41      1.00      0.58      1734
           4       0.00      0.00      0.00       226
           5       0.00      0.00      0.00       146
           6       0.00      0.00      0.00       285
           7       0.00      0.00      0.00        40
           8       0.00      0.00      0.00        35
           9       0.00      0.00      0.00        81
          10       0.00      0.00      0.00        36
          11       0.00      0.00      0.00        35
          12       0.00      0.00      0.00        80
          13       0.00      0.00      0.00        40
          14       0.00      0.00      0.00        66
          15       0.00      0.00      0.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [23]:
with open("pitching_model.pkl", "wb") as f:
    pickle.dump(final_model, f)
print("Final model saved as 'pitching_model.pkl'.")

Final model saved as 'pitching_model.pkl'.


In [20]:
# Create Pipeline of processes to run through
pline2 = Pipeline([('scaling', sk.preprocessing.StandardScaler()), ('pca', PCA()),
                   ('nnet', MultiOutputClassifier(MLPClassifier(max_iter=1000, early_stopping=True)))])

# Defines Parameters to Test
param_grid2 = {
    'pca__n_components': [5, 10, 15, 20, 25, 30],
    'nnet__estimator__hidden_layer_sizes': [30, 45, 60],
    'nnet__estimator__activation': ['relu'],
    'nnet__estimator__alpha': [0.0001, 0.001]
}

# Subsample the data for grid search
gs_batting_input_data = batting_input_data.sample(10000, random_state=42)
gs_batting_output_data = batting_output_data.sample(10000, random_state=42)

# Grid Search + Scoring
gs2 = GridSearchCV(pline2, param_grid2, cv=5, scoring=multioutput_scorer, n_jobs=-1)

# Cross-validate using the subsampled data
batting_nested_score = cross_val_score(gs2, 
                                       gs_batting_input_data.values, 
                                       gs_batting_output_data.values, 
                                       cv=3, 
                                       scoring=multioutput_scorer, 
                                       n_jobs=-1)

print("Nested cross-validation scores:", batting_nested_score)
print("Mean Accuracy: ", batting_nested_score.mean() * 100)


30 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/pipeline.py", line 406, in _fit
    X, fitted_transformer = f

Nested cross-validation scores: [0.67610228 0.67127963 0.67394239]
Mean Accuracy:  67.37747672487706


In [21]:
# Extract the Best Parameters
gs2.fit(batting_input_data, batting_output_data)
best_params2 = gs2.best_params_
print(best_params2)

Traceback (most recent call last):
  File "/home/mathew/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 173, in pandas._libs.index.IndexEngine.get_loc
TypeError: '(slice(None, None, None), 0)' is an invalid key

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign *

{'nnet__estimator__activation': 'relu', 'nnet__estimator__alpha': 0.0001, 'nnet__estimator__hidden_layer_sizes': 30, 'pca__n_components': 5}


In [22]:
# Split the dataset into training and testing sets
X_train2, X_test2, y_train2, y_test2 = sk.model_selection.train_test_split(
    batting_input_data.values,  # Ensure NumPy arrays
    batting_output_data.values, 
    test_size=0.2, 
    random_state=42
)

# Generate Final Algorithm with the best parameters
final_model2 = Pipeline([
    ('scaling', sk.preprocessing.StandardScaler()), 
    ('pca', PCA(n_components=best_params2['pca__n_components'])),
    ('nnet', MultiOutputClassifier(MLPClassifier(
        activation=best_params2['nnet__estimator__activation'],
        hidden_layer_sizes=best_params2['nnet__estimator__hidden_layer_sizes'],
        alpha=best_params2['nnet__estimator__alpha'],
        max_iter=1000,
        early_stopping=True
    )))
])

# Train the final model on the training set
final_model2.fit(X_train2, y_train2)
print("Final model training completed.")

# Test the model and compute predictions
y_pred2 = final_model2.predict(X_test2)

# Compute and print the accuracy
test_accuracy2 = multioutput_accuracy(y_test2, y_pred2)
print(f"Test Accuracy: {test_accuracy2 * 100:.2f}%")

# Generate detailed classification reports for each output variable
for i, col in enumerate(batting_output_data.columns):
    print(f"Classification Report for {col}:")
    print(sk.metrics.classification_report(y_test2[:, i], y_pred2[:, i]))


Final model training completed.
Test Accuracy: 67.29%
Classification Report for b_ab:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       825
           2       0.00      0.00      0.00      1036
           3       0.25      0.01      0.01      2648
           4       0.42      0.99      0.59      4120
           5       0.00      0.00      0.00      1127
           6       0.00      0.00      0.00        56
           7       0.00      0.00      0.00         4

    accuracy                           0.42      9816
   macro avg       0.10      0.14      0.09      9816
weighted avg       0.24      0.42      0.25      9816

Classification Report for b_h:
              precision    recall  f1-score   support

           0       0.42      0.75      0.54      4087
           1       0.39      0.27      0.32      3731
           2       0.00      0.00      0.00      1564
           3       0.00      0.00      0.00       367
           4   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [24]:
with open("batting_model.pkl", "wb") as f:
    pickle.dump(final_model2, f)
print("Final model saved as 'batting_model.pkl'.")

Final model saved as 'batting_model.pkl'.
