# ML Baseball Prediction

## Imports

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import confusion_matrix
import seaborn as sns; sns.set()
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.metrics import mean_squared_error,r2_score, mean_absolute_error, accuracy_score
from xgboost import XGBClassifier
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import RandomizedSearchCV

## Data Processing
Gathering data from txt documents

In [2]:
!git clone https://github.com/mamarcus64/Baseball-Predictor.git

Cloning into 'Baseball-Predictor'...
remote: Enumerating objects: 1216, done.[K
remote: Total 1216 (delta 0), reused 0 (delta 0), pack-reused 1216[K
Receiving objects: 100% (1216/1216), 131.55 MiB | 9.25 MiB/s, done.
Resolving deltas: 100% (177/177), done.
Checking out files: 100% (1184/1184), done.


Preprocessing the results into DataFrames (do not run this more than once since it edits the files directly)



In [3]:
def insert(originalfile,string):
    with open(originalfile,'r') as f:
        with open('newfile.txt','w') as f2: 
            f2.write(string)
            f2.write(f.read())
    os.rename('newfile.txt',originalfile)
s = "feature 0"
for i in range(1,316):
    s += str(",feature " + str(i))
s += '\n'
insert('Baseball-Predictor/data/features.txt', s)
insert('Baseball-Predictor/data/labels.txt', 'labels\n')

Reading into dataframes, cleaning, and splitting

In [4]:
features = pd.read_csv('Baseball-Predictor/data/features.txt')
labels = pd.read_csv('Baseball-Predictor/data/labels.txt')
features = features.replace('\[|\]','', regex=True)
features = features.astype('float64')
labels = labels.astype('int32')
features = features.drop(columns=['feature 315', 'feature 314'])
Xtrain, Xtest, ytrain, ytest = train_test_split(features, labels, random_state=0)

  interactivity=interactivity, compiler=compiler, result=result)


## ML Models


### Random Forest Classifier

In [5]:
forest_model = RandomForestClassifier(n_estimators=1000)
forest_model.fit(Xtrain, ytrain)
forestpred = forest_model.predict(Xtest)
forest_accuracy = accuracy_score(ytest, forestpred)
forest_accuracy

  


0.5275163957798689

To optimize hyperparameters using randomized grid search. This takes about 1.5 hours to run on the colab GPU.

In [6]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 20, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(Xtrain, ytrain)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


KeyboardInterrupt: ignored

In [7]:
rf_random.best_params_ # Print out the best hyperparameters

AttributeError: ignored

This prints out:

{'bootstrap': True,
 'max_depth': 10,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 5,
 'n_estimators': 200}


In [None]:
rf_pred = rf_random.predict(Xtest)
rf_accuracy = accuracy_score(ytest, rf_pred)
rf_accuracy

0.5489021956087824

Which beats the default accuracy by

In [None]:
forest_optimization_improvement = rf_accuracy - forest_accuracy
forest_optimization_improvement

0.02338180781294552

### Elastic Net

In [None]:
enet_model = ElasticNet().fit(Xtrain, ytrain) # default model
elasticpred = enet_model.predict(Xtest)
elastic_accuracy = 1 - mean_absolute_error(ytest, elasticpred)
elastic_accuracy

  coef_, l1_reg, l2_reg, X, y, max_iter, tol, rng, random, positive


0.5072314205333175

In [None]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# define model
ratios = np.arange(0, 1, 0.01)
alphas = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0]
model = ElasticNetCV(l1_ratio=ratios, alphas=alphas, cv=cv, n_jobs=-1)
# fit model
model.fit(Xtrain, ytrain)

Checking the optimization accuracy, it actually got worse somehow, so let's ignore this one...

In [None]:
elastic_optimized = model.predict(Xtest)
optimized_elastic_accuracy = 1 - mean_absolute_error(ytest, elastic_optimized)
optimized_elastic_accuracy

0.5067068751295367

In [None]:
elastic_optimization_improvement = optimized_elastic_accuracy - elastic_accuracy
elastic_optimization_improvement

-0.000524545403780774

###XGBoost

In [None]:
xgb_model = XGBClassifier()
xgb_model.fit(Xtrain, ytrain)
xgbpred = xgb_model.predict(Xtest)
xgb_accuracy = accuracy_score(ytest, xgbpred)
xgb_accuracy

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0.5406330196749358

In [None]:
test_params = {
 'max_depth':[4,8,12]
}

xgb_optimized_model = GridSearchCV(estimator = xgb_model,param_grid = test_params)
xgb_optimized_model.fit(Xtrain, ytrain)
xgb_optimized_model.best_params_

In [None]:
optimized_xgbpred = xgb_model.predict(Xtest)
optimized_xgb_accuracy = accuracy_score(ytest, optimized_xgbpred)
optimized_xgb_accuracy

0.5406330196749358

## Comparison

Constructing dataframe of results

In [None]:
results = pd.DataFrame({'Random Forest': [np.sqrt(mean_squared_error(ytest,forestpred)),
                                          mean_absolute_error(ytest, forestpred),
                                          r2_score(ytest, forestpred),
                                          accuracy_score(ytest, forestpred)], 
                        'ElasticNet': [np.sqrt(mean_squared_error(ytest,elasticpred)),
                                          mean_absolute_error(ytest, elasticpred),
                                          r2_score(ytest, elasticpred), 
                                       1 - mean_absolute_error(ytest, elasticpred)],
                        'XGBoost': [np.sqrt(mean_squared_error(ytest,xgbpred)),
                                          mean_absolute_error(ytest, xgbpred),
                                          r2_score(ytest, xgbpred),
                                          accuracy_score(ytest, xgbpred)]})
results = results.rename(index={0: "Mean Squared Error",
                                1: "Mean Absolute Error", 
                                2: "r^2 Error", 
                                3: "Accuracy Score"})
results

Unnamed: 0,Random Forest,ElasticNet,XGBoost
Mean Squared Error,0.686544,0.49683,0.677766
Mean Absolute Error,0.471343,0.492769,0.459367
r^2 Error,-0.895712,0.007227,-0.847545
Accuracy Score,0.528657,0.507231,0.540633


In [None]:
import pandas as pd
optimized_results = pd.DataFrame({'Random Forest': [0.5489021956087824,
                                                    abs(0.5255203877958369 - 
                                                    0.5489021956087824)], 
                                  'ElasticNet': ['N/A',
                                                    'N/A'],
                                  'XGBoost': [0.5508982035928144,
                                                  0.5508982035928144 - 
                                              0.5406330196749358]}) #TODO fix me
optimized_results = optimized_results.rename(index={0: "Accuracy Score", 
                                                    1: "Improvement"})
optimized_results.to_latex

<bound method NDFrame.to_latex of                 Random Forest ElasticNet   XGBoost
Accuracy Score       0.548902        N/A  0.550898
Improvement          0.023382        N/A  0.010265>

**Best Hyperparameters**
 * Random Forest
  * bootstrap: True
  * max_depth: 10
  * max_features: 'sqrt'
  * min_samples_leaf: 2
  * min_samples_split: 5
  * n_estimators: 200
 * ElasticNet
  * Any change from default hyperparameters decreases the accuracy of the model
 * XGBoost
  * max_depth=2
  * n_estimators=98