# MLB Predictor Project

Group 21, Plotters for Success

Gerardo Skrut, Victor Gikunda, Mathew Huang

In [2]:
import pandas as pd
import seaborn as sn
import pickle
import chardet
import sklearn as sk
from meteostat import Point, Hourly
from datetime import datetime
import matplotlib



In [3]:
# Set time period
start = datetime(2018, 1, 1)
end = datetime(2018, 1, 1, 1, 00)

# Create Point for Vancouver, BC
# location = Point(49.2497, -123.1193, 70)

# Get daily data for 2018
data = Hourly("72219", start, end)
data = data.fetch()

print(data)

                     temp  dwpt  rhum  prcp  snow   wdir  wspd  wpgt    pres  \
time                                                                           
2018-01-01 00:00:00   0.6  -5.6  63.0   NaN   NaN  340.0  18.4   NaN  1024.6   
2018-01-01 01:00:00  -1.1  -6.6  66.0   0.0   NaN  320.0  16.6   NaN  1025.4   

                     tsun  coco  
time                             
2018-01-01 00:00:00   NaN   NaN  
2018-01-01 01:00:00   NaN   NaN  


## Data Cleaning

In [15]:
ballpark_data = pd.read_csv('ballparks.csv', header = 'infer')
ballpark_data.head()

Unnamed: 0,team_name,ballpark,left_field,center_field,right_field,min_wall_height,max_wall_height,hr_park_effects,extra_distance,avg_temp,elevation,roof,daytime
0,ATL,Truist Park,335,400,325,11.0,15,99,1.8,79.2,1001,0.0,0.31
1,AZ,Chase Field,328,407,335,7.6,25,84,13.2,80.8,1086,0.81,0.31
2,BAL,Oriole Park at Camden Yards,333,400,318,7.0,21,107,-0.5,76.4,33,0.0,0.35
3,BOS,Fenway Park,310,420,302,3.0,37,102,-4.7,69.5,21,0.0,0.31
4,CHC,Wrigley Field,355,400,353,11.5,15,97,-3.4,70.2,595,0.0,0.55


Our Batting and Pitching stats are not in usable CSV form due to the fact that it is separated by semicolons. Therefore, we will edit it as a txt file and import as a csv

In [11]:
with open('batting_stats.csv', 'rb') as f:
    result = chardet.detect(f.read())
    print(result)

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [10]:
batting_data = pd.read_csv('batting_stats.csv', delimiter=';', encoding='ISO-8859-1')

batting_data.head()


Unnamed: 0,Rk,Name,Age,Tm,Lg,G,PA,AB,R,H,...,OBP,SLG,OPS,OPS+,TB,GDP,HBP,SH,SF,IBB
0,1,CJ Abrams*,22,WSN,NL,89,340,316,47,82,...,0.306,0.434,0.739,105,137,5,8,3,0,0
1,2,José Abreu,36,HOU,AL,95,400,368,33,90,...,0.293,0.353,0.646,79,130,11,3,0,5,1
2,3,Ronald Acuna Jr.,25,ATL,NL,97,446,391,86,129,...,0.408,0.578,0.986,160,226,7,4,0,2,2
3,4,Willy Adames,27,MIL,NL,89,383,336,44,71,...,0.291,0.411,0.702,90,138,9,3,0,5,0
4,5,Riley Adams,27,WSN,NL,23,87,79,4,22,...,0.337,0.506,0.844,133,40,4,1,1,0,0


In [12]:
with open('pitching_stats.csv', 'rb') as f:
    result = chardet.detect(f.read())
    print(result)

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}


In [13]:
pitching_data = pd.read_csv('pitching_stats.csv', delimiter=';', encoding='ISO-8859-1')

pitching_data.head()


Unnamed: 0,Rk,Name,Age,Tm,Lg,W,L,W-L%,ERA,G,...,WP,BF,ERA+,FIP,WHIP,H9,HR9,BB9,SO9,SO/W
0,1,Fernando Abad*,37,COL,NL,1,0,1.0,4.26,6,...,0,32,124,8.18,2.211,15.6,2.8,4.3,2.8,0.67
1,2,Andrew Abbott*,24,CIN,NL,5,2,0.714,2.1,9,...,0,213,226,4.08,0.916,5.3,1.3,2.9,9.2,3.17
2,3,Cory Abbott,27,WSN,NL,0,1,0.0,5.49,10,...,1,85,77,5.83,1.475,9.2,1.8,4.1,7.3,1.78
3,4,Albert Abreu,27,NYY,AL,2,2,0.5,4.14,33,...,2,182,103,4.69,1.331,7.4,1.3,4.6,10.0,2.19
4,5,Bryan Abreu,26,HOU,AL,3,2,0.6,2.72,47,...,1,192,155,3.42,1.144,6.6,1.2,3.7,13.4,3.63


# Implementing Neural Networks

To Tune or Neural Network (NN), we are using different numbers. To do so, we will use the gridsearch CV function to process our Data 

In [4]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In this portion, we are using a pipeline to scale our neural network and testing out various hyperparameters to determine which one works best

In [None]:
pline = Pipeline([('scaling', sk.preprocessing.StandardScaler()), 
                  ('nnet', MLPClassifier())])

param_grid = {

    'nnet__hidden_layer_sizes':[30, 40, 50, 60],
    'nnet__activation': ['tanh', 'relu'],
    'nnet__alpha':[0.0001, 0.05]

}
gs = GridSearchCV(pline, param_grid, cv = 5, scoring = 'accuracy')

nested_score = sk.model_selection.cross_val_score(gs,data_x,labels, cv = 5)

print(nested_score)
print("Accuracy: ", nested_score.mean()*100)

After tuning our hyperparameters, we will now build our final model

In [None]:
gs.fit(data_x, labels)
best_params = gs.best_params_
print(best_params)

layer_size = best_params.get('nnet__hidden_layer_sizes')
activation_func = best_params.get('nnet__activation')
model_alpha = best_params.get('nnet__alpha')

final_model = Pipeline([('scaling', sk.preprocessing.StandardScaler()), 
                        ('nnet', MLPClassifier(activation = activation_func, 
                                               hidden_layer_sizes=layer_size, 
                                               alpha=model_alpha))])

