# MLB Predictor Project

Group 21, Plotters for Success

Gerardo Skrut, Victor Gikunda, Mathew Huang

In [2]:
import pandas as pd
import seaborn as sn
import pickle
import chardet
import sklearn as sk
from meteostat import Point, Hourly
from datetime import datetime
import matplotlib



## Data Cleaning

Prior to importing the data, we cleaned and explored the existing data.

In [13]:
#batting data cleaning
# Load your CSV file into a DataFrame
df = pd.read_csv('2023batting.csv')

# Specify the columns you want to keep
columns_to_keep = ['gid', 'id', 'team', 'b_pa', 'b_ab', 'b_h', 'b_d', 'b_t', 'b_hr', 'b_rbi', 'b_w', 'b_k', 'date', 'wl']  # Replace with your column names

# Create a new DataFrame with only the selected columns
df_filtered = df[columns_to_keep]

# Optionally, save the filtered DataFrame to a new CSV
df_filtered.to_csv('2023_filtered_batting_data.csv', index=False)

batting_data = pd.read_csv('2023_filtered_batting_data.csv')
batting_data.head()

Unnamed: 0,gid,id,team,b_pa,b_ab,b_h,b_d,b_t,b_hr,b_rbi,b_w,b_k,date,wl
0,BOS202303300,mullc002,BAL,6,4,1,0,0,0,1,2,1,20230330,w
1,BOS202303300,rutsa001,BAL,6,5,5,0,0,1,4,1,0,20230330,w
2,BOS202303300,santa003,BAL,6,6,2,1,0,0,0,0,2,20230330,w
3,BOS202303300,mcker001,BAL,0,0,0,0,0,0,0,0,0,20230330,w
4,BOS202303300,mounr001,BAL,6,4,1,1,0,0,1,2,0,20230330,w


In [11]:
# Load your CSV file into a DataFrame
df = pd.read_csv('2023pitching.csv')

# Specify the columns you want to keep
columns_to_keep = ['gid', 'id', 'team', 'p_seq', 'p_h', 'p_r', 'p_er', 'p_w', 'p_hbp', 'p_wp', 'date', 'wl']  # Replace with your column names

# Create a new DataFrame with only the selected columns
df_filtered = df[columns_to_keep]

# Optionally, save the filtered DataFrame to a new CSV
df_filtered.to_csv('2023_filtered_pitching_data.csv', index=False)

pitching_data = pd.read_csv('2023_filtered_pitching_data.csv')
pitching_data.head()

We also 

In [15]:
df = pd.read_csv('2023gameinfo.csv')
columns_to_keep = ['gid', 'attendance', 'fieldcond', 'precip', 'sky', 'temp', 'winddir', 'umplf', 'umprf']
df_filtered = df[columns_to_keep]
# Optionally, save the filtered DataFrame to a new CSV
df_filtered.to_csv('2023_filtered_game_data.csv', index=False)
game_data = pd.read_csv('2023_filtered_game_data.csv')
game_data.head()

Unnamed: 0,gid,attendance,fieldcond,precip,sky,temp,winddir,umplf,umprf
0,BOS202303300,36049,unknown,none,sunny,38,ltor,(none),(none)
1,CHN202303300,36054,unknown,none,sunny,42,rtol,(none),(none)
2,CIN202303300,44063,unknown,none,sunny,61,fromrf,(none),(none)
3,HOU202303300,43032,unknown,none,dome,73,unknown,(none),(none)
4,KCA202303300,38351,unknown,none,cloudy,67,rtol,(none),(none)


# Implementing Neural Networks

To Tune or Neural Network (NN), we are using different numbers. To do so, we will use the gridsearch CV function to process our Data 

In [4]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In this portion, we are using a pipeline to scale our neural network and testing out various hyperparameters to determine which one works best

In [None]:
pline = Pipeline([('scaling', sk.preprocessing.StandardScaler()), ('pca', sk.decomposition.PCA()),
                  ('nnet', MLPClassifier())])

param_grid = {
    'pca__n_components':list(range(5,19)),
    'nnet__hidden_layer_sizes':[30, 40, 50, 60],
    'nnet__activation': ['tanh', 'relu'],
    'nnet__alpha':[0.0001, 0.05]

}
gs = GridSearchCV(pline, param_grid, cv = 5, scoring = 'accuracy')

nested_score = sk.model_selection.cross_val_score(gs,data_x,labels, cv = 5)

print(nested_score)
print("Accuracy: ", nested_score.mean()*100)

After tuning our hyperparameters, we will now build our final model

In [None]:
gs.fit(data_x, labels)
best_params = gs.best_params_
print(best_params)

layer_size = best_params.get('nnet__hidden_layer_sizes')
activation_func = best_params.get('nnet__activation')
model_alpha = best_params.get('nnet__alpha')

final_model = Pipeline([('scaling', sk.preprocessing.StandardScaler()), 
                        ('nnet', MLPClassifier(activation = activation_func, 
                                               hidden_layer_sizes=layer_size, 
                                               alpha=model_alpha))])

