# MLB Predictor Project

Group 21, Plotters for Success

Gerardo Skrut, Victor Gikunda, Mathew Huang

In [11]:
import pandas as pd
import seaborn as sn
import numpy as np
import csv
from datetime import datetime

## Data Cleaning

Prior to importing the data, we cleaned and explored the existing data.

Finally, After we consolidated two datasets with pitching and batting, we are going to separate each portion to inputs and outputs respectively. 

Our inputs overall would include Left Field, Right Field, and Centerfield Distance, Maximum and minimum wall height, Day/night, Attendance, Precipitation, Sky Condition, Temperature, Wind Direction, and Wind Speed. 

For Pitching specifically, we will be using the pitcher's **Season ERA** from the 2023 Season. 

For Batting Specifically, we will be using the batter's **Season Batting Average** from the 2023 Season.

Our outputs would be game specific statistics. 

For Pitching, we would have the number of Hits Allowed, Runs Allowed, Earned Runs, Walks Given, Hit by Pitches, and Wild Pitches.

For Batting, we would have the number of Hits, Doubles, Triples, Home Runs, RBIs, Walks, and Strikeouts. 

### Ballpark Dataset Cleaning

In [25]:
# Load ballparks data
data = pd.read_csv('ballparks.csv')

# Filter for relevant columns
columns_to_keep = ['team_name', 'ballpark', 'left_field', 'center_field', 'right_field', 'min_wall_height', 'max_wall_height']
data_filtered = data[columns_to_keep]

# Calculate average for numeric columns
average_values = data_filtered[['left_field', 'center_field', 'right_field', 'min_wall_height', 'max_wall_height']].mean()

# Print the average values
print("Average Column Values:")
print(average_values)

# Save the filtered data
data_filtered.to_csv('2023_filtered_ballpark_data.csv', index=False)


Average Column Values:
left_field         331.833333
center_field       404.166667
right_field        328.333333
min_wall_height      7.553333
max_wall_height     14.266667
dtype: float64


### Game Information Data Cleaning

In [26]:
# Load game info data
data = pd.read_csv('2023gameinfo.csv')

# Filter for relevant columns
columns_to_keep = ['gid', 'daynight', 'attendance', 'precip', 'sky', 'temp', 'winddir', 'windspeed']
data_filtered = data[columns_to_keep]

# Calculate average for numeric columns
numeric_columns = ['attendance', 'temp', 'windspeed']
average_values = data_filtered[numeric_columns].mean()

# Find the most frequent value for categorical columns
categorical_columns = ['daynight', 'precip', 'sky', 'winddir']
most_frequent_values = data_filtered[categorical_columns].mode().iloc[0]

# Print results
print("Average Values for Numeric Columns:")
print(average_values)
print("\nMost Frequent Values for Categorical Columns:")
print(most_frequent_values)

# Save filtered data
data_filtered.to_csv('2023_filtered_gameinfo_data.csv', index=False)


Average Values for Numeric Columns:
attendance    29356.347087
temp             72.413835
windspeed         6.466828
dtype: float64

Most Frequent Values for Categorical Columns:
daynight      night
precip         none
sky          cloudy
winddir     unknown
Name: 0, dtype: object


### Batting Data Cleaning

In [14]:
'''filters for relevant batting player data'''
# load csv file
data = pd.read_csv('2023batting.csv')

# filters for columns with relevant data
columns_to_keep = ['gid', 'id', 'team', 'b_ab', 'b_h', 'b_d', 'b_t', 'b_hr', 'b_rbi', 'b_w', 'b_k', 'date', 'wl']  

# creates a new data frame
data_filtered = data[columns_to_keep]

# save new csv file
data_filtered.to_csv('2023_filtered_batting_data.csv', index=False)

In [15]:
'''Dataset includes batting data for players who do not bat, this filters out those players who had zero at bats (i.e. pitchers)'''

updated_rows = []
with open('2023_filtered_batting_data.csv', 'r') as data_file:  
    data_reader = csv.reader(data_file)
    header = next(data_reader)  
    updated_rows.append(header)

    # reads each row
    for row in data_reader:
        # checks if players number of plate appearances is zero
        if int(row[3]) == 0:
            # skips if plate appearance is equal to zero  
            continue  

        # appends data if not    
        updated_rows.append(row)

# creates a new csv file with updated data
with open('2023_batting_data_cleaned.csv', 'w', newline='') as updated_file:
    writer = csv.writer(updated_file)
    writer.writerows(updated_rows)

In [16]:
'''Merges relevant data into a one csv file'''

# loads ball park data into a dictionary
stadium_data = {}
with open('2023_filtered_ballpark_data.csv', 'r') as stadium_file:  
    stadium_reader = csv.DictReader(stadium_file)
    for row in stadium_reader:
        # uses the team name as the key for each entry
        team_name = row['team_name']  
        # stores the data for the corresponding team
        stadium_data[team_name] = row  

# loads game info data into a dictionary       
gameinfo_data = {}
with open('2023_filtered_gameinfo_data.csv', 'r') as gameinfo_file:  
    gameinfo_reader = csv.DictReader(gameinfo_file)
    for row in gameinfo_reader:
        # uses the game id as the key for each entry
        gid = row['gid']  
        # stores the data for the corresponding game id 
        gameinfo_data[gid] = row  
        

# read batting data, merge with ball park data, store updated rows
updated_rows = []
with open('2023_batting_data_cleaned.csv', 'r') as game_log_file: 
    batting_log = csv.reader(game_log_file)
    # captures the header row
    header = next(batting_log)

    # extracts column names of the ball park data
    stadium_columns = list(stadium_data.values())[0].keys() 
    # extracts column names of the game info data, excluding 'gid' since it already exist in the dataset 
    gameinfo_columns = list(gameinfo_data.values())[0].keys()
    new_gameinfo_columns = []
    
    for col in gameinfo_columns:
        if col != 'gid':
            new_gameinfo_columns.append(col)

    # appens the original header with additional stadium and game info columns           
    updated_rows.append(header + list(stadium_columns) + new_gameinfo_columns)          

    # iterates through each row of the batting log data      
    for row in batting_log:
        
        # extracts game id
        gid = row[0] 
        # extracts first three letters of the game id, which is the team id
        team_id = gid[:3]

        # checks if the team id exists in the stadium data        
        if team_id in stadium_data:
            # retieves ball park data for that team and appends it to the row
            # this can be done because the first three characters of the game id represent the team id of the home team 
            stadium_info = stadium_data[team_id]
            row.extend(stadium_info.values())
        
        # checks if game id is in the game info data
        if gid in gameinfo_data:
            
            # retrieves game info data to corresponding game id
            gameinfo = gameinfo_data[gid]
            updated_gameinfo = []
            # removes the duplicate game id column and appends the other values
            for key, value in gameinfo.items():
                if key != 'gid':
                    updated_gameinfo.append(value)
            
            # adds game info data to current row
            row.extend(updated_gameinfo)

        # adds new data to row   
        updated_rows.append(row)


# new csv file output with updated data
with open('2023_merged_batting_data.csv', 'w', newline='') as updated_file:
    writer = csv.writer(updated_file)
    writer.writerows(updated_rows)

In [17]:
'''adds a row with calculated season average for each player since the data set only includes specific game log data'''
 
# Dictionary to store season totals
season_avg_data = {}

# Process input file to calculate season totals for each player
with open('2023_merged_batting_data.csv', 'r') as stat_file:
    stat_reader = csv.DictReader(stat_file)
    header = stat_reader.fieldnames + ['season_batting_avg']

    for row in stat_reader:
        name = row['id']
        bats = int(row['b_ab'])
        hits = int(row['b_h'])

        # Update season totals for the player
        if name in season_avg_data:
            season_avg_data[name]['total_at_bats'] += bats
            season_avg_data[name]['total_hits'] += hits
        else:
            season_avg_data[name] = {'total_at_bats': bats, 'total_hits': hits}

# Prepare updated rows with calculated season batting averages
updated_rows = []
with open('2023_merged_batting_data.csv', 'r') as stat_file:
    stat_reader = csv.DictReader(stat_file)
    for row in stat_reader:
        name = row['id']
        total_bats = season_avg_data[name]['total_at_bats']
        total_hits = season_avg_data[name]['total_hits']

        # Calculate player's season batting average
        batting_average = total_hits / total_bats
        row['season_batting_avg'] = f"{batting_average:.3f}"

        updated_rows.append(row)

# Write updated data to a new CSV file
with open('2023_complete_batting_data.csv', 'w', newline='') as updated_file:
    writer = csv.DictWriter(updated_file, fieldnames=header)
    writer.writeheader()
    writer.writerows(updated_rows)

# Calculate overall average batting average (NEW ADDITION)
batting_averages = [
    stats['total_hits'] / stats['total_at_bats']
    for stats in season_avg_data.values()
    if stats['total_at_bats'] > 0  # Avoid division by zero
]

average_batting_avg = sum(batting_averages) / len(batting_averages) if batting_averages else 0

# Print the overall average batting average (does not affect functionality)
print(f"Average Batting Average: {average_batting_avg:.3f}")

Average Batting Average: 0.226


In [18]:
'''formats finalzied dataset properly'''

# creates a mapping of player IDs to their full names
name_id_map = {}
with open('2023allplayers.csv', 'r') as name_file:  
    name_reader = csv.DictReader(name_file)
    for row in name_reader:
        # combines first and last name columns to form the full name and maps it respectively 
        name_id_map[row['id']] = f"{row['first']} {row['last']}"

# reads the baseball data file and replace IDs with full names
updated_rows = []
with open('2023_complete_batting_data.csv', 'r') as data_file: 
    data_reader = csv.reader(data_file)
    header = next(data_reader)  
    updated_rows.append(header)

    # iterates through each row of the data
    for row in data_reader:
        
        # converts the date format from 'YYYYMMDD' to 'MM/DD/YYYY'
        date_str = row[11]
        date_format = datetime.strptime(date_str, '%Y%m%d').strftime('%m/%d/%Y')
        row[11] = date_format
        
        # replaces the player id with the player's full name
        player_id = row[1] 
        if player_id in name_id_map:
            row[1] = name_id_map[player_id] 

        # adds the updated row
        updated_rows.append(row)

# writes the updated csv file with all data and cleaned
with open('2023_full_batting_stats_cleaned.csv', 'w', newline='') as updated_file:
    writer = csv.writer(updated_file)
    writer.writerows(updated_rows)

In [19]:
final_csv = '2023_full_batting_stats_cleaned.csv'

data = pd.read_csv(final_csv)

print(data.head())

            gid                 id team  b_ab  b_h  b_d  b_t  b_hr  b_rbi  \
0  BOS202303300     Cedric Mullins  BAL     4    1    0    0     0      1   
1  BOS202303300    Adley Rutschman  BAL     5    5    0    0     1      4   
2  BOS202303300  Anthony Santander  BAL     6    2    1    0     0      0   
3  BOS202303300   Ryan Mountcastle  BAL     4    1    1    0     0      1   
4  BOS202303300   Gunnar Henderson  BAL     3    0    0    0     0      0   

   b_w  ...  min_wall_height max_wall_height daynight attendance precip  \
0    2  ...              3.0              37      day    36049.0   none   
1    1  ...              3.0              37      day    36049.0   none   
2    0  ...              3.0              37      day    36049.0   none   
3    2  ...              3.0              37      day    36049.0   none   
4    2  ...              3.0              37      day    36049.0   none   

     sky  temp  winddir windspeed  season_batting_avg  
0  sunny  38.0     ltor      1

  data = pd.read_csv(final_csv)


### Pitching Data Cleaning

In [20]:
'''filters for relevant player pitching data'''
# load csv file
data = pd.read_csv('2023pitching.csv')

# filters for columns with relevant data
columns_to_keep = ['gid', 'id', 'team', 'p_ipouts', 'p_seq', 'p_h', 'p_r', 'p_er', 'p_w', 'p_hbp', 'p_wp', 'date', 'wl']  

# creates a new data frame
data_filtered = data[columns_to_keep]

# save new csv file
data_filtered.to_csv('2023_filtered_pitching_data.csv', index=False)

In [21]:
'''merges pitcher game log data with ball park and game info data'''

# load ball park data into a dictionary
stadium_data = {}
with open('2023_filtered_ballpark_data.csv', 'r') as stadium_file:  
    stadium_reader = csv.DictReader(stadium_file)
    for row in stadium_reader:
        # extracts the team name column
        team_name = row['team_name'] 
        # uses team name as a key and the row as the value 
        stadium_data[team_name] = row  

# loads game info data into a dictionary        
gameinfo_data = {}
with open('2023_filtered_gameinfo_data.csv', 'r') as gameinfo_file:  
    gameinfo_reader = csv.DictReader(gameinfo_file)
    for row in gameinfo_reader:
        # extracts the 'gid' column
        gid = row['gid']  
        # uses the game id as a key and the row as the value
        gameinfo_data[gid] = row  
        

# read pitching data, merge with ball park data, store updated rows
updated_rows = []
with open('2023_filtered_pitching_data.csv', 'r') as game_log_file: 
    pitching_log = csv.reader(game_log_file)
    # captures the header row
    header = next(pitching_log)  


    # extarcts column names of ball park data
    stadium_columns = list(stadium_data.values())[0].keys()  
    # extracts column names of the game info data, excluding 'gid' since it already exist in the dataset
    gameinfo_columns = list(gameinfo_data.values())[0].keys()
    new_gameinfo_columns = []
    
    for col in gameinfo_columns:
        if col != 'gid':
            new_gameinfo_columns.append(col)

    # appends the original header with additional stadium and game info columns           
    updated_rows.append(header + list(stadium_columns) + new_gameinfo_columns)


    # interates through each row of the pitching log data
    for row in pitching_log:
        # extracts the game id
        gid = row[0]  
        # extracts the first three letter of the game id, which is the team id
        team_id = gid[:3] 

        # checks if the team id exists in the staidum data 
        if team_id in stadium_data:
            
            # retrieves he ball park data for that team and appends it to the row
            # this can be done because the first three character of the game id represent the team if of the home team
            stadium_info = stadium_data[team_id]
            row.extend(stadium_info.values())
        
        # checks if the game id is in the game info data 
        if gid in gameinfo_data:
            
            # retrieves game info data to corresponding game id 
            gameinfo = gameinfo_data[gid]
            updated_gameinfo = []
            # removes the duplicate game id column and appends the other values
            for key, value in gameinfo.items():
                if key != 'gid':
                    updated_gameinfo.append(value)

            # adds game info data to current row
            row.extend(updated_gameinfo)

        # adds new data to row   
        updated_rows.append(row)


# new csv file output with updated data
with open('2023_merged_pitching_data.csv', 'w', newline='') as updated_file:
    writer = csv.writer(updated_file)
    writer.writerows(updated_rows)

In [22]:
'''calculates the season ERA for each pitcher and merge the dataset'''
import csv

# Initialize dictionary to store season totals
season_avg_data = {}

# Process input file to calculate season ERA totals for each pitcher
with open('2023_merged_pitching_data.csv', 'r') as stat_file:
    stat_reader = csv.DictReader(stat_file)
    header = stat_reader.fieldnames + ['season_era']

    for row in stat_reader:
        name = row['id']
        outs = int(row['p_ipouts'])
        earned_runs = int(row['p_er'])

        # Update season totals for the pitcher
        if name in season_avg_data:
            season_avg_data[name]['total_innings'] += (outs / 3)
            season_avg_data[name]['total_earned_runs'] += earned_runs
        else:
            season_avg_data[name] = {'total_innings': (outs / 3), 'total_earned_runs': earned_runs}

# Prepare updated rows with calculated season ERA
updated_rows = []
with open('2023_merged_pitching_data.csv', 'r') as stat_file:
    stat_reader = csv.DictReader(stat_file)
    for row in stat_reader:
        name = row['id']
        innings = season_avg_data[name]['total_innings']
        earned_runs = season_avg_data[name]['total_earned_runs']

        # Calculate season ERA
        season_era = 9 * (earned_runs / innings)
        row['season_era'] = f"{season_era:.3f}"

        updated_rows.append(row)

# Write updated data to a new CSV file
with open('2023_complete_pitching_data.csv', 'w', newline='') as updated_file:
    writer = csv.DictWriter(updated_file, fieldnames=header)
    writer.writeheader()
    writer.writerows(updated_rows)

# Calculate overall average season ERA (NEW ADDITION)
season_eras = [
    9 * (stats['total_earned_runs'] / stats['total_innings'])
    for stats in season_avg_data.values()
    if stats['total_innings'] > 0  # Avoid division by zero
]

average_season_era = sum(season_eras) / len(season_eras) if season_eras else 0

# Print the overall average season ERA (does not affect functionality)
print(f"Average Season ERA: {average_season_era:.3f}")

Average Season ERA: 5.870


In [23]:
'''formats finalized dataset properly'''

# creates a mapping of player IDs to their full names
name_id_map = {}
with open('2023allplayers.csv', 'r') as name_file:  
    name_reader = csv.DictReader(name_file)
    for row in name_reader:
        # combines first and last name columns to form the full name and maps it respectively 
        name_id_map[row['id']] = f"{row['first']} {row['last']}"

# reads the baseball data file and replaces IDs with full names
updated_rows = []
with open('2023_complete_pitching_data.csv', 'r') as data_file: 
    data_reader = csv.reader(data_file)
    header = next(data_reader)  
    updated_rows.append(header)

    # iterates through each row of the data
    for row in data_reader:
        
        # converts the data format from 'YYYYMMDD' to 'MM/DD/YYYY'
        date_str = row[11]
        date_format = datetime.strptime(date_str, '%Y%m%d').strftime('%m/%d/%Y')
        row[11] = date_format
        
        # replaces the player id with the player's full name
        player_id = row[1] 
        if player_id in name_id_map:
            row[1] = name_id_map[player_id]  

        # adds the updated row
        updated_rows.append(row)

# writes the updated csv file all data and cleaned
with open('2023_full_pitching_stats_cleaned.csv', 'w', newline='') as updated_file:
    writer = csv.writer(updated_file)
    writer.writerows(updated_rows)



### Pitching Data Head

In [24]:
final_csv = '2023_full_pitching_stats_cleaned.csv'

data = pd.read_csv(final_csv)

print(data.head())

            gid               id team  p_ipouts  p_seq  p_h  p_r  p_er  p_w  \
0  BOS202303300      Kyle Gibson  BAL        15      1    6    4     4    1   
1  BOS202303300      Keegan Akin  BAL         3      2    1    0     0    0   
2  BOS202303300     Cionel Perez  BAL         3      3    0    0     0    0   
3  BOS202303300      Bryan Baker  BAL         2      4    2    3     3    1   
4  BOS202303300  Logan Gillaspie  BAL         1      5    0    0     0    0   

   p_hbp  ...  min_wall_height max_wall_height daynight attendance precip  \
0      1  ...              3.0              37      day    36049.0   none   
1      0  ...              3.0              37      day    36049.0   none   
2      0  ...              3.0              37      day    36049.0   none   
3      1  ...              3.0              37      day    36049.0   none   
4      0  ...              3.0              37      day    36049.0   none   

     sky  temp  winddir windspeed  season_era  
0  sunny  38.0

## Pre-input processing

Prior to using the cleaned data, we need to process the data into a more readable format. This would mean fully separating into inputs and outputs as well as converting any categorical variables into binaries. 

To do so, we use pd.get_dummies to "one-hot-encode" our categorical variables to not place too much importance on any given point.

## Pitching Data

In [23]:
pitching_data = pd.read_csv('2023_complete_pitching_data.csv')

categorical_data = ['daynight', 'precip', 'sky', 'winddir']


pitching_inputs= ['left_field', 'center_field', 'right_field','min_wall_height','max_wall_height',
                                    'attendance','temp','windspeed','season_era', 'daynight_day', 'daynight_night', 'precip_drizzle', 'precip_none', 'precip_rain', 
                                    'precip_snow', 'sky_cloudy', 'sky_dome', 'sky_overcast', 'sky_sunny', 'winddir_fromcf', 'winddir_fromlf', 'winddir_fromrf', 'winddir_ltor', 
                                    'winddir_rtol', 'winddir_tocf', 'winddir_tolf', 'winddir_torf', 'winddir_unknown']

pitching_outputs = ['p_ipouts','p_h','p_r','p_er','p_w','p_hbp','p_wp']



pitching_data.loc[:, 'precip'] = pitching_data['precip'].fillna('none')
pitching_data = pitching_data.dropna()
pitching_data = pd.get_dummies(pitching_data, columns=categorical_data)


#post encoding: 
p_encoded_variables = ['daynight_day', 'daynight_night', 'precip_drizzle', 
                       'precip_none', 'precip_rain', 'precip_snow', 'sky_cloudy', 'sky_dome', 
                       'sky_overcast', 'sky_sunny', 'winddir_fromcf', 'winddir_fromlf', 'winddir_fromrf', 
                       'winddir_ltor', 'winddir_rtol', 'winddir_tocf', 'winddir_tolf', 'winddir_torf', 
                       'winddir_unknown']

#some columns were object classees
pitching_data[['left_field', 'center_field', 'min_wall_height']] = pitching_data[['left_field', 'center_field', 'min_wall_height']].astype(float)
pitching_data[p_encoded_variables] = pitching_data[p_encoded_variables].astype(int)


pitching_input_data = pitching_data[pitching_inputs]
pitching_output_data = pitching_data[pitching_outputs]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)  # Show all rows

print(pitching_input_data.info())  # Check data types
print(pitching_input_data.head())  # Check first few rows

<class 'pandas.core.frame.DataFrame'>
Index: 21042 entries, 0 to 21061
Data columns (total 28 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   left_field       21042 non-null  float64
 1   center_field     21042 non-null  float64
 2   right_field      21042 non-null  int64  
 3   min_wall_height  21042 non-null  float64
 4   max_wall_height  21042 non-null  int64  
 5   attendance       21042 non-null  float64
 6   temp             21042 non-null  float64
 7   windspeed        21042 non-null  float64
 8   season_era       21042 non-null  float64
 9   daynight_day     21042 non-null  int64  
 10  daynight_night   21042 non-null  int64  
 11  precip_drizzle   21042 non-null  int64  
 12  precip_none      21042 non-null  int64  
 13  precip_rain      21042 non-null  int64  
 14  precip_snow      21042 non-null  int64  
 15  sky_cloudy       21042 non-null  int64  
 16  sky_dome         21042 non-null  int64  
 17  sky_overcast     

## Batting Data

In [4]:
batting_data = pd.read_csv('2023_complete_batting_data.csv')

batting_inputs= ['left_field', 'center_field', 'right_field','min_wall_height','max_wall_height',
                 'attendance','temp','windspeed','season_batting_avg', 'daynight_day', 'daynight_night', 'precip_drizzle', 'precip_none', 'precip_rain', 
                 'precip_snow', 'sky_cloudy', 'sky_dome', 'sky_overcast', 'sky_sunny', 'winddir_fromcf', 'winddir_fromlf', 'winddir_fromrf', 'winddir_ltor', 
                 'winddir_rtol', 'winddir_tocf', 'winddir_tolf', 'winddir_torf', 'winddir_unknown']

batting_outputs = ['b_ab','b_h', 'b_d','b_t','b_hr','b_rbi','b_w','b_k']


batting_data.loc[:, 'precip'] = batting_data['precip'].fillna('none')
batting_data = batting_data.dropna()
batting_data = pd.get_dummies(batting_data, columns=categorical_data)


#post encoding: 
b_encoded_variables = ['daynight_day', 'daynight_night', 'precip_drizzle', 
                       'precip_none', 'precip_rain', 'precip_snow', 'sky_cloudy', 'sky_dome', 
                       'sky_overcast', 'sky_sunny', 'winddir_fromcf', 'winddir_fromlf', 'winddir_fromrf', 
                       'winddir_ltor', 'winddir_rtol', 'winddir_tocf', 'winddir_tolf', 'winddir_torf', 
                       'winddir_unknown']

#some columns were object classees
batting_data[['left_field', 'center_field', 'min_wall_height']] = batting_data[['left_field', 'center_field', 'min_wall_height']].astype(float)
batting_data[p_encoded_variables] = batting_data[b_encoded_variables].astype(int)

batting_input_data = batting_data[batting_inputs]
batting_output_data = batting_data[batting_outputs]
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)  # Show all rows

print(pitching_input_data.info())  # Check data types
print(pitching_input_data.head())  # Check first few rows

<class 'pandas.core.frame.DataFrame'>
Index: 21042 entries, 0 to 21061
Data columns (total 28 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   left_field       21042 non-null  float64
 1   center_field     21042 non-null  float64
 2   right_field      21042 non-null  int64  
 3   min_wall_height  21042 non-null  float64
 4   max_wall_height  21042 non-null  int64  
 5   attendance       21042 non-null  float64
 6   temp             21042 non-null  float64
 7   windspeed        21042 non-null  float64
 8   season_era       21042 non-null  float64
 9   daynight_day     21042 non-null  int64  
 10  daynight_night   21042 non-null  int64  
 11  precip_drizzle   21042 non-null  int64  
 12  precip_none      21042 non-null  int64  
 13  precip_rain      21042 non-null  int64  
 14  precip_snow      21042 non-null  int64  
 15  sky_cloudy       21042 non-null  int64  
 16  sky_dome         21042 non-null  int64  
 17  sky_overcast     

  batting_data = pd.read_csv('2023_complete_batting_data.csv')


For predictions, we are also going to use average valuesand modes. For numerical data we will use the mean. For categorical datas, we will use the mode.

# Implementing Neural Networks

To Tune or Neural Network (NN), we are using different numbers. To do so, we will use the gridsearch CV function to process our Data 

In [39]:
import sklearn as sk
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.multioutput import MultiOutputClassifier
import pickle
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer, accuracy_score


To support our multi-output classifier, we needed to create a multioutput evaluation method to score our tests

In [24]:
def multioutput_accuracy(y_true, y_pred):
    # Compute accuracy for each target column and average them
    accuracies = [accuracy_score(y_true[:, i], y_pred[:, i]) for i in range(y_true.shape[1])]
    return np.mean(accuracies)

multioutput_scorer = make_scorer(multioutput_accuracy)

Firstly we will start with our pitching neural network. We start off with scaling our data and reducing our number of dimensions. From there, we will run it through our MLPClassifier Algorithm from Sci-kit learn. We will determine what hyperparameters work best for our neural network by using the GridSearchCV function to get a cross validation accuracy.

In [36]:
# Create Pipeline of processes to run through
pline = Pipeline([('scaling', sk.preprocessing.StandardScaler()), ('pca', PCA()),
                  ('nnet', MultiOutputClassifier(MLPClassifier( early_stopping= True)))])

# Defines Parameters to Test
param_grid = {
    # 'pca__n_components':[5,10,15,20, 25, 30],
    # 'nnet__estimator__hidden_layer_sizes':[30, 45, 60],
    # 'nnet__estimator__activation': ['relu'],
    # 'nnet__estimator__alpha':[0.0001, 0.001]
    'pca__n_components':list(range(3,10)),
    'nnet__estimator__hidden_layer_sizes':[10, 20,25, 30],
    'nnet__estimator__activation': ['relu'],
    'nnet__estimator__alpha':[0.0001, 0.00001],
    # 'nnet__estimator__max_iter':[500, 1000, 1500]
    'nnet__estimator__max_iter':[500]

}

gs_pitching_input_data = pitching_input_data.sample(10000, random_state=42)
gs_pitching_output_data = pitching_output_data.sample(10000, random_state=42)

# Grid Search + Scoring
gs = GridSearchCV(pline, param_grid, cv=5, scoring=multioutput_scorer, n_jobs=-1)

# Cross-validate using the subsampled data
pitching_nested_score = cross_val_score(gs, gs_pitching_input_data.values, gs_pitching_output_data.values, 
                                        cv=3,scoring=multioutput_scorer, n_jobs=-1)

print("Nested cross-validation scores:", pitching_nested_score)
print("Mean Accuracy: ", pitching_nested_score.mean() * 100)




KeyboardInterrupt: 

After tuning our hyperparameters, we will now build our final model

In [33]:
# Extract the Best Parameters
gs.fit(pitching_input_data, pitching_output_data)
best_params = gs.best_params_
print(best_params)


#{'nnet__estimator__activation': 'relu', 'nnet__estimator__alpha': 0.0001, 'nnet__estimator__hidden_layer_sizes': 30, 'pca__n_components': 5} From previous try

Traceback (most recent call last):
  File "/home/mathew/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 173, in pandas._libs.index.IndexEngine.get_loc
TypeError: '(slice(None, None, None), 0)' is an invalid key

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign *

{'nnet__estimator__activation': 'relu', 'nnet__estimator__alpha': 0.0001, 'nnet__estimator__hidden_layer_sizes': 10, 'pca__n_components': 3}


In [18]:
# Generate Final Algorithm

X_train, X_test, y_train, y_test = sk.model_selection.train_test_split(
    pitching_input_data.values,  # Ensure NumPy arrays
    pitching_output_data.values, 
    test_size=0.2, 
    random_state=42
)

# Generate Final Algorithm with the best parameters
final_model = Pipeline([
    ('scaling', sk.preprocessing.StandardScaler()), 
    ('pca', PCA(n_components=best_params['pca__n_components'])),
    ('nnet', MultiOutputClassifier(MLPClassifier(
        activation=best_params['nnet__estimator__activation'],
        hidden_layer_sizes=best_params['nnet__estimator__hidden_layer_sizes'],
        alpha=best_params['nnet__estimator__alpha'],
        max_iter=1000,
        early_stopping=True
    )))
])

# Train the final model on the training set
final_model.fit(X_train, y_train)
print("Final model training completed.")

y_pred = final_model.predict(X_test)
# Compute and print the accuracy
test_accuracy = multioutput_accuracy(y_test, y_pred)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Generate detailed classification reports for each output variable
for i, col in enumerate(pitching_output_data.columns):
    print(f"Classification Report for {col}:")
    print(sk.metrics.classification_report(y_test[:, i], y_pred[:, i]))


Final model training completed.
Test Accuracy: 60.25%
Classification Report for p_ipouts:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        35
           1       0.00      0.00      0.00       281
           2       0.00      0.00      0.00       380
           3       0.41      1.00      0.58      1734
           4       0.00      0.00      0.00       226
           5       0.00      0.00      0.00       146
           6       0.00      0.00      0.00       285
           7       0.00      0.00      0.00        40
           8       0.00      0.00      0.00        35
           9       0.00      0.00      0.00        81
          10       0.00      0.00      0.00        36
          11       0.00      0.00      0.00        35
          12       0.00      0.00      0.00        80
          13       0.00      0.00      0.00        40
          14       0.00      0.00      0.00        66
          15       0.00      0.00      0.00  

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [23]:
with open("pitching_model.pkl", "wb") as f:
    pickle.dump(final_model, f)
print("Final model saved as 'pitching_model.pkl'.")

Final model saved as 'pitching_model.pkl'.


## Batting Model Training

In [20]:
# Create Pipeline of processes to run through
pline2 = Pipeline([('scaling', sk.preprocessing.StandardScaler()), ('pca', PCA()),
                   ('nnet', MultiOutputClassifier(MLPClassifier(max_iter=1000, early_stopping=True)))])

# Defines Parameters to Test
param_grid2 = {
    'pca__n_components': [5, 10, 15, 20, 25, 30],
    'nnet__estimator__hidden_layer_sizes': [30, 45, 60],
    'nnet__estimator__activation': ['relu'],
    'nnet__estimator__alpha': [0.0001, 0.001]
}

# Subsample the data for grid search
gs_batting_input_data = batting_input_data.sample(10000, random_state=42)
gs_batting_output_data = batting_output_data.sample(10000, random_state=42)

# Grid Search + Scoring
gs2 = GridSearchCV(pline2, param_grid2, cv=5, scoring=multioutput_scorer, n_jobs=-1)

# Cross-validate using the subsampled data
batting_nested_score = cross_val_score(gs2, 
                                       gs_batting_input_data.values, 
                                       gs_batting_output_data.values, 
                                       cv=3, 
                                       scoring=multioutput_scorer, 
                                       n_jobs=-1)

print("Nested cross-validation scores:", batting_nested_score)
print("Mean Accuracy: ", batting_nested_score.mean() * 100)


30 fits failed out of a total of 180.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/pipeline.py", line 469, in fit
    Xt = self._fit(X, y, routed_params)
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/pipeline.py", line 406, in _fit
    X, fitted_transformer = f

Nested cross-validation scores: [0.67610228 0.67127963 0.67394239]
Mean Accuracy:  67.37747672487706


In [21]:
# Extract the Best Parameters
gs2.fit(batting_input_data, batting_output_data)
best_params2 = gs2.best_params_
print(best_params2)

Traceback (most recent call last):
  File "/home/mathew/.local/lib/python3.10/site-packages/pandas/core/indexes/base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 173, in pandas._libs.index.IndexEngine.get_loc
TypeError: '(slice(None, None, None), 0)' is an invalid key

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/mathew/.local/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign *

{'nnet__estimator__activation': 'relu', 'nnet__estimator__alpha': 0.0001, 'nnet__estimator__hidden_layer_sizes': 30, 'pca__n_components': 5}


In [22]:
# Split the dataset into training and testing sets
X_train2, X_test2, y_train2, y_test2 = sk.model_selection.train_test_split(
    batting_input_data.values,  # Ensure NumPy arrays
    batting_output_data.values, 
    test_size=0.2, 
    random_state=42
)

# Generate Final Algorithm with the best parameters
final_model2 = Pipeline([
    ('scaling', sk.preprocessing.StandardScaler()), 
    ('pca', PCA(n_components=best_params2['pca__n_components'])),
    ('nnet', MultiOutputClassifier(MLPClassifier(
        activation=best_params2['nnet__estimator__activation'],
        hidden_layer_sizes=best_params2['nnet__estimator__hidden_layer_sizes'],
        alpha=best_params2['nnet__estimator__alpha'],
        max_iter=1000,
        early_stopping=True
    )))
])

# Train the final model on the training set
final_model2.fit(X_train2, y_train2)
print("Final model training completed.")

# Test the model and compute predictions
y_pred2 = final_model2.predict(X_test2)

# Compute and print the accuracy
test_accuracy2 = multioutput_accuracy(y_test2, y_pred2)
print(f"Test Accuracy: {test_accuracy2 * 100:.2f}%")

# Generate detailed classification reports for each output variable
for i, col in enumerate(batting_output_data.columns):
    print(f"Classification Report for {col}:")
    print(sk.metrics.classification_report(y_test2[:, i], y_pred2[:, i]))


Final model training completed.
Test Accuracy: 67.29%
Classification Report for b_ab:
              precision    recall  f1-score   support

           1       0.00      0.00      0.00       825
           2       0.00      0.00      0.00      1036
           3       0.25      0.01      0.01      2648
           4       0.42      0.99      0.59      4120
           5       0.00      0.00      0.00      1127
           6       0.00      0.00      0.00        56
           7       0.00      0.00      0.00         4

    accuracy                           0.42      9816
   macro avg       0.10      0.14      0.09      9816
weighted avg       0.24      0.42      0.25      9816

Classification Report for b_h:
              precision    recall  f1-score   support

           0       0.42      0.75      0.54      4087
           1       0.39      0.27      0.32      3731
           2       0.00      0.00      0.00      1564
           3       0.00      0.00      0.00       367
           4   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

In [24]:
with open("batting_model.pkl", "wb") as f:
    pickle.dump(final_model2, f)
print("Final model saved as 'batting_model.pkl'.")

Final model saved as 'batting_model.pkl'.


## Using the Model

After tuning and training the model, we will now use the model. You can load in the model by using the Pickle load method.

In [40]:
pitching_model = pickle.load(open('pitching_model.pkl', 'rb'))
print("Pitching model imported from 'pitching_model.pkl' as pitching_model")

Pitching model imported from 'pitching_model.pkl' as pitching_model


## Creation of the Player Class

We created the base player class to have the player object contain all the non-player based classification data and then have it separate with subclasses as pitchers and batters. We included default values for environmental and player stats.

For batters, we have an average batting average of 0.226

For pitchers, we have an average season ERA of 5.870

For environmental variables, we used either the mean of numerical values or used the mode of categorical variables. 
This provided us with stadium values of the following:

Average Column Values:  
left_field         331.833333  
center_field       404.166667  
right_field        328.333333  
min_wall_height      7.553333  
max_wall_height     14.266667  
  
We also got game variables of the following:  

Average Values for Numeric Columns:  
attendance    29356.347087  
temp             72.413835  
windspeed         6.466828  
  
Most Frequent Values for Categorical Columns:  
daynight      night  
precip         none  
sky          cloudy  
winddir     unknown  

In [35]:
class Player:
    left_field, center_field, right_field, min_wall_height, max_wall_height, attendance, temp, windspeed = (-1,) * 8
    daynight, precip, sky, winddir = ('',)*4
    def __init__(self, name = 'unknown', lf = 331.833, cf = 404.167, rf = 328.333, min_wh = 7.553, max_wh = 14.27, att = 29356,
                 t = 72.414, ws = 6.467, dn = 'night', pp = 'none', s = 'cloudy', wd = 'unknown'):
        self.name = name
        self.left_field = lf
        self.center_field = cf
        self.right_field = rf
        self.min_wall_height= min_wh
        self.max_wall_height = max_wh
        self.attendance = att
        self.temp = t 
        self.windspeed = ws
        self.daynight = dn 
        self.precip = pp
        self.sky = s
        self.winddir = wd
        self.stat = -1

    def get_name(self):
        return self.name

    #one-hot-encode helper method
    def one_hot_encode(self, value, categories):
        return [1 if value == category else 0 for category in categories]

    def __iter__(self):
        # Base numeric features
        features = [
            self.left_field, 
            self.center_field, 
            self.right_field, 
            self.min_wall_height, 
            self.max_wall_height, 
            self.attendance, 
            self.temp, 
            self.windspeed,
            self.stat #for batter or pitcher
        ]

        # One-hot encode categorical variables
        features += self.one_hot_encode(self.daynight, ['day', 'night'])
        features += self.one_hot_encode(self.precip, ['drizzle', 'none', 'rain', 'snow'])
        features += self.one_hot_encode(self.sky, ['cloudy', 'dome', 'overcast', 'sunny'])
        features += self.one_hot_encode(self.winddir, [
            'fromcf', 'fromlf', 'fromrf', 'ltor', 'rtol', 'tocf', 'tolf', 'torf', 'unknown'
        ])

        return iter(features)

In [36]:
class Pitcher(Player):
    def __init__(self, era=5.870, **kwargs):
        super().__init__(**kwargs)  # Pass all parent arguments to Player
        self.stat = era  # Set ERA as the stat


class Batter(Player):
    def __init__(self, batting_avg=0.226, **kwargs):
        super().__init__(**kwargs)  # Pass all parent arguments to Player
        self.stat = batting_avg  # Set batting average as the stat


In [37]:
bob = Pitcher()
print(list(bob))

[331.833, 404.167, 328.333, 7.553, 14.27, 29356, 72.414, 6.467, 5.87, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]


In [45]:
bob_data = np.array(list(bob)).reshape(1,-1)
pred_class = pitching_model.predict(bob_data)
pred_prob = pitching_model.predict_proba(bob_data)
# ['left_field', 'center_field', 'right_field','min_wall_height','max_wall_height',
                                    # 'attendance','temp','windspeed','season_era', 'daynight_day', 'daynight_night', 'precip_drizzle', 'precip_none', 'precip_rain', 
                                    # 'precip_snow', 'sky_cloudy', 'sky_dome', 'sky_overcast', 'sky_sunny', 'winddir_fromcf', 'winddir_fromlf', 'winddir_fromrf', 'winddir_ltor', 
                                    # 'winddir_rtol', 'winddir_tocf', 'winddir_tolf', 'winddir_torf', 'winddir_unknown']

In [22]:
print(pred_class)
print(pred_prob)

[[3 0 0 0 0 0 0]]
[array([[1.91315578e-04, 1.75529938e-02, 6.63668291e-02, 8.09934357e-01,
        2.28810021e-02, 3.84205536e-03, 1.22857993e-02, 7.97109313e-04,
        3.46737133e-05, 2.47956668e-05, 3.91959189e-04, 8.35930315e-03,
        8.11778038e-04, 5.74990890e-05, 3.09591162e-04, 2.16819438e-02,
        9.51838278e-04, 9.42886546e-05, 3.12501411e-02, 3.11182747e-04,
        3.11280217e-04, 9.84340198e-04, 1.06834366e-05, 1.09768559e-05,
        2.58925467e-04, 2.61371783e-04, 6.28165901e-06, 2.56838662e-05]]), array([[3.69829358e-01, 3.23964340e-01, 9.72784186e-02, 6.25816193e-02,
        3.49610802e-02, 3.85918238e-02, 6.36890215e-02, 7.50601094e-03,
        8.86831551e-04, 6.81525450e-04, 2.73384166e-06, 8.23928257e-07,
        7.13562224e-08, 1.08781657e-07, 2.62329712e-05]]), array([[7.85999334e-01, 1.08562807e-01, 4.59237353e-02, 4.29812841e-02,
        8.99256518e-03, 2.03155446e-03, 5.33486817e-03, 1.72039565e-04,
        1.19593730e-06, 2.87433097e-07, 1.63153653e-08,