In [1]:
import pybaseball as pyb
import pandas as pd
import numpy as np

In [None]:
batting_df = pd.read_csv('fangraphs_batting_stats.csv')
pitching_df = pd.read_csv('fangraphs_pitching_stats.csv')
mvp_df = pd.read_csv('mvp_winners_2000_2024.csv')
# Load the 2025 raw data
batting_df_2025 = pd.read_csv('batting_stats_2025.csv')
pitching_df_2025 = pd.read_csv('pitching_stats_2025.csv')


# Look at column names, data types, missing values
print("\n--- Batting DataFrame Info ---")
batting_df.info()
print("\n--- Pitching DataFrame Info ---")
pitching_df.info()
#print("\n--- Team Standings DataFrame Info ---")
#team_standings_df.info()
print("\n--- MVP Winners DataFrame Info ---")
mvp_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Columns: 320 entries, IDfg to L-WAR
dtypes: float64(262), int64(54), object(4)
memory usage: 360.1+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51 entries, 0 to 50
Columns: 393 entries, IDfg to Pit+ FO
dtypes: float64(319), int64(70), object(4)
memory usage: 156.7+ KB


In [3]:
print(f"\nDuplicates in batting_df: {batting_df.duplicated().sum()}")
print(f"Duplicates in pitching_df: {pitching_df.duplicated().sum()}")
print(f"Duplicates in team_standings_df: {mvp_df.duplicated().sum()}")



Duplicates in batting_df: 0
Duplicates in pitching_df: 0
Duplicates in team_standings_df: 0


Let's define what our obvious features may be as our "keeper columns".

In [4]:
# --- Define a wider set of "keeper" columns for each dataframe ---

# Kept columns for batters
batting_keepers = [
    # Identifiers & Shared Info
    'IDfg', 'Season', 'Name', 'Team', 'Age', 'G', 'WAR',
    # Traditional Stats
    'PA', 'HR', 'R', 'RBI', 'SB', 'BB%', 'K%', 'AVG', 'OBP', 'SLG', 'OPS',
    # Advanced Value
    'wOBA', 'wRC+', 'BABIP',
    # Plate Discipline
    'O-Swing%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'SwStr%',
    # Batted Ball Profile
    'LD%', 'GB%', 'FB%', 'Hard%', 'Soft%', 'Pull%', 'Cent%', 'Oppo%', 'Barrel%'
]

# Kept columns for pitchers
pitching_keepers = [
    # Identifiers & Shared Info
    'IDfg', 'Season', 'Name', 'Team', 'Age', 'G', 'WAR',
    # Traditional Stats
    'W', 'L', 'SV', 'IP', 'BB%', 'K%', 'HR/9', 'ERA', 'WHIP', 'FIP',
    # Advanced Value
    'ERA-', 'FIP-', 'xFIP-', 'BABIP',
    # Plate Discipline (opponent's)
    'O-Swing%', 'Z-Swing%', 'Swing%', 'O-Contact%', 'Z-Contact%', 'Contact%', 'Zone%', 'SwStr%',
    # Batted Ball Profile (opponent's)
    'LD%', 'GB%', 'FB%', 'Hard%', 'Soft%', 'Barrel%'
]

Now we need to merge these DataFrames.

In [5]:
# Create new, smaller dataframes using the keeper lists
# We use .copy() to avoid SettingWithCopyWarning
small_bat_df = batting_df[batting_keepers].copy()
small_pitch_df = pitching_df[pitching_keepers].copy()

small_bat_df_2025 = batting_df_2025[batting_keepers].copy()
small_pitch_df_2025 = pitching_df_2025[pitching_keepers].copy()


# Merge the two SMALL dataframes
merged_df = pd.merge(
    small_bat_df,
    small_pitch_df,
    on=['IDfg', 'Season'],
    how='outer',
    suffixes=('_bat', '_pitch')
)

merged_2025 = pd.merge(small_bat_df_2025, small_pitch_df_2025, on=['IDfg', 'Season'], how='outer', suffixes=('_bat', '_pitch'))



In [6]:
# Consolidate the few truly shared descriptive columns
merged_df['Name'] = merged_df['Name_bat'].fillna(merged_df['Name_pitch'])
merged_df['Team'] = merged_df['Team_bat'].fillna(merged_df['Team_pitch'])
merged_df['Age'] = merged_df['Age_bat'].fillna(merged_df['Age_pitch'])
merged_df['G'] = merged_df['G_bat'].fillna(merged_df['G_pitch'])

# Combine WAR from both sides, filling NaNs with 0 for single-sided players
merged_df['WAR'] = merged_df['WAR_bat'].fillna(0) + merged_df['WAR_pitch'].fillna(0)

# Drop the old, redundant suffixed columns
merged_df.drop(columns=[
    'Name_bat', 'Name_pitch', 'Team_bat', 'Team_pitch',
    'Age_bat', 'Age_pitch', 'G_bat', 'G_pitch', 'WAR_bat', 'WAR_pitch'
], inplace=True)

#Do the same for the Merged 2025 data
# Consolidate the few truly shared descriptive columns
merged_2025['Name'] = merged_2025['Name_bat'].fillna(merged_2025['Name_pitch'])
merged_2025['Team'] = merged_2025['Team_bat'].fillna(merged_2025['Team_pitch'])
merged_2025['Age'] = merged_2025['Age_bat'].fillna(merged_2025['Age_pitch'])
merged_2025['G'] = merged_2025['G_bat'].fillna(merged_2025['G_pitch'])

# Combine WAR from both sides, filling NaNs with 0 for single-sided players
merged_2025['WAR'] = merged_2025['WAR_bat'].fillna(0) + merged_2025['WAR_pitch'].fillna(0)

# Drop the old, redundant suffixed columns
merged_2025.drop(columns=[
    'Name_bat', 'Name_pitch', 'Team_bat', 'Team_pitch',
    'Age_bat', 'Age_pitch', 'G_bat', 'G_pitch', 'WAR_bat', 'WAR_pitch'
], inplace=True)



Now we should have all of the players merged together with a new column for if they won the MVP award or not. 

In [7]:
# Add the MVP flag
mvp_df['MVP'] = True
merged_df = pd.merge(merged_df, mvp_df[['Name', 'Season', 'League', 'MVP']], on=['Name', 'Season'], how='left')
merged_df['MVP'] = merged_df['MVP'].fillna(False)

# --- Assign League to all players based on team ---
al_teams = ['BAL', 'BOS', 'NYY', 'TBR', 'TBD', 'TOR', 'CHW', 'CWS', 'CLE', 'DET', 'KCR', 'MIN', 'LAA', 'OAK', 'SEA', 'TEX']
nl_teams = ['ATL', 'MIA', 'FLA', 'NYM', 'PHI', 'WSN', 'WSH', 'MON', 'CHC', 'CIN', 'MIL', 'PIT', 'STL', 'ARI', 'COL', 'LAD', 'SDP', 'SFG', 'SF']

def assign_league(row):
    if row['Team'] == 'HOU':
        return 'AL' if row['Season'] >= 2013 else 'NL'
    if row['Team'] in al_teams:
        return 'AL'
    elif row['Team'] in nl_teams:
        return 'NL'
    return np.nan

# Fill the League column for non-MVP players
merged_df['League_from_Team'] = merged_df.apply(assign_league, axis=1)
merged_df['League'] = merged_df['League'].fillna(merged_df['League_from_Team'])
merged_df.drop(columns=['League_from_Team'], inplace=True)

merged_2025['League'] = merged_2025.apply(assign_league, axis=1)


# Save the final, processed file
merged_df.to_csv('processed_mvp_data.csv', index=False)
merged_2025.to_csv('processed_2025_data.csv', index=False)

print("Processed data saved to 'processed_mvp_data.csv'")
print(f"Final DataFrame shape: {merged_df.shape}")
print(f"Final DataFrame shape: {merged_2025.shape}")
print(merged_df.head())

  merged_df['MVP'] = merged_df['MVP'].fillna(False)


Processed data saved to 'processed_mvp_data.csv'
Final DataFrame shape: (32960, 68)
Final DataFrame shape: (195, 67)
   IDfg  Season     PA   HR     R   RBI    SB  BB%_bat  K%_bat    AVG  ...  \
0     1    2002   13.0  0.0   3.0   2.0   1.0    0.000   0.077  0.538  ...   
1     1    2003  120.0  2.0  15.0   7.0   2.0    0.075   0.192  0.210  ...   
2     1    2004  105.0  2.0  12.0  11.0   3.0    0.029   0.229  0.161  ...   
3     1    2005    9.0  0.0   3.0   0.0   1.0    0.111   0.000  0.250  ...   
4     1    2006  378.0  3.0  42.0  19.0  20.0    0.087   0.122  0.260  ...   

   Hard%_pitch  Soft%_pitch  Barrel%_pitch             Name   Team   Age  \
0          NaN          NaN            NaN  Alfredo Amezaga    ANA  24.0   
1          NaN          NaN            NaN  Alfredo Amezaga    ANA  25.0   
2          NaN          NaN            NaN  Alfredo Amezaga    ANA  26.0   
3          NaN          NaN            NaN  Alfredo Amezaga  - - -  27.0   
4          NaN          NaN       