**Predicting the MLB's Most Valuable Player!**

In [1]:
import pybaseball as pyb
import pandas as pd

In [2]:
# Let's get batting stats for a range of seasons (2000-2024)
# MVP voting often considers seasons where sabermetrics became more widely recognized.
start_season = 2000
end_season = 2024 

In [3]:
print(f"Fetching batting stats from FanGraphs for seasons {start_season}-{end_season}...")
batting_stats = pyb.batting_stats(start_season, end_season)
print("Batting stats fetched successfully!")

Fetching batting stats from FanGraphs for seasons 2000-2024...
Batting stats fetched successfully!


**Let's take a look at some sample batting statistics.**

In [4]:
print("\nSample Batting Stats:")
print(batting_stats.head())
print(f"\nShape of batting_stats: {batting_stats.shape}")


Sample Batting Stats:
     IDfg  Season         Name Team  Age    G   AB   PA    H  1B  ...  maxEV  \
0    1109    2002  Barry Bonds  SFG   37  143  403  612  149  70  ...    NaN   
2    1109    2001  Barry Bonds  SFG   36  153  476  664  156  49  ...    NaN   
1    1109    2004  Barry Bonds  SFG   39  147  373  617  135  60  ...    NaN   
7   15640    2024  Aaron Judge  NYY   32  158  559  704  180  85  ...  117.5   
18  15640    2022  Aaron Judge  NYY   30  157  570  696  177  87  ...  118.4   

    HardHit  HardHit%  Events  CStr%   CSW%  xBA  xSLG  xwOBA  L-WAR  
0       NaN       NaN       0  0.127  0.191  NaN   NaN    NaN   12.7  
2       NaN       NaN       0    NaN    NaN  NaN   NaN    NaN   12.5  
1       NaN       NaN       0  0.124  0.164  NaN   NaN    NaN   11.9  
7     238.0     0.609     391  0.146  0.267  NaN   NaN    NaN   11.6  
18    246.0     0.609     404  0.169  0.287  NaN   NaN    NaN   11.4  

[5 rows x 320 columns]

Shape of batting_stats: (3682, 320)


**We can't forget pitchers too!**

In [5]:
print(f"\nFetching pitching stats from FanGraphs for seasons {start_season}-{end_season}...")
pitching_stats = pyb.pitching_stats(start_season, end_season)
print("Pitching stats fetched successfully!")


Fetching pitching stats from FanGraphs for seasons 2000-2024...
Pitching stats fetched successfully!


**Let's take a look at some sample pitching statistics.**

In [6]:
print("\nSample Pitching Stats:")
print(pitching_stats.head())
print(f"\nShape of pitching_stats: {pitching_stats.shape}")



Sample Pitching Stats:
     IDfg  Season            Name Team  Age   W   L   WAR   ERA   G  ...  \
70     60    2001   Randy Johnson  ARI   37  21   6  10.4  2.49  35  ...   
104    60    2000   Randy Johnson  ARI   36  19   7   9.6  2.64  35  ...   
95     60    2004   Randy Johnson  ARI   40  16  14   9.6  2.60  35  ...   
4     200    2000  Pedro Martinez  BOS   28  18   6   9.4  1.74  29  ...   
384    73    2002  Curt Schilling  ARI   35  23   7   9.3  3.23  36  ...   

     Pit+ FC  Stf+ FS  Loc+ FS  Pit+ FS  Stuff+  Location+  Pitching+  \
70       NaN      NaN      NaN      NaN     NaN        NaN        NaN   
104      NaN      NaN      NaN      NaN     NaN        NaN        NaN   
95       NaN      NaN      NaN      NaN     NaN        NaN        NaN   
4        NaN      NaN      NaN      NaN     NaN        NaN        NaN   
384      NaN      NaN      NaN      NaN     NaN        NaN        NaN   

     Stf+ FO  Loc+ FO  Pit+ FO  
70       NaN      NaN      NaN  
104      NaN  

**Now I can save both the hitting and pitching stats as CSVs, so we do not have to scrape them everytime.**

In [7]:
batting_stats.to_csv('fangraphs_batting_stats.csv', index=False)
pitching_stats.to_csv('fangraphs_pitching_stats.csv', index=False)
print("\nData saved to CSV files.")


Data saved to CSV files.


**Now, I will input all of the MVP winners and their information. Let's start with all of the AL MVPs!**

In [8]:
mvp_data = []

# American League MVP Winners
al_mvps = [
    (2000, 'Jason Giambi', 'AL', 'OAK'),
    (2001, 'Ichiro Suzuki', 'AL', 'SEA'),
    (2002, 'Miguel Tejada', 'AL', 'OAK'),
    (2003, 'Alex Rodriguez', 'AL', 'TEX'),
    (2004, 'Vladimir Guerrero', 'AL', 'LAA'), 
    (2005, 'Alex Rodriguez', 'AL', 'NYY'),
    (2006, 'Justin Morneau', 'AL', 'MIN'),
    (2007, 'Alex Rodriguez', 'AL', 'NYY'),
    (2008, 'Dustin Pedroia', 'AL', 'BOS'),
    (2009, 'Joe Mauer', 'AL', 'MIN'),
    (2010, 'Josh Hamilton', 'AL', 'TEX'),
    (2011, 'Justin Verlander', 'AL', 'DET'),
    (2012, 'Miguel Cabrera', 'AL', 'DET'),
    (2013, 'Miguel Cabrera', 'AL', 'DET'),
    (2014, 'Mike Trout', 'AL', 'LAA'),
    (2015, 'Josh Donaldson', 'AL', 'TOR'),
    (2016, 'Mike Trout', 'AL', 'LAA'),
    (2017, 'Jose Altuve', 'AL', 'HOU'),
    (2018, 'Mookie Betts', 'AL', 'BOS'),
    (2019, 'Mike Trout', 'AL', 'LAA'),
    (2020, 'José Abreu', 'AL', 'CHW'),
    (2021, 'Shohei Ohtani', 'AL', 'LAA'),
    (2022, 'Aaron Judge', 'AL', 'NYY'),
    (2023, 'Shohei Ohtani', 'AL', 'LAA'),
    (2024, 'Aaron Judge', 'AL', 'NYY')
]
mvp_data.extend(al_mvps)

**Now for the NL MVPs!**

In [9]:
# National League MVP Winners
nl_mvps = [
    (2000, 'Jeff Kent', 'NL', 'SF'),
    (2001, 'Barry Bonds', 'NL', 'SF'),
    (2002, 'Barry Bonds', 'NL', 'SF'),
    (2003, 'Barry Bonds', 'NL', 'SF'),
    (2004, 'Barry Bonds', 'NL', 'SF'),
    (2005, 'Albert Pujols', 'NL', 'STL'),
    (2006, 'Ryan Howard', 'NL', 'PHI'),
    (2007, 'Jimmy Rollins', 'NL', 'PHI'),
    (2008, 'Albert Pujols', 'NL', 'STL'),
    (2009, 'Albert Pujols', 'NL', 'STL'),
    (2010, 'Joey Votto', 'NL', 'CIN'),
    (2011, 'Ryan Braun', 'NL', 'MIL'),
    (2012, 'Buster Posey', 'NL', 'SF'),
    (2013, 'Andrew McCutchen', 'NL', 'PIT'),
    (2014, 'Clayton Kershaw', 'NL', 'LAD'),
    (2015, 'Bryce Harper', 'NL', 'WSH'),
    (2016, 'Kris Bryant', 'NL', 'CHC'),
    (2017, 'Giancarlo Stanton', 'NL', 'MIA'),
    (2018, 'Christian Yelich', 'NL', 'MIL'),
    (2019, 'Cody Bellinger', 'NL', 'LAD'),
    (2020, 'Freddie Freeman', 'NL', 'ATL'),
    (2021, 'Bryce Harper', 'NL', 'PHI'),
    (2022, 'Paul Goldschmidt', 'NL', 'STL'),
    (2023, 'Ronald Acuña Jr.', 'NL', 'ATL'),
    (2024, 'Shohei Ohtani', 'NL', 'LAD') # Based on current information/projections for 2024
]
mvp_data.extend(nl_mvps)


**Let's bring them together into one dataframe.**

In [10]:
mvp_df = pd.DataFrame(mvp_data, columns=['Season', 'Name', 'League', 'Team'])

print(mvp_df.head())
print(mvp_df.tail())
print(f"\nTotal MVP winners recorded: {len(mvp_df)}")


   Season               Name League Team
0    2000       Jason Giambi     AL  OAK
1    2001      Ichiro Suzuki     AL  SEA
2    2002      Miguel Tejada     AL  OAK
3    2003     Alex Rodriguez     AL  TEX
4    2004  Vladimir Guerrero     AL  LAA
    Season              Name League Team
45    2020   Freddie Freeman     NL  ATL
46    2021      Bryce Harper     NL  PHI
47    2022  Paul Goldschmidt     NL  STL
48    2023  Ronald Acuña Jr.     NL  ATL
49    2024     Shohei Ohtani     NL  LAD

Total MVP winners recorded: 50


**Once again, I will save this as a CSV.**

In [11]:
mvp_df.to_csv('mvp_winners_2000_2024.csv', index=False)
print("\nMVP winners data saved to 'mvp_winners_2000_2024.csv'")


MVP winners data saved to 'mvp_winners_2000_2024.csv'
