In [None]:
import pandas as pd
import requests
import time

In [None]:
def scrape_batting_data(start_year, end_year, num_of_players):
    '''
    Webscrape MLB.com for batting statistics
    
    Parameters:
    -----------
    start_year : int
        First year to scrape (inclusive)
    end_year : int
        Last year to scrape (inclusive)
    num_of_players : int
        Maximum number of players to retrieve per year
    
    Returns:
    --------
    list
        List of player batting statistics dictionaries
    '''
    batting_url = 'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season={}&sportId=1&stats=season&group=hitting&gameType=R&limit={}&offset=0&sortStat=atBats&order=desc&playerPool=ALL'
    data = []
    
    for year in range(start_year, end_year+1):
        print(f"Scraping batting data for {year}...")
        time.sleep(5)  # Rate limiting
        
        try:
            request = requests.get(batting_url.format(year, num_of_players))
            request.raise_for_status()
            new_data = request.json()['stats']
            data += new_data
            print(f"  Retrieved {len(new_data)} players")
        except Exception as e:
            print(f"  Error for {year}: {e}")
            
    print(f"\nTotal batting records: {len(data)}")
    return data

In [None]:
# Scrape batting data from 2009-2025
# (2009 needed for prior year stats in 2010)
data = scrape_batting_data(2009, 2025, 4000)

In [None]:
batting_df = pd.DataFrame(data)
print(f"Batting DataFrame shape: {batting_df.shape}")
print(f"\nYears covered:")
print(batting_df['year'].value_counts().sort_index())
batting_df.head()

In [None]:
# Save batting data
batting_df.to_csv("MLB_batting_stats_2009_2025.csv", index=False)
print("Saved to MLB_batting_stats_2009_2025.csv")

In [None]:
def scrape_pitching_data(start_year, end_year, num_of_players):
    '''
    Webscrape MLB.com for pitching statistics
    
    Parameters:
    -----------
    start_year : int
        First year to scrape (inclusive)
    end_year : int
        Last year to scrape (inclusive)
    num_of_players : int
        Maximum number of players to retrieve per year
    
    Returns:
    --------
    list
        List of player pitching statistics dictionaries
    '''
    pitching_url = 'https://bdfed.stitch.mlbinfra.com/bdfed/stats/player?stitch_env=prod&season={}&sportId=1&stats=season&group=pitching&gameType=R&limit={}&offset=0&sortStat=earnedRunAverage&order=asc&playerPool=ALL'
    data = []
    
    for year in range(start_year, end_year+1):
        print(f"Scraping pitching data for {year}...")
        time.sleep(5)  # Rate limiting
        
        try:
            request = requests.get(pitching_url.format(year, num_of_players))
            request.raise_for_status()
            new_data = request.json()["stats"]
            data += new_data
            print(f"  Retrieved {len(new_data)} players")
        except Exception as e:
            print(f"  Error for {year}: {e}")
            
    print(f"\nTotal pitching records: {len(data)}")
    return data

In [None]:
# Scrape pitching data from 2009-2025
data2 = scrape_pitching_data(2009, 2025, 3000)

In [None]:
pitching_df = pd.DataFrame(data2)
print(f"Pitching DataFrame shape: {pitching_df.shape}")
print(f"\nYears covered:")
print(pitching_df['year'].value_counts().sort_index())
pitching_df.head()

In [None]:
# Save pitching data
pitching_df.to_csv("MLB_pitching_stats_2009_2025.csv", index=False)
print("Saved to MLB_pitching_stats_2009_2025.csv")

## Summary

Data scraped:
- **Batting stats**: 2009-2025 (17 years)
- **Pitching stats**: 2009-2025 (17 years)

Note: 2009 is included to provide prior year stats for 2010 season predictions.