# Statcast Data

Gets the full, raw statcast data for the specified years

In [2]:
from pybaseball import statcast
import pandas as pd
from datetime import datetime, timedelta
import os
from pathlib import Path


YEARS=[2022, 2023, 2024]

In [3]:
for YEAR in YEARS:
    start_date = datetime(YEAR, 1, 1)
    end_date = datetime(YEAR, 12, 31)

    # Setup directories
    data_dir = Path("../../data")
    output_dir = data_dir / "historical/statcast" / str(YEAR)
    output_dir.mkdir(parents=True, exist_ok=True)

    # Track errors
    errors = []
    current_date = start_date

    while current_date < end_date:
        # Calculate end of current month
        if current_date.month == 12:
            month_end = datetime(current_date.year, 12, 31)
        else:
            month_end = datetime(current_date.year, current_date.month + 1, 1) - timedelta(days=1)
        
        month_end = min(month_end, end_date)
        
        # Define output file for this month
        month_file = output_dir / f"statcast_{current_date.strftime('%Y_%m')}.csv"
        
        # Skip if month file already exists
        if month_file.exists():
            print(f"Skipping {current_date.strftime('%Y-%m')}, file already exists")
            current_date = month_end + timedelta(days=1)
            continue
        
        print(f"Fetching data from {current_date.strftime('%Y-%m-%d')} to {month_end.strftime('%Y-%m-%d')}")
        
        try:
            # Get data for current month
            data = statcast(start_dt=current_date.strftime('%Y-%m-%d'),
                           end_dt=month_end.strftime('%Y-%m-%d'))
            
            if data is not None and not data.empty:
                # Save month's data
                data.to_csv(month_file, index=False)
                print(f"Saved {len(data)} statcast events for {current_date.strftime('%Y-%m')}")
            else:
                errors.append(f"No data returned for month of {current_date.strftime('%Y-%m')}")
                
        except Exception as e:
            errors.append(f"Error fetching data for month of {current_date.strftime('%Y-%m')}: {str(e)}")
        
        # Move to next month
        current_date = month_end + timedelta(days=1)

    print(f"\nCompleted saving monthly data to {output_dir}")
    if errors:
        print("\nErrors encountered:")
        for error in errors:
            print(error)


Fetching data from 2022-01-01 to 2022-01-31
This is a large query, it may take a moment to complete


Skipping offseason dates


0it [00:00, ?it/s]

Fetching data from 2022-02-01 to 2022-02-28
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

Skipping 2022-03, file already exists
Skipping 2022-04, file already exists
Skipping 2022-05, file already exists
Skipping 2022-06, file already exists
Skipping 2022-07, file already exists
Skipping 2022-08, file already exists
Skipping 2022-09, file already exists
Skipping 2022-10, file already exists
Skipping 2022-11, file already exists
Fetching data from 2022-12-01 to 2022-12-31
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]


Completed saving monthly data to ../../data/historical/statcast/2022

Errors encountered:
No data returned for month of 2022-01
No data returned for month of 2022-02
No data returned for month of 2022-12
Fetching data from 2023-01-01 to 2023-01-31
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

Fetching data from 2023-02-01 to 2023-02-28
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

Skipping 2023-03, file already exists
Skipping 2023-04, file already exists
Skipping 2023-05, file already exists
Skipping 2023-06, file already exists
Skipping 2023-07, file already exists
Skipping 2023-08, file already exists
Skipping 2023-09, file already exists
Skipping 2023-10, file already exists
Skipping 2023-11, file already exists
Fetching data from 2023-12-01 to 2023-12-31
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]


Completed saving monthly data to ../../data/historical/statcast/2023

Errors encountered:
No data returned for month of 2023-01
No data returned for month of 2023-02
No data returned for month of 2023-12
Fetching data from 2024-01-01 to 2024-01-31
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

Fetching data from 2024-02-01 to 2024-02-29
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]

Skipping 2024-03, file already exists
Skipping 2024-04, file already exists





Fetching data from 2024-05-01 to 2024-05-31
This is a large query, it may take a moment to complete


  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_cop

Skipping 2024-06, file already exists
Skipping 2024-07, file already exists
Skipping 2024-08, file already exists
Skipping 2024-09, file already exists
Skipping 2024-10, file already exists
Fetching data from 2024-11-01 to 2024-11-30
This is a large query, it may take a moment to complete





Skipping offseason dates


100%|██████████| 15/15 [00:00<00:00, 18.24it/s]

Fetching data from 2024-12-01 to 2024-12-31
This is a large query, it may take a moment to complete





Skipping offseason dates


0it [00:00, ?it/s]


Completed saving monthly data to ../../data/historical/statcast/2024

Errors encountered:
No data returned for month of 2024-01
No data returned for month of 2024-02
Error fetching data for month of 2024-05: HTTPSConnectionPool(host='baseballsavant.mlb.com', port=443): Max retries exceeded with url: /statcast_search/csv?all=true&hfPT=&hfAB=&hfBBT=&hfPR=&hfZ=&stadium=&hfBBL=&hfNewZones=&hfGT=R%7CPO%7CS%7C=&hfSea=&hfSit=&player_type=pitcher&hfOuts=&opponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=2024-05-08&game_date_lt=2024-05-08&team=&position=&hfRO=&home_road=&hfFlag=&metric_1=&hfInn=&min_pitches=0&min_results=0&group_by=name&sort_col=pitches&player_event_sort=h_launch_speed&sort_order=desc&min_abs=0&type=details& (Caused by SSLError(SSLError(1, '[SSL: TLSV1_ALERT_DECODE_ERROR] tlsv1 alert decode error (_ssl.c:1028)')))
No data returned for month of 2024-11
No data returned for month of 2024-12



