# Load pitch data using the pybaseball package
- The pybaseball package scrapes data from [baseball savant](https://baseballsavant.mlb.com/statcast_search?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2021%7C2020%7C2019%7C&hfSit=&player_type=pitcher&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfMo=&hfTeam=&home_road=&hfRO=&position=&hfInfield=&hfOutfield=&hfInn=&hfBBT=&hfFlag=&metric_1=&group_by=name&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc#results) to get pitch level data
- https://github.com/jldbc/pybaseball


In [82]:
import pandas as pd
import tqdm # for tracking status of loops
from pybaseball import statcast, pitching_stats, pitching_stats_range


In [34]:
# get season level data for each pitcher and save it to a parquet file
df = pitching_stats(2010,2025)
df.to_parquet('by_season_data.gzip', compression='gzip')

In [40]:
pd.read_parquet('by_season_data.gzip').columns

Index(['IDfg', 'Season', 'Name', 'Team', 'Age', 'W', 'L', 'WAR', 'ERA', 'G',
       ...
       'Pit+ FC', 'Stf+ FS', 'Loc+ FS', 'Pit+ FS', 'Stuff+', 'Location+',
       'Pitching+', 'Stf+ FO', 'Loc+ FO', 'Pit+ FO'],
      dtype='object', length=393)

In [46]:
# Create an empty dictionary to fill in with monthly data pulled from pitching_stats_range
data_dict = {}

In [83]:
# we could also go month by month for 14 years
date_range = (
    pd.Series(pd.date_range("2010-01-01", periods=12 * 14, freq="MS", ))
    .dt.strftime('%Y-%m-%d')
    .to_list()
)
date_range = list(zip(date_range[:-1], date_range[1:]))

# Use tqdm to monitor status of loopz
for start_date, end_date in tqdm.tqdm(date_range):
    if start_date in data_dict:
        continue
    try:
        data_dict[start_date] = pitching_stats_range(start_dt=start_date, end_dt=end_date)
    except IndexError:
        # If the month has no data, an index error will be raised
        # This can simply be ignore and move onto the next month.
        # A better solution than a try except clause would be to 
        # only use months that have baseball games in them
        continue

# TO DO: convert dictionary to dataframe, then to_parquet file

 56%|█████▋    | 94/167 [09:52<07:17,  6.00s/it]