# Load pitch data using the pybaseball package
- The pybaseball package scrapes data from [baseball savant](https://baseballsavant.mlb.com/statcast_search?hfPT=&hfAB=&hfGT=R%7C&hfPR=&hfZ=&hfStadium=&hfBBL=&hfNewZones=&hfPull=&hfC=&hfSea=2021%7C2020%7C2019%7C&hfSit=&player_type=pitcher&hfOuts=&hfOpponent=&pitcher_throws=&batter_stands=&hfSA=&game_date_gt=&game_date_lt=&hfMo=&hfTeam=&home_road=&hfRO=&position=&hfInfield=&hfOutfield=&hfInn=&hfBBT=&hfFlag=&metric_1=&group_by=name&min_pitches=0&min_results=0&min_pas=0&sort_col=pitches&player_event_sort=api_p_release_speed&sort_order=desc#results) to get pitch level data
- https://github.com/jldbc/pybaseball



In [4]:
import pandas as pd
import tqdm # for tracking status of loops
from pybaseball import statcast, pitching_stats, pitching_stats_range

import time


In [9]:
# it will error out if you try to do all years at once, so go year by year and create a dictionary (i.e. range(2010,2011) then range(2011,2012))
for year in range(2010, 2025):

    data_dict = {}

    # we could also go month by month for 14 years
    date_range = (
        pd.Series(pd.date_range(f"{year}-03-01", periods=8, freq="MS", ))
        .dt.strftime('%Y-%m-%d')
        .to_list()
    )

    date_range = list(zip(date_range[:-1], date_range[1:]))

    # Use tqdm to monitor status of loop
    for start_date, end_date in tqdm.tqdm(date_range):
        # exclude if the date is already in our dictionary to avoid duplicate data
        if start_date in data_dict:
            continue
        try:
            # Get month long data frame from api
            df = pitching_stats_range(start_dt=start_date, end_dt=end_date,)

            # Add month column to df, define as starting date of each month
            df['month'] = start_date

            # add df to data dict
            data_dict[start_date] = df
        except IndexError:
            # If the month has no data, an index error will be raised
            # This can simply be ignore and move onto the next month.
            # A better solution than a try except clause would be to 
            # only use months that have baseball games in them
            print(f"Error on month: {start_date}")
            continue

    # Convert dictionary to dataframe, then to_parquet file
    df = pd.concat(data_dict.values())

    df.to_parquet(f"{year}_data.parquet.gzip")
    df.head()

    print(f"{year=} complete. Sleeping for 30 seconds")
    time.sleep(30)

 14%|█▍        | 1/7 [00:00<00:00,  6.35it/s]

Error on month: 2010-03-01


100%|██████████| 7/7 [00:36<00:00,  5.24s/it]


year=2010 complete. Sleeping for 30 seconds


100%|██████████| 7/7 [00:36<00:00,  5.24s/it]


year=2011 complete. Sleeping for 30 seconds


100%|██████████| 7/7 [00:40<00:00,  5.85s/it]


year=2012 complete. Sleeping for 30 seconds


100%|██████████| 7/7 [00:39<00:00,  5.70s/it]


year=2013 complete. Sleeping for 30 seconds


100%|██████████| 7/7 [00:39<00:00,  5.63s/it]


year=2014 complete. Sleeping for 30 seconds


 14%|█▍        | 1/7 [00:00<00:00,  6.21it/s]

Error on month: 2015-03-01


100%|██████████| 7/7 [00:36<00:00,  5.24s/it]


year=2015 complete. Sleeping for 30 seconds


 14%|█▍        | 1/7 [00:00<00:01,  3.05it/s]

Error on month: 2016-03-01


100%|██████████| 7/7 [00:36<00:00,  5.24s/it]


year=2016 complete. Sleeping for 30 seconds


 14%|█▍        | 1/7 [00:00<00:01,  4.43it/s]

Error on month: 2017-03-01


100%|██████████| 7/7 [00:36<00:00,  5.24s/it]


year=2017 complete. Sleeping for 30 seconds


100%|██████████| 7/7 [00:36<00:00,  5.24s/it]


year=2018 complete. Sleeping for 30 seconds


100%|██████████| 7/7 [00:36<00:00,  5.24s/it]


year=2019 complete. Sleeping for 30 seconds


 14%|█▍        | 1/7 [00:00<00:01,  3.11it/s]

Error on month: 2020-03-01


 29%|██▊       | 2/7 [00:06<00:18,  3.60s/it]

Error on month: 2020-04-01


 43%|████▎     | 3/7 [00:12<00:18,  4.70s/it]

Error on month: 2020-05-01


 57%|█████▋    | 4/7 [00:18<00:15,  5.24s/it]

Error on month: 2020-06-01


100%|██████████| 7/7 [00:36<00:00,  5.23s/it]


year=2020 complete. Sleeping for 30 seconds


100%|██████████| 7/7 [00:36<00:00,  5.23s/it]


year=2021 complete. Sleeping for 30 seconds


 14%|█▍        | 1/7 [00:00<00:01,  3.34it/s]

Error on month: 2022-03-01


100%|██████████| 7/7 [00:36<00:00,  5.23s/it]


year=2022 complete. Sleeping for 30 seconds


100%|██████████| 7/7 [00:36<00:00,  5.23s/it]


year=2023 complete. Sleeping for 30 seconds


 57%|█████▋    | 4/7 [00:18<00:14,  4.81s/it]

Error on month: 2024-06-01


 71%|███████▏  | 5/7 [00:24<00:10,  5.25s/it]

Error on month: 2024-07-01


 86%|████████▌ | 6/7 [00:30<00:05,  5.49s/it]

Error on month: 2024-08-01


100%|██████████| 7/7 [00:36<00:00,  5.17s/it]

Error on month: 2024-09-01
year=2024 complete. Sleeping for 30 seconds





KeyboardInterrupt: 

In [18]:
from glob import glob # this will allow us to grab a bunch of files with the same file name
from tqdm import tqdm

file_list = glob("*_data.parquet.gzip")

# TODO: define what an iterator is in P's own works 
# Iterator: will load in files as needed
df_iter = (
    pd.read_parquet(filename) for filename in file_list
)

df = pd.concat(df_iter)

In [20]:
df

Unnamed: 0,Name,Age,#days,Lev,Tm,G,GS,W,L,SV,...,StS,GB/FB,LD,PU,WHIP,BAbip,SO9,SO/W,mlbID,month
1,Albert Abreu,26,757,Maj-AL,Texas,6,0,,,,...,0.11,0.50,0.22,0.00,1.957,0.125,9.4,0.73,656061,2022-04-01
2,Bryan Abreu,25,757,Maj-AL,Houston,7,0,,,,...,0.15,0.44,0.28,0.08,1.759,0.458,13.0,2.80,650556,2022-04-01
3,Domingo Acevedo,28,757,Maj-AL,Oakland,10,0,,1.0,,...,0.18,0.41,0.30,0.04,1.552,0.375,10.2,3.67,642758,2022-04-01
4,Jason Adam,30,758,Maj-AL,Tampa Bay,8,0,,1.0,,...,0.18,0.43,0.07,0.14,0.500,0.000,11.3,2.50,592094,2022-04-01
5,Austin Adams,31,774,Maj-NL,San Diego,2,0,1.0,,,...,0.11,1.00,0.00,0.00,1.286,0.000,7.7,0.67,613534,2022-04-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
589,Ryan Yarbrough,26,2064,Maj-AL,Tampa Bay,6,0,4.0,1.0,,...,0.12,0.39,0.31,0.06,1.456,0.328,7.5,2.11,642232,2018-09-01
590,Kirby Yates,31,2064,Maj-NL,San Diego,12,0,1.0,,6.0,...,0.25,0.40,0.10,0.10,0.794,0.211,15.1,4.75,489446,2018-09-01
591,Daniel Zamora,25,2065,Maj-NL,New York,8,0,1.0,,,...,0.14,0.25,0.25,0.25,0.500,0.250,18.0,8.00,623354,2018-09-01
592,Brad Ziegler,38,2065,Maj-NL,Arizona,14,0,1.0,,,...,0.09,0.69,0.23,0.00,1.286,0.265,6.9,1.80,446899,2018-09-01
