# Data Management

This notebook imports general data management packages and loads the four Statcast Parquet files for the 2021–2024 MLB seasons.

In [97]:
# Import general data management packages
import polars as pl
import numpy as np

In [98]:
# Load Statcast Parquet files for 2021–2024
statcast_2021 = pl.read_parquet('data/statcast_2021.parquet')
statcast_2022 = pl.read_parquet('data/statcast_2022.parquet')
statcast_2023 = pl.read_parquet('data/statcast_2023.parquet')
statcast_2024 = pl.read_parquet('data/statcast_2024.parquet')

In [99]:
# Sum the shapes of all 4 tables
total_rows = (statcast_2021.shape[0] + statcast_2022.shape[0] + 
              statcast_2023.shape[0] + statcast_2024.shape[0])
total_cols = statcast_2021.shape[1]

(total_rows, total_cols)

(2853845, 119)

## Finding Variables
### We know they have the exact same columns from last notebooks exploration. Part 1 is gaining knowledge on what variables we need to keep to run analysis

In [100]:
# Display column schema from 2021 data (all years have the same columns)
statcast_2021.schema

Schema([('pitch_type', String),
        ('game_date', Datetime(time_unit='ns', time_zone=None)),
        ('release_speed', Float64),
        ('release_pos_x', Float64),
        ('release_pos_z', Float64),
        ('player_name', String),
        ('batter', Int64),
        ('pitcher', Int64),
        ('events', String),
        ('description', String),
        ('spin_dir', Int64),
        ('spin_rate_deprecated', Int64),
        ('break_angle_deprecated', Int64),
        ('break_length_deprecated', Int64),
        ('zone', Int64),
        ('des', String),
        ('game_type', String),
        ('stand', String),
        ('p_throws', String),
        ('home_team', String),
        ('away_team', String),
        ('type', String),
        ('hit_location', Int64),
        ('bb_type', String),
        ('balls', Int64),
        ('strikes', Int64),
        ('game_year', Int64),
        ('pfx_x', Float64),
        ('pfx_z', Float64),
        ('plate_x', Float64),
        ('plate_z', Float64),
 

In [101]:
# Select columns relevant to relief pitcher rest days and performance analysis
relevant_columns = [
    # Pitcher identification
    'pitcher',
    
    # Date/time for calculating rest days
    'game_date', 'game_pk',
    
    # Pitch characteristics (velocity and spin rate - primary metrics)
    'release_speed', 'release_spin_rate', 'pitch_type', 'pitch_name',
    
    # Pitch outcomes for strike percentage and control metrics
    'description', 'events', 'type', 'zone',
    
    # Count information for control analysis
    'balls', 'strikes', 'outs_when_up',
    
    # Game context
    'inning', 'inning_topbot', 'home_team', 'away_team',
    'at_bat_number', 'pitch_number',
    
    # Additional outcome metrics (for walks and ERA calculations)
    'bb_type', 'on_1b', 'on_2b', 'on_3b',
    
    # Runs data for ERA calculation
    'post_away_score', 'post_home_score', 'fld_score', 'bat_score',
    
    # For identifying relief pitchers and game situation
    'game_type', 'stand', 'p_throws'
]

# Filter to relevant columns
statcast_2021_filtered = statcast_2021.select(relevant_columns)
statcast_2022_filtered = statcast_2022.select(relevant_columns)
statcast_2023_filtered = statcast_2023.select(relevant_columns)
statcast_2024_filtered = statcast_2024.select(relevant_columns)

statcast_2021_filtered.head()

pitcher,game_date,game_pk,release_speed,release_spin_rate,pitch_type,pitch_name,description,events,type,zone,balls,strikes,outs_when_up,inning,inning_topbot,home_team,away_team,at_bat_number,pitch_number,bb_type,on_1b,on_2b,on_3b,post_away_score,post_home_score,fld_score,bat_score,game_type,stand,p_throws
i64,datetime[ns],i64,f64,i64,str,str,str,str,str,i64,i64,i64,i64,i64,str,str,str,i64,i64,str,i64,i64,i64,i64,i64,i64,i64,str,str,str
519293,2021-10-03 00:00:00,632254,92.3,2330,"""FF""","""4-Seam Fastball""","""hit_into_play""","""field_out""","""X""",1,1,2,2,9,"""Top""","""ATL""","""NYM""",61,4,"""ground_ball""",,,,0,5,5,0,"""R""","""R""","""L"""
519293,2021-10-03 00:00:00,632254,80.6,2254,"""SL""","""Slider""","""foul""",,"""S""",4,1,1,2,9,"""Top""","""ATL""","""NYM""",61,3,,,,,0,5,5,0,"""R""","""R""","""L"""
519293,2021-10-03 00:00:00,632254,75.5,1940,"""CU""","""Curveball""","""foul""",,"""S""",5,1,0,2,9,"""Top""","""ATL""","""NYM""",61,2,,,,,0,5,5,0,"""R""","""R""","""L"""
519293,2021-10-03 00:00:00,632254,75.0,2017,"""CU""","""Curveball""","""ball""",,"""B""",12,0,0,2,9,"""Top""","""ATL""","""NYM""",61,1,,,,,0,5,5,0,"""R""","""R""","""L"""
519293,2021-10-03 00:00:00,632254,91.2,2281,"""FF""","""4-Seam Fastball""","""hit_into_play""","""field_out""","""X""",4,1,0,1,9,"""Top""","""ATL""","""NYM""",60,2,"""ground_ball""",,,,0,5,5,0,"""R""","""L""","""L"""


In [102]:
# Rename columns to be easier to type and read
column_rename_map = {
    'pitcher': 'pitcher_id',
    'game_date': 'date',
    'game_pk': 'game_id',
    'release_speed': 'velocity',
    'release_spin_rate': 'spin_rate',
    'pitch_type': 'pitch_type_abbr',
    'pitch_name': 'pitch_type',
    'description': 'pitch_result',
    'events': 'ab_result',
    'type': 'pitch_call',
    'zone': 'strike_zone',
    'balls': 'ball_count',
    'strikes': 'strike_count',
    'outs_when_up': 'outs',
    'inning_topbot': 'top_bottom',
    'at_bat_number': 'ab_number',
    'pitch_number': 'pitch_num',
    'bb_type': 'batted_ball_type',
    'on_1b': 'runner_1b',
    'on_2b': 'runner_2b',
    'on_3b': 'runner_3b',
    'post_away_score': 'away_score',
    'post_home_score': 'home_score',
    'fld_score': 'defense_score',
    'bat_score': 'offense_score',
    'stand': 'batter_side',
    'p_throws': 'pitcher_hand'
}

# Apply renaming to all filtered datasets
statcast_2021_filtered = statcast_2021_filtered.rename(column_rename_map)
statcast_2022_filtered = statcast_2022_filtered.rename(column_rename_map)
statcast_2023_filtered = statcast_2023_filtered.rename(column_rename_map)
statcast_2024_filtered = statcast_2024_filtered.rename(column_rename_map)

statcast_2021_filtered.head()

pitcher_id,date,game_id,velocity,spin_rate,pitch_type_abbr,pitch_type,pitch_result,ab_result,pitch_call,strike_zone,ball_count,strike_count,outs,inning,top_bottom,home_team,away_team,ab_number,pitch_num,batted_ball_type,runner_1b,runner_2b,runner_3b,away_score,home_score,defense_score,offense_score,game_type,batter_side,pitcher_hand
i64,datetime[ns],i64,f64,i64,str,str,str,str,str,i64,i64,i64,i64,i64,str,str,str,i64,i64,str,i64,i64,i64,i64,i64,i64,i64,str,str,str
519293,2021-10-03 00:00:00,632254,92.3,2330,"""FF""","""4-Seam Fastball""","""hit_into_play""","""field_out""","""X""",1,1,2,2,9,"""Top""","""ATL""","""NYM""",61,4,"""ground_ball""",,,,0,5,5,0,"""R""","""R""","""L"""
519293,2021-10-03 00:00:00,632254,80.6,2254,"""SL""","""Slider""","""foul""",,"""S""",4,1,1,2,9,"""Top""","""ATL""","""NYM""",61,3,,,,,0,5,5,0,"""R""","""R""","""L"""
519293,2021-10-03 00:00:00,632254,75.5,1940,"""CU""","""Curveball""","""foul""",,"""S""",5,1,0,2,9,"""Top""","""ATL""","""NYM""",61,2,,,,,0,5,5,0,"""R""","""R""","""L"""
519293,2021-10-03 00:00:00,632254,75.0,2017,"""CU""","""Curveball""","""ball""",,"""B""",12,0,0,2,9,"""Top""","""ATL""","""NYM""",61,1,,,,,0,5,5,0,"""R""","""R""","""L"""
519293,2021-10-03 00:00:00,632254,91.2,2281,"""FF""","""4-Seam Fastball""","""hit_into_play""","""field_out""","""X""",4,1,0,1,9,"""Top""","""ATL""","""NYM""",60,2,"""ground_ball""",,,,0,5,5,0,"""R""","""L""","""L"""


In [103]:
# Add year column to each dataset and combine all years
statcast_2021_filtered = statcast_2021_filtered.with_columns(pl.lit(2021).alias('year'))
statcast_2022_filtered = statcast_2022_filtered.with_columns(pl.lit(2022).alias('year'))
statcast_2023_filtered = statcast_2023_filtered.with_columns(pl.lit(2023).alias('year'))
statcast_2024_filtered = statcast_2024_filtered.with_columns(pl.lit(2024).alias('year'))

# Combine all years into one dataset
statcast_combined = pl.concat([
    statcast_2021_filtered,
    statcast_2022_filtered,
    statcast_2023_filtered,
    statcast_2024_filtered
])

# Sort by pitcher and date for proper rest day calculations
statcast_combined = statcast_combined.sort(['pitcher_id', 'date', 'game_id', 'pitch_num'])

statcast_combined.shape

(2853845, 32)

## Filter to Relief Pitchers Only

Relief pitchers experience more variable rest intervals compared to starting pitchers, who typically pitch every 5 days. 

**Methodology**: Identify the starting pitcher (first pitcher to throw in inning 1) for each game-half and exclude those specific starting appearances. This approach:
- Excludes starting appearances only (not entire pitchers)
- Includes relief appearances by pitchers who occasionally start
- Captures the full spectrum of relief roles (long relievers, setup men, closers)

This filtering ensures we're analyzing actual relief appearances with diverse rest day patterns.

In [113]:
# Identify starting pitchers (first pitcher in inning 1 of each game-half)
# Get only inning 1 pitchers, group by game/inning, take first pitcher who threw
(starters := (statcast_combined
           .filter(pl.col('inning') == 1)
           .group_by(['game_id', 'top_bottom'])
           .agg(pl.col('pitcher_id').first())
           .select(['game_id', 'top_bottom', 'pitcher_id'])))


game_id,top_bottom,pitcher_id
i64,str,i64
634441,"""Top""",656288
745430,"""Top""",663362
745354,"""Bot""",668678
745808,"""Top""",640455
633192,"""Top""",605400
…,…,…
661221,"""Top""",593334
716646,"""Top""",543037
718350,"""Top""",656232
746634,"""Top""",663474


In [114]:
 # Create a set of (game_id, top_bottom, pitcher_id) combinations where pitcher started
# This allows us to exclude only the specific games where a pitcher started,
# not all their appearances if they ever started once
starter_games = starters.with_columns(pl.lit(True).alias('is_starter'))

# Left join to mark which game appearances were starts
statcast_with_starts = statcast_combined.join(
    starter_games, 
    on=['game_id', 'top_bottom', 'pitcher_id'], 
    how='left'
)

# Filter to only relief appearances (where is_starter is null)
statcast_relievers = statcast_with_starts.filter(pl.col('is_starter').is_null()).drop('is_starter')

# Check unique pitcher counts
print(f"Total unique pitchers: {statcast_combined['pitcher_id'].n_unique()}")
print(f"Relief pitcher IDs (pitchers with relief appearances): {statcast_relievers['pitcher_id'].n_unique()}")
print(f"Pitchers who both started and relieved: {statcast_combined['pitcher_id'].n_unique() - statcast_relievers['pitcher_id'].n_unique()}")



Total unique pitchers: 1569
Relief pitcher IDs (pitchers with relief appearances): 1445
Pitchers who both started and relieved: 124


In [115]:
# Summary statistics
total_pitchers = statcast_combined['pitcher_id'].n_unique()
relief_pitchers = statcast_relievers['pitcher_id'].n_unique()
total_pitches = statcast_combined.height
relief_pitches = statcast_relievers.height

print(
    f"Total unique pitchers: {total_pitchers}\n"
    f"Relief unique pitchers: {relief_pitchers}\n"
    f"Total pitches in dataset: {total_pitches:,}\n"
    f"Relief pitches analyzing: {relief_pitches:,}\n"
    f"Relief pitcher dataset shape: {statcast_relievers.shape}\n"
)

Total unique pitchers: 1569
Relief unique pitchers: 1445
Total pitches in dataset: 2,853,845
Relief pitches analyzing: 1,207,931
Relief pitcher dataset shape: (1207931, 32)



In [116]:
# Verify we have relief pitcher data from all 4 years
year_check = statcast_relievers.group_by('year').agg([
    pl.col('pitcher_id').n_unique().alias('unique_pitchers'),
    pl.col('game_id').n_unique().alias('unique_games'),
    pl.len().alias('total_pitches')
]).sort('year')

print("Relief pitcher data by year:")
print(year_check)

Relief pitcher data by year:
shape: (4, 4)
┌──────┬─────────────────┬──────────────┬───────────────┐
│ year ┆ unique_pitchers ┆ unique_games ┆ total_pitches │
│ ---  ┆ ---             ┆ ---          ┆ ---           │
│ i32  ┆ u32             ┆ u32          ┆ u32           │
╞══════╪═════════════════╪══════════════╪═══════════════╡
│ 2021 ┆ 772             ┆ 2429         ┆ 309617        │
│ 2022 ┆ 712             ┆ 2428         ┆ 296927        │
│ 2023 ┆ 715             ┆ 2429         ┆ 306082        │
│ 2024 ┆ 674             ┆ 2424         ┆ 295305        │
└──────┴─────────────────┴──────────────┴───────────────┘


In [117]:
# Preview the relief pitcher dataset
statcast_relievers.head(10)

pitcher_id,date,game_id,velocity,spin_rate,pitch_type_abbr,pitch_type,pitch_result,ab_result,pitch_call,strike_zone,ball_count,strike_count,outs,inning,top_bottom,home_team,away_team,ab_number,pitch_num,batted_ball_type,runner_1b,runner_2b,runner_3b,away_score,home_score,defense_score,offense_score,game_type,batter_side,pitcher_hand,year
i64,datetime[ns],i64,f64,i64,str,str,str,str,str,i64,i64,i64,i64,i64,str,str,str,i64,i64,str,i64,i64,i64,i64,i64,i64,i64,str,str,str,i32
405395,2022-05-15 00:00:00,661984,63.7,1695,"""FA""","""Other""","""ball""",,"""B""",12,0,0,0,9,"""Top""","""STL""","""SF""",79,1,,,,,2,15,15,2,"""R""","""R""","""R""",2022
405395,2022-05-15 00:00:00,661984,64.4,1571,"""FA""","""Other""","""foul""",,"""S""",1,0,0,2,9,"""Top""","""STL""","""SF""",85,1,,,,,6,15,15,6,"""R""","""L""","""R""",2022
405395,2022-05-15 00:00:00,661984,63.5,1699,"""FA""","""Other""","""ball""",,"""B""",12,0,0,0,9,"""Top""","""STL""","""SF""",80,1,,573131.0,,,2,15,15,2,"""R""","""R""","""R""",2022
405395,2022-05-15 00:00:00,661984,53.6,1450,"""FA""","""Other""","""ball""",,"""B""",13,0,0,2,9,"""Top""","""STL""","""SF""",84,1,,,,,5,15,15,5,"""R""","""R""","""R""",2022
405395,2022-05-15 00:00:00,661984,54.0,1485,"""FA""","""Other""","""called_strike""",,"""S""",4,0,0,1,9,"""Top""","""STL""","""SF""",81,1,,573131.0,,,2,15,15,2,"""R""","""R""","""R""",2022
405395,2022-05-15 00:00:00,661984,62.2,1672,"""FA""","""Other""","""called_strike""",,"""S""",4,0,0,2,9,"""Top""","""STL""","""SF""",83,1,,642731.0,,573131.0,2,15,15,2,"""R""","""L""","""R""",2022
405395,2022-05-15 00:00:00,661984,69.6,1735,"""FA""","""Other""","""ball""",,"""B""",12,0,0,1,9,"""Top""","""STL""","""SF""",82,1,,446334.0,573131.0,,2,15,15,2,"""R""","""R""","""R""",2022
405395,2022-05-15 00:00:00,661984,55.2,1554,"""CU""","""Curveball""","""blocked_ball""",,"""B""",13,1,0,1,9,"""Top""","""STL""","""SF""",82,2,,446334.0,573131.0,,2,15,15,2,"""R""","""R""","""R""",2022
405395,2022-05-15 00:00:00,661984,66.8,1648,"""FA""","""Other""","""foul""",,"""S""",6,0,1,2,9,"""Top""","""STL""","""SF""",85,2,,,,,6,15,15,6,"""R""","""L""","""R""",2022
405395,2022-05-15 00:00:00,661984,64.6,1776,"""FA""","""Other""","""ball""",,"""B""",14,1,0,0,9,"""Top""","""STL""","""SF""",79,2,,,,,2,15,15,2,"""R""","""R""","""R""",2022


## Add Pitcher Names

Use pybaseball to lookup pitcher names from their MLB IDs.

In [118]:
# Import pybaseball for player ID lookup
from pybaseball import playerid_reverse_lookup

# Get unique pitcher IDs
unique_pitcher_ids = statcast_relievers['pitcher_id'].unique().to_list()

# Lookup player names (this may take a minute)
pitcher_names = playerid_reverse_lookup(unique_pitcher_ids, key_type='mlbam')

# Convert to Polars and select relevant columns
pitcher_lookup = pl.from_pandas(pitcher_names[['key_mlbam', 'name_first', 'name_last']]).rename({
    'key_mlbam': 'pitcher_id',
    'name_first': 'first_name',
    'name_last': 'last_name'
})

# Create full name column
pitcher_lookup = pitcher_lookup.with_columns(
    (pl.col('last_name') + ', ' + pl.col('first_name')).alias('pitcher_name')
).select(['pitcher_id', 'pitcher_name'])

pitcher_lookup.head()

pitcher_id,pitcher_name
i64,str
594987,"""sadler, casey"""
680911,"""miller, owen"""
703231,"""meeker, james"""
660636,"""castillo, diego"""
570666,"""cessa, luis"""


In [119]:
# Join pitcher names to the relievers dataset
statcast_relievers = statcast_relievers.join(pitcher_lookup, on='pitcher_id', how='left')

# Reorder columns to put pitcher_name after pitcher_id
cols = statcast_relievers.columns
cols.remove('pitcher_name')
pitcher_id_index = cols.index('pitcher_id')
cols.insert(pitcher_id_index + 1, 'pitcher_name')
statcast_relievers = statcast_relievers.select(cols)

print("Pitcher names added successfully!")
statcast_relievers.head()

Pitcher names added successfully!


pitcher_id,pitcher_name,date,game_id,velocity,spin_rate,pitch_type_abbr,pitch_type,pitch_result,ab_result,pitch_call,strike_zone,ball_count,strike_count,outs,inning,top_bottom,home_team,away_team,ab_number,pitch_num,batted_ball_type,runner_1b,runner_2b,runner_3b,away_score,home_score,defense_score,offense_score,game_type,batter_side,pitcher_hand,year
i64,str,datetime[ns],i64,f64,i64,str,str,str,str,str,i64,i64,i64,i64,i64,str,str,str,i64,i64,str,i64,i64,i64,i64,i64,i64,i64,str,str,str,i32
405395,"""pujols, albert""",2022-05-15 00:00:00,661984,63.7,1695,"""FA""","""Other""","""ball""",,"""B""",12,0,0,0,9,"""Top""","""STL""","""SF""",79,1,,,,,2,15,15,2,"""R""","""R""","""R""",2022
405395,"""pujols, albert""",2022-05-15 00:00:00,661984,64.4,1571,"""FA""","""Other""","""foul""",,"""S""",1,0,0,2,9,"""Top""","""STL""","""SF""",85,1,,,,,6,15,15,6,"""R""","""L""","""R""",2022
405395,"""pujols, albert""",2022-05-15 00:00:00,661984,63.5,1699,"""FA""","""Other""","""ball""",,"""B""",12,0,0,0,9,"""Top""","""STL""","""SF""",80,1,,573131.0,,,2,15,15,2,"""R""","""R""","""R""",2022
405395,"""pujols, albert""",2022-05-15 00:00:00,661984,53.6,1450,"""FA""","""Other""","""ball""",,"""B""",13,0,0,2,9,"""Top""","""STL""","""SF""",84,1,,,,,5,15,15,5,"""R""","""R""","""R""",2022
405395,"""pujols, albert""",2022-05-15 00:00:00,661984,54.0,1485,"""FA""","""Other""","""called_strike""",,"""S""",4,0,0,1,9,"""Top""","""STL""","""SF""",81,1,,573131.0,,,2,15,15,2,"""R""","""R""","""R""",2022


## Filter Out Position Players Who Pitched

Position players occasionally pitch in blowout games but should be excluded from analysis. Real relief pitchers should have pitched at least 10 games in at least one season, indicating they had an actual relief role.

This ensures we're analyzing actual relief pitchers only.

In [120]:
# Identify real pitchers: at least 10 games in at least one season
pitcher_games_by_year = statcast_relievers.group_by(['pitcher_id', 'year']).agg(
    pl.col('game_id').n_unique().alias('games')
)
real_pitcher_ids = pitcher_games_by_year.filter(pl.col('games') >= 10)['pitcher_id'].unique()

print(f"Pitchers before filter: {statcast_relievers['pitcher_id'].n_unique()}")
print(f"Relievers pitchers (10+ games in any season): {len(real_pitcher_ids)}")

# Filter to real pitchers only
statcast_relievers = statcast_relievers.filter(pl.col('pitcher_id').is_in(real_pitcher_ids))

print(f"Pitchers after filter: {statcast_relievers['pitcher_id'].n_unique()}")
print(f"Dataset shape: {statcast_relievers.shape}")

Pitchers before filter: 1445
Relievers pitchers (10+ games in any season): 781
Pitchers after filter: 781
Dataset shape: (1129247, 33)


Please use `implode` to return to previous behavior.

See https://github.com/pola-rs/polars/issues/22149 for more information.
  statcast_relievers = statcast_relievers.filter(pl.col('pitcher_id').is_in(real_pitcher_ids))


In [121]:
# Save final cleaned relief pitcher dataset
statcast_relievers.write_parquet('data/statcast_relievers.parquet')

print("Final cleaned relief pitcher dataset saved!")
statcast_relievers.head()

Final cleaned relief pitcher dataset saved!


pitcher_id,pitcher_name,date,game_id,velocity,spin_rate,pitch_type_abbr,pitch_type,pitch_result,ab_result,pitch_call,strike_zone,ball_count,strike_count,outs,inning,top_bottom,home_team,away_team,ab_number,pitch_num,batted_ball_type,runner_1b,runner_2b,runner_3b,away_score,home_score,defense_score,offense_score,game_type,batter_side,pitcher_hand,year
i64,str,datetime[ns],i64,f64,i64,str,str,str,str,str,i64,i64,i64,i64,i64,str,str,str,i64,i64,str,i64,i64,i64,i64,i64,i64,i64,str,str,str,i32
429722,"""santana, ervin""",2021-04-20 00:00:00,634407,83.5,2133,"""SL""","""Slider""","""foul""",,"""S""",8,0,0,1,5,"""Top""","""KC""","""TB""",44,1,,,595281.0,,6,4,4,6,"""R""","""R""","""R""",2021
429722,"""santana, ervin""",2021-04-20 00:00:00,634407,80.4,2388,"""SL""","""Slider""","""ball""",,"""B""",11,0,0,2,4,"""Top""","""KC""","""TB""",37,1,,,,,6,4,4,6,"""R""","""R""","""R""",2021
429722,"""santana, ervin""",2021-04-20 00:00:00,634407,92.3,2207,"""FF""","""4-Seam Fastball""","""hit_into_play""","""home_run""","""X""",9,0,0,2,4,"""Top""","""KC""","""TB""",36,1,"""fly_ball""",,,,6,4,4,5,"""R""","""L""","""R""",2021
429722,"""santana, ervin""",2021-04-20 00:00:00,634407,84.8,2240,"""SL""","""Slider""","""called_strike""",,"""S""",5,0,0,1,5,"""Top""","""KC""","""TB""",45,1,,,,642715.0,7,4,4,7,"""R""","""R""","""R""",2021
429722,"""santana, ervin""",2021-04-20 00:00:00,634407,83.2,2354,"""SL""","""Slider""","""blocked_ball""",,"""B""",14,0,0,2,5,"""Top""","""KC""","""TB""",46,1,,,,642715.0,7,4,4,7,"""R""","""L""","""R""",2021


## Summary

This notebook creates a cleaned relief pitcher dataset for analyzing the relationship between rest days and performance (2021-2024 MLB regular seasons).

**Data Processing Steps:**
1. Loaded 2021-2024 Statcast pitch-level data (4 parquet files)
2. Selected 31 relevant columns for analysis (velocity, spin rate, pitch outcomes, game context)
3. Combined all years and sorted by pitcher → date → game → pitch

**Relief Pitcher Filtering:**
- Identified starting pitchers as first pitcher in inning 1 per game-half
- Excluded specific starting appearances (not entire pitchers)
- Retained relief appearances by pitchers who occasionally start
- Added pitcher names via pybaseball lookup
- Filtered out position players (required 10+ games in at least one season)

**Final Dataset:**
- **Scope**: Relief appearances only, 2021-2024 regular season
- **Unit of observation**: Individual pitch
- **Key variables**: pitcher_id, pitcher_name, date, velocity, spin_rate, pitch outcomes
- **Saved to**: `data/statcast_relievers.parquet`

Ready for rest day calculations and performance analysis.