In [1]:
import pandas as pd
from pybaseball import statcast

In [8]:
# Define filter parameters
start_date = '2023-04-01'
end_date = '2023-09-30'
minPA = 10  # Minimum plate appearances
minIP = 5   # Minimum innings pitched

In [5]:
# Fetch Statcast data
data = statcast(start_dt=start_date, end_dt=end_date)
# Select relevant columns
selected_columns = [
    'pitch_type', 'release_speed', 'batter', 'events', 
    'plate_x', 'plate_z', 'description', 'game_date', 'inning', 'pitcher',
    'balls', 'strikes'
]
data = data[selected_columns]

This is a large query, it may take a moment to complete


That's a nice request you got there. It'd be a shame if something were to happen to it.
We strongly recommend that you enable caching before running this. It's as simple as `pybaseball.cache.enable()`.
Since the Statcast requests can take a *really* long time to run, if something were to happen, like: a disconnect;
gremlins; computer repair by associates of Rudy Giuliani; electromagnetic interference from metal trash cans; etc.;
you could lose a lot of progress. Enabling caching will allow you to immediately recover all the successful
subqueries if that happens.
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[column].apply(pd.to_datetime, errors='ignore', format=date_format)
  data_copy[column] = data_copy[col

In [9]:
# Calculate plate appearances (PA) and innings pitched (IP)
# Group by batter for PA and pitcher for IP
pa_counts = data.groupby('batter').size().reset_index(name='PA')
ip_counts = data.groupby('pitcher')['inning'].nunique().reset_index(name='IP')
# Merge PA and IP counts back to the main dataset
data = data.merge(pa_counts, on='batter', how='left')
data = data.merge(ip_counts, on='pitcher', how='left')
# Filter data by minPA and minIP
data = data[(data['PA'] >= minPA) & (data['IP'] >= minIP)]

In [10]:
# Define success events for analysis
success_events = ['single', 'double', 'triple', 'home_run', 'walk']
data['success'] = data['events'].isin(success_events).astype(int)


In [11]:
# Function to analyze scenarios based on count
def analyze_scenario(balls, strikes):
    scenario_data = data[(data['balls'] == balls) & (data['strikes'] == strikes)]
    if scenario_data.empty:
        print(f"No data available for count {balls}-{strikes}.")
        return

    # Group by pitch type
    analysis = scenario_data.groupby('pitch_type').agg(
        total_pitches=('pitch_type', 'size'),
        successes=('success', 'sum')
    ).reset_index()

    # Calculate success rates
    analysis['success_rate'] = analysis['successes'] / analysis['total_pitches']

    print(f"Analysis for count {balls}-{strikes}:")
    print(analysis)


In [12]:
# Example: Analyze 3-2 count
analyze_scenario(3, 2)


Analysis for count 3-2:
   pitch_type  total_pitches  successes  success_rate
0          CH           3240       1105      0.341049
1          CS              1          1      1.000000
2          CU           1687        577      0.342027
3          FA              1          0      0.000000
4          FC           2812        875      0.311166
5          FF          12563       3853      0.306694
6          FO             43         23      0.534884
7          FS            676        228      0.337278
8          KC            432        161      0.372685
9          KN              2          0      0.000000
10         SC              2          2      1.000000
11         SI           5475       1846      0.337169
12         SL           5344       1684      0.315120
13         ST           1436        493      0.343315
14         SV            109         28      0.256881


In [16]:
# Function to analyze success rate for a specific pitch sequence
def analyze_pitch_sequence(sequence):
    sequence_length = len(sequence)

    # Identify rows with the given pitch sequence using a custom approach
    data['sequence_match'] = data['pitch_type'].shift(0).eq(sequence[0])
    for i in range(1, sequence_length):
        data['sequence_match'] &= data['pitch_type'].shift(-i).eq(sequence[i])

    # Filter data for matching sequences
    sequence_data = data[data['sequence_match']]

    if sequence_data.empty:
        print(f"No data available for pitch sequence {sequence}.")
        return

    # Group by final pitch type in the sequence
    analysis = sequence_data.groupby('pitch_type').agg(
        total_pitches=('pitch_type', 'size'),
        successes=('success', 'sum')
    ).reset_index()

    # Calculate success rates
    analysis['success_rate'] = analysis['successes'] / analysis['total_pitches']

    print(f"Analysis for pitch sequence {sequence}:")
    print(analysis)

In [18]:
# Example: Analyze a pitch sequence ['FF', 'CH']
analyze_pitch_sequence(['FF', 'CH'])

Analysis for pitch sequence ['FF', 'CH']:
  pitch_type  total_pitches  successes  success_rate
0         FF          24579       2045      0.083201


In [22]:
def analyze_scenario_and_sequence(balls, strikes, sequence):
    # Filter by count
    scenario_data = data[(data['balls'] == balls) & (data['strikes'] == strikes)]
    if scenario_data.empty:
        print(f"No data available for count {balls}-{strikes}.")
        return

    # Identify rows with the given pitch sequence
    sequence_length = len(sequence)
    scenario_data['sequence_match'] = False

    for idx in range(len(scenario_data) - sequence_length + 1):
        # Check if the sequence matches
        if list(scenario_data['pitch_type'].iloc[idx:idx + sequence_length]) == sequence:
            scenario_data.iloc[idx + sequence_length - 1, scenario_data.columns.get_loc('sequence_match')] = True

    # Filter data for matching sequences
    sequence_data = scenario_data[scenario_data['sequence_match']]

    if sequence_data.empty:
        print(f"No data available for pitch sequence {sequence} at count {balls}-{strikes}.")
        return

    # Group by final pitch type in the sequence
    analysis = sequence_data.groupby('pitch_type').agg(
        total_pitches=('pitch_type', 'size'),
        successes=('success', 'sum')
    ).reset_index()

    # Calculate success rates
    analysis['success_rate'] = analysis['successes'] / analysis['total_pitches']

    print(f"Analysis for pitch sequence {sequence} at count {balls}-{strikes}:")
    print(analysis)

In [23]:
# Example: Analyze 3-2 count with a pitch sequence ['FF', 'CH']
analyze_scenario_and_sequence(3, 2, ['FF', 'CH'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  scenario_data['sequence_match'] = False


Analysis for pitch sequence ['FF', 'CH'] at count 3-2:
  pitch_type  total_pitches  successes  success_rate
0         CH           1168        397      0.339897
