# feats using polar yay

In [2]:
import logging
import sys
import time
from pathlib import Path

# import pandas as pd
import polars as pl
import pybaseball as pb
import yaml

logger = logging.getLogger("featurize")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(
    logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
)
logger.addHandler(handler)
cfg = pl.Config()
cfg.set_tbl_rows(2000)

polars.config.Config

In [3]:
with open("../params.yaml", "r") as file:
    params = yaml.safe_load(file)
input_file_path = params["featurize"]["input_data_path"]

df = pl.read_parquet("../" / Path(input_file_path))

In [4]:
df = df.with_columns(
    df.select(
        pl.col(pl.String).exclude(["player_name"]).cast(pl.Categorical).to_physical()
    ),
)

In [5]:
df = df.sort(
    [
        "game_date",
        "game_pk",
        "at_bat_number",
        "pitch_number",
    ],
    descending=False,
)
# create target variable
df = df.with_columns(
    df.select(pl.col("pitch_type").shift().alias("next_pitch")),
).drop_nulls("next_pitch")

In [6]:
df.select(pl.col(["pitch_type", "next_pitch"])).head(20)

pitch_type,next_pitch
u32,u32
0,0
0,0
2,0
5,2
1,5
0,1
1,0
1,1
5,1
1,5


In [7]:
df = df.with_columns(
    (pl.col("balls").cast(pl.String) + " - " + pl.col("strikes").cast(pl.String))
    .alias("count")
    .cast(pl.Categorical)
    .to_physical()
)
df = df.drop(["balls", "strikes"])

In [8]:
print(df.select("count").describe())
df.head()


shape: (9, 2)
┌────────────┬──────────┐
│ statistic  ┆ count    │
│ ---        ┆ ---      │
│ str        ┆ f64      │
╞════════════╪══════════╡
│ count      ┆ 591111.0 │
│ null_count ┆ 0.0      │
│ mean       ┆ 3.569081 │
│ std        ┆ 3.025384 │
│ min        ┆ 0.0      │
│ 25%        ┆ 1.0      │
│ 50%        ┆ 3.0      │
│ 75%        ┆ 6.0      │
│ max        ┆ 12.0     │
└────────────┴──────────┘


pitch_type,game_date,release_speed,player_name,batter,pitcher,type,hit_location,bb_type,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,hc_x,hc_y,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_run_exp,bat_speed,swing_length,next_pitch,count
u32,datetime[μs],f64,str,i64,i64,u32,f64,u32,f64,f64,f64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,u32,u32,f64,f64,f64,f64,u32,u32
0,2015-04-06 00:00:00,90.2,"""Keuchel, Dallas""",456422,572971,0,4.0,2.0,-0.2,2.42,,,,0,1,,,-4.628975,-131.971479,-5.967792,9.441,25.153,-14.651,3.32,1.5,,,,,,,413654,,,0.0,,0.0,0.0,,1,2,0,0,0,0,,,,-0.192,,,0,0
0,2015-04-06 00:00:00,89.7,"""Keuchel, Dallas""",543401,572971,2,,,0.02,1.64,,,,1,1,,,-3.799464,-131.134671,-7.613539,7.162,21.465,-16.083,3.34,1.48,,,,,,,413654,,,,,,,,2,1,0,0,0,0,,,,-0.025,,,0,1
2,2015-04-06 00:00:00,87.2,"""Keuchel, Dallas""",543401,572971,1,,,-1.54,0.73,,,,1,1,,,-6.769911,-127.314531,-7.249607,3.693,20.834,-24.441,3.27,1.49,,,,,,,413654,,,,,,,,2,2,0,0,0,0,,,,0.017,,,0,0
5,2015-04-06 00:00:00,89.8,"""Keuchel, Dallas""",543401,572971,1,,,-1.06,2.54,,,,1,1,,,-8.908004,-131.414862,-3.799711,17.985,26.595,-23.057,3.24,1.49,,,,89.6,2132.0,6.2,413654,,,,,,,,2,3,0,0,0,0,3.0,0.0,,0.031,,,2,2
1,2015-04-06 00:00:00,80.8,"""Keuchel, Dallas""",543401,572971,0,7.0,1.0,-0.82,1.43,,,,1,1,,,-2.59669,-118.371671,-1.826686,-6.587,22.218,-34.947,3.33,1.49,217.0,67.9,33.0,79.9,2151.0,5.7,413654,0.76,0.721,0.0,1.0,0.0,0.0,4.0,2,4,0,0,0,0,3.0,1.0,,-0.179,,,5,3


In [9]:
df = df.with_columns(df.select(["on_1b", "on_2b", "on_3b"]).fill_null(-1))
df = df.with_columns(
    pl.col("on_1b")
    .map_elements(lambda s: 0 if s == -1.0 else 1, return_dtype=pl.Int32)
    .alias("on_1b"),
    pl.col("on_2b")
    .map_elements(lambda s: 0 if s == -1.0 else 1, return_dtype=pl.Int32)
    .alias("on_2b"),
    pl.col("on_3b")
    .map_elements(lambda s: 0 if s == -1.0 else 1, return_dtype=pl.Int32)
    .alias("on_3b"),
)
df = df.with_columns(
    (
        pl.col("on_1b").cast(pl.String)
        + "/"
        + pl.col("on_2b").cast(pl.String)
        + "/"
        + pl.col("on_3b").cast(pl.String)
    )
    .alias("base_state")
    .cast(pl.Categorical)
    .to_physical()
)
df = df.drop(["on_1b", "on_2b", "on_3b"])

In [10]:
print(df.select("base_state").describe())

df.head()

shape: (9, 2)
┌────────────┬────────────┐
│ statistic  ┆ base_state │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 591111.0   │
│ null_count ┆ 0.0        │
│ mean       ┆ 0.923358   │
│ std        ┆ 1.605332   │
│ min        ┆ 0.0        │
│ 25%        ┆ 0.0        │
│ 50%        ┆ 0.0        │
│ 75%        ┆ 1.0        │
│ max        ┆ 7.0        │
└────────────┴────────────┘


pitch_type,game_date,release_speed,player_name,batter,pitcher,type,hit_location,bb_type,plate_x,plate_z,outs_when_up,inning,hc_x,hc_y,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_run_exp,bat_speed,swing_length,next_pitch,count,base_state
u32,datetime[μs],f64,str,i64,i64,u32,f64,u32,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,u32,u32,f64,f64,f64,f64,u32,u32,u32
0,2015-04-06 00:00:00,90.2,"""Keuchel, Dallas""",456422,572971,0,4.0,2.0,-0.2,2.42,0,1,,,-4.628975,-131.971479,-5.967792,9.441,25.153,-14.651,3.32,1.5,,,,,,,413654,,,0.0,,0.0,0.0,,1,2,0,0,0,0,,,,-0.192,,,0,0,0
0,2015-04-06 00:00:00,89.7,"""Keuchel, Dallas""",543401,572971,2,,,0.02,1.64,1,1,,,-3.799464,-131.134671,-7.613539,7.162,21.465,-16.083,3.34,1.48,,,,,,,413654,,,,,,,,2,1,0,0,0,0,,,,-0.025,,,0,1,0
2,2015-04-06 00:00:00,87.2,"""Keuchel, Dallas""",543401,572971,1,,,-1.54,0.73,1,1,,,-6.769911,-127.314531,-7.249607,3.693,20.834,-24.441,3.27,1.49,,,,,,,413654,,,,,,,,2,2,0,0,0,0,,,,0.017,,,0,0,0
5,2015-04-06 00:00:00,89.8,"""Keuchel, Dallas""",543401,572971,1,,,-1.06,2.54,1,1,,,-8.908004,-131.414862,-3.799711,17.985,26.595,-23.057,3.24,1.49,,,,89.6,2132.0,6.2,413654,,,,,,,,2,3,0,0,0,0,3.0,0.0,,0.031,,,2,2,0
1,2015-04-06 00:00:00,80.8,"""Keuchel, Dallas""",543401,572971,0,7.0,1.0,-0.82,1.43,1,1,,,-2.59669,-118.371671,-1.826686,-6.587,22.218,-34.947,3.33,1.49,217.0,67.9,33.0,79.9,2151.0,5.7,413654,0.76,0.721,0.0,1.0,0.0,0.0,4.0,2,4,0,0,0,0,3.0,1.0,,-0.179,,,5,3,0


In [11]:
print(df.shape)

# # find columns with null percentage > 10%
# # Calculate null percentages for all columns
# null_percentages = df.select(pl.all().is_null().sum() / df.height)

# # Filter columns where null percentage > 10%
# columns_with_too_many_nulls = (
#     null_percentages.melt().filter(pl.col("value") > 0.1).select("variable")
# )

# print("Columns with >10% null values:")
# print(columns_with_too_many_nulls)

# drop columns with too many nulls
df = df.drop(
    df.select(pl.all().is_null().sum() / df.height)
    .unpivot()
    .filter(pl.col("value") > 0.05)
    .select("variable")
    .to_series()
    .to_list()
)
print(df.shape)
df.head()

(591111, 52)
(591111, 35)


pitch_type,game_date,release_speed,player_name,batter,pitcher,type,plate_x,plate_z,outs_when_up,inning,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,effective_speed,release_spin_rate,release_extension,game_pk,at_bat_number,pitch_number,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,delta_run_exp,next_pitch,count,base_state
u32,datetime[μs],f64,str,i64,i64,u32,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,u32,u32,f64,u32,u32,u32
0,2015-04-06 00:00:00,90.2,"""Keuchel, Dallas""",456422,572971,0,-0.2,2.42,0,1,-4.628975,-131.971479,-5.967792,9.441,25.153,-14.651,3.32,1.5,,,,413654,1,2,0,0,0,0,,,-0.192,0,0,0
0,2015-04-06 00:00:00,89.7,"""Keuchel, Dallas""",543401,572971,2,0.02,1.64,1,1,-3.799464,-131.134671,-7.613539,7.162,21.465,-16.083,3.34,1.48,,,,413654,2,1,0,0,0,0,,,-0.025,0,1,0
2,2015-04-06 00:00:00,87.2,"""Keuchel, Dallas""",543401,572971,1,-1.54,0.73,1,1,-6.769911,-127.314531,-7.249607,3.693,20.834,-24.441,3.27,1.49,,,,413654,2,2,0,0,0,0,,,0.017,0,0,0
5,2015-04-06 00:00:00,89.8,"""Keuchel, Dallas""",543401,572971,1,-1.06,2.54,1,1,-8.908004,-131.414862,-3.799711,17.985,26.595,-23.057,3.24,1.49,89.6,2132.0,6.2,413654,2,3,0,0,0,0,3.0,0.0,0.031,2,2,0
1,2015-04-06 00:00:00,80.8,"""Keuchel, Dallas""",543401,572971,0,-0.82,1.43,1,1,-2.59669,-118.371671,-1.826686,-6.587,22.218,-34.947,3.33,1.49,79.9,2151.0,5.7,413654,2,4,0,0,0,0,3.0,1.0,-0.179,5,3,0


In [12]:
# release point consistency feature


df.head()

pitch_type,game_date,release_speed,player_name,batter,pitcher,type,plate_x,plate_z,outs_when_up,inning,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,effective_speed,release_spin_rate,release_extension,game_pk,at_bat_number,pitch_number,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,delta_run_exp,next_pitch,count,base_state
u32,datetime[μs],f64,str,i64,i64,u32,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,u32,u32,f64,u32,u32,u32
0,2015-04-06 00:00:00,90.2,"""Keuchel, Dallas""",456422,572971,0,-0.2,2.42,0,1,-4.628975,-131.971479,-5.967792,9.441,25.153,-14.651,3.32,1.5,,,,413654,1,2,0,0,0,0,,,-0.192,0,0,0
0,2015-04-06 00:00:00,89.7,"""Keuchel, Dallas""",543401,572971,2,0.02,1.64,1,1,-3.799464,-131.134671,-7.613539,7.162,21.465,-16.083,3.34,1.48,,,,413654,2,1,0,0,0,0,,,-0.025,0,1,0
2,2015-04-06 00:00:00,87.2,"""Keuchel, Dallas""",543401,572971,1,-1.54,0.73,1,1,-6.769911,-127.314531,-7.249607,3.693,20.834,-24.441,3.27,1.49,,,,413654,2,2,0,0,0,0,,,0.017,0,0,0
5,2015-04-06 00:00:00,89.8,"""Keuchel, Dallas""",543401,572971,1,-1.06,2.54,1,1,-8.908004,-131.414862,-3.799711,17.985,26.595,-23.057,3.24,1.49,89.6,2132.0,6.2,413654,2,3,0,0,0,0,3.0,0.0,0.031,2,2,0
1,2015-04-06 00:00:00,80.8,"""Keuchel, Dallas""",543401,572971,0,-0.82,1.43,1,1,-2.59669,-118.371671,-1.826686,-6.587,22.218,-34.947,3.33,1.49,79.9,2151.0,5.7,413654,2,4,0,0,0,0,3.0,1.0,-0.179,5,3,0


In [13]:
# # Late inning pressure (boolean)
# df = df.with_columns(
#     ((pl.col("inning") >= 7) & (pl.col("bat_score") - pl.col("fld_score")).abs() <= 3)
#     .cast(pl.Int32)
#     .alias("high_pressure")
# )

# df.head()

In [14]:
def sort_pitch_data(df):
    return df.sort(
        [
            "game_date",  # Primary: chronological order
            "game_pk",  # Secondary: unique game identifier
            "inning",  # Tertiary: game sequence
            "at_bat_number",  # Game at-bat order
            "pitch_number",  # At-bat pitch sequence
        ],
        descending=False,  # Ascending order for all
    )


df = sort_pitch_data(df)
df.head()

pitch_type,game_date,release_speed,player_name,batter,pitcher,type,plate_x,plate_z,outs_when_up,inning,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,effective_speed,release_spin_rate,release_extension,game_pk,at_bat_number,pitch_number,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,delta_run_exp,next_pitch,count,base_state
u32,datetime[μs],f64,str,i64,i64,u32,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,u32,u32,f64,u32,u32,u32
0,2015-04-06 00:00:00,90.2,"""Keuchel, Dallas""",456422,572971,0,-0.2,2.42,0,1,-4.628975,-131.971479,-5.967792,9.441,25.153,-14.651,3.32,1.5,,,,413654,1,2,0,0,0,0,,,-0.192,0,0,0
0,2015-04-06 00:00:00,89.7,"""Keuchel, Dallas""",543401,572971,2,0.02,1.64,1,1,-3.799464,-131.134671,-7.613539,7.162,21.465,-16.083,3.34,1.48,,,,413654,2,1,0,0,0,0,,,-0.025,0,1,0
2,2015-04-06 00:00:00,87.2,"""Keuchel, Dallas""",543401,572971,1,-1.54,0.73,1,1,-6.769911,-127.314531,-7.249607,3.693,20.834,-24.441,3.27,1.49,,,,413654,2,2,0,0,0,0,,,0.017,0,0,0
5,2015-04-06 00:00:00,89.8,"""Keuchel, Dallas""",543401,572971,1,-1.06,2.54,1,1,-8.908004,-131.414862,-3.799711,17.985,26.595,-23.057,3.24,1.49,89.6,2132.0,6.2,413654,2,3,0,0,0,0,3.0,0.0,0.031,2,2,0
1,2015-04-06 00:00:00,80.8,"""Keuchel, Dallas""",543401,572971,0,-0.82,1.43,1,1,-2.59669,-118.371671,-1.826686,-6.587,22.218,-34.947,3.33,1.49,79.9,2151.0,5.7,413654,2,4,0,0,0,0,3.0,1.0,-0.179,5,3,0


In [15]:
df.head()

pitch_type,game_date,release_speed,player_name,batter,pitcher,type,plate_x,plate_z,outs_when_up,inning,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,effective_speed,release_spin_rate,release_extension,game_pk,at_bat_number,pitch_number,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,delta_run_exp,next_pitch,count,base_state
u32,datetime[μs],f64,str,i64,i64,u32,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,u32,u32,f64,u32,u32,u32
0,2015-04-06 00:00:00,90.2,"""Keuchel, Dallas""",456422,572971,0,-0.2,2.42,0,1,-4.628975,-131.971479,-5.967792,9.441,25.153,-14.651,3.32,1.5,,,,413654,1,2,0,0,0,0,,,-0.192,0,0,0
0,2015-04-06 00:00:00,89.7,"""Keuchel, Dallas""",543401,572971,2,0.02,1.64,1,1,-3.799464,-131.134671,-7.613539,7.162,21.465,-16.083,3.34,1.48,,,,413654,2,1,0,0,0,0,,,-0.025,0,1,0
2,2015-04-06 00:00:00,87.2,"""Keuchel, Dallas""",543401,572971,1,-1.54,0.73,1,1,-6.769911,-127.314531,-7.249607,3.693,20.834,-24.441,3.27,1.49,,,,413654,2,2,0,0,0,0,,,0.017,0,0,0
5,2015-04-06 00:00:00,89.8,"""Keuchel, Dallas""",543401,572971,1,-1.06,2.54,1,1,-8.908004,-131.414862,-3.799711,17.985,26.595,-23.057,3.24,1.49,89.6,2132.0,6.2,413654,2,3,0,0,0,0,3.0,0.0,0.031,2,2,0
1,2015-04-06 00:00:00,80.8,"""Keuchel, Dallas""",543401,572971,0,-0.82,1.43,1,1,-2.59669,-118.371671,-1.826686,-6.587,22.218,-34.947,3.33,1.49,79.9,2151.0,5.7,413654,2,4,0,0,0,0,3.0,1.0,-0.179,5,3,0


In [16]:
print(df.height)
filtered = []
for pitcher_df in df.group_by("pitcher"):
    pitcher_code = pitcher_df[0]
    pitcher_df = pitcher_df[1]
    val_counts = (
        pitcher_df.select(
            pl.col("pitch_type").value_counts(normalize=True).alias("pitch_type_counts")
        )
        .unnest("pitch_type_counts")
        .sort("proportion", descending=True)
    )

    # Get pitch types with proportion < 0.01
    low_prop_pitches = (
        val_counts.filter(pl.col("proportion") < 0.01)["pitch_type"].unique().to_list()
    )

    # Filter out those pitch types
    pitcher_df = pitcher_df.filter(~pl.col("pitch_type").is_in(low_prop_pitches))

    # combine back to main df
    filtered.append(pitcher_df)
# combine back to one df
df = pl.concat(filtered)

591111


In [17]:
batting_df = pb.batting_stats_bref(params["clean"]["start_year"])
player_ids = list(df.select("batter").unique().to_pandas()["batter"])
batting_df = pl.DataFrame(batting_df[batting_df["mlbID"].isin(player_ids)])
batting_df = batting_df.drop(
    [
        "Name",
        "Age",
        "#days",
        "Lev",
        "Tm",
        "G",
        "PA",
        "AB",
        "SO",
        "HBP",
        "SH",
        "SF",
        "SB",
        "CS",
    ]
)
df = df.join(batting_df, left_on="batter", right_on="mlbID", how="left")

df = df.fill_null(-1)
df = df.fill_nan(-1)

In [18]:
df.head(20)

pitch_type,game_date,release_speed,player_name,batter,pitcher,type,plate_x,plate_z,outs_when_up,inning,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,effective_speed,release_spin_rate,release_extension,game_pk,at_bat_number,pitch_number,bat_score,fld_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,delta_run_exp,next_pitch,count,base_state,R,H,2B,3B,HR,RBI,BB,IBB,GDP,BA,OBP,SLG,OPS
i64,datetime[μs],f64,str,i64,i64,i64,f64,f64,i64,i64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,f64,f64,f64
5,2015-04-12 00:00:00,97.1,"""Sale, Chris""",542454,519242,2,-0.08,3.11,0,1,-11.928961,-142.008098,-1.908309,21.781,35.521,-18.095,3.44,1.47,96.4,2418.0,6.4,413732,1,1,0,0,0,0,0,0,-0.032,1,1,0,30,56,10,5,0,21,6,1,7,0.215,0.241,0.291,0.532
5,2015-04-12 00:00:00,89.8,"""Sale, Chris""",542454,519242,0,0.06,1.93,0,1,-10.04573,-131.389047,-1.895191,16.023,27.088,-26.441,3.33,1.46,89.6,2093.0,6.3,413732,1,2,0,0,0,0,0,1,-0.192,5,0,0,30,56,10,5,0,21,6,1,7,0.215,0.241,0.291,0.532
5,2015-04-12 00:00:00,97.3,"""Sale, Chris""",572821,519242,1,-1.0,1.41,1,1,-13.444729,-142.165101,-5.347231,20.977,36.72,-20.349,3.02,1.25,96.6,2122.0,6.4,413732,2,1,0,0,0,0,2,0,0.024,5,1,0,101,148,39,4,28,77,61,2,10,0.236,0.307,0.444,0.751
5,2015-04-12 00:00:00,97.3,"""Sale, Chris""",572821,519242,2,1.13,2.6,1,1,-9.851442,-142.369666,-4.262231,27.16,30.982,-13.014,2.93,1.25,97.2,2465.0,6.3,413732,2,2,0,0,0,0,2,0,-0.031,5,7,0,101,148,39,4,28,77,61,2,10,0.236,0.307,0.444,0.751
5,2015-04-12 00:00:00,97.8,"""Sale, Chris""",572821,519242,2,0.45,3.58,1,1,-10.435978,-143.120975,-1.179134,22.877,33.682,-16.346,2.93,1.25,97.4,2384.0,6.3,413732,2,3,0,0,0,0,2,0,-0.039,5,2,0,101,148,39,4,28,77,61,2,10,0.236,0.307,0.444,0.751
4,2015-04-12 00:00:00,87.8,"""Sale, Chris""",572821,519242,0,1.04,1.31,1,1,-10.261341,-128.483704,-3.795101,24.104,26.959,-22.726,2.93,1.25,87.4,2115.0,6.2,413732,2,4,0,0,0,0,2,0,-0.109,5,6,0,101,148,39,4,28,77,61,2,10,0.236,0.307,0.444,0.751
5,2015-04-12 00:00:00,94.8,"""Sale, Chris""",408045,519242,1,-1.19,1.54,2,1,-15.192858,-138.240298,-7.270097,24.441,34.459,-10.95,3.72,1.66,93.6,2286.0,6.2,413732,3,1,0,0,0,0,0,0,0.014,4,1,0,69,157,34,2,10,66,67,12,22,0.265,0.338,0.38,0.718
5,2015-04-12 00:00:00,92.7,"""Sale, Chris""",408045,519242,2,0.43,2.0,2,1,-10.221469,-135.724839,-1.617749,20.716,31.033,-28.852,3.77,1.66,92.5,2119.0,6.5,413732,3,2,0,0,0,0,0,0,-0.021,5,7,0,69,157,34,2,10,66,67,12,22,0.265,0.338,0.38,0.718
4,2015-04-12 00:00:00,86.2,"""Sale, Chris""",408045,519242,0,-0.24,1.83,2,1,-12.45622,-125.853312,-2.322514,22.327,25.367,-23.254,3.42,1.66,85.6,1956.0,6.2,413732,3,3,0,0,0,0,0,0,0.13,5,2,0,69,157,34,2,10,66,67,12,22,0.265,0.338,0.38,0.718
5,2015-04-12 00:00:00,92.2,"""Sale, Chris""",116338,519242,2,1.0,1.48,2,1,-10.324501,-135.120157,-3.392345,24.858,35.341,-25.042,3.47,1.51,90.8,2133.0,6.1,413732,4,1,0,0,0,0,0,0,-0.035,4,1,1,67,125,22,0,22,81,35,1,14,0.24,0.293,0.409,0.702


In [30]:
for pitcher_df in df.group_by("pitcher"):
    # get the pitcher code (statcast id)
    pitcher_code = pitcher_df[0]

    # get the pitcher dataframe
    pitcher_df = pitcher_df[1]

    # pitcher_df.select(
    #     pl.col("pitch_type").value_counts().sort(descending=False).head(1)
    # ).item()["count"] / len(pitcher_df)
    print(
        pitcher_df.select(
            pl.col("pitch_type").value_counts(normalize=True, sort=True).head(1)
        ).item()["proportion"]
    )

0.41926558176616796
0.4656529889606332
0.4985260520303633
0.3180760647642562
0.2951825579877279
0.4912361623616236
0.4064202621116183
0.5653712937235945
0.5506124566093444
0.499915537687084
0.31733870967741934
0.44836547667897686
0.44796624306952554
0.2912889910208945
0.43912404737856947
0.35263702171664946
0.31200444609114486
0.35349747948173144
0.42418370712401055
0.3464450051951907
0.3865761444019839
0.4005735354362966
0.41203323290422716
0.364190012180268
0.3450628366247756
