# feats using polar yay

In [53]:
import logging
import sys
import time
from pathlib import Path

# import pandas as pd
import polars as pl
import pybaseball as pb
import yaml

logger = logging.getLogger("featurize")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(
    logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
)
logger.addHandler(handler)
cfg = pl.Config()
cfg.set_tbl_rows(2000)

polars.config.Config

In [54]:
with open("../params.yaml", "r") as file:
    params = yaml.safe_load(file)
input_file_path = params["featurize"]["input_data_path"]

df = pl.read_parquet("../" / Path(input_file_path))

In [55]:
df = df.with_columns(
    df.select(
        pl.col(pl.String).exclude(["player_name"]).cast(pl.Categorical).to_physical()
    ),
)

In [56]:
df = df.sort(
    [
        "game_date",
        "game_pk",
        "at_bat_number",
        "pitch_number",
    ],
    descending=False,
)
# create target variable
df = df.with_columns(
    df.select(pl.col("pitch_type").shift(-1).alias("next_pitch")),
).drop_nulls("next_pitch")

In [57]:
df.select(pl.col(["pitch_type", "next_pitch"])).head(20)

pitch_type,next_pitch
u32,u32
1,3
3,1
1,1
1,1
1,1
1,1
1,1
1,3
3,1
1,5


In [58]:
df = df.with_columns(
    (pl.col("balls").cast(pl.String) + " - " + pl.col("strikes").cast(pl.String))
    .alias("count")
    .cast(pl.Categorical)
    .to_physical()
)
df = df.drop(["balls", "strikes"])

In [59]:
print(df.select("count").describe())
df.head()


shape: (9, 2)
┌────────────┬──────────┐
│ statistic  ┆ count    │
│ ---        ┆ ---      │
│ str        ┆ f64      │
╞════════════╪══════════╡
│ count      ┆ 30170.0  │
│ null_count ┆ 0.0      │
│ mean       ┆ 3.282996 │
│ std        ┆ 2.928879 │
│ min        ┆ 0.0      │
│ 25%        ┆ 0.0      │
│ 50%        ┆ 3.0      │
│ 75%        ┆ 5.0      │
│ max        ┆ 11.0     │
└────────────┴──────────┘


pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,stand,type,hit_location,bb_type,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,bat_score,fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,bat_speed,swing_length,next_pitch,count
u32,datetime[μs],f64,f64,f64,str,i64,i64,u32,u32,f64,u32,f64,f64,f64,f64,f64,f64,f64,i64,i64,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,u32,u32,f64,f64,f64,u32,u32
1,2022-03-19 00:00:00,92.2,-2.56,5.43,"""Nola, Aaron""",605480,605400,0,1,,,-1.49,1.31,-0.84,2.43,,,,0,1,0,,,7.60704,-134.022722,-4.632891,-19.575483,28.632765,-15.589008,3.3,1.5,,,,,,,706872,54.13,,,,,,,,1,1,0,0,,,,,,3,0
3,2022-03-19 00:00:00,78.4,-2.43,5.57,"""Nola, Aaron""",605480,605400,0,1,,,0.68,-0.69,0.5,2.11,,,,0,1,0,,,5.097511,-114.097729,0.697378,4.920583,22.400799,-38.53246,3.3,1.5,,,,,,,706872,54.13,,,,,,,,1,2,0,0,,,,,,1,1
1,2022-03-19 00:00:00,92.4,-2.49,5.49,"""Nola, Aaron""",605480,605400,0,2,,,-1.47,1.35,0.58,3.4,,,,0,1,0,,,10.986446,-134.099297,-2.413662,-20.087748,28.685899,-15.513516,3.3,1.5,,,,,,,706872,54.13,,,,,,,,1,3,0,0,,,,,,1,2
1,2022-03-19 00:00:00,93.2,-2.35,5.45,"""Nola, Aaron""",605480,605400,0,1,2.0,,-1.56,1.29,-0.98,2.75,,,,0,1,0,,,6.967558,-135.527336,-3.979329,-20.786844,30.865196,-15.61834,3.3,1.5,,,,,,,706872,54.13,,,0.0,,0.0,0.0,,1,4,0,0,,,,,,1,3
1,2022-03-19 00:00:00,92.7,-2.51,5.43,"""Nola, Aaron""",666182,605400,1,1,,,-1.25,1.52,-0.06,2.77,,,,1,1,0,,,9.000929,-134.537277,-4.232589,-17.263002,31.813808,-13.015395,3.37,1.53,,,,,,,706872,54.13,,,,,,,,2,1,0,0,,,,,,1,0


In [60]:
df = df.with_columns(df.select(["on_1b", "on_2b", "on_3b"]).fill_null(-1))
df = df.with_columns(
    pl.col("on_1b")
    .map_elements(lambda s: 0 if s == -1.0 else 1, return_dtype=pl.Int32)
    .alias("on_1b"),
    pl.col("on_2b")
    .map_elements(lambda s: 0 if s == -1.0 else 1, return_dtype=pl.Int32)
    .alias("on_2b"),
    pl.col("on_3b")
    .map_elements(lambda s: 0 if s == -1.0 else 1, return_dtype=pl.Int32)
    .alias("on_3b"),
)
df = df.with_columns(
    (
        pl.col("on_1b").cast(pl.String)
        + "/"
        + pl.col("on_2b").cast(pl.String)
        + "/"
        + pl.col("on_3b").cast(pl.String)
    )
    .alias("base_state")
    .cast(pl.Categorical)
    .to_physical()
)
df = df.drop(["on_1b", "on_2b", "on_3b"])

In [61]:
print(df.select("base_state").describe())

df.head()

shape: (9, 2)
┌────────────┬────────────┐
│ statistic  ┆ base_state │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 30170.0    │
│ null_count ┆ 0.0        │
│ mean       ┆ 1.015976   │
│ std        ┆ 1.685017   │
│ min        ┆ 0.0        │
│ 25%        ┆ 0.0        │
│ 50%        ┆ 0.0        │
│ 75%        ┆ 2.0        │
│ max        ┆ 7.0        │
└────────────┴────────────┘


pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,stand,type,hit_location,bb_type,pfx_x,pfx_z,plate_x,plate_z,outs_when_up,inning,inning_topbot,hc_x,hc_y,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,bat_score,fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,bat_speed,swing_length,next_pitch,count,base_state
u32,datetime[μs],f64,f64,f64,str,i64,i64,u32,u32,f64,u32,f64,f64,f64,f64,i64,i64,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,u32,u32,f64,f64,f64,u32,u32,u32
1,2022-03-19 00:00:00,92.2,-2.56,5.43,"""Nola, Aaron""",605480,605400,0,1,,,-1.49,1.31,-0.84,2.43,0,1,0,,,7.60704,-134.022722,-4.632891,-19.575483,28.632765,-15.589008,3.3,1.5,,,,,,,706872,54.13,,,,,,,,1,1,0,0,,,,,,3,0,0
3,2022-03-19 00:00:00,78.4,-2.43,5.57,"""Nola, Aaron""",605480,605400,0,1,,,0.68,-0.69,0.5,2.11,0,1,0,,,5.097511,-114.097729,0.697378,4.920583,22.400799,-38.53246,3.3,1.5,,,,,,,706872,54.13,,,,,,,,1,2,0,0,,,,,,1,1,0
1,2022-03-19 00:00:00,92.4,-2.49,5.49,"""Nola, Aaron""",605480,605400,0,2,,,-1.47,1.35,0.58,3.4,0,1,0,,,10.986446,-134.099297,-2.413662,-20.087748,28.685899,-15.513516,3.3,1.5,,,,,,,706872,54.13,,,,,,,,1,3,0,0,,,,,,1,2,0
1,2022-03-19 00:00:00,93.2,-2.35,5.45,"""Nola, Aaron""",605480,605400,0,1,2.0,,-1.56,1.29,-0.98,2.75,0,1,0,,,6.967558,-135.527336,-3.979329,-20.786844,30.865196,-15.61834,3.3,1.5,,,,,,,706872,54.13,,,0.0,,0.0,0.0,,1,4,0,0,,,,,,1,3,0
1,2022-03-19 00:00:00,92.7,-2.51,5.43,"""Nola, Aaron""",666182,605400,1,1,,,-1.25,1.52,-0.06,2.77,1,1,0,,,9.000929,-134.537277,-4.232589,-17.263002,31.813808,-13.015395,3.37,1.53,,,,,,,706872,54.13,,,,,,,,2,1,0,0,,,,,,1,0,0


In [62]:
print(df.shape)

# # find columns with null percentage > 10%
# # Calculate null percentages for all columns
# null_percentages = df.select(pl.all().is_null().sum() / df.height)

# # Filter columns where null percentage > 10%
# columns_with_too_many_nulls = (
#     null_percentages.melt().filter(pl.col("value") > 0.1).select("variable")
# )

# print("Columns with >10% null values:")
# print(columns_with_too_many_nulls)

# drop columns with too many nulls
df = df.drop(
    df.select(pl.all().is_null().sum() / df.height)
    .unpivot()
    .filter(pl.col("value") > 0.05)
    .select("variable")
    .to_series()
    .to_list()
)
print(df.shape)
df.head()

(30170, 56)
(30170, 40)


pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,stand,type,pfx_x,pfx_z,plate_x,plate_z,outs_when_up,inning,inning_topbot,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,at_bat_number,pitch_number,bat_score,fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,next_pitch,count,base_state
u32,datetime[μs],f64,f64,f64,str,i64,i64,u32,u32,f64,f64,f64,f64,i64,i64,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,i64,i64,i64,i64,u32,u32,f64,u32,u32,u32
1,2022-03-19 00:00:00,92.2,-2.56,5.43,"""Nola, Aaron""",605480,605400,0,1,-1.49,1.31,-0.84,2.43,0,1,0,7.60704,-134.022722,-4.632891,-19.575483,28.632765,-15.589008,3.3,1.5,,,,706872,54.13,1,1,0,0,,,,3,0,0
3,2022-03-19 00:00:00,78.4,-2.43,5.57,"""Nola, Aaron""",605480,605400,0,1,0.68,-0.69,0.5,2.11,0,1,0,5.097511,-114.097729,0.697378,4.920583,22.400799,-38.53246,3.3,1.5,,,,706872,54.13,1,2,0,0,,,,1,1,0
1,2022-03-19 00:00:00,92.4,-2.49,5.49,"""Nola, Aaron""",605480,605400,0,2,-1.47,1.35,0.58,3.4,0,1,0,10.986446,-134.099297,-2.413662,-20.087748,28.685899,-15.513516,3.3,1.5,,,,706872,54.13,1,3,0,0,,,,1,2,0
1,2022-03-19 00:00:00,93.2,-2.35,5.45,"""Nola, Aaron""",605480,605400,0,1,-1.56,1.29,-0.98,2.75,0,1,0,6.967558,-135.527336,-3.979329,-20.786844,30.865196,-15.61834,3.3,1.5,,,,706872,54.13,1,4,0,0,,,,1,3,0
1,2022-03-19 00:00:00,92.7,-2.51,5.43,"""Nola, Aaron""",666182,605400,1,1,-1.25,1.52,-0.06,2.77,1,1,0,9.000929,-134.537277,-4.232589,-17.263002,31.813808,-13.015395,3.37,1.53,,,,706872,54.13,2,1,0,0,,,,1,0,0


In [63]:
# release point consistency feature


df.head()

pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,stand,type,pfx_x,pfx_z,plate_x,plate_z,outs_when_up,inning,inning_topbot,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,at_bat_number,pitch_number,bat_score,fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,next_pitch,count,base_state
u32,datetime[μs],f64,f64,f64,str,i64,i64,u32,u32,f64,f64,f64,f64,i64,i64,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,i64,i64,i64,i64,u32,u32,f64,u32,u32,u32
1,2022-03-19 00:00:00,92.2,-2.56,5.43,"""Nola, Aaron""",605480,605400,0,1,-1.49,1.31,-0.84,2.43,0,1,0,7.60704,-134.022722,-4.632891,-19.575483,28.632765,-15.589008,3.3,1.5,,,,706872,54.13,1,1,0,0,,,,3,0,0
3,2022-03-19 00:00:00,78.4,-2.43,5.57,"""Nola, Aaron""",605480,605400,0,1,0.68,-0.69,0.5,2.11,0,1,0,5.097511,-114.097729,0.697378,4.920583,22.400799,-38.53246,3.3,1.5,,,,706872,54.13,1,2,0,0,,,,1,1,0
1,2022-03-19 00:00:00,92.4,-2.49,5.49,"""Nola, Aaron""",605480,605400,0,2,-1.47,1.35,0.58,3.4,0,1,0,10.986446,-134.099297,-2.413662,-20.087748,28.685899,-15.513516,3.3,1.5,,,,706872,54.13,1,3,0,0,,,,1,2,0
1,2022-03-19 00:00:00,93.2,-2.35,5.45,"""Nola, Aaron""",605480,605400,0,1,-1.56,1.29,-0.98,2.75,0,1,0,6.967558,-135.527336,-3.979329,-20.786844,30.865196,-15.61834,3.3,1.5,,,,706872,54.13,1,4,0,0,,,,1,3,0
1,2022-03-19 00:00:00,92.7,-2.51,5.43,"""Nola, Aaron""",666182,605400,1,1,-1.25,1.52,-0.06,2.77,1,1,0,9.000929,-134.537277,-4.232589,-17.263002,31.813808,-13.015395,3.37,1.53,,,,706872,54.13,2,1,0,0,,,,1,0,0


In [64]:
# # Late inning pressure (boolean)
# df = df.with_columns(
#     ((pl.col("inning") >= 7) & (pl.col("bat_score") - pl.col("fld_score")).abs() <= 3)
#     .cast(pl.Int32)
#     .alias("high_pressure")
# )

# df.head()

In [65]:
def sort_pitch_data(df):
    return df.sort(
        [
            "game_date",  # Primary: chronological order
            "game_pk",  # Secondary: unique game identifier
            "inning",  # Tertiary: game sequence
            "at_bat_number",  # Game at-bat order
            "pitch_number",  # At-bat pitch sequence
        ],
        descending=False,  # Ascending order for all
    )


df = sort_pitch_data(df)
df.head()

pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,stand,type,pfx_x,pfx_z,plate_x,plate_z,outs_when_up,inning,inning_topbot,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,at_bat_number,pitch_number,bat_score,fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,next_pitch,count,base_state
u32,datetime[μs],f64,f64,f64,str,i64,i64,u32,u32,f64,f64,f64,f64,i64,i64,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,i64,i64,i64,i64,u32,u32,f64,u32,u32,u32
1,2022-03-19 00:00:00,92.2,-2.56,5.43,"""Nola, Aaron""",605480,605400,0,1,-1.49,1.31,-0.84,2.43,0,1,0,7.60704,-134.022722,-4.632891,-19.575483,28.632765,-15.589008,3.3,1.5,,,,706872,54.13,1,1,0,0,,,,3,0,0
3,2022-03-19 00:00:00,78.4,-2.43,5.57,"""Nola, Aaron""",605480,605400,0,1,0.68,-0.69,0.5,2.11,0,1,0,5.097511,-114.097729,0.697378,4.920583,22.400799,-38.53246,3.3,1.5,,,,706872,54.13,1,2,0,0,,,,1,1,0
1,2022-03-19 00:00:00,92.4,-2.49,5.49,"""Nola, Aaron""",605480,605400,0,2,-1.47,1.35,0.58,3.4,0,1,0,10.986446,-134.099297,-2.413662,-20.087748,28.685899,-15.513516,3.3,1.5,,,,706872,54.13,1,3,0,0,,,,1,2,0
1,2022-03-19 00:00:00,93.2,-2.35,5.45,"""Nola, Aaron""",605480,605400,0,1,-1.56,1.29,-0.98,2.75,0,1,0,6.967558,-135.527336,-3.979329,-20.786844,30.865196,-15.61834,3.3,1.5,,,,706872,54.13,1,4,0,0,,,,1,3,0
1,2022-03-19 00:00:00,92.7,-2.51,5.43,"""Nola, Aaron""",666182,605400,1,1,-1.25,1.52,-0.06,2.77,1,1,0,9.000929,-134.537277,-4.232589,-17.263002,31.813808,-13.015395,3.37,1.53,,,,706872,54.13,2,1,0,0,,,,1,0,0


In [66]:
df.head()

pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,stand,type,pfx_x,pfx_z,plate_x,plate_z,outs_when_up,inning,inning_topbot,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,at_bat_number,pitch_number,bat_score,fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,next_pitch,count,base_state
u32,datetime[μs],f64,f64,f64,str,i64,i64,u32,u32,f64,f64,f64,f64,i64,i64,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,i64,i64,i64,i64,u32,u32,f64,u32,u32,u32
1,2022-03-19 00:00:00,92.2,-2.56,5.43,"""Nola, Aaron""",605480,605400,0,1,-1.49,1.31,-0.84,2.43,0,1,0,7.60704,-134.022722,-4.632891,-19.575483,28.632765,-15.589008,3.3,1.5,,,,706872,54.13,1,1,0,0,,,,3,0,0
3,2022-03-19 00:00:00,78.4,-2.43,5.57,"""Nola, Aaron""",605480,605400,0,1,0.68,-0.69,0.5,2.11,0,1,0,5.097511,-114.097729,0.697378,4.920583,22.400799,-38.53246,3.3,1.5,,,,706872,54.13,1,2,0,0,,,,1,1,0
1,2022-03-19 00:00:00,92.4,-2.49,5.49,"""Nola, Aaron""",605480,605400,0,2,-1.47,1.35,0.58,3.4,0,1,0,10.986446,-134.099297,-2.413662,-20.087748,28.685899,-15.513516,3.3,1.5,,,,706872,54.13,1,3,0,0,,,,1,2,0
1,2022-03-19 00:00:00,93.2,-2.35,5.45,"""Nola, Aaron""",605480,605400,0,1,-1.56,1.29,-0.98,2.75,0,1,0,6.967558,-135.527336,-3.979329,-20.786844,30.865196,-15.61834,3.3,1.5,,,,706872,54.13,1,4,0,0,,,,1,3,0
1,2022-03-19 00:00:00,92.7,-2.51,5.43,"""Nola, Aaron""",666182,605400,1,1,-1.25,1.52,-0.06,2.77,1,1,0,9.000929,-134.537277,-4.232589,-17.263002,31.813808,-13.015395,3.37,1.53,,,,706872,54.13,2,1,0,0,,,,1,0,0


In [69]:
print(df.height)
filtered = []
for pitcher_df in df.group_by("pitcher"):
    pitcher_code = pitcher_df[0]
    pitcher_df = pitcher_df[1]
    val_counts = (
        pitcher_df.select(
            pl.col("pitch_type").value_counts(normalize=True).alias("pitch_type_counts")
        )
        .unnest("pitch_type_counts")
        .sort("proportion", descending=True)
    )

    # Get pitch types with proportion < 0.01
    low_prop_pitches = (
        val_counts.filter(pl.col("proportion") < 0.01)["pitch_type"].unique().to_list()
    )

    # Filter out those pitch types
    pitcher_df = pitcher_df.filter(~pl.col("pitch_type").is_in(low_prop_pitches))

    # combine back to main df
    filtered.append(pitcher_df)
# combine back to one df
df = pl.concat(filtered)

30170


In [70]:
df.height

30062