# feats using polar yay

In [19]:
import logging
import sys
import time
from pathlib import Path

# import pandas as pd
import polars as pl
import pybaseball as pb
import yaml

logger = logging.getLogger("featurize")
logger.setLevel(logging.INFO)
handler = logging.StreamHandler()
handler.setFormatter(
    logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
)
logger.addHandler(handler)
cfg = pl.Config()
cfg.set_tbl_rows(2000)

polars.config.Config

In [20]:
with open("../params.yaml", "r") as file:
    params = yaml.safe_load(file)
input_file_path = params["featurize"]["input_data_path"]

df = pl.read_parquet("../" / Path(input_file_path))

In [21]:
df = df.with_columns(
    df.select(
        pl.col(pl.String).exclude(["player_name"]).cast(pl.Categorical).to_physical()
    ),
)

In [22]:
df = df.sort(
    [
        "game_date",
        "game_pk",
        "at_bat_number",
        "pitch_number",
    ],
    descending=False,
)
# create target variable
df = df.with_columns(
    df.select(pl.col("pitch_type").shift(-1).alias("next_pitch")),
).drop_nulls("next_pitch")

In [23]:
df.select(pl.col(["pitch_type", "next_pitch"])).head(20)

pitch_type,next_pitch
u32,u32
0,0
0,0
0,4
4,0
0,4
4,0
0,0
0,0
0,0
0,1


In [24]:
df = df.with_columns(
    (pl.col("balls").cast(pl.String) + " - " + pl.col("strikes").cast(pl.String))
    .alias("count")
    .cast(pl.Categorical)
    .to_physical()
)
df = df.drop(["balls", "strikes"])

In [25]:
print(df.select("count").describe())
df.head()


shape: (9, 2)
┌────────────┬──────────┐
│ statistic  ┆ count    │
│ ---        ┆ ---      │
│ str        ┆ f64      │
╞════════════╪══════════╡
│ count      ┆ 137514.0 │
│ null_count ┆ 0.0      │
│ mean       ┆ 3.387059 │
│ std        ┆ 3.069877 │
│ min        ┆ 0.0      │
│ 25%        ┆ 0.0      │
│ 50%        ┆ 3.0      │
│ 75%        ┆ 6.0      │
│ max        ┆ 11.0     │
└────────────┴──────────┘


pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,zone,stand,type,hit_location,bb_type,pfx_x,pfx_z,plate_x,plate_z,on_3b,on_2b,on_1b,outs_when_up,inning,inning_topbot,hc_x,hc_y,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,bat_score,fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,bat_speed,swing_length,next_pitch,count
u32,datetime[μs],f64,f64,f64,str,i64,i64,f64,u32,u32,f64,u32,f64,f64,f64,f64,f64,f64,f64,i64,i64,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,u32,u32,f64,f64,f64,u32,u32
0,2015-04-06 00:00:00,91.2,-3.46,5.52,"""Scherzer, Max""",434158,453286,13.0,1,2,,,-1.6,1.12,-1.6,1.87,,,,0,1,1,,,8.691649,-133.497709,-4.950985,-20.321,30.518,-18.145,3.18,1.41,,,,90.9,2338.0,6.5,413657,54.5,,,,,,,,1,1,0,0,1,0,,,,0,0
0,2015-04-06 00:00:00,94.2,-3.39,5.46,"""Scherzer, Max""",434158,453286,11.0,1,2,,,-1.11,1.38,-0.91,2.49,,,,0,1,1,,,9.33088,-137.854849,-4.455706,-15.832,33.65,-14.258,3.25,1.54,,,,92.5,2613.0,5.8,413657,54.5,,,,,,,,1,2,0,0,1,0,,,,0,1
0,2015-04-06 00:00:00,95.6,-3.46,5.27,"""Scherzer, Max""",434158,453286,11.0,1,1,,,-1.11,1.41,-1.58,2.93,,,,0,1,1,,,7.86545,-139.948899,-3.141207,-15.845,33.239,-13.656,3.14,1.38,,,,94.6,2639.0,6.1,413657,54.5,,,,,,,,1,3,0,0,1,0,,,,4,2
4,2015-04-06 00:00:00,87.2,-3.35,5.57,"""Scherzer, Max""",434158,453286,14.0,1,1,,,-1.01,0.43,0.36,0.24,,,,0,1,1,,,11.317288,-127.242561,-6.411426,-12.939,26.441,-26.385,3.17,1.34,,,,86.1,1765.0,6.1,413657,54.5,,,,,,,,1,4,0,0,1,0,,,,0,3
0,2015-04-06 00:00:00,94.7,-3.51,5.25,"""Scherzer, Max""",434158,453286,11.0,1,1,,,-1.05,1.05,-1.93,4.25,,,,0,1,1,,,6.896021,-138.768748,1.434366,-14.653,33.106,-19.517,3.1,1.42,,,,93.3,2486.0,5.8,413657,54.5,,,,,,,,1,5,0,0,1,0,,,,4,4


In [26]:
df = df.with_columns(df.select(["on_1b", "on_2b", "on_3b"]).fill_null(-1))
df = df.with_columns(
    pl.col("on_1b")
    .map_elements(lambda s: 0 if s == -1.0 else 1, return_dtype=pl.Int32)
    .alias("on_1b"),
    pl.col("on_2b")
    .map_elements(lambda s: 0 if s == -1.0 else 1, return_dtype=pl.Int32)
    .alias("on_2b"),
    pl.col("on_3b")
    .map_elements(lambda s: 0 if s == -1.0 else 1, return_dtype=pl.Int32)
    .alias("on_3b"),
)
df = df.with_columns(
    (
        pl.col("on_1b").cast(pl.String)
        + "/"
        + pl.col("on_2b").cast(pl.String)
        + "/"
        + pl.col("on_3b").cast(pl.String)
    )
    .alias("base_state")
    .cast(pl.Categorical)
    .to_physical()
)
df = df.drop(["on_1b", "on_2b", "on_3b"])

In [27]:
print(df.select("base_state").describe())

df.head()

shape: (9, 2)
┌────────────┬────────────┐
│ statistic  ┆ base_state │
│ ---        ┆ ---        │
│ str        ┆ f64        │
╞════════════╪════════════╡
│ count      ┆ 137514.0   │
│ null_count ┆ 0.0        │
│ mean       ┆ 1.162616   │
│ std        ┆ 2.023535   │
│ min        ┆ 0.0        │
│ 25%        ┆ 0.0        │
│ 50%        ┆ 0.0        │
│ 75%        ┆ 1.0        │
│ max        ┆ 7.0        │
└────────────┴────────────┘


pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,zone,stand,type,hit_location,bb_type,pfx_x,pfx_z,plate_x,plate_z,outs_when_up,inning,inning_topbot,hc_x,hc_y,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,hit_distance_sc,launch_speed,launch_angle,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,estimated_ba_using_speedangle,estimated_woba_using_speedangle,woba_value,woba_denom,babip_value,iso_value,launch_speed_angle,at_bat_number,pitch_number,bat_score,fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,bat_speed,swing_length,next_pitch,count,base_state
u32,datetime[μs],f64,f64,f64,str,i64,i64,f64,u32,u32,f64,u32,f64,f64,f64,f64,i64,i64,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,f64,f64,f64,f64,f64,f64,f64,i64,i64,i64,i64,u32,u32,f64,f64,f64,u32,u32,u32
0,2015-04-06 00:00:00,91.2,-3.46,5.52,"""Scherzer, Max""",434158,453286,13.0,1,2,,,-1.6,1.12,-1.6,1.87,0,1,1,,,8.691649,-133.497709,-4.950985,-20.321,30.518,-18.145,3.18,1.41,,,,90.9,2338.0,6.5,413657,54.5,,,,,,,,1,1,0,0,1,0,,,,0,0,0
0,2015-04-06 00:00:00,94.2,-3.39,5.46,"""Scherzer, Max""",434158,453286,11.0,1,2,,,-1.11,1.38,-0.91,2.49,0,1,1,,,9.33088,-137.854849,-4.455706,-15.832,33.65,-14.258,3.25,1.54,,,,92.5,2613.0,5.8,413657,54.5,,,,,,,,1,2,0,0,1,0,,,,0,1,0
0,2015-04-06 00:00:00,95.6,-3.46,5.27,"""Scherzer, Max""",434158,453286,11.0,1,1,,,-1.11,1.41,-1.58,2.93,0,1,1,,,7.86545,-139.948899,-3.141207,-15.845,33.239,-13.656,3.14,1.38,,,,94.6,2639.0,6.1,413657,54.5,,,,,,,,1,3,0,0,1,0,,,,4,2,0
4,2015-04-06 00:00:00,87.2,-3.35,5.57,"""Scherzer, Max""",434158,453286,14.0,1,1,,,-1.01,0.43,0.36,0.24,0,1,1,,,11.317288,-127.242561,-6.411426,-12.939,26.441,-26.385,3.17,1.34,,,,86.1,1765.0,6.1,413657,54.5,,,,,,,,1,4,0,0,1,0,,,,0,3,0
0,2015-04-06 00:00:00,94.7,-3.51,5.25,"""Scherzer, Max""",434158,453286,11.0,1,1,,,-1.05,1.05,-1.93,4.25,0,1,1,,,6.896021,-138.768748,1.434366,-14.653,33.106,-19.517,3.1,1.42,,,,93.3,2486.0,5.8,413657,54.5,,,,,,,,1,5,0,0,1,0,,,,4,4,0


In [28]:
print(df.shape)

# # find columns with null percentage > 10%
# # Calculate null percentages for all columns
# null_percentages = df.select(pl.all().is_null().sum() / df.height)

# # Filter columns where null percentage > 10%
# columns_with_too_many_nulls = (
#     null_percentages.melt().filter(pl.col("value") > 0.1).select("variable")
# )

# print("Columns with >10% null values:")
# print(columns_with_too_many_nulls)

# drop columns with too many nulls
df = df.drop(
    df.select(pl.all().is_null().sum() / df.height)
    .unpivot()
    .filter(pl.col("value") > 0.05)
    .select("variable")
    .to_series()
    .to_list()
)
print(df.shape)
df.head()

(137514, 57)
(137514, 40)


pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,zone,stand,type,pfx_x,pfx_z,plate_x,plate_z,outs_when_up,inning,inning_topbot,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,at_bat_number,pitch_number,bat_score,fld_score,if_fielding_alignment,of_fielding_alignment,next_pitch,count,base_state
u32,datetime[μs],f64,f64,f64,str,i64,i64,f64,u32,u32,f64,f64,f64,f64,i64,i64,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,i64,i64,i64,i64,u32,u32,u32,u32,u32
0,2015-04-06 00:00:00,91.2,-3.46,5.52,"""Scherzer, Max""",434158,453286,13.0,1,2,-1.6,1.12,-1.6,1.87,0,1,1,8.691649,-133.497709,-4.950985,-20.321,30.518,-18.145,3.18,1.41,90.9,2338.0,6.5,413657,54.5,1,1,0,0,1,0,0,0,0
0,2015-04-06 00:00:00,94.2,-3.39,5.46,"""Scherzer, Max""",434158,453286,11.0,1,2,-1.11,1.38,-0.91,2.49,0,1,1,9.33088,-137.854849,-4.455706,-15.832,33.65,-14.258,3.25,1.54,92.5,2613.0,5.8,413657,54.5,1,2,0,0,1,0,0,1,0
0,2015-04-06 00:00:00,95.6,-3.46,5.27,"""Scherzer, Max""",434158,453286,11.0,1,1,-1.11,1.41,-1.58,2.93,0,1,1,7.86545,-139.948899,-3.141207,-15.845,33.239,-13.656,3.14,1.38,94.6,2639.0,6.1,413657,54.5,1,3,0,0,1,0,4,2,0
4,2015-04-06 00:00:00,87.2,-3.35,5.57,"""Scherzer, Max""",434158,453286,14.0,1,1,-1.01,0.43,0.36,0.24,0,1,1,11.317288,-127.242561,-6.411426,-12.939,26.441,-26.385,3.17,1.34,86.1,1765.0,6.1,413657,54.5,1,4,0,0,1,0,0,3,0
0,2015-04-06 00:00:00,94.7,-3.51,5.25,"""Scherzer, Max""",434158,453286,11.0,1,1,-1.05,1.05,-1.93,4.25,0,1,1,6.896021,-138.768748,1.434366,-14.653,33.106,-19.517,3.1,1.42,93.3,2486.0,5.8,413657,54.5,1,5,0,0,1,0,4,4,0


In [29]:
# release point consistency feature


df.head()

pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,zone,stand,type,pfx_x,pfx_z,plate_x,plate_z,outs_when_up,inning,inning_topbot,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,at_bat_number,pitch_number,bat_score,fld_score,if_fielding_alignment,of_fielding_alignment,next_pitch,count,base_state
u32,datetime[μs],f64,f64,f64,str,i64,i64,f64,u32,u32,f64,f64,f64,f64,i64,i64,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,i64,i64,i64,i64,u32,u32,u32,u32,u32
0,2015-04-06 00:00:00,91.2,-3.46,5.52,"""Scherzer, Max""",434158,453286,13.0,1,2,-1.6,1.12,-1.6,1.87,0,1,1,8.691649,-133.497709,-4.950985,-20.321,30.518,-18.145,3.18,1.41,90.9,2338.0,6.5,413657,54.5,1,1,0,0,1,0,0,0,0
0,2015-04-06 00:00:00,94.2,-3.39,5.46,"""Scherzer, Max""",434158,453286,11.0,1,2,-1.11,1.38,-0.91,2.49,0,1,1,9.33088,-137.854849,-4.455706,-15.832,33.65,-14.258,3.25,1.54,92.5,2613.0,5.8,413657,54.5,1,2,0,0,1,0,0,1,0
0,2015-04-06 00:00:00,95.6,-3.46,5.27,"""Scherzer, Max""",434158,453286,11.0,1,1,-1.11,1.41,-1.58,2.93,0,1,1,7.86545,-139.948899,-3.141207,-15.845,33.239,-13.656,3.14,1.38,94.6,2639.0,6.1,413657,54.5,1,3,0,0,1,0,4,2,0
4,2015-04-06 00:00:00,87.2,-3.35,5.57,"""Scherzer, Max""",434158,453286,14.0,1,1,-1.01,0.43,0.36,0.24,0,1,1,11.317288,-127.242561,-6.411426,-12.939,26.441,-26.385,3.17,1.34,86.1,1765.0,6.1,413657,54.5,1,4,0,0,1,0,0,3,0
0,2015-04-06 00:00:00,94.7,-3.51,5.25,"""Scherzer, Max""",434158,453286,11.0,1,1,-1.05,1.05,-1.93,4.25,0,1,1,6.896021,-138.768748,1.434366,-14.653,33.106,-19.517,3.1,1.42,93.3,2486.0,5.8,413657,54.5,1,5,0,0,1,0,4,4,0


In [30]:
# # Late inning pressure (boolean)
# df = df.with_columns(
#     ((pl.col("inning") >= 7) & (pl.col("bat_score") - pl.col("fld_score")).abs() <= 3)
#     .cast(pl.Int32)
#     .alias("high_pressure")
# )

# df.head()

In [32]:
def sort_pitch_data(df):
    return df.sort(
        [
            "game_date",  # Primary: chronological order
            "game_pk",  # Secondary: unique game identifier
            "inning",  # Tertiary: game sequence
            "at_bat_number",  # Game at-bat order
            "pitch_number",  # At-bat pitch sequence
        ],
        descending=False,  # Ascending order for all
    )


df = sort_pitch_data(df)
df.head()

pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,zone,stand,type,pfx_x,pfx_z,plate_x,plate_z,outs_when_up,inning,inning_topbot,vx0,vy0,vz0,ax,ay,az,sz_top,sz_bot,effective_speed,release_spin_rate,release_extension,game_pk,release_pos_y,at_bat_number,pitch_number,bat_score,fld_score,if_fielding_alignment,of_fielding_alignment,next_pitch,count,base_state
u32,datetime[μs],f64,f64,f64,str,i64,i64,f64,u32,u32,f64,f64,f64,f64,i64,i64,u32,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,i64,f64,i64,i64,i64,i64,u32,u32,u32,u32,u32
0,2015-04-06 00:00:00,91.2,-3.46,5.52,"""Scherzer, Max""",434158,453286,13.0,1,2,-1.6,1.12,-1.6,1.87,0,1,1,8.691649,-133.497709,-4.950985,-20.321,30.518,-18.145,3.18,1.41,90.9,2338.0,6.5,413657,54.5,1,1,0,0,1,0,0,0,0
0,2015-04-06 00:00:00,94.2,-3.39,5.46,"""Scherzer, Max""",434158,453286,11.0,1,2,-1.11,1.38,-0.91,2.49,0,1,1,9.33088,-137.854849,-4.455706,-15.832,33.65,-14.258,3.25,1.54,92.5,2613.0,5.8,413657,54.5,1,2,0,0,1,0,0,1,0
0,2015-04-06 00:00:00,95.6,-3.46,5.27,"""Scherzer, Max""",434158,453286,11.0,1,1,-1.11,1.41,-1.58,2.93,0,1,1,7.86545,-139.948899,-3.141207,-15.845,33.239,-13.656,3.14,1.38,94.6,2639.0,6.1,413657,54.5,1,3,0,0,1,0,4,2,0
4,2015-04-06 00:00:00,87.2,-3.35,5.57,"""Scherzer, Max""",434158,453286,14.0,1,1,-1.01,0.43,0.36,0.24,0,1,1,11.317288,-127.242561,-6.411426,-12.939,26.441,-26.385,3.17,1.34,86.1,1765.0,6.1,413657,54.5,1,4,0,0,1,0,0,3,0
0,2015-04-06 00:00:00,94.7,-3.51,5.25,"""Scherzer, Max""",434158,453286,11.0,1,1,-1.05,1.05,-1.93,4.25,0,1,1,6.896021,-138.768748,1.434366,-14.653,33.106,-19.517,3.1,1.42,93.3,2486.0,5.8,413657,54.5,1,5,0,0,1,0,4,4,0


In [18]:
print(df.select("next_pitch", "pitch_type").head(20))

shape: (20, 2)
┌────────────┬────────────┐
│ next_pitch ┆ pitch_type │
│ ---        ┆ ---        │
│ u32        ┆ u32        │
╞════════════╪════════════╡
│ 0          ┆ 0          │
│ 0          ┆ 0          │
│ 4          ┆ 0          │
│ 0          ┆ 4          │
│ 4          ┆ 0          │
│ 0          ┆ 4          │
│ 0          ┆ 0          │
│ 0          ┆ 0          │
│ 0          ┆ 0          │
│ 1          ┆ 0          │
│ 1          ┆ 1          │
│ 0          ┆ 1          │
│ 0          ┆ 0          │
│ 4          ┆ 0          │
│ 0          ┆ 4          │
│ 0          ┆ 0          │
│ 0          ┆ 0          │
│ 0          ┆ 0          │
│ 4          ┆ 0          │
│ 0          ┆ 4          │
└────────────┴────────────┘
