In [None]:
from pybaseball import statcast
import pandas as pd
import pickle
import numpy as np
import random

In [None]:
### CONSTANTS

pitch_mapping_simple = {
                # Fastballs
                'FF': 'FB', # Four seam fastball
                'FT': 'FB', # Two seam fastball
                'CU': 'FB', # Cutter
                'FC': 'FB',
                'FS': 'FB',
                'FA': 'FB',
                'SI': 'FB',
                # Changeups
                'SC': 'CH', # Screwball
                'CH': 'CH', 
                # Sliders
                'SL': 'SL',
                'SV': 'SL', # Slurve
                'ST': "SL", # Sweeper
                # Curveballs
                'FO': 'CU',
                'CU': 'CU',
                'CS': 'CU', # Slow curve
                'KC': 'CU',
                # Special Pitch - included in CU
                'EP': 'CU',
                'KN': 'CU'}

pitch_mapping_binary = {'FF': 'FAST',
                 'SI': 'FAST',
                 'FC': 'FAST',
                 'FS': 'FAST',
                 'FA': 'FAST',
                 'FT': 'FAST',
                 'SF': 'FAST',
                 'SL': 'OFF',
                 'CH': 'OFF',
                 'CU': 'OFF',
                 'KC': 'OFF',
                 'CS': 'OFF',
                 'KN': 'OFF',
                 'EP': 'OFF',
                 'SV': 'OFF',
                 'SC': 'OFF',
                 'ST': 'OFF',
                 'FO': 'OFF'}

pitch_mapping_std = {'FF': 'FF',
                 'SI': 'SI',
                 'FC': 'FC',
                 'FS': 'FS',
                 'FA': 'FA',
                 'FT': 'FT',
                 'SF': 'SF',
                 'SL': 'SL',
                 'CH': 'CH',
                 'CU': 'CU',
                 'KC': 'KC',
                 'CS': 'CS',
                 'KN': 'KN',
                 'EP': 'EP',
                 'SV': 'SV',
                 'SC': 'SC',
                 'ST': 'ST',
                 'FO': 'FO'}

In [None]:
data = statcast('2016-08-01', '2017-08-01', parallel=True, team="DET")
data.to_parquet('data/data_160801_170801_DET.parquet')

In [None]:
keptCols = [
    'pitch_type',
    'game_pk', # Removed before input
    'batter', # Sep. Emb. (Cat.)
    'pitcher', # Sep. Emb. (Cat.)
    'fielder_2', # Sep. Emb. (Cat.)
    'stand', # Cat.
    'p_throws', # Cat.
    'game_year', # Cat.
    'balls', # Cat.
    'strikes', # Cat.
    'on_3b', # Cat.
    'on_2b', # Cat.
    'on_1b', # Cat.
    'outs_when_up', # Cat.
    'inning_topbot', # Cat.
    'events', # Cat.
    'description', # Cat.
    'home_team', # Cat.
    'away_team',  # Cat.
    'hit_location', # Cat.
    'bb_type', # Cat.
    'inning', # Cont.
    'at_bat_number', # Cont.
    'pitch_number', # Cont.
    'fld_score', # Cont.
    'bat_score', # Cont.
    'pfx_x', # Cont.
    'pfx_z', # Cont.
    'hc_x', # Cont.
    'hc_y', # Cont.
    'vx0', # Cont.
    'vy0', # Cont.
    'vz0', # Cont.
    'ax', # Cont.
    'ay', # Cont.
    'az', # Cont.
    'hit_distance_sc', # Cont.
    'launch_speed', # Cont.
    'launch_angle', # Cont.
    'release_speed', # Cont.
    'release_spin_rate', # Cont.
    'release_extension', # Cont.
    'release_pos_x', # Cont.
    'release_pos_y', # Cont.
    'release_pos_z', # Cont.
    'spin_axis', # Cont.
    'pitch_name' # Removed before input
]
with open('full_multi/data/keepCols.pkl', 'wb') as f:
    pickle.dump(keptCols, f)

In [None]:
rmdata = data[keptCols].copy()
rmdata = rmdata.sort_values(by=["game_pk", "pitcher", "at_bat_number", "pitch_number"], ascending=[True, True, True, True]).reset_index(drop=True)
rmdata = rmdata.dropna(subset=['pitch_type'])

In [None]:
def one_zero_embed(df, colname):
    df[colname] = df[colname].apply(lambda x: 1 if pd.notna(x) and x != 0 and x != '' else 0)

In [None]:
def build_map_dict(df, colname, savefile=None, basefile=None, verbose=False, apply=True):
    if basefile:
        with open(basefile, 'rb') as f:
            d = pickle.load(f)
            if verbose:
                print(f"Loaded dict from {basefile}")
    else:
        d = {np.nan:0}
    newCount = 0
    dsize = len(d.keys())
    for item in df[colname].unique():
        if item not in d:
            newCount +=1
            d[item] = dsize
            dsize +=1

    if verbose:
        print(f"Added {newCount} new items to dict from {colname}, new size: {dsize}")

    if savefile:
        with open(savefile, 'wb') as f:
            pickle.dump(d, f)
            if verbose:
                print(f"Saved dict to {savefile}")
    
    if apply:
        df[colname] = df[colname].map(d)
        if verbose:
            print(f"Applied {colname} dict to data")

    return d

In [None]:
one_zero_emb_cols = [
    'on_3b',
    'on_2b',
    'on_1b'
]
with open('full_multi/data/one_zero_emb_cols.pkl', 'wb') as f:
    pickle.dump(one_zero_emb_cols, f)

for colname in one_zero_emb_cols:
    one_zero_embed(rmdata,colname)
rmdata = rmdata[((rmdata['pitch_type'] != 'IN') & (rmdata['pitch_type'] != 'PO'))]

In [None]:
# Configure pitch type for data
# One of: pitch_mapping_binary, pitch_mapping_simple, pitch_mapping_std, or comment out line
pitch_map = pitch_mapping_std

with open('full_multi/data/pitch_map.pkl', 'wb') as f:
    pickle.dump(pitch_map,f)

for pitch in rmdata['pitch_type'].unique():
    assert pitch in pitch_map.keys(), f"Found pitch {pitch} not in pitch mapping. Types of pitches available: {rmdata['pitch_name'].unique()}"
    
rmdata['pitch_type'] = rmdata['pitch_type'].map(pitch_map)

In [None]:
rmdata = rmdata.drop(columns=['pitch_name'])

In [None]:
mapping_cols = [
    'pitch_type',
    'batter', # Sep. Emb. (Cat.)
    'pitcher', # Sep. Emb. (Cat.)
    'fielder_2', # Sep. Emb. (Cat.)
    'stand', # Cat.
    'p_throws', # Cat.
    'game_year', # Cat.
    'balls', # Cat.
    'strikes', # Cat.
    'on_3b', # Cat.
    'on_2b', # Cat.
    'on_1b', # Cat.
    'outs_when_up', # Cat.
    'inning_topbot', # Cat.
    'events', # Cat.
    'description', # Cat.
    'home_team', # Cat.
    'away_team',  # Cat.
    'hit_location', # Cat.
    'bb_type', # Cat.
]
with open('full_multi/data/mapping_cols.pkl', 'wb') as f:
    pickle.dump(mapping_cols, f)


for map_col in mapping_cols:
    build_map_dict(rmdata, map_col, f"full_multi/mappings/{map_col}.pkl", verbose=True)

In [None]:
cleandata = rmdata.fillna(0) # Fill rest of null values

cleandata = cleandata.apply(pd.to_numeric, errors='coerce') # Convert to numeric values

In [None]:
cleandata.to_parquet('full_multi/data/1623_full_cleandata.parquet')

In [None]:
# Randomization seed
seed = 42

# Define the data split ratios
train_ratio = 0.8
val_ratio = 0.1
test_ratio = 0.1

cleandata = cleandata.reset_index(drop=True)  # reset index

# Shuffle sequences and split into train, cv, and test sets
np.random.seed(seed)
seq_ids = cleandata.groupby(["game_pk", "pitcher"]).apply(lambda x: x.index[0]).tolist()
random.shuffle(seq_ids)

num_train = int(train_ratio * len(seq_ids))
num_val = int(val_ratio * len(seq_ids))

train_seq_ids = seq_ids[:num_train]
val_seq_ids = seq_ids[num_train:num_train+num_val]
test_seq_ids = seq_ids[num_train+num_val:]

with open('full_multi/data/train_full_1623.pkl', 'wb') as f:
    pickle.dump(train_seq_ids, f)
with open('full_multi/data/val_full_1623.pkl', 'wb') as f:
    pickle.dump(val_seq_ids, f)
with open('full_multi/data/test_full_1623.pkl', 'wb') as f:
    pickle.dump(test_seq_ids, f)