In [None]:
import pandas as pd
from pybaseball import pybaseball
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../data/2020Onwards.csv')

In [None]:
df

In [None]:
set(df["description"])

In [None]:
for row in range(len(df)):
    if df.loc[row, "description"] in ['blocked_ball', 'hit_by_pitch', 'pitchout']:
        df.loc[row, "description"] = 'ball'
    if df.loc[row, "description"] in ['bunt_foul_tip', 'foul_bunt', 'foul_pitchout', 'foul_tip']:
        df.loc[row, "description"] = 'foul'
    if df.loc[row, "description"] in ["missed_bunt", "swinging_strike_blocked"]:
        df.loc[row, "description"] = 'swinging_strike'

In [None]:
df

In [None]:
set(df["description"])

In [None]:
df.to_csv('../data/cleaned/filtering.csv')

In [None]:
df.columns

In [None]:
df.drop('Unnamed: 0', axis=1, inplace=True)

In [None]:
description_map = {'ball': 0, 'called_strike': 1, 'swinging_strike': 2, 'foul': 3, 'hit_into_play': 4 }

df['outcome'] = df['description'].map(description_map)

In [None]:
df.drop('game_type', axis = 1, inplace=True)

In [None]:
top_50 = df['batter'].value_counts().head(50).reset_index()

display(pd.DataFrame(top_50))

In [None]:
def get_player_name(id):
    player_info = pybaseball.playerid_reverse_lookup([id])
    return player_info['name_first'][0] + " " + player_info['name_last'][0]

top_50["Player Name"] = top_50["batter"].map(get_player_name)

In [None]:
top_50

In [None]:
main_batters = set(top_50["batter"])

In [None]:
main_players_df = df[df["batter"].isin(main_batters)]

In [None]:
main_players_df.columns

In [None]:
pitch_types = set(pd.DataFrame(main_players_df["pitch_name"].value_counts().head(9)).reset_index()["pitch_name"])

In [None]:
reduced_pt = df[df["pitch_name"].isin(pitch_types)]

In [None]:
df_encoded = pd.get_dummies(reduced_pt, columns=['pitch_name'])

In [None]:
description_map = {'R': 0, 'L': 1}

df_encoded['Batter Side'] = df_encoded['stand'].map(description_map)

In [None]:
df_encoded.columns

In [None]:
df_encoded["on_1st"] = df_encoded["on_1b"].isna()
df_encoded["on_2nd"] = df_encoded["on_2b"].isna()
df_encoded["on_3rd"] = df_encoded["on_3b"].isna()

df_encoded.drop(['on_3b', 'on_2b', 'on_1b'], axis=1, inplace=True)

In [None]:
df_encoded

In [None]:
df_encoded.drop('description', axis = 1, inplace=True)

In [None]:
pitch_features = [
       'pitch_name_4-Seam Fastball', 'pitch_name_Changeup',
       'pitch_name_Curveball', 'pitch_name_Cutter', 'pitch_name_Knuckle Curve',
       'pitch_name_Sinker', 'pitch_name_Slider', 'pitch_name_Split-Finger',
       'pitch_name_Sweeper',
       'api_break_z_with_gravity',
       'api_break_x_arm', 'api_break_x_batter_in', 'spin_axis',
       'vx0', 'vy0',
       'vz0', 'ax', 'ay', 'az', 'effective_speed',
       'release_spin_rate',
       'release_extension', 'release_pos_y',
       'pfx_x', 'pfx_z', 'plate_x', 'plate_z',
       'zone', 'Pitcher Side', 'balls', 'strikes']

descriptive_features = ['Batter Side', 'bat_win_exp', 'n_priorpa_thisgame_player_at_bat',
       'batter_days_since_prev_game', 'pitch_number', 'on_3rd', 'on_2nd', 'on_1st', 'outs_when_up', 'inning', 
       'sz_top', 'sz_bot', 'batter', 'pitcher', ]

outcome_features = ['outcome', 'estimated_ba_using_speedangle', ]

In [None]:
description_map = {'R': 0, 'L': 1}

df_encoded['Pitcher Side'] = df_encoded['p_throws'].map(description_map)

In [None]:
df_encoded.drop('p_throws', axis=1, inplace=True)

In [None]:
features_df = df_encoded[pitch_features]

In [None]:
features_df

In [None]:
df_encoded.to_csv('../data/cleaned/encoded.csv')

In [None]:
cov_matrix = features_df.cov()
print(cov_matrix)

In [None]:
sns.heatmap(cov_matrix, annot=False, fmt=".2f", cmap="coolwarm")
plt.title("Covariance Matrix")
plt.show()