In [3]:
import joblib
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [4]:
data_dir = "./datasets/"

In [5]:
df = pd.read_csv(data_dir+"pitchdf_ready_preds.csv")
player_id_name = pd.read_csv(data_dir+"player_id_name.csv")
player_id_name_dict = dict(zip(list(player_id_name["id"].values),list(player_id_name["fullName"].values)))

In [6]:
df = df[(df["1_day_lag_season_sum_ab"]>50)&(df["pitcher_total_batters_faced"]>80)]

In [7]:
df["batter_name"] = df["batter"].apply(lambda x: player_id_name_dict[x])
df["pitcher_name"] = df["pitcher"].apply(lambda x: player_id_name_dict[x])

In [8]:
at_bat_result_dict = {0: 'single',
 1: 'double_triple',
 2: 'home_run',
 3: 'walk',
 4: 'field_out',
 5: 'strikeout'}

In [9]:
features = ['balls', 'strikes', 'on_3b', 'on_2b', 'on_1b', 'outs_when_up',
            'batter_walk_rate', 'batter_strikeout_rate',
            'batter_singles_average',
                                    'batter_home_run_average',
           'pitcher_walk_rate', 'pitcher_strikeout_percentage',
       'pitcher_batting_average_against', 'pitcher_home_run_average']

In [10]:
batter_df_reduced = df[df["game_date"]>="2024-01-01"].dropna(subset=features).sort_values("game_date").drop_duplicates(subset=["batter"],keep="last")
pitcher_df_reduced = df[df["game_date"]>="2024-01-01"].dropna(subset=features).sort_values("game_date").drop_duplicates(subset=["pitcher"],keep="last")

In [11]:
pitcher_df_reduced[["pitcher_name",'pitcher_walk_rate', 'pitcher_era', 'pitcher_strikeout_percentage',
       'pitcher_batting_average_against', 'pitcher_home_run_average',
       'pitcher_total_batters_faced']].to_csv("pitcher_reduced.csv",index=False)

In [12]:
batter_df_reduced[['batter_name', 'batter_batting_average', 'batter_walk_rate', 'batter_strikeout_rate',
       'batter_contact_rate', 'batter_slugging_percent',
       'batter_on_base_percentage', 'batter_whiff_rate',
       'batter_singles_average', 'batter_doubles_triple_average',
       'batter_home_run_average']].to_csv("batter_reduced.csv")

In [13]:
model = joblib.load('best_xgboost_model.pkl')

In [14]:
pitcher_df_reduced.columns

Index(['game_date', 'game_pk', 'batter', 'pitcher', 'events', 'description',
       'zone', 'pitch_type', 'balls', 'strikes', 'game_year', 'on_3b', 'on_2b',
       'on_1b', 'outs_when_up', 'hit_distance_sc', 'launch_speed',
       'launch_angle', 'effective_speed', 'estimated_ba_using_speedangle',
       'estimated_woba_using_speedangle', 'woba_value', 'woba_denom',
       'babip_value', 'iso_value', 'launch_speed_angle', 'at_bat_number',
       'pitch_number', 'swinging_strike', 'hit_into_play', 'foul',
       'batter_batting_average', 'batter_walk_rate', 'batter_strikeout_rate',
       'batter_contact_rate', 'batter_slugging_percent',
       'batter_on_base_percentage', 'batter_whiff_rate',
       'batter_singles_average', 'batter_doubles_triple_average',
       'batter_home_run_average', '1_day_lag_season_sum_ab',
       'pitcher_walk_rate', 'pitcher_era', 'pitcher_strikeout_percentage',
       'pitcher_batting_average_against', 'pitcher_home_run_average',
       'pitcher_total_batt

In [15]:
pitcher_df_reduced[
        (pitcher_df_reduced['pitcher_name'] == "Edwin Díaz")][[ 'pitcher_walk_rate', 'pitcher_era', 'pitcher_strikeout_percentage',
       'pitcher_batting_average_against', 'pitcher_home_run_average',
       'pitcher_total_batters_faced',]]

Unnamed: 0,pitcher_walk_rate,pitcher_era,pitcher_strikeout_percentage,pitcher_batting_average_against,pitcher_home_run_average,pitcher_total_batters_faced
1415784,0.091346,3.290993,0.394231,0.158654,0.033654,208.0


In [16]:
batter_df_reduced[
        (batter_df_reduced['batter_name'] == "Aaron Judge")][['batter_walk_rate',
         'batter_strikeout_rate',
         'batter_singles_average',
         'batter_doubles_triple_average',
         'batter_home_run_average',]]

Unnamed: 0,batter_walk_rate,batter_strikeout_rate,batter_singles_average,batter_doubles_triple_average,batter_home_run_average
1423256,0.155556,0.194152,0.152878,0.066547,0.104317


In [17]:
pitcher_df_reduced[
        (pitcher_df_reduced['pitcher_name'] == "Tarik Skubal")][['pitcher_walk_rate',
     'pitcher_strikeout_percentage',
     'pitcher_batting_average_against',
     'pitcher_home_run_average']]

Unnamed: 0,pitcher_walk_rate,pitcher_strikeout_percentage,pitcher_batting_average_against,pitcher_home_run_average
735350,0.046448,0.301913,0.191257,0.020492


In [59]:
def plot_probabilities(batter_name, pitcher_name, balls, strikes, on_3b, on_2b, on_1b, outs_when_up):
    # Filter dataframe based on selected batter and pitcher
    batter_filtered_df = batter_df_reduced[(batter_df_reduced['batter_name'] == batter_name)][[
        'batter_batting_average', 'batter_on_base_percentage', 'batter_slugging_percent', 
        'batter_walk_rate', 'batter_strikeout_rate','batter_singles_average', 'batter_home_run_average']].reset_index(drop=True)
    
    pitcher_filtered_df = pitcher_df_reduced[(pitcher_df_reduced['pitcher_name'] == pitcher_name)][[
        'pitcher_walk_rate', 'pitcher_era', 'pitcher_strikeout_percentage', 
        'pitcher_batting_average_against','pitcher_home_run_average']].reset_index(drop=True)
    
    
    # Get max values for each stat column from the dataframe (for normalization)
    batter_max_values = {
        'batter_batting_average': batter_df_reduced['batter_batting_average'].max(),
        'batter_on_base_percentage': batter_df_reduced['batter_on_base_percentage'].max(),
        'batter_slugging_percent': batter_df_reduced['batter_slugging_percent'].max(),
        'batter_walk_rate': batter_df_reduced['batter_walk_rate'].max(),
        'batter_strikeout_rate': batter_df_reduced['batter_strikeout_rate'].max()
    }

    pitcher_max_values = {
        'pitcher_walk_rate': pitcher_df_reduced['pitcher_walk_rate'].max(),
        'pitcher_era': pitcher_df_reduced['pitcher_era'].max(),
        'pitcher_strikeout_percentage': pitcher_df_reduced['pitcher_strikeout_percentage'].max(),
        'pitcher_batting_average_against': pitcher_df_reduced['pitcher_batting_average_against'].max()
    }
    
    # Combine batter and pitcher data
    filtered_df = pd.concat([batter_filtered_df, pitcher_filtered_df], axis=1)
    filtered_df['balls'] = balls
    filtered_df['strikes'] = strikes
    filtered_df['on_3b'] = on_3b
    filtered_df['on_2b'] = on_2b
    filtered_df['on_1b'] = on_1b
    filtered_df['outs_when_up'] = outs_when_up

    # Prepare the input for the model
    filtered_df = filtered_df[features]
    predict_proba_list = model.predict_proba(filtered_df)

    # Get class labels
    outcomes = list(at_bat_result_dict.values())
    
    # Prepare data for visualization
    proba_df = pd.DataFrame({
        'at_bat_result': outcomes,
        'probability': predict_proba_list[0]  # Assuming the first row
    })
    proba_df['probability'] = proba_df['probability'].apply(lambda x: round(x*100, 2))

    # Create probability bar chart
    prob_plot = px.bar(proba_df, x='at_bat_result', y='probability',
                       title=f"Probabilities of At-Bat Outcomes for {batter_name} vs {pitcher_name}",
                       labels={'at_bat_result': 'At-Bat Result', 'probability': 'Probability'})

    ### Radar plot for batter stats ###
    batter_stats = batter_filtered_df.iloc[0].values
    batter_stats_normalized = [
        batter_stats[0] / batter_max_values['batter_batting_average'],
        batter_stats[1] / batter_max_values['batter_on_base_percentage'],
        batter_stats[2] / batter_max_values['batter_slugging_percent'],
        batter_stats[3] / batter_max_values['batter_walk_rate'],
        (batter_max_values['batter_strikeout_rate']-batter_stats[4]) / batter_max_values['batter_strikeout_rate']
    ]
    batter_categories = ['Batting Avg', 'OBP', 'Slugging %', 'Walk Rate', 'Strikeout Rate']
    
    batter_plot = go.Figure()
    batter_plot.add_trace(go.Scatterpolar(
        r=batter_stats_normalized, 
        theta=batter_categories, 
        fill='toself', 
        name='Batter Stats',
        hovertemplate = (
            f'Batting Avg: {batter_stats[0]:.3f}<br>' +
            f'OBP: {batter_stats[1]:.3f}<br>' +
            f'Slugging %: {batter_stats[2]:.3f}<br>' +
            f'Walk Rate: {batter_stats[3]*100:.2f}%<br>' +
            f'Strikeout Rate: {batter_stats[4]*100:.2f}%<br>'
        )
    ))

    batter_plot.update_layout(
        polar=dict(radialaxis=dict(visible=False, range=[0, 1])),
        title=f"{batter_name} - Batter Stats Radar",
        showlegend=False
    )

    ### Radar plot for pitcher stats ###
    pitcher_stats = pitcher_filtered_df.iloc[0].values
    pitcher_stats_normalized = [
        (pitcher_max_values['pitcher_walk_rate']-pitcher_stats[0]) / pitcher_max_values['pitcher_walk_rate'],
        (5 - pitcher_stats[1]) / 5,  # Inverse for ERA (since lower is better)
        pitcher_stats[2] / pitcher_max_values['pitcher_strikeout_percentage'],
        (pitcher_max_values['pitcher_batting_average_against']-pitcher_stats[3]) / pitcher_max_values['pitcher_batting_average_against']
    ]
    pitcher_categories = ['Walk Rate', 'ERA', 'Strikeout %', 'BA Against']

    pitcher_plot = go.Figure()
    pitcher_plot.add_trace(go.Scatterpolar(
        r=pitcher_stats_normalized, 
        theta=pitcher_categories, 
        fill='toself', 
        name='Pitcher Stats',
        hovertemplate = (
            f'Walk Rate: {pitcher_stats[0]*100:.2f}%<br>' +
            f'ERA: {pitcher_stats[1]:.2f}<br>' +
            f'Strikeout %: {pitcher_stats[2]*100:.2f}%<br>' +
            f'BA Against: {pitcher_stats[3]:.3f}<br>'
        )
    ))

    pitcher_plot.update_layout(
        polar=dict(radialaxis=dict(visible=False, range=[0, 1])),
        title=f"{pitcher_name} - Pitcher Stats Radar",
        showlegend=False
    )
    
    prob_plot = prob_plot.to_html(full_html=False)
    batter_plot = batter_plot.to_html(full_html=False)
    pitcher_plot = pitcher_plot.to_html(full_html=False)
    
    return prob_plot, batter_plot, pitcher_plot


In [56]:
batter_df_reduced[batter_df_reduced["batter_name"].str.contains("Arrae")]["batter_name"]

1416450    Luis Arraez
Name: batter_name, dtype: object

In [20]:
df.columns

Index(['game_date', 'game_pk', 'batter', 'pitcher', 'events', 'description',
       'zone', 'pitch_type', 'balls', 'strikes', 'game_year', 'on_3b', 'on_2b',
       'on_1b', 'outs_when_up', 'hit_distance_sc', 'launch_speed',
       'launch_angle', 'effective_speed', 'estimated_ba_using_speedangle',
       'estimated_woba_using_speedangle', 'woba_value', 'woba_denom',
       'babip_value', 'iso_value', 'launch_speed_angle', 'at_bat_number',
       'pitch_number', 'swinging_strike', 'hit_into_play', 'foul',
       'batter_batting_average', 'batter_walk_rate', 'batter_strikeout_rate',
       'batter_contact_rate', 'batter_slugging_percent',
       'batter_on_base_percentage', 'batter_whiff_rate',
       'batter_singles_average', 'batter_doubles_triple_average',
       'batter_home_run_average', '1_day_lag_season_sum_ab',
       'pitcher_walk_rate', 'pitcher_era', 'pitcher_strikeout_percentage',
       'pitcher_batting_average_against', 'pitcher_home_run_average',
       'pitcher_total_batt

In [21]:
df[["batter_name",'balls', 'strikes']]

Unnamed: 0,batter_name,balls,strikes
0,Ketel Marte,2,2
1,Ketel Marte,1,2
2,Ketel Marte,1,1
3,Ketel Marte,1,0
4,Ketel Marte,0,0
...,...,...,...
1431874,Bobby Witt Jr.,0,1
1431875,Bobby Witt Jr.,0,0
1431876,Tommy Pham,1,1
1431877,Tommy Pham,0,1
