# Analysing Umpire 3 data

In [2]:
import pandas as pd
import pymc as pm
import numpy as np
import arviz as az
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [9]:
ump3_orig = pd.read_csv('Umpire_3.csv')

In [10]:

# Clean the data by categorizing the different pitch types 
def clean_df(orig_df):
    """
    Pitch types:
        ['SI' 'AB' 'SL' 'FC' 'CH' 'CU' 'FF' 'FS' 'EP' 'KC' 'FO' 'KN' 'SC' 'ST'
         'SV' 'CS' 'FA']
    Handedness: R=1, L=0
    Description: 'ball': 0, 'called_strike': 1
    Error in decision: 'correct': 0, 'incorrect': 1
    """
    df = orig_df.copy()
    # Drop nulls 
    df = df.dropna()
    
    # Map pitch types to binary columns
    unique_pt = df['pitch_type'].unique()
    fastballs = ['FF', 'FA', 'FT', 'SI', 'FC'] # 4 seam, 4 seam again, two seam, sinker, cutter
    changeups = ['CH', 'EP', 'FO', 'FS'] # Change, ephus, fork, split
    off_speed = ['SL', 'CU', 'ST', 'SV', 'CS'] # Slider, curve, sweeper, slurve, slow curve
    rare = ['KC', 'KN', 'SC'] # Knucle curve, knuckle ball, screwball
    # AB and AS are 'automatic ball/strike' so they are not included. This happens when something like a pitch clock violation occurs
    # Against the pitcher or the hitter

    # Create new binary columns
    df['fastball'] = df['pitch_type'].isin(fastballs).astype(int)
    df['changeup'] = df['pitch_type'].isin(changeups).astype(int)
    df['off_speed'] = df['pitch_type'].isin(off_speed).astype(int)
    df['rare'] = df['pitch_type'].isin(rare).astype(int)

    # Drop ptich_type column since it is not categorical
    df = df.drop(columns='pitch_type')
    
    # Same with error
    df['error_in_decision'] = df['error_in_decision'].map({'correct': 0, 'incorrect': 1})

    # Fix handedness
    df['stand'] = df['stand'].map({'R': 1, 'L': 0})
    df['p_throws'] = df['p_throws'].map({'R': 1, 'L': 0})
    
    df['description'] = df['description'].map({'ball': 0, 'called_strike': 1})


    # Max min scale the data to make coefficienst more interpretable
    scaler = MinMaxScaler()
    df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

    return df


ump3 = clean_df(ump3_orig)

In [11]:
X = ump3.sample(n=200, random_state=42) # Only take a small amount so that the modeling doesn't take 4-ever
y_true = np.array(X['error_in_decision'])

In [26]:
# Generate a simulated dataset
np.random.seed(42)
simulated_X = pd.DataFrame({
    'description': np.random.choice([0, 1], size=200),
    'pitch_number': np.random.uniform(0, 1, size=200),
    'at_bat_number': np.random.uniform(0, 1, size=200)
})

# Sample from the priors
with pm.Model() as prior_model:
    # Priors
    beta_balls = pm.HalfNormal('beta_balls', sigma=1)
    beta_strikes = pm.HalfNormal('beta_strikes', sigma=1)
    beta_pitchNum = pm.HalfNormal('beta_pitchNum', sigma=1)
    beta_atBatNum = pm.HalfNormal('beta_atBatNum', sigma=1)
    intercept = pm.Normal('intercept', mu=0, sigma=1)

    # Logistic model
    logits = intercept + beta_balls * simulated_X['description'] + beta_strikes * (1 - simulated_X['description']) + beta_pitchNum * simulated_X['pitch_number'] + beta_atBatNum * simulated_X['at_bat_number']
    p = pm.Deterministic('p', pm.math.sigmoid(logits))
    
    # Sample from the priors
    prior_samples = pm.sample_prior_predictive(500)

prior_samples.extend(pm.sample_posterior_predictive(prior_model))
# Plot the prior predictive checks (PPC) curve
az.plot_ppc(prior_samples, group="prior", data_pairs={"y": "p"})
plt.title("Prior Predictive Check")
plt.show()



Sampling: [beta_atBatNum, beta_balls, beta_pitchNum, beta_strikes, intercept]


TypeError: Unsupported type for `trace` argument: <class 'pymc.model.core.Model'>.

In [None]:

with pm.Model() as count_model:
    # Priors
    beta_balls = pm.Normal('beta_balls', mu=0, sigma=1)
    beta_strikes = pm.Normal('beta_strikes', mu=0, sigma=1)
    beta_pitchNum = pm.Normal('beta_pitchNum', mu=0, sigma=1)
    beta_atBatNum = pm.Normal('beta_atBatNum', mu=0, sigma=1)
    intercept = pm.Normal('intercept', mu = 0, sigma = 1)

    #logistic model 
    logits = intercept + beta_balls * X['description'] + beta_strikes * (1 - X['description']) + beta_pitchNum * X['pitch_number'] + beta_atBatNum * X['at_bat_number']
    p = pm.Deterministic('p', pm.math.sigmoid(logits))
    # Likelihood
    y = pm.Bernoulli('y', p=p, observed=y_true)
    # Sample from the posterior
    count_trace = pm.sample(500)
    count_predictive = pm.sample_posterior_predictive(count_trace)

az.plot_posterior(count_trace, var_names=["balls", "strikes", "pitch_number", 'at_bat_number'])
plt.show()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [beta_balls, beta_strikes, beta_pitchNum, beta_atBatNum, intercept]


Output()

Sampling 4 chains for 1_000 tune and 500 draw iterations (4_000 + 2_000 draws total) took 2 seconds.
Sampling: [y]


Output()

KeyError: 'var names: "[\'balls\' \'strikes\' \'pitch_number\' \'at_bat_number\'] are not present" in dataset'