# baseball.ipynb

Original Jupyter Notebook used for in-class presentation. This is not very tidy but still works and does not require atBatModel.py. This will not be updated any further.

In [None]:
import numpy as np
import pandas as pd
import pybaseball as pyb
#from sklearn import model_selection
from sklearn import linear_model
#from sklearn import metrics
import time

pd.options.mode.chained_assignment = None  # default='warn'
pyb.cache.enable()

In [None]:
def classifyOutcome(series):
    strikeEvents = np.array(["foul_tip", "called_strike", "swinging_strike", "swinging_strike_blocked", "missed_bunt"])
    foulEvents = np.array(["foul", "foul_bunt"])
    ballEvents = np.array(["ball", "blocked_ball"])
    outEvents = np.array(["force_out","field_error","field_out","fielders_choice","fielders_choice_out","grounded_into_double_play","double_play","sac_fly"])

    events = series["events"]
    description = series["description"]

    if np.isin(description, strikeEvents):
        return "K"
    elif np.isin(description, ballEvents):
        return "B"
    elif np.isin(description, foulEvents):
        return "F"
    elif np.isin(events, outEvents):
        return "FO"
    elif events == "hit_by_pitch":
        return "HBP"
    elif events == "single":
        return "1B"
    elif events == "double":
        return "2B"
    # elif events == "triple":
    #     return "3B"
    elif events == "home_run":
        return "HR"

def processData(df):
    df = df[["pitch_type","events","description","release_speed","release_spin_rate","balls","strikes","pfx_x","pfx_z","plate_x","plate_z","vx0","vy0","vz0"]]
    #df = reduceData(df)
    outcomes = df.apply(classifyOutcome, axis=1)
    df["Outcome"] = outcomes
    df = df.drop(["events", "description"], axis=1)
    df = df.dropna()
    
    return df

dataFeatures = np.array(["release_speed","release_spin_rate","pfx_x","pfx_z","plate_x","plate_z","vx0","vy0","vz0"])

In [None]:
def getCountData(df, count):
    return df[(df.balls == count[0]) & (df.strikes == count[1])]

In [None]:
def trainModel(pitchData):

    X = pitchData[dataFeatures].values
    y = pitchData["Outcome"].values

    #X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size= 0.2, random_state = 1)

    lm = linear_model.LogisticRegression(multi_class='ovr', solver='liblinear')
    return lm.fit(X, y)

In [None]:
def predictOutcomes(pitch, df):
    lm = batterModel
    pred = pd.DataFrame()
    
    pred.index = lm.classes_

    xProbs = np.round(lm.predict_proba(np.array(df.loc[pitch].drop("Freq")).reshape(1, -1)), 3)

    pred[pitch] = xProbs.transpose()

    return pred.transpose()

In [None]:
def countProbabilities(count):
    countData = getCountData(pitcherData, count)

    pitchStats = countData.groupby("pitch_type").mean()
    pitchStats = pitchStats[dataFeatures]

    pitchStats["Freq"] = countData["pitch_type"].value_counts()
    pitchStats["Freq"] = pitchStats["Freq"]/pitchStats["Freq"].sum()

    countOutcomes = pd.DataFrame([],columns=['1B', '2B', 'B', 'F', 'FO', 'HBP', 'HR', 'K'])

    for p in pitchStats.index:
        countOutcomes = pd.concat([countOutcomes, predictOutcomes(p, pitchStats)], axis=0)

    countOutcomes = pd.concat([countOutcomes, pitchStats["Freq"]], axis=1).fillna(0)

    return countOutcomes

In [None]:
def markovMatrix():
    vectorsA = [
        [(0,0), {1: ["B"], 4: ["K","F"], 13: ["HBP"], 14: ["1B"], 15: ["2B"], 16: ["HR"], 17: ["FO"]}],   #0
        [(1,0), {2: ["B"], 5: ["K","F"], 13: ["HBP"], 14: ["1B"], 15: ["2B"], 16: ["HR"], 17: ["FO"]}],   #1
        [(2,0), {3: ["B"], 6: ["K","F"], 13: ["HBP"], 14: ["1B"], 15: ["2B"], 16: ["HR"], 17: ["FO"]}],   #2
        [(3,0), {7: ["K","F"], 12: ["B"], 13: ["HBP"], 14: ["1B"], 15: ["2B"], 16: ["HR"], 17: ["FO"]}],  #3
        [(0,1), {5: ["B"], 8: ["K","F"], 13: ["HBP"], 14: ["1B"], 15: ["2B"], 16: ["HR"], 17: ["FO"]}],   #4
        [(1,1), {6: ["B"], 9: ["K","F"], 13: ["HBP"], 14: ["1B"], 15: ["2B"], 16: ["HR"], 17: ["FO"]}],   #5
        [(2,1), {7: ["B"], 10: ["K","F"], 13: ["HBP"], 14: ["1B"], 15: ["2B"], 16: ["HR"], 17: ["FO"]}],  #6
        [(3,1), {11: ["K","F"], 12: ["B"], 13: ["HBP"], 14: ["1B"], 15: ["2B"], 16: ["HR"], 17: ["FO"]}], #7
        [(0,2), {8: ["F"], 9: ["B"], 13: ["HBP"], 14: ["1B"], 15: ["2B"], 16: ["HR"], 17: ["FO","K"]}],   #8
        [(1,2), {9: ["F"], 10: ["B"], 13: ["HBP"], 14: ["1B"], 15: ["2B"], 16: ["HR"], 17: ["FO","K"]}],  #9
        [(2,2), {10: ["F"], 11: ["B"], 13: ["HBP"], 14: ["1B"], 15: ["2B"], 16: ["HR"], 17: ["FO","K"]}], #10
        [(3,2), {11: ["F"], 12: ["B"], 13: ["HBP"], 14: ["1B"], 15: ["2B"], 16: ["HR"], 17: ["FO","K"]}]  #11
    ]

    matrixA = np.zeros((18,18), dtype=np.float64)

    matrixA[12,12] = 1
    matrixA[13,13] = 1
    matrixA[14,14] = 1
    matrixA[15,15] = 1
    matrixA[16,16] = 1
    matrixA[17,17] = 1

    outcomeTypes = pd.Series(['1B', '2B', 'B', 'F', 'FO', 'HBP', 'HR', 'K'])

    for i in range(len(vectorsA)):
        count, code = vectorsA[i]
        vector = np.zeros(18)

        outcomeMatrix = countProbabilities(count)
        outcomeMatrix = outcomeMatrix.multiply(outcomeMatrix["Freq"], axis=0).drop("Freq", axis=1)

        outcomes = np.round(outcomeMatrix.sum(), decimals=3)

        for j in code:
            vector[j] = sum(outcomes[code.get(j)])

        matrixA[i] = vector

    return matrixA.transpose()

In [None]:
def simulateAtBat(pitcherID, batterID, power=100, verbose = False, saveRaws = False): 
    #pitcherID = pyb.playerid_lookup(pitcherName[0], pitcherName[1])
    #batterID = pyb.playerid_lookup(batterName[0], batterName[1])

    global pitcherData
    global batterData

    pitcherData = pyb.statcast_pitcher(start_dt="2021-01-01", end_dt="2021-10-03", player_id = pitcherID)
    if verbose:
        print("Pitcher:", pitcherData["player_name"].mode()[0])
        print("Found", pitcherData.shape[0], "observations.\n")

    batterData = pyb.statcast_batter(start_dt="2021-01-01", end_dt="2021-10-03", player_id = batterID)
    if verbose:
        print("Batter:", batterData["player_name"].mode()[0])
        print("Found", batterData.shape[0], "observations.\n")

    if saveRaws:
        global pitcherDataRaw
        global batterDataRaw
        pitcherDataRaw = pitcherData
        batterDataRaw = batterData

    pitcherData = processData(pitcherData)
    batterData = processData(batterData)
    
    global batterModel
    batterModel = trainModel(batterData)

    if verbose:
        print("Generating Markov matrix.")
    global matrix
    matrix = markovMatrix()
    
    startVector = np.zeros(18, dtype=float)
    startVector[0] = 1

    if verbose:
        print("Start Vector:\n", startVector, "\n")

    outVector = np.matmul(startVector, np.linalg.matrix_power(matrix.transpose(), power))
    
    if verbose:
        print("End vector:", np.round(outVector, 3), "\n")

    return outVector

In [None]:
def outcomeStats(pitcherName, batterName, inVector = np.zeros((18))):      
    stats = pd.Series([(pitcherName.upper()), (batterName.upper())], index = ["Pitcher", "Batter"])
    statsVector = pd.Series(np.round(inVector[12:18], 3), index = ["pWalk", "pHBP", "p1B", "p2B", "pHR", "pOut"])

    stats = stats.append(statsVector)

    stats["AVG"] = np.round(sum(stats[["p1B", "p2B", "pHR"]]), 3)

    stats["OBP"] = np.round((1 - statsVector["pOut"]), 3)

    stats["SLG"] = np.round(stats["p1B"] + 2*stats["p2B"] + 4*stats["pHR"], 3)
    
    stats["OPS"] = np.round(sum(stats[["OBP", "SLG"]]), 3)

    wOBAWeights = np.array([0.69, 0.72, 0.89, 1.27, 2.1])
    stats["wOBA"] = np.round(np.dot(wOBAWeights, inVector[12:17]), 3)

    #rawData = rawData.append(pd.Series(inVector[12:18]))

    #stats = pd.Series(data = rawData)#, index = ["Pitcher", "Batter", "pWalk", "pHBP", "p1B", "p2B", "pHR", "pOut"])

    #stats.reindex(["Pitcher", "Batter", "pWalk", "pHBP", "p1B", "p2B", "pHR", "pOut"])

    #row = 

    #pd.append()

    return stats.transpose()

In [None]:
stats = pd.DataFrame()

In [None]:
pitcherName = ["andrew", "heaney"]
batterName = ["ji-man", "choi"]

pitcherID = pyb.playerid_lookup(pitcherName[1], pitcherName[0])["key_mlbam"][0]
batterID = pyb.playerid_lookup(batterName[1], batterName[0])["key_mlbam"][0]

atBatOutcome = simulateAtBat(pitcherID, batterID, verbose=True)

print("Calculated statistics:")

statsOutput = pd.DataFrame(outcomeStats(" ".join(pitcherName), " ".join(batterName), atBatOutcome)).transpose()
stats = stats.append(statsOutput, ignore_index=True)

print(statsOutput)


In [None]:
k = 50

qual_pitchers = pyb.pitching_stats(2021, qual=75).sample(k)

qual_batters = pyb.batting_stats(2021, qual=400).sample(k)

#stats = pd.DataFrame()
try: stats
except NameError: stats = pd.DataFrame()

for i in range(k):
    print(i)

    pitcher = qual_pitchers.iloc[i]
    batter = qual_batters.iloc[i]

    pitcherID = pyb.playerid_reverse_lookup([pitcher["IDfg"]], key_type="fangraphs")["key_mlbam"][0]
    pitcherName = pitcher["Name"].upper()

    batterID = pyb.playerid_reverse_lookup([batter["IDfg"]], key_type="fangraphs")["key_mlbam"][0]
    batterName = batter["Name"].upper()

    # if stats[(stats["Pitcher"] == pitcherName) & (stats["Batter"] == batterName)].count() != 0:
    #     print(pitcherName, "vs", batterName, "already in the table, skipping.")
    #     continue
    
    output = simulateAtBat(pitcherID, batterID, verbose = False)
    statsRow = pd.DataFrame(outcomeStats(pitcherName, batterName, output)).transpose()

    stats = stats.append(statsRow, ignore_index=True)
    print(statsRow)