In [1]:
import pybaseball as pyb
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import metrics
from sklearn import preprocessing
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical

In [2]:
def classifyOutcome(series):
    # Types of events we classify as strikes, fouls, balls, and field outs
    strikeEvents = np.array(["foul_tip", "called_strike", "swinging_strike", "swinging_strike_blocked", "missed_bunt"])
    foulEvents = np.array(["foul", "foul_bunt"])
    ballEvents = np.array(["ball", "blocked_ball"])
    outEvents = np.array(["force_out","field_error","field_out","fielders_choice","fielders_choice_out","grounded_into_double_play","double_play","sac_fly"])

    events = series["events"]
    description = series["description"]

    out = np.zeros(8, dtype=int)

    if np.isin(description, strikeEvents):
        return "K"
    elif np.isin(description, ballEvents):
        return "B"
    elif np.isin(description, foulEvents):
        return "F"
    elif np.isin(events, outEvents):
        return "FO"
    elif events == "hit_by_pitch":
        return "HBP"
    elif events == "single":
        return "1B"
    elif events == "double":
        return "2B"
    elif events == "home_run":
        return "HR"

def processData(df) -> pd.DataFrame:
    pd.options.mode.chained_assignment = None  # default='warn'
    df = df[["pitch_type","events","description","release_speed","release_spin_rate","balls","strikes","pfx_x","pfx_z","plate_x","plate_z","vx0","vy0","vz0"]]

    outcomes = df.apply(classifyOutcome, axis=1)

    df["Outcome"] = outcomes

    df = df.drop(["events", "description"], axis=1)

    df = df.dropna()
    
    return df

In [222]:
class player:
    def __init__(self, **lookup)-> None:
        pyb.cache.enable()

        playerID = pd.Series(dtype=object)

        # Maybe we have a key?
        for key in ("key_mlbam","key_retro","key_bbref","key_fangraphs"):
            l = lookup.get(key)

            if not (l == None):
                playerID = pyb.playerid_reverse_lookup([l], key_type=key[4:])
        
        # If we still haven't found anyone, see if a name has been given
        if not ((lookup.get("name_first")) == None or (lookup.get("name_last") == None)) and (playerID.shape[0] == 0):
            playerID = pyb.playerid_lookup(first = lookup.get("name_first"), last = lookup.get("name_last"))

        if (playerID.shape[0] == 0):
            # PlayerID still empty, throw exception

            raise AssertionError(str("Failed to find player with these inputs or insufficient information given:\n" + str(lookup)))
        elif (playerID.shape[0] > 1):
            # Found several players, warn that we will be using the first one
            print("Warning: Found", playerID.shape[0], "players with input\n", str(lookup), "\nDefaulting to the first player!")

        self.playerID = playerID.iloc[0]
        self.playerName = (self.playerID["name_first"] + " " + self.playerID["name_last"]).title()

    def getStatcastData(self, playerType = str, dateRange = [], verbose = False) -> pd.DataFrame:
        # By default we will get pitches from games between today and 01/01/(last year)
        if dateRange == []:
            from datetime import date
            today = date.today()

            jan1LastYear = str(today.year - 1) + "-01-01"

            dateRange.append(jan1LastYear)
            dateRange.append(today.strftime("%Y-%m-%d"))

        if playerType.lower() == "batter":
            getSC = pyb.statcast_batter
        else:
            getSC = pyb.statcast_pitcher

        statcastData = getSC(start_dt = dateRange[0], end_dt = dateRange[1], player_id = self.playerID["key_mlbam"])

        if verbose:
            print(playerType.title() + ": " + self.playerName)
            print("Found " + str(statcastData.shape[0]) + " observations from " + dateRange[0] + " to " + dateRange[1] + ".\n")

        self.statcastData = statcastData

        return statcastData

In [223]:
pitcher = player(name_first = "Gerrit", name_last = "Cole")
batter = player(key_mlbam = 545361) # Mike Trout

In [224]:
batter.getStatcastData(playerType="batter", verbose=True, dateRange=["2019-01-01","2022-01-01"])

Gathering Player Data
Batter: Mike Trout
Found 4480 observations from 2019-01-01 to 2022-01-01.



Unnamed: 0,pitch_type,game_date,release_speed,release_pos_x,release_pos_z,player_name,batter,pitcher,events,description,...,fld_score,post_away_score,post_home_score,post_bat_score,post_fld_score,if_fielding_alignment,of_fielding_alignment,spin_axis,delta_home_win_exp,delta_run_exp
0,FF,2021-05-17,95.1,1.85,6.60,"Trout, Mike",545361,656529,walk,ball,...,1,1,0,0,1,Standard,Standard,136.0,0.022,0.187
1,FF,2021-05-17,93.6,1.58,6.66,"Trout, Mike",545361,656529,,ball,...,1,1,0,0,1,Standard,Standard,146.0,0.016,0.137
2,CU,2021-05-17,80.2,1.64,6.77,"Trout, Mike",545361,656529,,called_strike,...,1,1,0,0,1,Standard,Standard,321.0,0.000,-0.061
3,CU,2021-05-17,79.7,1.70,6.64,"Trout, Mike",545361,656529,,ball,...,1,1,0,0,1,Standard,Standard,321.0,0.000,0.070
4,FF,2021-05-17,94.6,1.79,6.66,"Trout, Mike",545361,656529,,ball,...,1,1,0,0,1,Standard,Standard,144.0,0.000,0.047
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4475,,2019-02-26,,,,"Trout, Mike",545361,407845,,ball,...,3,3,3,3,3,,,,0.000,
4476,,2019-02-26,,,,"Trout, Mike",545361,407845,,ball,...,3,3,3,3,3,,,,0.000,
4477,,2019-02-26,,,,"Trout, Mike",545361,571666,single,hit_into_play,...,0,0,0,0,0,,,,-0.036,
4478,,2019-02-24,,,,"Trout, Mike",545361,607219,grounded_into_double_play,hit_into_play,...,5,5,0,0,5,,,,-0.048,


In [7]:
categories = {
    "K":    0,
    "B":    1,
    "F":    2,
    "FO":   3,
    "HBP":  4,
    "1B":   5,
    "2B":   6,
    "HR":   7,
}

In [8]:
sd["Outcome"] = sd["Outcome"].apply(lambda x: categories.get(x)).astype(int).apply(to_categorical, num_classes=8, dtype=int)

NameError: name 'sd' is not defined

In [9]:
sample = sd.sample(frac=0.2)

NameError: name 'sd' is not defined

In [230]:
class markovModel:
    def __init__(self, pitcher = player, batter = player, classifier = "logistic", verbose = False) -> None:
        dataFeatures = np.array(["release_spin_rate","plate_x","plate_z","vx0","vy0","vz0"])
        
        # We will store data here
        self.modelData = dict()

        # See if the batter object has already been initialized with data, otherwise init with default settings.
        try:
            data = batter.statcastData
        except AttributeError:
            data = batter.getStatcastData(playerType="batter")
        finally:
            self.modelData["batter"] = processData(data)

        # See if the pitcher object has already been initialized with data, otherwise init with default settings.
        try:
            data = pitcher.statcastData
        except AttributeError:
            data = pitcher.getStatcastData(playerType="pitcher")
        finally:
            self.modelData["pitcher"] = processData(data)

        self.pitcher = pitcher
        self.batter = batter

        batterTest = self.modelData["batter"].sample(frac=0.2)
        batterTrain = pd.concat([self.modelData["batter"], batterTest]).drop_duplicates(keep=False)

        if classifier == "logistic":
            from sklearn import linear_model
            model = linear_model.LogisticRegression(multi_class='multinomial')

            X_train = batterTrain[dataFeatures].values
            y_train = batterTrain["Outcome"].values

            X_test = batterTest[dataFeatures].values
            y_test = batterTest["Outcome"].values

            self.batterModel = model.fit(X_train, y_train)

            self.score = model.score(X_test, y_test)

            if verbose:
                print("Model accuracy score:", self.score)

                print(metrics.classification_report(y_test, model.predict(X_test)))

        elif classifier == "neural":
            # Have to convert into categorical type that keras accepts
            categories = {
                "K":    0,
                "B":    1,
                "F":    2,
                "FO":   3,
                "HBP":  4,
                "1B":   5,
                "2B":   6,
                "HR":   7,
            }
            data = self.modelData["batter"]
            data["Outcome"] = data["Outcome"].apply(lambda x: categories.get(x)).astype(int).apply(to_categorical, num_classes=8, dtype=int)
            print(data)
            


In [232]:
markovModel(pitcher, batter, classifier="logistic", verbose=True)

Model accuracy score: 0.44457831325301206
              precision    recall  f1-score   support

          1B       0.00      0.00      0.00        24
          2B       0.00      0.00      0.00         8
           B       0.44      1.00      0.62       369
           F       0.00      0.00      0.00       120
          FO       0.00      0.00      0.00        71
         HBP       0.00      0.00      0.00         5
          HR       0.00      0.00      0.00        10
           K       0.00      0.00      0.00       223

    accuracy                           0.44       830
   macro avg       0.06      0.12      0.08       830
weighted avg       0.20      0.44      0.27       830



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


<__main__.markovModel at 0x1f5314b58d0>

In [244]:
def swingOrTake(x):
    # Types of events we classify as strikes, fouls, balls, and field outs
    swingEvents = np.array([
        "foul_tip", "foul", "foul_bunt", "swinging_strike", "swinging_strike_blocked", 
        "missed_bunt", "force_out","field_error","field_out","fielders_choice","fielders_choice_out",
        "grounded_into_double_play","double_play","sac_fly","single","double","triple","home_run"])
    takeEvents = np.array(["called_strike", "ball", "blocked_ball", "hit_by_pitch"])

    events = x["events"]
    description = x["description"]

    if np.isin(description, swingEvents):
        return 1
    elif np.isin(description, takeEvents):
        return 0
    elif np.isin(events, swingEvents):
        return 1
    elif np.isin(events, takeEvents):
        return 0

def ballsAndStrikes(x):
    b = x["balls"]
    s = x["strikes"]

    return 3*b + s

cols = pd.Series([
    # "release_speed",
    "release_spin_rate",
    "pfx_x",
    "pfx_z",
    "plate_x",
    "plate_z",
    "vx0",
    "vy0",
    "vz0"
])

def verifyModel(x):
    batter = player(key_fangraphs = x)
    print(batter.playerName)
    bd = batter.getStatcastData(dateRange=["2021-01-01","2022-01-01"], playerType="batter")

    swings = bd.apply(swingOrTake, axis=1)
    counts = pd.DataFrame(to_categorical(bd.apply(ballsAndStrikes, axis=1), num_classes=12), dtype=int).rename(lambda x: "c" + str(x), axis=1)

    bd = bd[cols]

    bd["plate_xz"] = np.sqrt(np.power(bd["plate_x"], 2) + np.power(bd["plate_z"], 2))
    bd = bd.join(counts)

    bd["Swing"] = swings
    bd = bd.dropna().astype({"Swing": int})

    bd_test = bd.sample(frac=0.1)
    bd_train = pd.concat([bd, bd_test]).drop_duplicates(keep=False)

    y_train = bd_train["Swing"].values
    X_train = bd_train.drop("Swing", axis=1).values

    y_test = bd_test["Swing"].values
    X_test = bd_test.drop("Swing", axis=1).values

    # print(np.round(bd[features].corr(method="pearson"), 3))

    scaler = preprocessing.StandardScaler().fit(X_train)

    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    bd = None

    # Logistic Model
    lm = linear_model.LogisticRegression()
    lm.fit(X_train, y_train)
    ms = np.round(lm.score(X_test, y_test), 3)

    # Neural Network Model
    neural = Sequential()
    neural.add(Dense(8, input_dim=X_train.shape[1], activation='relu'))
    neural.add(Dense(5, activation='softmax'))
    neural.add(Dense(1, activation='sigmoid'))

    neural.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    neural.fit(X_train, y_train, epochs=100, batch_size=10)

    ns = neural.evaluate(X_test, y_test)[1]

    # print(np.round(lm.score(X_test, y_test), 3))

    # # print(metrics.classification_report(y_test, model.predict(X_test)))

    # print(np.round(lm.score(X_test, y_test), 3))

    # print(metrics.classification_report(y_test, lm.predict(X_test)))

    #print(neural.evaluate(X_test, y_test))

    return pd.Series([batter.playerName, int(batter.statcastData.shape[0]*0.8), ms, ns], index=["Name","n","Logistic", "Neural"])

In [157]:
bd = batter.statcastData

swings = bd.apply(swingOrTake, axis=1)
counts = pd.DataFrame(to_categorical(bd.apply(ballsAndStrikes, axis=1), num_classes=12), dtype=int).rename(lambda x: "c" + str(x), axis=1)

bd["plate_xz"] = np.sqrt(np.power(bd["plate_x"], 2) + np.power(bd["plate_z"], 2))

bd = bd[cols]
bd = bd.join(counts)

bd["Swing"] = swings
bd = bd.dropna().astype({"Swing": int})

features = pd.Series(bd.columns.values)

bd_test = bd.sample(frac=0.2)
bd_train = pd.concat([bd, bd_test]).drop_duplicates(keep=False)

y_train = bd_train["Swing"].values
X_train = bd_train.drop("Swing", axis=1).values

y_test = bd_test["Swing"].values
X_test = bd_test.drop("Swing", axis=1).values

print(np.round(bd[features].corr(method="pearson"), 3))

scaler = preprocessing.StandardScaler().fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

bd = None

# Logistic Model
lm = linear_model.LogisticRegression()
lm.fit(X_train, y_train)
ms = np.round(lm.score(X_test, y_test), 3)

# Neural Network Model
# neural = Sequential()
# neural.add(Dense(8, input_dim=features.shape[0], activation='softmax'))
# neural.add(Dense(5, input_dim=features.shape[0], activation='softplus'))
# neural.add(Dense(1, activation='sigmoid'))

# neural.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# neural.fit(X_train, y_train, epochs=150, batch_size=10, verbose=0)

ns = 0#neural.evaluate(X_test, y_test)[1]

# print(np.round(lm.score(X_test, y_test), 3))

# # print(metrics.classification_report(y_test, model.predict(X_test)))

# print(np.round(lm.score(X_test, y_test), 3))

# print(metrics.classification_report(y_test, lm.predict(X_test)))

#print(neural.evaluate(X_test, y_test))

                   release_spin_rate  pfx_x  pfx_z  plate_x  plate_z  \
release_spin_rate              1.000  0.150 -0.168    0.101   -0.007   
pfx_x                          0.150  1.000 -0.241    0.205   -0.048   
pfx_z                         -0.168 -0.241  1.000   -0.122    0.410   
plate_x                        0.101  0.205 -0.122    1.000   -0.218   
plate_z                       -0.007 -0.048  0.410   -0.218    1.000   
plate_xz                       0.023 -0.020  0.363   -0.135    0.935   
vx0                            0.094 -0.558  0.009    0.361   -0.098   
vy0                            0.015  0.329 -0.741    0.126   -0.343   
vz0                            0.072  0.208 -0.499   -0.062    0.465   
c0                             0.040  0.009 -0.003    0.005    0.015   
c1                             0.004 -0.011  0.009    0.004    0.014   
c2                             0.011 -0.002 -0.006    0.023    0.024   
c3                            -0.012 -0.006 -0.008   -0.022   -0

array([[-0.68605297,  0.70997439,  0.56824935, ..., -0.11713032,
         5.5651733 , -0.27640573],
       [-0.6317017 ,  0.80589592,  0.59483762, ..., -0.11713032,
        -0.17968892, -0.27640573],
       [ 0.221933  , -0.24924092, -2.3298723 , ..., -0.11713032,
        -0.17968892, -0.27640573],
       ...,
       [-0.60932176,  0.86584688,  0.54166108, ..., -0.11713032,
        -0.17968892, -0.27640573],
       [-1.36704246,  1.77710143, -0.36234017, ..., -0.11713032,
        -0.17968892, -0.27640573],
       [-0.29919979,  1.22555262,  0.46189626, ..., -0.11713032,
        -0.17968892, -0.27640573]])

In [240]:
sample = pyb.batting_stats(start_season=2021, end_season=2021, qual=400)["IDfg"].sample(n=10)

In [245]:
scores = sample.apply(verifyModel)
scores

Dj Lemahieu
Gathering Player Data
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100


  mask |= (ar1 == a)


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

  mask |= (ar1 == a)


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

Unnamed: 0,Name,n,Logistic,Neural
132,Dj Lemahieu,2309,0.761,0.775439
40,Jonathan India,2209,0.699,0.744361
82,Kolten Wong,1570,0.749,0.796791
39,Yuli Gurriel,2088,0.685,0.728346
35,J. D. Martinez,2164,0.751,0.812261
21,Brandon Crawford,1832,0.739,0.811927
145,Sean Murphy,1397,0.657,0.739645
0,Bryce Harper,2075,0.73,0.776062
150,Anthony Santander,1537,0.778,0.783784
23,Paul Goldschmidt,2366,0.725,0.824742


In [246]:
print(scores[["n", "Logistic", "Neural"]].mean())
print(scores[["n", "Logistic", "Neural"]].std())

n           1954.700000
Logistic       0.727400
Neural         0.779336
dtype: float64
n           347.045322
Logistic      0.037149
Neural        0.033261
dtype: float64
