In [1]:
import pandas as pd

In [2]:
statcast = pd.read_csv("data/pitches_2016_2018.csv")

In [4]:
statcast = statcast.loc[(statcast["game_date"] < "2019-01-01") & (statcast["game_date"] > "2018-01-01")]

In [6]:
def get_obp_series(df):
    return (df["H"] + df["BB"] + df["HBP"]) / (df["AB"] + df["BB"] + df["HBP"] + df["SF"])

def get_slg_series(df):
    singles = df["H"] - df["HR"] - df["2B"] - df["3B"]
    return (singles + 2*df["2B"] + 3*df["3B"] + 4*df["HR"]) / df["AB"]

def get_ops_series(df):
    return get_obp_series(df) + get_slg_series(df)

def get_avg_series(df):
    return (df["H"]) / (df["AB"])

In [26]:
def get_pca5_series_pitcher(df):
    pab = df.groupby(["pitcher", "game_date", "batter", "inning"]).size()
    return pab.loc[pab > 5].to_frame().reset_index().groupby("pitcher").size()

In [65]:
def get_pca5_series_batter(df):
    bab = df.groupby(["batter", "game_date", "inning"]).size()
    return bab.loc[bab > 5].to_frame().reset_index().groupby("batter").size()

In [60]:
def get_bf(df):
    return df.loc[df["events"].notna()].groupby(["pitcher"]).size()

In [51]:
# returns tuple (release_speed_mean, effective_speed_mean, pfx_x_mean, pfx_z_mean)
def get_movement_series(df):
    d = df.groupby("pitcher").agg({"release_speed": "mean",
                                 "effective_speed": "mean",
                                 "pfx_x": "mean", 
                                 "pfx_z": "mean"}).abs().rename(index=str, columns={"release_speed": "release_speed_mean",
                                                                 "effective_speed": "effective_speed_mean",
                                                                 "pfx_x": "pfx_x_mean",
                                                                 "pfx_z": "pfx_z_mean"})
    return (d["release_speed_mean"], d["effective_speed_mean"], d["pfx_x_mean"], d["pfx_z_mean"])

In [77]:
def get_event_series_pitcher(df, event):
    return df.loc[(df["events"] == event)].groupby("pitcher").size()

def get_event_series_batter(df, event):
    return df.loc[(df["events"] == event)].groupby("batter").size()

In [91]:
# this only estimates ip, it is not perfect
def get_ip_series(df):
    return df.groupby(["pitcher", "game_date"])["inning"].nunique().to_frame().reset_index().groupby("pitcher")["inning"].sum()

In [99]:
def get_whip_series(df):
    walks = df.loc[(df["events"] == "walk")].groupby("pitcher").size()
    hits = df.loc[(df["events"] == "single") |
                        (df["events"] == "double") |
                        (df["events"] == "triple") |
                        (df["events"] == "home_run")].groupby("pitcher").size()
    stuff = pd.concat([get_ip_series(df), walks, hits], keys=["ip", "walks", "hits"], axis=1)
    return (stuff["walks"] + stuff["hits"]) / stuff["ip"]

In [67]:
def remove_infreq_pitchers(df):
    bf = get_bf(df)
    bf.loc[bf > 70].index

    return df.loc[(df["pitcher"].isin(bf.loc[bf > 70].index))]

In [103]:
def make_pitcher_df(df):
    df = remove_infreq_pitchers(df)
    _, avg_speed, avg_x, avg_z = get_movement_series(statcast_p)
    return pd.concat([get_event_series_pitcher(df, "strikeout"),
                      get_event_series_pitcher(df, "home_run"),
                      get_event_series_pitcher(df, "walk"), # I don't think we can do IBB
                      get_pca5_series_pitcher(df),
                      get_bf(df),
                      get_whip_series(df),
                      avg_speed,
                      avg_x,
                      avg_z
                     ],
                     keys=["SO", "HR", "BB", "PCA5", "BF", "WHIP", "avg_speed", "avg_x", "avg_z"],
                     axis=1)

In [105]:
def make_efp_series(pitcher_df):
    df = pitcher_df
    # WE DON'T HAVE IBB, only BB
    return (1.0*df["SO"] - (0.5*df["PCA5"] + 3.0*df["HR"] + 3.0*df["BB"])) / df["BF"] - \
            1.0*df["WHIP"] + 0.1*df["avg_z"] + 0.1*df["avg_x"] + 0.3*df["avg_speed"]

In [115]:
make_efp_series(make_pitcher_df(statcast)).sort_values(ascending=False)

pitcher
663855.0    27.943128
547973.0    27.886378
621242.0    27.876568
622554.0    27.760640
453344.0    27.701912
643338.0    27.604235
448281.0    27.566962
621237.0    27.554503
595014.0    27.551103
594027.0    27.440869
594798.0    27.385340
642152.0    27.315820
623352.0    27.300859
622259.0    27.163126
607192.0    27.120136
445276.0    27.102669
518489.0    27.099703
592789.0    27.075767
592773.0    27.024761
488984.0    26.956611
641729.0    26.932500
605151.0    26.932053
621345.0    26.907919
622772.0    26.898433
520980.0    26.895929
571710.0    26.886311
572021.0    26.872022
571561.0    26.870487
554430.0    26.864035
502083.0    26.840247
              ...    
657610.0    23.616710
592811.0    23.607833
446399.0    23.585023
543548.0    23.540358
621389.0    23.491989
435043.0    23.483427
493247.0    23.482703
621397.0    23.451612
460283.0    23.401049
453281.0    23.255541
446899.0    23.195517
458708.0    23.189536
450306.0    23.159886
598271.0    23.140476
64

In [42]:
from pybaseball import playerid_lookup, playerid_reverse_lookup

In [122]:
playerid_reverse_lookup([547973.0], key_type="mlbam")

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,chapman,aroldis,547973,chapa001,chapmar01,10233,2010.0,2019.0
