In [1]:
import pandas as pd

In [2]:
statcast = pd.read_csv("data/pitches_2016_2018.csv")

In [4]:
statcast = statcast.loc[(statcast["game_date"] < "2019-01-01") & (statcast["game_date"] > "2018-01-01")]

In [6]:
def get_obp_series(df):
    return (df["H"] + df["BB"] + df["HBP"]) / (df["AB"] + df["BB"] + df["HBP"] + df["SF"])

def get_slg_series(df):
    singles = df["H"] - df["HR"] - df["2B"] - df["3B"]
    return (singles + 2*df["2B"] + 3*df["3B"] + 4*df["HR"]) / df["AB"]

def get_ops_series(df):
    return get_obp_series(df) + get_slg_series(df)

def get_avg_series(df):
    return (df["H"]) / (df["AB"])

In [26]:
def get_pca5_series_pitcher(df):
    pab = df.groupby(["pitcher", "game_date", "batter", "inning"]).size()
    return pab.loc[pab > 5].to_frame().reset_index().groupby("pitcher").size()

In [65]:
def get_pca5_series_batter(df):
    bab = df.groupby(["batter", "game_date", "inning"]).size()
    return bab.loc[bab > 5].to_frame().reset_index().groupby("batter").size()

In [60]:
def get_bf(df):
    return df.loc[df["events"].notna()].groupby(["pitcher"]).size()

In [51]:
# returns tuple (release_speed_mean, effective_speed_mean, pfx_x_mean, pfx_z_mean)
def get_movement_series(df):
    d = df.groupby("pitcher").agg({"release_speed": "mean",
                                 "effective_speed": "mean",
                                 "pfx_x": "mean", 
                                 "pfx_z": "mean"}).abs().rename(index=str, columns={"release_speed": "release_speed_mean",
                                                                 "effective_speed": "effective_speed_mean",
                                                                 "pfx_x": "pfx_x_mean",
                                                                 "pfx_z": "pfx_z_mean"})
    return (d["release_speed_mean"], d["effective_speed_mean"], d["pfx_x_mean"], d["pfx_z_mean"])

In [77]:
def get_event_series_pitcher(df, event):
    return df.loc[(df["events"] == event)].groupby("pitcher").size()

def get_event_series_batter(df, event):
    return df.loc[(df["events"] == event)].groupby("batter").size()

In [91]:
# this only estimates ip, it is not perfect
def get_ip_series(df):
    return df.groupby(["pitcher", "game_date"])["inning"].nunique().to_frame().reset_index().groupby("pitcher")["inning"].sum()

In [99]:
def get_whip_series(df):
    walks = df.loc[(df["events"] == "walk")].groupby("pitcher").size()
    hits = df.loc[(df["events"] == "single") |
                        (df["events"] == "double") |
                        (df["events"] == "triple") |
                        (df["events"] == "home_run")].groupby("pitcher").size()
    stuff = pd.concat([get_ip_series(df), walks, hits], keys=["ip", "walks", "hits"], axis=1)
    return (stuff["walks"] + stuff["hits"]) / stuff["ip"]

In [67]:
def remove_infreq_pitchers(df):
    bf = get_bf(df)
    bf.loc[bf > 70].index

    return df.loc[(df["pitcher"].isin(bf.loc[bf > 70].index))]

In [141]:
def make_pitcher_df(df):
    df = remove_infreq_pitchers(df)
    _, avg_speed, avg_x, avg_z = get_movement_series(statcast_p)
    return pd.concat([get_event_series_pitcher(df, "strikeout"),
                      get_event_series_pitcher(df, "home_run"),
                      get_event_series_pitcher(df, "walk"), # I don't think we can do IBB
                      get_pca5_series_pitcher(df),
                      get_bf(df),
                      get_whip_series(df),
                      avg_speed,
                      avg_x,
                      avg_z
                     ],
                     keys=["SO", "HR", "BB", "PCA5", "BF", "WHIP", "avg_speed", "avg_x", "avg_z"],
                     axis=1)

def make_efp_series(pitcher_df):
    df = pitcher_df
    # WE DON'T HAVE IBB, only BB
    return (1.0*df["SO"] - (0.5*df["PCA5"] + 3.0*df["HR"] + 3.0*df["BB"])) / df["BF"] - \
            1.0*df["WHIP"] + 0.1*df["avg_z"] + 0.1*df["avg_x"] + 0.3*df["avg_speed"]

In [132]:
#make_efp_series(make_pitcher_df(statcast)).sort_values(ascending=False)

In [154]:
from pybaseball import batting_stats, batting_stats_bref

In [155]:
b = batting_stats(2018)

In [296]:
playerid_lookup("ramirez", "jose").dropna()

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
6,ramirez,jose,542432,ramij004,ramirjo02,10171,2014.0,2018.0
15,ramirez,jose,608070,ramij003,ramirjo01,13510,2013.0,2019.0


In [289]:
# https://github.com/jldbc/pybaseball/pull/49/files
# customized because this data is awful
def batch_playerid_lookup(names, year=None):
    """
    Gets player id's for a list of player names.
     names : list : List of names to look up in format [[last, first], [last1, first1], [last2, first2]]
    return : DataFrame : DataFrame containing a row for each requested player.
    """
    from pybaseball.playerid_lookup import get_lookup_table
    table = get_lookup_table()

    player_ids = []
    for name in names:
        last = name[0].lower()
        if name[1]:
            first = name[1].lower()

        if first is None:
            if year is not None:
                results = table.loc[(table['name_last'] == last) &
                                    (table['mlb_played_first'] <= year) &
                                    (table['mlb_played_last'] >= year)]
            else:
                results = table.loc[table['name_last'] == last]
        else:
            if year is not None:
                results = table.loc[(table['name_last'] == last) & (table['name_first'] == first) &
                                    (table['mlb_played_first'] <= year) &
                                    (table['mlb_played_last'] >= year)]
            else:
                results = results = table.loc[(table['name_last'] == last) & (table['name_first'] == first)]
            
            iterations = 0
            while len(results) == 0 and iterations < 10:
                iterations += 1
                if 'jr.' in last:
                    last = last.replace('jr.', '').strip()
                elif 'jr' in last:
                    last = last.replace('jr', '').strip()
                elif '.' in first and ' ' in first:
                    first = first.replace(' ', '').replace('.', '')
                elif '.' in first:
                    first = first[:first.index('.') + 1] + ' ' + first[first.index('.') + 1:]
                elif first == 'nicholas':
                    first = 'nick'
                elif first == 'yolmer' and last == 'sanchez':
                    first = 'carlos'
                elif first == 'raffy' and last == 'lopez':
                    first = 'rafael'
                elif last == 'ervin':
                    first = 'phil'
                elif last == 'wheeler' and first == 'zack':
                    first = 'zach'
                elif last == 'joyce' and first == 'matt':
                    first = 'matthew'
                elif last == 'vogelbach' and first == 'daniel':
                    first = 'dan'
                elif last == 'kang':
                    first = 'jung ho'
                elif last == 'urshela':
                    first = 'gio'
                elif last == 'poncedeleon':
                    last = 'ponce de leon'
                elif last == 'bowman' and first == 'matt':
                    first = 'matthew'
                elif last == 'chargois':
                    first = 'j. t.'
                elif last == 'boyd' and first == 'matthew':
                    first = 'matt'
                elif last == 'gosselin' and first == 'phil':
                    first = 'philip'
                elif last == 'guerra' and first == 'javier':
                    first = 'javy'
                elif last == 'delmonico' and first == 'nicky':
                    first = 'nick'
                elif last == 'wilkerson' and first == 'steve':
                    first = 'stevie'
                elif first == 'john' and last == 'ryan murphy':
                    first = 'j. r.'
                    last = 'murphy'
                else:
                    for name_bit in last.split(' '):
                        if len(name_bit) <= 3:
                            last = last.replace(name_bit, '').strip()
                
                if year is not None:
                    results = table.loc[(table['name_last'] == last) & (table['name_first'] == first) &
                                        (table['mlb_played_first'] <= year) &
                                        (table['mlb_played_last'] >= year)]
                else:
                    results = results = table.loc[(table['name_last'] == last) &
                                                  (table['name_first'] == first)]
                
            if iterations >= 10:
                results = pd.DataFrame([[
                    last,
                    first,
                    -1,
                    'fail001',
                    'fail01',
                    -1,
                    -1,
                    -1
                ]], columns=[
                    'name_last', 
                     'name_first', 
                     'key_mlbam', 
                     'key_retro', 
                     'key_bbref', 
                     'key_fangraphs',
                     'mlb_played_first',
                     'mlb_played_last'])


        results = results.reset_index().drop('index', 1)
        player_ids.append(results)

    df = pd.DataFrame(columns=player_ids[0].columns)
    for id in player_ids:
        df = pd.concat([df, id])
    return df.reset_index(drop=True)

In [290]:
def make_last_first_list(name):
    s = name.split(' ')
    return ' '.join(s[1:]), s[0]

bpl = batch_playerid_lookup(b["Name"].apply(make_last_first_list), year=2018).dropna()

Gathering player lookup table. This may take a moment.


In [310]:
from pybaseball.playerid_lookup import get_lookup_table
lookup_table = get_lookup_table()

def playerid_lookup(last, first=None, year=None):
    # force input strings to lowercase
    last = last.lower()
    if first:
        first = first.lower()
    table = lookup_table
    
    if first is None:
        if year is not None:
            results = table.loc[(table['name_last'] == last) &
                                (table['mlb_played_first'] <= year) &
                                (table['mlb_played_last'] >= year)]
        else:
            results = table.loc[table['name_last'] == last]
    else:
        if year is not None:
            results = table.loc[(table['name_last'] == last) & (table['name_first'] == first) &
                                (table['mlb_played_first'] <= year) &
                                (table['mlb_played_last'] >= year)]
        else:
            results = results = table.loc[(table['name_last'] == last) & (table['name_first'] == first)]

        iterations = 0
        while len(results) == 0 and iterations < 10:
            iterations += 1
            if 'jr.' in last:
                last = last.replace('jr.', '').strip()
            elif 'jr' in last:
                last = last.replace('jr', '').strip()
            elif '.' in first and ' ' in first:
                first = first.replace(' ', '').replace('.', '')
            elif '.' in first:
                first = first[:first.index('.') + 1] + ' ' + first[first.index('.') + 1:]
            elif first == 'nicholas':
                first = 'nick'
            elif first == 'yolmer' and last == 'sanchez':
                first = 'carlos'
            elif first == 'raffy' and last == 'lopez':
                first = 'rafael'
            elif last == 'ervin':
                first = 'phil'
            elif last == 'wheeler' and first == 'zack':
                first = 'zach'
            elif last == 'joyce' and first == 'matt':
                first = 'matthew'
            elif last == 'vogelbach' and first == 'daniel':
                first = 'dan'
            elif last == 'kang':
                first = 'jung ho'
            elif last == 'urshela':
                first = 'gio'
            elif last == 'poncedeleon':
                last = 'ponce de leon'
            elif last == 'bowman' and first == 'matt':
                first = 'matthew'
            elif last == 'chargois':
                first = 'j. t.'
            elif last == 'boyd' and first == 'matthew':
                first = 'matt'
            elif last == 'gosselin' and first == 'phil':
                first = 'philip'
            elif last == 'guerra' and first == 'javier':
                first = 'javy'
            elif last == 'delmonico' and first == 'nicky':
                first = 'nick'
            elif last == 'wilkerson' and first == 'steve':
                first = 'stevie'
            elif first == 'john' and last == 'ryan murphy':
                first = 'j. r.'
                last = 'murphy'
            else:
                for name_bit in last.split(' '):
                    if len(name_bit) <= 3:
                        last = last.replace(name_bit, '').strip()

            if year is not None:
                results = table.loc[(table['name_last'] == last) & (table['name_first'] == first) &
                                    (table['mlb_played_first'] <= year) &
                                    (table['mlb_played_last'] >= year)]
            else:
                results = results = table.loc[(table['name_last'] == last) &
                                              (table['name_first'] == first)]
    
    
    #results[['key_mlbam', 'key_fangraphs', 'mlb_played_first', 'mlb_played_last']] = results[['key_mlbam', 'key_fangraphs', 'mlb_played_first', 'mlb_played_last']].astype(int) # originally returned as floats which is wrong
    results = results.reset_index().drop('index', 1)
    return results

def get_mlbam_from_name(last, first=None, year=None):
    try:
        return playerid_lookup(last, first, year=year).dropna().reset_index(drop=True)["key_mlbam"].iloc[0]
    except IndexError as e:
        if '.' in first and ' ' in first:
            first = first.replace(' ', '').replace('.', '')
        elif '.' in first:
            first = first[:first.index('.') + 1] + ' ' + first[first.index('.') + 1:]
        elif first.lower() == 'nicholas':
            first = 'nick'
        elif first.lower() == 'yolmer' and last.lower() == 'sanchez':
            first = 'carlos'
        else:
            #raise Exception("your name bad: %s, %s" % (last, first))
            print("bad name: %s, %s" % (last, first))
            return -1
        
        return get_mlbam_from_name(last, first)
        return playerid_lookup(last, first).dropna().reset_index(drop=True)["key_mlbam"].iloc[0]
        # j.d. martinez -> j. d. martinez

def add_mlbam_to_fg(df_fg):
    def get_last_first(name):
        s = name.split(' ')
        last, first = ' '.join(s[1:]), s[0]
#         if len(last) <= 2 and len(s) > 2:
#             last = s[2]
        return last, first
    
    def get_mlbam(name):
        last, first = get_last_first(name)
        return get_mlbam_from_name(last, first)
    
    df_fg["key_mlbam"] = df_fg["Name"].apply(get_mlbam)
    return df_fg

Gathering player lookup table. This may take a moment.


In [None]:
df_asdf = add_mlbam_to_fg(b)[["Season", "Name", "Team", "key_mlbam"]]

In [None]:
df_asdf.loc[(df_asdf["key_mlbam"] == -1)]

In [42]:
from pybaseball import playerid_lookup, playerid_reverse_lookup

In [130]:
playerid_reverse_lookup([663855.0], key_type="mlbam")

Gathering player lookup table. This may take a moment.


Unnamed: 0,name_last,name_first,key_mlbam,key_retro,key_bbref,key_fangraphs,mlb_played_first,mlb_played_last
0,hicks,jordan,663855,hickj002,hicksjo03,19618,2018.0,2019.0


In [208]:
b.columns

Index(['Season', 'Name', 'Team', 'Age', 'G', 'AB', 'PA', 'H', '1B', '2B',
       ...
       'wSL/C (pi)', 'wXX/C (pi)', 'O-Swing% (pi)', 'Z-Swing% (pi)',
       'Swing% (pi)', 'O-Contact% (pi)', 'Z-Contact% (pi)', 'Contact% (pi)',
       'Zone% (pi)', 'Pace (pi)'],
      dtype='object', length=287)

In [209]:
b_bref = batting_stats_bref(2018)