In [1]:
import preprocessing_data as prep
import pandas as pd
import numpy as np
pd.set_option('display.max_row', 1000)
pd.set_option('display.max_columns', 50)

# Load Season Player Data

All features:   
date,stadium,inning,side,pitcher,pitch_count,batter,balls,strikes,  
ay,px,ax,sz_bot,vz0,vy0,  
pfx_x,type_confidence,z0,tfs,pz,start_speed,  
az,zone,break_angle,spin_dir,end_speed,vx0,  
sz_top,nasty,descr,pfx_z,break_y,pitch_type,  
tfs_zulu,x,spin_rate,y0,break_length,y,x0,  
on_1b,on_2b,on_3b,umpcall,outcome,offense_score,defense_score

In [2]:
years = [2,3,4,5]
base_dir = "Data/"
# features_to_keep = ["pitcher","batter", "balls","strikes","pitch_count","inning","side", "umpcall"]
features_to_load = ["pitcher","batter","pitch_type","start_speed","spin_dir","umpcall"]
train_regular_season = [base_dir+"MLB_201{0}/MLB_PitchFX_201{0}_RegularSeason.csv".format(i) for i in years]
train_post_season = [base_dir+"MLB_201{0}/MLB_PitchFX_201{0}_RegularSeason.csv".format(i) for i in years]
all_data = prep.read_and_combine_data(train_regular_season+train_post_season,features_to_load)

In [3]:
players = pd.read_csv(base_dir+"MLB_Players.csv")

# Basic Statistics

In [4]:
pitchers_name_s = all_data.pitcher.unique()
batters_name_s = all_data.batter.unique()
all_name_s = np.unique(np.append(pitchers_name_s,batters_name_s))
player_stats = {}

In [5]:
pitch_types = ["FA","FF","FT","FC","FS",
               "SI","SF","SL","CH","CB",
               "CU","KC","KN","EP","UN",
               "XX","PO","FO"
              ]
def gen_empty_pitch_type(types):
    result = {}
    for t in types:
        result[t] = {
            "count":0,
            "start_speed":0,
            "spin_dir":0,
            "umpcall":{
                'X':0,
                'S':0,
                'B':0
            }
        }
    return result
def new_player_stat():
    result = {"bat_count":0, 
             "pitch_count":0,
             "hit_count":0, 
             "pitch_type":gen_empty_pitch_type(pitch_types),
             "at_bats":0}
    return result

In [6]:
old_batter = ""
init = True
for row in all_data.itertuples(index=False):
#     print(row)
    if not init:
        old_batter = batter
    batter = row.batter
    pitcher = row.pitcher
    
    # do batter first
    if batter not in player_stats:
        player_stats[batter] = new_player_stat()
   
    if old_batter != batter:
        player_stats[batter]["at_bats"] += 1
    player_stats[batter]["bat_count"] += 1
    
    if row.umpcall is 'X':
        player_stats[batter]["hit_count"] +=1
    
    
    
    # do pitcher 
    
    if pitcher not in player_stats:
        player_stats[pitcher] = new_player_stat()
    player_stats[pitcher]["pitch_count"] += 1
    
    p = player_stats[pitcher]["pitch_type"][row.pitch_type]
    p["count"]+=1
    p["start_speed"]+=row.start_speed
    p["spin_dir"]+=row.spin_dir
    p["umpcall"][row.umpcall]+=1
    init = False

In [7]:
# Calculate avg stats
for player,stat in player_stats.items():
    strike_count = 0
    for pitch_type,pitch_type_val in stat["pitch_type"].items():
        if(pitch_type_val["count"]>0):
            pitch_type_val["avg_start_speed"] = pitch_type_val["start_speed"]/pitch_type_val["count"]
            pitch_type_val["avg_spin_dir"] = pitch_type_val["spin_dir"]/pitch_type_val["count"]
            pitch_type_val["possibility"] = pitch_type_val["count"]/stat["pitch_count"]
            strike_count +=pitch_type_val["umpcall"]["S"]
        else:
            pitch_type_val["avg_start_speed"] = 0
            pitch_type_val["avg_spin_dir"] = 0
            pitch_type_val["possibility"] = 0
    if stat["pitch_count"] > 0:
        stat["strike_ratio"] = strike_count / stat["pitch_count"]
    else:
        stat["strike_ratio"] = 0
    if stat["bat_count"] > 0:
        stat["hit_ratio"] = stat["hit_count"] / stat["bat_count"]
    else:
        stat["hit_ratio"] = 0

In [37]:
for pitcher in pitchers_name_s[:5]:
    print("\nPlayer: ",pitcher)
    plyer = player_stats[pitcher]
    print("Strike ratio: ",plyer["strike_ratio"])
    for pitch_type,pitch_type_val in plyer["pitch_type"].items():
        if(pitch_type_val["count"]>0):
            print(pitch_type,pitch_type_val)


Player:  johnsjo09
Strike ratio:  0.44223918575063614
FF {'count': 3744, 'start_speed': 347358.60000000033, 'spin_dir': 754628.13999999978, 'umpcall': {'X': 718, 'S': 1732, 'B': 1294}, 'avg_start_speed': 92.777403846153931, 'avg_spin_dir': 201.55666132478626, 'possibility': 0.4763358778625954}
FT {'count': 652, 'start_speed': 60559.60000000002, 'spin_dir': 147892.46999999997, 'umpcall': {'X': 160, 'S': 252, 'B': 240}, 'avg_start_speed': 92.8828220858896, 'avg_spin_dir': 226.82894171779137, 'possibility': 0.08295165394402036}
SL {'count': 1778, 'start_speed': 154080.80000000002, 'spin_dir': 227220.80599999955, 'umpcall': {'X': 268, 'S': 796, 'B': 714}, 'avg_start_speed': 86.659617547806533, 'avg_spin_dir': 127.79572890888613, 'possibility': 0.2262086513994911}
CH {'count': 508, 'start_speed': 44571.199999999968, 'spin_dir': 117045.29799999994, 'umpcall': {'X': 102, 'S': 154, 'B': 252}, 'avg_start_speed': 87.738582677165297, 'avg_spin_dir': 230.40412992125971, 'possibility': 0.064631043

In [8]:

for batter in batters_name_s[:5]:
    print("\nPlayer: ",batter)
    plyer = player_stats[batter]
    print("hit_ratio", plyer["hit_ratio"])
    print("hit", plyer["hit_count"])
    print("bats", plyer["bat_count"])


Player:  beltrca01
hit_ratio 0.19748177601060304
hit 2980
bats 15090

Player:  hollima01
hit_ratio 0.1889347433582667
hit 2930
bats 15508

Player:  freesda01
hit_ratio 0.17575113001861206
hit 2644
bats 15044

Player:  molinya01
hit_ratio 0.2336996336996337
hit 3190
bats 13650

Player:  reyesjo01
hit_ratio 0.23524741995236836
hit 3556
bats 15116


In [24]:
def new_col_names(pitch_types, rest):
    col = [pth for pth in pitch_types]
    col += [pth+"_speed" for pth in pitch_types]
    col += rest
    return col
for c  in new_col_names(pitch_types, ["hit_ratio"]):
    players[c] = 0 
    
temp = players.copy()
temp = temp.drop(temp.index[0:])

for index, player in players.iterrows():
    pid = player["bref_id"]
#     print(pid)
    if pid in player_stats:
        plyer = player_stats[pid]
        for pitch_type,pitch_type_val in plyer["pitch_type"].items():
            if(pitch_type_val["count"]>0):
                player[pitch_type] = pitch_type_val["possibility"]
                player[pitch_type+"_speed"] = pitch_type_val["avg_start_speed"]
        player["hit_ratio"] = plyer["hit_ratio"]
    temp = temp.append(player)

In [54]:
print(players[:5])

   mlb_id    bref_id     last    first throws bats  height  weight  \
0  112526  colonba01    Colon  Bartolo      R    R    71.0   265.0   
1  134181  beltrad01   Beltre   Adrian      R    R    71.0   220.0   
2  136860  beltrca01  Beltran   Carlos      R    S    73.0   215.0   
3  150029  werthja01    Werth   Jayson      R    R    77.0   220.0   
4  150212  cuddymi01  Cuddyer  Michael      R  NaN    74.0   220.0   

          dob  FA  FF  FT  FC  FS  SI  SF  SL  CH  CB  CU  KC  KN  EP  UN  XX  \
0  1973-05-24   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   
1  1979-04-07   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   
2  1977-04-24   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   
3  1979-05-20   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   
4  1979-03-27   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   

   PO  FO  FA_speed  FF_speed  FT_speed  FC_speed  FS_speed  SI_speed  \
0   0   0         0

In [21]:
temp[:5]


Unnamed: 0,mlb_id,bref_id,last,first,throws,bats,height,weight,dob,FA,FF,FT,FC,FS,SI,SF,SL,CH,CB,CU,KC,KN,EP,UN,XX,PO,FO,FA_speed,FF_speed,FT_speed,FC_speed,FS_speed,SI_speed,SF_speed,SL_speed,CH_speed,CB_speed,CU_speed,KC_speed,KN_speed,EP_speed,UN_speed,XX_speed,PO_speed,FO_speed,hit_ratio
0,112526,colonba01,Colon,Bartolo,R,R,71.0,265.0,1973-05-24,0,0.311696,0.533863,0.0,0.0,0.0,0,0.099176,0.055265,0,0.0,0.0,0,0,0,0,0,0,0,90.97724,88.151647,0.0,0.0,0.0,0,82.546201,81.813455,0,0.0,0.0,0,0,0,0,0,0,0.184019
1,134181,beltrad01,Beltre,Adrian,R,R,71.0,220.0,1979-04-07,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0.220411
2,136860,beltrca01,Beltran,Carlos,R,S,73.0,215.0,1977-04-24,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0.197482
3,150029,werthja01,Werth,Jayson,R,R,77.0,220.0,1979-05-20,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0.162191
4,150212,cuddymi01,Cuddyer,Michael,R,,74.0,220.0,1979-03-27,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,0.0,0,0,0,0,0,0,0.191378


In [23]:
player_stats["beltrad01"]

{'at_bats': 4864,
 'bat_count': 17912,
 'hit_count': 3948,
 'hit_ratio': 0.2204108977221974,
 'pitch_count': 0,
 'pitch_type': {'CB': {'avg_spin_dir': 0,
   'avg_start_speed': 0,
   'count': 0,
   'possibility': 0,
   'spin_dir': 0,
   'start_speed': 0,
   'umpcall': {'B': 0, 'S': 0, 'X': 0}},
  'CH': {'avg_spin_dir': 0,
   'avg_start_speed': 0,
   'count': 0,
   'possibility': 0,
   'spin_dir': 0,
   'start_speed': 0,
   'umpcall': {'B': 0, 'S': 0, 'X': 0}},
  'CU': {'avg_spin_dir': 0,
   'avg_start_speed': 0,
   'count': 0,
   'possibility': 0,
   'spin_dir': 0,
   'start_speed': 0,
   'umpcall': {'B': 0, 'S': 0, 'X': 0}},
  'EP': {'avg_spin_dir': 0,
   'avg_start_speed': 0,
   'count': 0,
   'possibility': 0,
   'spin_dir': 0,
   'start_speed': 0,
   'umpcall': {'B': 0, 'S': 0, 'X': 0}},
  'FA': {'avg_spin_dir': 0,
   'avg_start_speed': 0,
   'count': 0,
   'possibility': 0,
   'spin_dir': 0,
   'start_speed': 0,
   'umpcall': {'B': 0, 'S': 0, 'X': 0}},
  'FC': {'avg_spin_dir': 0,
 

In [27]:
temp.to_csv("MLB_Players_Stats.csv" )