In [1]:
import preprocessing_data as prep
import pandas as pd
import numpy as np
pd.set_option('display.max_row', 1000)
pd.set_option('display.max_columns', 50)

# Load Season Player Data

All features:   
date,stadium,inning,side,pitcher,pitch_count,batter,balls,strikes,  
ay,px,ax,sz_bot,vz0,vy0,  
pfx_x,type_confidence,z0,tfs,pz,start_speed,  
az,zone,break_angle,spin_dir,end_speed,vx0,  
sz_top,nasty,descr,pfx_z,break_y,pitch_type,  
tfs_zulu,x,spin_rate,y0,break_length,y,x0,  
on_1b,on_2b,on_3b,umpcall,outcome,offense_score,defense_score

In [2]:
years = [2,3,4,5]
base_dir = "Data/"
# features_to_keep = ["pitcher","batter", "balls","strikes","pitch_count","inning","side", "umpcall"]
features_to_load = ["pitcher","batter","pitch_type","start_speed","spin_dir","umpcall"]
train_regular_season = [base_dir+"MLB_201{0}/MLB_PitchFX_201{0}_RegularSeason.csv".format(i) for i in years]
train_post_season = [base_dir+"MLB_201{0}/MLB_PitchFX_201{0}_RegularSeason.csv".format(i) for i in years]
all_data = prep.read_and_combine_data(train_regular_season+train_post_season,features_to_load)

In [3]:
players = pd.read_csv(base_dir+"MLB_Players.csv")

# Basic Statistics

In [4]:
pitchers_name_s = all_data.pitcher.unique()
batters_name_s = all_data.batter.unique()
all_name_s = np.unique(np.append(pitchers_name_s,batters_name_s))
player_stats = {}

In [5]:
pitch_types = ["FA","FF","FT","FC","FS",
               "SI","SF","SL","CH","CB",
               "CU","KC","KN","EP","UN",
               "XX","PO","FO"
              ]
pitch_types = all_data.pitch_type.unique()
def gen_empty_pitch_type(types):
    result = {}
    for t in types:
        result[t] = {
            "count":0,
            "start_speed":0,
            "spin_dir":0,
            "umpcall":{
                'X':0,
                'S':0,
                'B':0
            }
        }
    return result
def new_player_stat():
    result = {"bat_count":0, 
             "pitch_count":0,
             "hit_count":0, 
             "pitch_type":gen_empty_pitch_type(pitch_types),
             "at_bats":0}
    return result

In [6]:
old_batter = ""
init = True
for row in all_data.itertuples(index=False):
#     print(row)
    if not init:
        old_batter = batter
    batter = row.batter
    pitcher = row.pitcher
    
    # do batter first
    if batter not in player_stats:
        player_stats[batter] = new_player_stat()
   
    if old_batter != batter:
        player_stats[batter]["at_bats"] += 1
    player_stats[batter]["bat_count"] += 1
    
    if row.umpcall is 'X':
        player_stats[batter]["hit_count"] +=1
    
    
    
    # do pitcher 
    
    if pitcher not in player_stats:
        player_stats[pitcher] = new_player_stat()
    player_stats[pitcher]["pitch_count"] += 1
    
    p = player_stats[pitcher]["pitch_type"][row.pitch_type]
    p["count"]+=1
    p["start_speed"]+=row.start_speed
    p["spin_dir"]+=row.spin_dir
    p["umpcall"][row.umpcall]+=1
    init = False

In [7]:
# Calculate avg stats
for player,stat in player_stats.items():
    strike_count = 0
    for pitch_type,pitch_type_val in stat["pitch_type"].items():
        if(pitch_type_val["count"]>0):
            pitch_type_val["avg_start_speed"] = pitch_type_val["start_speed"]/pitch_type_val["count"]
            pitch_type_val["avg_spin_dir"] = pitch_type_val["spin_dir"]/pitch_type_val["count"]
            pitch_type_val["possibility"] = pitch_type_val["count"]/stat["pitch_count"]
            strike_count +=pitch_type_val["umpcall"]["S"]
        else:
            pitch_type_val["avg_start_speed"] = 0
            pitch_type_val["avg_spin_dir"] = 0
            pitch_type_val["possibility"] = 0
    if stat["pitch_count"] > 0:
        stat["strike_ratio"] = strike_count / stat["pitch_count"]
    else:
        stat["strike_ratio"] = 0
    if stat["bat_count"] > 0:
        stat["hit_ratio"] = stat["hit_count"] / stat["bat_count"]
    else:
        stat["hit_ratio"] = 0

In [None]:
# for pitcher in pitchers_name_s[:5]:
#     print("\nPlayer: ",pitcher)
#     plyer = player_stats[pitcher]
#     print("Strike ratio: ",plyer["strike_ratio"])
#     for pitch_type,pitch_type_val in plyer["pitch_type"].items():
#         if(pitch_type_val["count"]>0):
#             print(pitch_type,pitch_type_val)

In [None]:

# for batter in batters_name_s[:5]:
#     print("\nPlayer: ",batter)
#     plyer = player_stats[batter]
#     print("hit_ratio", plyer["hit_ratio"])
#     print("hit", plyer["hit_count"])
#     print("bats", plyer["bat_count"])

In [8]:
def new_col_names(pitch_types, rest):
    col = [pth for pth in pitch_types]
    col += [pth+"_speed" for pth in pitch_types]
    col += rest
    return col
for c  in new_col_names(pitch_types, ["hit_ratio"]):
    players[c] = 0 
    
temp = players.copy()
temp = temp.drop(temp.index[0:])

for index, player in players.iterrows():
    pid = player["bref_id"]
#     print(pid)
    if pid in player_stats:
        plyer = player_stats[pid]
        for pitch_type,pitch_type_val in plyer["pitch_type"].items():
            if(pitch_type_val["count"]>0):
                player[pitch_type] = pitch_type_val["possibility"]
                player[pitch_type+"_speed"] = pitch_type_val["avg_start_speed"]
        player["hit_ratio"] = plyer["hit_ratio"]
    temp = temp.append(player)

In [None]:
# print(players[:5])

In [None]:
# temp[:5]


In [None]:
# player_stats["beltrad01"]

In [9]:
temp.to_csv("MLB_Players_Stats.csv" )