In [1]:
import preprocessing_data as prep
import pandas as pd
import numpy as np
pd.set_option('display.max_row', 1000)
pd.set_option('display.max_columns', 50)

# Load Season Player Data

All features:   
date,stadium,inning,side,pitcher,pitch_count,batter,balls,strikes,  
ay,px,ax,sz_bot,vz0,vy0,  
pfx_x,type_confidence,z0,tfs,pz,start_speed,  
az,zone,break_angle,spin_dir,end_speed,vx0,  
sz_top,nasty,descr,pfx_z,break_y,pitch_type,  
tfs_zulu,x,spin_rate,y0,break_length,y,x0,  
on_1b,on_2b,on_3b,umpcall,outcome,offense_score,defense_score

In [2]:
years = [2,3,4,5]
base_dir = "Data/"
# features_to_keep = ["pitcher","batter", "balls","strikes","pitch_count","inning","side", "umpcall"]
features_to_load = ["pitcher","batter","pitch_type","start_speed","spin_dir","umpcall"]
train_regular_season = [base_dir+"MLB_201{0}/MLB_PitchFX_201{0}_RegularSeason.csv".format(i) for i in years]
train_post_season = [base_dir+"MLB_201{0}/MLB_PitchFX_201{0}_RegularSeason.csv".format(i) for i in years]
all_data = prep.read_and_combine_data(train_regular_season+train_post_season,features_to_load)

In [3]:
players = pd.read_csv(base_dir+"MLB_Players.csv")

# Basic Statistics

In [10]:
pitchers_name_s = all_data.pitcher.unique()
batters_name_s = all_data.batter.unique()
all_name_s = np.unique(np.append(pitchers_name_s,batters_name_s))
player_stats = {}

In [11]:
pitch_types = ["FA","FF","FT","FC","FS",
               "SI","SF","SL","CH","CB",
               "CU","KC","KN","EP","UN",
               "XX","PO","FO"
              ]
def gen_empty_pitch_type(types):
    result = {}
    for t in types:
        result[t] = {
            "count":0,
            "start_speed":0,
            "spin_dir":0,
            "umpcall":{
                'X':0,
                'S':0,
                'B':0
            }
        }
    return result

In [18]:
for row in all_data.itertuples(index=False):
#     print(row)
    batter = row.batter
    pitcher = row.pitcher
    
    # do batter first
    if batter in player_stats:
        player_stats[batter]["bat_count"] += 1
    else:
        player_stats[batter] = {"bat_count":1, 
                                 "pitch_count":0,
                                 "pitch_type":gen_empty_pitch_type(pitch_types)
                                }
    
    # do pitcher 
    
    if pitcher in player_stats:
        player_stats[pitcher]["pitch_count"] += 1
    else:
        player_stats[pitcher] = {"bat_count":0, 
                                 "pitch_count":1,
                                 "pitch_type":gen_empty_pitch_type(pitch_types)
                                }
    p = player_stats[pitcher]["pitch_type"][row.pitch_type]
    p["count"]+=1
    p["start_speed"]+=row.start_speed
    p["spin_dir"]+=row.spin_dir
    p["umpcall"][row.umpcall]+=1

In [32]:
# Calculate avg stats
for player,stat in player_stats.items():
    for pitch_type,pitch_type_val in stat["pitch_type"].items():
        if(pitch_type_val["count"]>0):
            pitch_type_val["avg_start_speed"] = pitch_type_val["start_speed"]/pitch_type_val["count"]
            pitch_type_val["avg_spin_dir"] = pitch_type_val["spin_dir"]/pitch_type_val["count"]
            pitch_type_val["possibility"] = pitch_type_val["count"]/stat["pitch_count"]
        else:
            pitch_type_val["avg_start_speed"] = 0
            pitch_type_val["avg_spin_dir"] = 0
            pitch_type_val["possibility"] = 0

In [33]:
for pitcher in pitchers_name_s[:5]:
    print("\nPlayer: ",pitcher)
    plyer = player_stats[pitcher]
    for pitch_type,pitch_type_val in plyer["pitch_type"].items():
        if(pitch_type_val["count"]>0):
            print(pitch_type,pitch_type_val)


Player:  johnsjo09
FF {'count': 11232, 'start_speed': 1042075.8000000054, 'spin_dir': 2263884.4199999943, 'umpcall': {'X': 2154, 'S': 5196, 'B': 3882}, 'avg_start_speed': 92.77740384615433, 'avg_spin_dir': 201.5566613247858, 'possibility': 0.4763358778625954}
FT {'count': 1956, 'start_speed': 181678.79999999987, 'spin_dir': 443677.40999999933, 'umpcall': {'X': 480, 'S': 756, 'B': 720}, 'avg_start_speed': 92.8828220858895, 'avg_spin_dir': 226.82894171779108, 'possibility': 0.08295165394402036}
SL {'count': 5334, 'start_speed': 462242.39999999775, 'spin_dir': 681662.4180000017, 'umpcall': {'X': 804, 'S': 2388, 'B': 2142}, 'avg_start_speed': 86.6596175478061, 'avg_spin_dir': 127.7957289088867, 'possibility': 0.2262086513994911}
CH {'count': 1524, 'start_speed': 133713.5999999999, 'spin_dir': 351135.8940000004, 'umpcall': {'X': 306, 'S': 462, 'B': 756}, 'avg_start_speed': 87.73858267716528, 'avg_spin_dir': 230.40412992126008, 'possibility': 0.06463104325699745}
CU {'count': 3534, 'start_s