In [112]:
import os
import json
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
import time
from datetime import datetime, timedelta
from dateutil import tz

import warnings
warnings.filterwarnings('ignore')

In [113]:
def convert_datetime(dates):
    """
    receives list of UTC dates as strings and offsets the time 
    by -7 hours to sync with MLB.com scheduled dates
    returns converted datetimes
    """
    new_dates = []
    from_zone = tz.gettz('UTC')
    for dt in dates:
        utc = datetime.strptime(dt,'%Y-%m-%dT%H:%M:%S+00:00')
        utc = utc.replace(tzinfo=from_zone)
        new_dt = utc - timedelta(hours=7)
        new_dates.append(datetime.strftime(new_dt, '%Y-%m-%d'))
    return new_dates

In [114]:
def get_meta_cols(df, col_excld=[]):
    """
    creates a list of column names in dataframe 'df'
    excluding those listed in col_excld
    returns list of strings (column names)
    """
    if len(col_excld) == 0:
        meta_cols = [col for col in df.columns]
    else:
        meta_cols = [col for col in df.columns if col not in col_excld]
    return meta_cols

In [115]:
def drop_cols(df, cols_list=[]):
    """
    drops all rows that contains null values for every column
    and drops specific columns passed in cols_list, if present
    returns revised dataframe 'df'
    """
    df.dropna(how='all', axis='columns', inplace=True)
    if len(cols_list) > 0:
        to_drop = []
        for col in cols_list:
            if col in df.columns:
                to_drop.append(col)
        df.drop(columns=to_drop, axis=1, inplace=True)
    return df

In [96]:
def clean_df(path, df, fname, lname):
    """
    cleans play-by-play data for each game that each pitcher pitched during regular season
    cleans json file by storing information in dataframe and de-nest's highly nested columns
    returns clean dataframe with pitches thrown 'only' by pitcher of interest
    """
    # get pitcher_id for pitcher of interest
    pitcher_id = df.index[(df['last_name'].str.lower()==lname.lower()) & 
                          ((df['first_name'].str.lower()==fname.lower()) | 
                           (df['preferred_name'].str.lower()==fname.lower()))].tolist()
    if len(pitcher_id) == 0:
        print("\tPitcher: {} {}, from file below, not found:\n\t{}\n".format(fname, lname, path))
        df_empty = pd.DataFrame({'Empty' : []})
        return df_empty
    
    # open json file and use list comprehension to extract list of entries
    json_data = [json.loads(line) for line in open(path)]
    gameday = convert_datetime([json_data[0]['game']['scheduled']])

    # normalize JSON data to de-nest 'halfs' column for each inning (e.g. number)
    df1 = json_normalize(json_data[0]['game']['innings'], record_path=['halfs'], meta=['number'])
    # rename number to the correct term inning
    df1.rename(index=str, columns={'number': 'inning'}, inplace=True)
    
    # store columns to keep for next normalization excluding list of 'column names'
    meta_cols = get_meta_cols(df1, ['events'])
    # convert dataframe to a dictionary for next normalization
    dict1 = df1.to_dict(orient='record')
    # normalize dictionary to de-nest 'events' column
    df2 = json_normalize(dict1, record_path=['events'], meta=meta_cols)
    
    # de-nest 'at_bat' using apply(pd.Series) and replace original 'at_bat' column
    df3 = pd.concat([df2.drop('at_bat', axis=1), df2['at_bat'].apply(pd.Series)], axis=1, sort=False)
    # drop columns where all rows are null and those not needed (if present in df)
    #to_drop_1 = ['lineup', 'hitter_id', 'warming_up', 'id', 'pitcher_id', 'hitter']
    to_drop_1 = ['lineup', 'hitter_id', 'warming_up', 'id', 'hitter', 'pitcher']
    df3_rev = drop_cols(df3, to_drop_1)    
    
    df_pitcher = df3_rev[(df3_rev['pitcher_id']==pitcher_id[0])]
    if df_pitcher.empty:
        print("\tDouble header game, same day, different pitcher, this file excluded:\n\t{}\n".format(path))
        df_empty = pd.DataFrame({'Empty' : []})
        return df_empty
    
    # store columns to keep for next normalization
    meta_cols = get_meta_cols(df_pitcher, 'events')
    # convert dataframe to a dictionary for next normalization
    dict2 = df_pitcher.to_dict(orient='record')
    # normalize dictionary to de-nest 'events' column
    df4 = json_normalize(dict2, record_path=[['events']], meta=meta_cols) 
    
    # keep only the rows with type 'pitch'
    if 'type' in df4.columns:
        df5 = df4[df4['type']=='pitch']
    else:
        print("Pitch type missing in:\n{}\n".format(path))
        df5 = df4
    
    # drop columns where all rows are null and those not needed
    to_drop_2 = ['created_at', 'fielders', 'id', 'status', 'type', 'updated_at']
    df5_rev = drop_cols(df5, to_drop_2)
    # convert dataframe to a dictionary for next normalization
    dict3 = df5_rev.to_dict(orient='record')
    
    # normalize dictionary to de-nest 'events' column
    df5_rev = json_normalize(dict3)
    
    # keep only the rows with type 'pitch'
    if 'runners' in df5_rev.columns:
        # de-nest 'runners' using apply(pd.Series) and replace original 'runners' column
        df6 = pd.concat([df5_rev.drop('runners', axis=1), df5_rev['runners'].apply(pd.Series)], axis=1, sort=False)
        # drop columns where all rows are null
        df6_rev = drop_cols(df6)
        
        # rename columns to avoid conflicts
        for col in np.arange(4):
            if col in df6_rev.columns:
                new_col = 'runner_' + str(col+1)
                df6_rev.rename(index=str, columns={col: new_col}, inplace=True)
            else:
                new_col = 'runner_' + str(col+1)
                df6_rev[new_col]=''
    else:
        print("Runners on base id missing in:\n{}\n".format(path))
        df6_rev = df5_rev
        
    # reset index of dataframe
    df6_rev.reset_index(drop=True)
    # add new column 'date' to include day of game events occurred
    df6_rev['date']=gameday[0]
    if 'pitcher.first_name' not in df6_rev.columns:
        df6_rev['pitcher.first_name']=fname.capitalize()
    if 'pitcher.last_name' not in df6_rev.columns:
        df6_rev['pitcher.last_name']=lname.capitalize()
    
    # last check to assure non-pitchers of interest are excluded from dataframe
    if 'pitcher.id' in df6_rev.columns:
        df7 = df6_rev[(df6_rev['pitcher.id']==pitcher_id[0])]
    else:
        df7 = df6_rev
    
    # if pitcher.pitch_speed column is present in dataframe and not empty
    # replace null values with the mean pitch speed
    pitch_speed_mean = 0
    if 'pitcher.pitch_speed' in df7.columns:
        if df7['pitcher.pitch_speed'].empty:
            print("Exclude this game for {} {} since pitch speeds are empty in:\n{}\n".format(fname, lname, path))
            df_empty = pd.DataFrame({'Empty' : []})
            return df_empty
        else:
            pitch_speed_mean = round(df7['pitcher.pitch_speed'].mean(), 0)
            df7['pitcher.pitch_speed'] = df7['pitcher.pitch_speed'].fillna(pitch_speed_mean)
    else:
        print("Exclude this game for {} {} since there are no pitch speeds in:\n{}\n".format(fname, lname, path))
        df_empty = pd.DataFrame({'Empty' : []})
        return df_empty
    
    # if outcome_id == 'bIB' (intentional walk), replace pitcher.pitch_type with 'IB'
    df7.loc[df7['outcome_id'] == 'bIB', 'pitcher.pitch_type'] = 'IB'
    # for all other pitcher.pitch_types with null values, replace with 'UN' (unknown)
    df7['pitcher.pitch_type'] = df7['pitcher.pitch_type'].fillna('UN')
    
    to_drop_final = ['hitter.jersey_number', 'hitter_hand', 'pitcher.jersey_number', 'pitcher_hand']
    df_cleaned = drop_cols(df7, to_drop_final)
    
    return df_cleaned

In [157]:
def make_dfs(basepath, years, pitchers):
    """
    iterate through list of pitchers for each year in list to open play-by-play file
    then call clean_dfs() to de-nest and merge play-by-play data into a single csv
    
    Note: some pitchers pitched during a double-header (two games on same day) so some 
          play-by-play files are skipped if pitcher of interest did not pitch the game
    
    returns true if completed without errors 
    """
    # path to pitcher_ids.csv file
    pitcher_ids_path = 'data/pitcher_logs/pitcher_ids.csv'
    # open pitcher_ids.csv file, set 'id' as index
    df_pitcher_ids = pd.read_csv(pitcher_ids_path, index_col='id')
       
    for year in years:
        for pitcher in pitchers:           
            file_path = basepath + '/' + year + '/' + pitcher
            merged_path = basepath + '/' + year + '/merged_by_player'
            pitcher_filename = pitcher + '_' + year + '_merged_games.csv'
            outfile_path = os.path.join(merged_path, pitcher_filename)

            f_name, l_name = pitcher.split(sep='_')
            files = os.listdir(file_path)
            files.sort()
            
            if len(files) == 1:
                df_cleaned = clean_df(os.path.join(file_path, files), df_pitcher_ids, f_name, l_name)
                # update number of game files for pitcher of interest
                plyr_num_files = 1
            else:
                first_file = files.pop(0)
                df_cleaned = clean_df(os.path.join(file_path, first_file), df_pitcher_ids, f_name, l_name)
                # update number of game files for pitcher of interest
                plyr_num_files = 1
                for file_name in files:
                    df_temp = clean_df(os.path.join(file_path, file_name), df_pitcher_ids, f_name, l_name)
                    if not df_temp.empty:
                        df_cleaned = pd.concat([df_cleaned, df_temp], ignore_index=True)
                        # update number of game files for pitcher of interest
                        plyr_num_files += 1
            
            df_cleaned.to_csv(outfile_path, index=False)
            print("Total number of game files for {} {} = {}".format(f_name.capitalize(), l_name.capitalize(), plyr_num_files))
            print("file: ", outfile_path, '-- written\n')
    
    return True

In [158]:
# base path with subfolders for each pitcher of interest
basepath = 'data/pbp_files'

# list of years to clean and merge play-by-play data
years = ['2016', '2017']

# list of pitchers of interest
pitchers = ['aaron_nola', 'carlos_carrasco', 'carlos_martinez', 'chris_archer', 'chris_sale', 
            'clayton_kershaw', 'corey_kluber', 'dallas_keuchel', 'david_price', 'gerrit_cole',
            'jacob_degrom', 'jake_arrieta', 'jose_quintana', 'marcus_stroman', 'justin_verlander', 
            'max_scherzer', 'michael_fulmer', 'stephen_strasburg', 'yu_darvish', 'zack_greinke']


# call to make/merge play-by-play data into single dataframe and write to csv
status = False
status = make_dfs(basepath, years, pitchers)
status

Total number of game files for Aaron Nola = 20
file:  data/pbp_files/2016/merged_by_player/aaron_nola_2016_merged_games.csv -- written

Total number of game files for Carlos Carrasco = 25
file:  data/pbp_files/2016/merged_by_player/carlos_carrasco_2016_merged_games.csv -- written

	Double header game, same day, different pitcher, this file excluded:
	data/pbp_files/2016/carlos_martinez/2016_07_20_STL_vs_SD_bfa9376a_4fb1_49d1_95fc_0a92f6360b00_pbp.json

	Double header game, same day, different pitcher, this file excluded:
	data/pbp_files/2016/carlos_martinez/2016_07_26_STL_vs_NYM_c75746d4_04f8_466f_a7f2_97aeee8d1453_pbp.json

Total number of game files for Carlos Martinez = 31
file:  data/pbp_files/2016/merged_by_player/carlos_martinez_2016_merged_games.csv -- written

Total number of game files for Chris Archer = 33
file:  data/pbp_files/2016/merged_by_player/chris_archer_2016_merged_games.csv -- written

Total number of game files for Chris Sale = 32
file:  data/pbp_files/2016/merged_

Total number of game files for Max Scherzer = 31
file:  data/pbp_files/2017/merged_by_player/max_scherzer_2017_merged_games.csv -- written

	Double header game, same day, different pitcher, this file excluded:
	data/pbp_files/2017/michael_fulmer/2017_05_27_DET_vs_CWS_9d3541c7_8beb_429d_872c_e59389f8d1bc_pbp.json

Total number of game files for Michael Fulmer = 25
file:  data/pbp_files/2017/merged_by_player/michael_fulmer_2017_merged_games.csv -- written

Total number of game files for Stephen Strasburg = 28
file:  data/pbp_files/2017/merged_by_player/stephen_strasburg_2017_merged_games.csv -- written

	Double header game, same day, different pitcher, this file excluded:
	data/pbp_files/2017/yu_darvish/2017_09_02_LAD_vs_SD_7d0f12d5_7388_4f53_b916_b7867b966eaf_pbp.json

Total number of game files for Yu Darvish = 31
file:  data/pbp_files/2017/merged_by_player/yu_darvish_2017_merged_games.csv -- written

Total number of game files for Zack Greinke = 32
file:  data/pbp_files/2017/merged_by

True

### END