In [5]:
import json
import os
import pandas as pd

# the folder location of .jsonl files you're compiling into dataframes
folder_path = 'C:/Users/linds/Downloads/wsd-data-main/wsd-hits'

# custom try statement used to access components in json data (whether or whether not it exists)
# input: a string of code
# output: the return value of the string of code (if works) or "n/a" (doesn't work)

def return_if_available(thing):
    try:
        return eval(thing)
    except:
        return "n/a"

In [6]:
# initialize all 
# initialize all 
main_columns = {"filename": ['file'], "main pitch event id":['eventId'], "main pitch type":['type'], "main pitch result":['result'], "main pitch speed":['mph'], "main pitch spin":['rpm'], "main hit event id":['eventId'], "main hit speed":['mph'], "main hit spin":['rpm'], "main hit type": ['type'], "main hit grade": ['rating']}
event_columns = {"event angle start x":['x'], "event angle start y":['y'], "event type":['hit/no hit'], "event teamid":['mlbd'], "event personid":['mlbid'], "event eventid":['eventid']}
ball_columns = {"ball time start":['secs'], "ball pos start x":['feet'], "ball pos start y":['feet'], "ball pos start z":['feet'], "ball vel start x":['mph'], "ball vel start y":['mph'], "ball vel start z":['mph'], "ball acc start x":['mph/s'], "ball acc start y":['mph/s'], "ball acc start z":['mph/s'], "ball time end":['secs'], "ball pos end x":['feet'], "ball pos end y":['feet'], "ball pos end z":['feet'], "ball vel end x":['mph'], "ball vel end y":['mph'], "ball vel end z":['mph'], "ball acc end x":['mph/s'], "ball acc end y":['mph/s'], "ball acc end z":['mph/s']}
bat_columns = {"bat time start":['secs'], "bat pos head start x":['feet'], "bat pos head start y":['feet'], "bat pos head start z":['feet'], "bat pos handle start x":['feet'], "bat pos handle start y":['feet'], "bat pos handle start z":['feet'], "bat time end":['secs'], "bat pos head end x":['feet'], "bat pos head end y":['feet'], "bat pos head end z":['feet'], "bat pos handle end x":['feet'], "bat pos handle end y":['feet'], "bat pos handle end z":['feet']}

main_df = pd.DataFrame(main_columns)
event_df = pd.DataFrame(event_columns)
ball_df = pd.DataFrame(ball_columns)
bat_df = pd.DataFrame(bat_columns)

pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

In [7]:
# classify hit type function

def classify_hit_type(main_pitch_result, event_angle_start_y, main_hit_speed):
    hit_type = 'none'
    if isinstance(event_angle_start_y, float)  and isinstance(main_hit_speed, float):
        if main_pitch_result == "HitIntoPlay":
            # pop up
            if(event_angle_start_y >= 50 ):
                hit_type = 'Pop Up'
                    
            # power hit or pop fly depending on exit velocity
            if((event_angle_start_y > 25 and event_angle_start_y < 50 )):
                if (main_hit_speed >= 95):
                    hit_type = 'Power'
                else: 
                    hit_type = 'Fly Ball'
                        
    
            # line drives: hard hit vs lightly hit
            if((event_angle_start_y >= 10 and event_angle_start_y <= 25 )):
                if (main_hit_speed >= 95):
                    hit_type = 'Hard Hit Line Drive'
                else: 
                    hit_type = 'Light Line Drive'
        
            # ground balls: hard hit vs lightly hit
            elif (event_angle_start_y < 10 ):
                if (main_hit_speed >= 95):
                    hit_type = 'Hard Hit Ground Ball'
                else: 
                    hit_type = 'Light Ground Ball'
        elif main_pitch_result == "Strike":
            hit_type = 'Strike'

    return hit_type

# classify if hit is good/bad function
def rate_hit_type(main_hit_type):
    if main_hit_type == 'none':
        return 'n/a'
    if(main_hit_type == 'Power' or main_hit_type == 'Fly Ball' or main_hit_type == 'Hard Hit Line Drive'):
        return 'Bad'
    else: 
        return 'Good'

In [8]:
# making a comprehensive dataframe (with key data from each json file)
for filename in os.listdir(folder_path):
    if filename.endswith('.jsonl'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as json_file:
            data = json.load(json_file)
            # enter in data fields
            main_pitch_eventid = return_if_available("data['summary_acts']['pitch']['eventId']")
            main_pitch_type = return_if_available("data['summary_acts']['pitch']['type']")
            main_pitch_result = return_if_available("data['summary_acts']['pitch']['result']")
            main_pitch_speed = return_if_available("data['summary_acts']['pitch']['speed']['mph']")
            main_pitch_spin = return_if_available("data['summary_acts']['pitch']['spin']['rpm']")
            main_hit_eventid = return_if_available("data['summary_acts']['hit']['eventId']")
            main_hit_speed = return_if_available("data['summary_acts']['hit']['speed']['mph']")
            main_hit_spin = return_if_available("data['summary_acts']['hit']['spin']['rpm']")

            event_angle_start_x = return_if_available("data['events'][0]['start']['angle'][0]")
            event_angle_start_y = return_if_available("data['events'][0]['start']['angle'][1]")
            event_type = return_if_available("data['events'][0]['type']")
            event_teamid = return_if_available("data['events'][0]['teamId']['mlbId']")
            event_personid = return_if_available("data['events'][0]['personId']['mlbId']")
            event_eventid = return_if_available("data['events'][0]['eventId']")

            ball_time_start = return_if_available("data['samples_ball'][0]['time']")
            ball_pos_start_x = return_if_available("data['samples_ball'][0]['pos'][0]")
            ball_pos_start_y = return_if_available("data['samples_ball'][0]['pos'][1]")
            ball_pos_start_z = return_if_available("data['samples_ball'][0]['pos'][2]")
            ball_vel_start_x = return_if_available("data['samples_ball'][0]['vel'][0]")
            ball_vel_start_y = return_if_available("data['samples_ball'][0]['vel'][1]")
            ball_vel_start_z = return_if_available("data['samples_ball'][0]['vel'][2]")
            ball_acc_start_x = return_if_available("data['samples_ball'][0]['acc'][0]")
            ball_acc_start_y = return_if_available("data['samples_ball'][0]['acc'][1]")
            ball_acc_start_z = return_if_available("data['samples_ball'][0]['acc'][2]")
            ball_time_end = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['time']")
            ball_pos_end_x = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['pos'][0]")
            ball_pos_end_y = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['pos'][1]")
            ball_pos_end_z = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['pos'][2]")
            ball_vel_end_x = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['vel'][0]")
            ball_vel_end_y = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['vel'][1]")
            ball_vel_end_z = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['vel'][2]")
            ball_acc_end_x= return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['acc'][0]")
            ball_acc_end_y= return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['acc'][1]")
            ball_acc_end_z= return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['acc'][2]")

            bat_time_start = return_if_available("data['samples_bat'][0]['time']")
            bat_pos_head_start_x = return_if_available("data['samples_bat'][0]['head']['pos'][0]")
            bat_pos_head_start_y = return_if_available("data['samples_bat'][0]['head']['pos'][1]")
            bat_pos_head_start_z = return_if_available("data['samples_bat'][0]['head']['pos'][2]")
            bat_pos_handle_start_x = return_if_available("data['samples_bat'][0]['handle']['pos'][0]")
            bat_pos_handle_start_y = return_if_available("data['samples_bat'][0]['handle']['pos'][1]")
            bat_pos_handle_start_z = return_if_available("data['samples_bat'][0]['handle']['pos'][2]")
            bat_time_end = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['time']")
            bat_pos_head_end_x = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['head']['pos'][0]")
            bat_pos_head_end_y = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['head']['pos'][1]")
            bat_pos_head_end_z = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['head']['pos'][2]")
            bat_pos_handle_end_x = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['handle']['pos'][0]")
            bat_pos_handle_end_y = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['handle']['pos'][1]")
            bat_pos_handle_end_z = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['handle']['pos'][2]")

            main_hit_type = classify_hit_type(main_pitch_result, event_angle_start_y, main_hit_speed)
            main_hit_grade = rate_hit_type(main_hit_type)

            row = {"filename": filename, "main pitch event id":main_pitch_eventid, "main pitch type":main_pitch_type, "main pitch result":main_pitch_result, "main pitch speed":main_pitch_speed, "main pitch spin":main_pitch_spin, "main hit event id":main_hit_eventid, "main hit speed":main_hit_speed, "main hit spin":main_hit_spin, "main hit type":main_hit_type, "main hit grade":main_hit_grade}
            main_df.loc[len(main_df.index)] = row # add row to end of df
            row = {"event angle start x": event_angle_start_x, "event angle start y": event_angle_start_y, "event type": event_type, "event teamid":event_teamid, "event personid":event_personid, "event eventid":event_eventid}
            event_df.loc[len(event_df.index)] = row # add row to end of df
            row = {"ball time start":ball_time_start, "ball pos start x":ball_pos_start_x, "ball pos start y":ball_pos_start_y, "ball pos start z":ball_pos_start_z, "ball vel start x":ball_vel_start_x, "ball vel start y":ball_vel_start_y, "ball vel start z":ball_vel_start_z, "ball acc start x":ball_acc_start_x, "ball acc start y":ball_acc_start_y, "ball acc start z":ball_acc_start_z, "ball time end":ball_time_end, "ball pos end x":ball_pos_end_x, "ball pos end y":ball_pos_end_y, "ball pos end z":ball_pos_end_z, "ball vel end x":ball_vel_end_x, "ball vel end y":ball_vel_end_y, "ball vel end z":ball_vel_end_z, "ball acc end x":ball_acc_end_x, "ball acc end y":ball_acc_end_y, "ball acc end z":ball_acc_end_z}
            ball_df.loc[len(ball_df.index)] = row # add row to end of df
            row = {"bat time start":bat_time_start, "bat pos head start x":bat_pos_head_start_x, "bat pos head start y":bat_pos_head_start_y, "bat pos head start z":bat_pos_head_start_z, "bat pos handle start x":bat_pos_handle_start_x, "bat pos handle start y":bat_pos_handle_start_y, "bat pos handle start z":bat_pos_handle_start_z, "bat time end":bat_time_end, "bat pos head end x":bat_pos_head_end_x, "bat pos head end y":bat_pos_head_end_y, "bat pos head end z":bat_pos_head_end_z, "bat pos handle end x":bat_pos_handle_end_x, "bat pos handle end y":bat_pos_handle_end_y, "bat pos handle end z":bat_pos_handle_end_z}
            bat_df.loc[len(bat_df.index)] = row # add row to end of df
            
            df = pd.concat([main_df, event_df, ball_df, bat_df], axis=1).reindex(main_df.index)



In [None]:
# display everything

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
display(df_processed)

In [12]:
# clean up data / make data easily processable (just the numbers)

df_processed = df.copy()
# delete the first row (irrelevant data)
df_processed = df_processed.iloc[1:]

# convert relevant non-numeric data with numeric data
# df_processed.drop(['filename', 'main pitch event id', 'main hit event id', 'event personid', 'event eventid'], axis=1, inplace=True)
df_processed.drop(['filename', 'main pitch event id', 'main hit type', 'main pitch type', 'main hit event id', 'main hit grade', 'event personid', 'event eventid'], axis=1, inplace=True)
df_processed['main pitch result'] = df_processed['main pitch result'].map({'HitIntoPlay': 1, 'Strike': 0}) # for main pitch result, all strike = 0, hitintoplay = 1
df_processed['event type'] = df_processed['event type'].map({'Hit': 1, 'n/a': 0}) # for event type, all hit = 1, n/a = 0

# drop all columns with non-numeric values
for col in df_processed.columns:
    df_processed = df_processed[pd.to_numeric(df_processed[col], errors='coerce').notnull()]

In [1]:
# display cleaned up data

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
display(df_processed)

NameError: name 'pd' is not defined

In [14]:
# making correlation table
# note: "event type" may be displayed as "nan" but it's just a math thing. event type isnt super relevant anyway

corr_matrix = df_processed.corr(method='pearson', numeric_only = False)

# styling correlation df
def cond_formatting(x):
    if x > 0.6 or x < -0.6:
        return 'background-color: lightgreen'
    else:
        return None
    
#display DataFrame with conditional formatting applied    
corr_matrix = corr_matrix.style.map(cond_formatting)
display(corr_matrix)

Unnamed: 0,main pitch result,main pitch speed,main pitch spin,main hit speed,main hit spin,event angle start x,event angle start y,event type,event teamid,ball time start,ball pos start x,ball pos start y,ball pos start z,ball vel start x,ball vel start y,ball vel start z,ball acc start x,ball acc start y,ball acc start z,ball time end,ball pos end x,ball pos end y,ball pos end z,ball vel end x,ball vel end y,ball vel end z,ball acc end x,ball acc end y,ball acc end z,bat time start,bat pos head start x,bat pos head start y,bat pos head start z,bat pos handle start x,bat pos handle start y,bat pos handle start z,bat time end,bat pos head end x,bat pos head end y,bat pos head end z,bat pos handle end x,bat pos handle end y,bat pos handle end z
main pitch result,1.0,-0.091295,-0.005582,0.363987,-0.127317,-0.15146,-0.211553,,-0.001715,0.01179,-0.00066,0.000488,-0.155999,-0.059736,0.094682,0.09942,0.033067,-0.054005,-0.130001,0.482869,-0.014634,0.206716,-0.321551,0.162166,-0.656677,0.013974,-0.179298,0.539216,-0.140055,-0.001489,-0.023183,0.11812,-0.037175,-0.044336,0.028567,-0.005621,-0.020186,-0.072078,0.007269,0.064747,-0.056289,0.087578,-0.274675
main pitch speed,-0.091295,1.0,-0.035054,0.109768,-0.128855,-0.041593,0.069277,,-0.092426,0.125825,0.018041,-0.209654,0.166849,0.253675,-0.997637,-0.735303,-0.498528,0.837768,0.833124,-0.120313,-0.022104,-0.027097,0.038355,-0.045086,0.049175,-0.001703,0.055539,-0.034922,0.014289,-0.060217,0.102831,0.025566,0.077101,0.142968,0.046133,0.129934,-0.149967,0.138084,-0.044844,0.049404,0.142401,0.019688,0.091513
main pitch spin,-0.005582,-0.035054,1.0,-0.126748,0.006744,0.083332,-0.056479,,-0.069077,-0.028628,-0.075034,0.031565,0.108754,0.014285,0.033116,0.055888,0.12954,-0.042782,-0.061944,0.032777,-0.036952,-0.07691,-0.011358,0.032648,0.043808,0.044642,-0.041649,-0.089264,-0.093138,-0.055615,-0.102754,0.071165,-0.097312,-0.082168,0.077911,-0.00966,-0.055242,-0.066647,0.01205,-0.036129,-0.090723,-0.022618,-0.05223
main hit speed,0.363987,0.109768,-0.126748,1.0,0.043732,-0.128144,-0.05838,,-0.045307,0.066636,0.056147,-0.073687,0.041431,-0.018681,-0.106639,-0.144474,-0.004833,0.099355,0.123057,0.183591,-0.083757,0.213843,-0.158781,-0.030222,-0.31976,-0.058726,0.006855,0.292994,-0.019259,0.086342,0.01228,0.084604,0.109512,0.025291,-0.132564,0.181247,0.115777,-0.048416,-0.144758,0.198793,0.022982,0.062718,0.004296
main hit spin,-0.127317,-0.128855,0.006744,0.043732,1.0,-0.097174,0.041586,,0.092434,0.07604,-0.043482,-0.062409,0.023164,0.041582,0.132489,0.029855,-0.003279,-0.094599,-0.10926,-0.04151,0.106371,-0.020146,0.090849,0.044955,0.098168,-0.145132,-0.046962,-0.08602,0.076682,-0.08356,-0.047536,0.083623,-0.050351,-0.043854,-0.012102,-0.024459,-0.136684,-0.00629,-0.033912,-0.061549,-0.059303,-0.068792,0.054861
event angle start x,-0.15146,-0.041593,0.083332,-0.128144,-0.097174,1.0,0.234854,,0.092145,-0.146804,0.014894,0.148761,0.022976,-0.047107,0.03967,0.073081,0.133297,0.007637,-0.04663,-0.147664,0.110564,0.012702,0.006168,-0.021602,0.14076,-0.084607,0.038312,-0.112008,0.075737,0.001511,-0.286906,-0.047067,-0.086191,-0.291073,0.113418,-0.023113,-0.031253,-0.221273,-0.01247,-0.001147,-0.277323,-0.084338,0.126097
event angle start y,-0.211553,0.069277,-0.056479,-0.05838,0.041586,0.234854,1.0,,0.067716,-0.036277,0.010478,0.024168,0.051281,-0.002582,-0.068372,0.031584,0.027553,0.096714,0.174415,-0.047341,0.128824,0.112961,-0.094311,-0.076171,0.126278,-0.310884,0.149553,-0.06482,0.275415,0.106071,-0.035028,0.05658,0.0884,-0.025909,0.058298,0.118372,0.08387,-0.026338,-0.033442,-0.037953,-0.005494,-0.178397,0.304815
event type,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
event teamid,-0.001715,-0.092426,-0.069077,-0.045307,0.092434,0.092145,0.067716,,1.0,-0.131278,-0.010612,0.138316,0.158785,0.032017,0.086734,0.110514,-0.128613,0.033142,-0.170256,0.100868,0.159024,-0.034932,-0.10985,0.055225,0.031662,-0.046149,0.073318,-0.025764,0.054564,-0.120027,-0.368191,0.242174,-0.1642,-0.362146,0.292706,-0.019033,-0.1247,-0.36831,-0.027755,-0.015658,-0.404911,-0.102329,0.048213
ball time start,0.01179,0.125825,-0.028628,0.066636,0.07604,-0.146804,-0.036277,,-0.131278,1.0,-0.019088,-0.994701,-0.016917,0.111843,-0.080888,-0.219013,-0.08121,0.053238,0.145181,0.001622,0.09918,0.026062,-0.09394,-0.073498,0.042838,-0.077325,0.066182,-0.0441,0.092722,-0.015652,0.136439,0.004157,0.047796,0.150054,-0.056229,0.04559,-0.033963,0.118619,-0.0246,0.005597,0.147289,-0.049729,0.062491


In [None]:
# download styled dataframe as an html file

download_location = "C:/Users/linds/Downloads/"


corr_html = corr_matrix.to_html()
file = open(download_location + "corr_styled_df.html", "w")
file.write(corr_html)
file.close()