In [3]:
import json
import os
import pandas as pd

# the folder location of .jsonl files you're compiling into dataframes
folder_path = 'C:/Users/linds/Downloads/wsd-data-main/wsd-hits'

# custom try statement used to access components in json data (whether or whether not it exists)
# input: a string of code
# output: the return value of the string of code (if works) or "n/a" (doesn't work)

def return_if_available(thing):
    try:
        return eval(thing)
    except:
        return "n/a"

In [4]:
# initialize all 
# initialize all 
main_columns = {"filename": ['file'], "main pitch event id":['eventId'], "main pitch type":['type'], "main pitch result":['result'], "main pitch speed":['mph'], "main pitch spin":['rpm'], "main hit event id":['eventId'], "main hit speed":['mph'], "main hit spin":['rpm'], "main hit type": ['type'], "main hit grade": ['rating']}
event_columns = {"event angle start x":['x'], "event angle start y":['y'], "event type":['hit/no hit'], "event teamid":['mlbd'], "event personid":['mlbid'], "event eventid":['eventid']}
ball_columns = {"ball time start":['secs'], "ball pos start x":['feet'], "ball pos start y":['feet'], "ball pos start z":['feet'], "ball vel start x":['mph'], "ball vel start y":['mph'], "ball vel start z":['mph'], "ball acc start x":['mph/s'], "ball acc start y":['mph/s'], "ball acc start z":['mph/s'], "ball time end":['secs'], "ball pos end x":['feet'], "ball pos end y":['feet'], "ball pos end z":['feet'], "ball vel end x":['mph'], "ball vel end y":['mph'], "ball vel end z":['mph'], "ball acc end x":['mph/s'], "ball acc end y":['mph/s'], "ball acc end z":['mph/s']}
bat_columns = {"bat time start":['secs'], "bat pos head start x":['feet'], "bat pos head start y":['feet'], "bat pos head start z":['feet'], "bat pos handle start x":['feet'], "bat pos handle start y":['feet'], "bat pos handle start z":['feet'], "bat time end":['secs'], "bat pos head end x":['feet'], "bat pos head end y":['feet'], "bat pos head end z":['feet'], "bat pos handle end x":['feet'], "bat pos handle end y":['feet'], "bat pos handle end z":['feet']}

main_df = pd.DataFrame(main_columns)
event_df = pd.DataFrame(event_columns)
ball_df = pd.DataFrame(ball_columns)
bat_df = pd.DataFrame(bat_columns)

pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

In [5]:
# classify hit type function

def classify_hit_type(main_pitch_result, event_angle_start_y, main_hit_speed):
    hit_type = 'none'
    if isinstance(event_angle_start_y, float)  and isinstance(main_hit_speed, float):
        if main_pitch_result == "HitIntoPlay":
            # pop up
            if(event_angle_start_y >= 50 ):
                hit_type = 'Pop Up'
                    
            # power hit or pop fly depending on exit velocity
            if((event_angle_start_y > 25 and event_angle_start_y < 50 )):
                if (main_hit_speed >= 95):
                    hit_type = 'Power'
                else: 
                    hit_type = 'Fly Ball'
                        
    
            # line drives: hard hit vs lightly hit
            if((event_angle_start_y >= 10 and event_angle_start_y <= 25 )):
                if (main_hit_speed >= 95):
                    hit_type = 'Hard Hit Line Drive'
                else: 
                    hit_type = 'Light Line Drive'
        
            # ground balls: hard hit vs lightly hit
            elif (event_angle_start_y < 10 ):
                if (main_hit_speed >= 95):
                    hit_type = 'Hard Hit Ground Ball'
                else: 
                    hit_type = 'Light Ground Ball'
        elif main_pitch_result == "Strike":
            hit_type = 'Strike'

    return hit_type

# classify if hit is good/bad function
def rate_hit_type(main_hit_type):
    if main_hit_type == 'none':
        return 'n/a'
    if(main_hit_type == 'Power' or main_hit_type == 'Fly Ball' or main_hit_type == 'Hard Hit Line Drive'):
        return 'Bad'
    else: 
        return 'Good'

In [6]:
# making a comprehensive dataframe (with key data from each json file)
for filename in os.listdir(folder_path):
    if filename.endswith('.jsonl'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as json_file:
            data = json.load(json_file)
            # enter in data fields
            main_pitch_eventid = return_if_available("data['summary_acts']['pitch']['eventId']")
            main_pitch_type = return_if_available("data['summary_acts']['pitch']['type']")
            main_pitch_result = return_if_available("data['summary_acts']['pitch']['result']")
            main_pitch_speed = return_if_available("data['summary_acts']['pitch']['speed']['mph']")
            main_pitch_spin = return_if_available("data['summary_acts']['pitch']['spin']['rpm']")
            main_hit_eventid = return_if_available("data['summary_acts']['hit']['eventId']")
            main_hit_speed = return_if_available("data['summary_acts']['hit']['speed']['mph']")
            main_hit_spin = return_if_available("data['summary_acts']['hit']['spin']['rpm']")

            event_angle_start_x = return_if_available("data['events'][0]['start']['angle'][0]")
            event_angle_start_y = return_if_available("data['events'][0]['start']['angle'][1]")
            event_type = return_if_available("data['events'][0]['type']")
            event_teamid = return_if_available("data['events'][0]['teamId']['mlbId']")
            event_personid = return_if_available("data['events'][0]['personId']['mlbId']")
            event_eventid = return_if_available("data['events'][0]['eventId']")

            ball_time_start = return_if_available("data['samples_ball'][0]['time']")
            ball_pos_start_x = return_if_available("data['samples_ball'][0]['pos'][0]")
            ball_pos_start_y = return_if_available("data['samples_ball'][0]['pos'][1]")
            ball_pos_start_z = return_if_available("data['samples_ball'][0]['pos'][2]")
            ball_vel_start_x = return_if_available("data['samples_ball'][0]['vel'][0]")
            ball_vel_start_y = return_if_available("data['samples_ball'][0]['vel'][1]")
            ball_vel_start_z = return_if_available("data['samples_ball'][0]['vel'][2]")
            ball_acc_start_x = return_if_available("data['samples_ball'][0]['acc'][0]")
            ball_acc_start_y = return_if_available("data['samples_ball'][0]['acc'][1]")
            ball_acc_start_z = return_if_available("data['samples_ball'][0]['acc'][2]")
            ball_time_end = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['time']")
            ball_pos_end_x = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['pos'][0]")
            ball_pos_end_y = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['pos'][1]")
            ball_pos_end_z = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['pos'][2]")
            ball_vel_end_x = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['vel'][0]")
            ball_vel_end_y = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['vel'][1]")
            ball_vel_end_z = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['vel'][2]")
            ball_acc_end_x= return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['acc'][0]")
            ball_acc_end_y= return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['acc'][1]")
            ball_acc_end_z= return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['acc'][2]")

            bat_time_start = return_if_available("data['samples_bat'][0]['time']")
            bat_pos_head_start_x = return_if_available("data['samples_bat'][0]['head']['pos'][0]")
            bat_pos_head_start_y = return_if_available("data['samples_bat'][0]['head']['pos'][1]")
            bat_pos_head_start_z = return_if_available("data['samples_bat'][0]['head']['pos'][2]")
            bat_pos_handle_start_x = return_if_available("data['samples_bat'][0]['handle']['pos'][0]")
            bat_pos_handle_start_y = return_if_available("data['samples_bat'][0]['handle']['pos'][1]")
            bat_pos_handle_start_z = return_if_available("data['samples_bat'][0]['handle']['pos'][2]")
            bat_time_end = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['time']")
            bat_pos_head_end_x = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['head']['pos'][0]")
            bat_pos_head_end_y = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['head']['pos'][1]")
            bat_pos_head_end_z = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['head']['pos'][2]")
            bat_pos_handle_end_x = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['handle']['pos'][0]")
            bat_pos_handle_end_y = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['handle']['pos'][1]")
            bat_pos_handle_end_z = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['handle']['pos'][2]")

            main_hit_type = classify_hit_type(main_pitch_result, event_angle_start_y, main_hit_speed)
            main_hit_grade = rate_hit_type(main_hit_type)

            row = {"filename": filename, "main pitch event id":main_pitch_eventid, "main pitch type":main_pitch_type, "main pitch result":main_pitch_result, "main pitch speed":main_pitch_speed, "main pitch spin":main_pitch_spin, "main hit event id":main_hit_eventid, "main hit speed":main_hit_speed, "main hit spin":main_hit_spin, "main hit type":main_hit_type, "main hit grade":main_hit_grade}
            main_df.loc[len(main_df.index)] = row # add row to end of df
            row = {"event angle start x": event_angle_start_x, "event angle start y": event_angle_start_y, "event type": event_type, "event teamid":event_teamid, "event personid":event_personid, "event eventid":event_eventid}
            event_df.loc[len(event_df.index)] = row # add row to end of df
            row = {"ball time start":ball_time_start, "ball pos start x":ball_pos_start_x, "ball pos start y":ball_pos_start_y, "ball pos start z":ball_pos_start_z, "ball vel start x":ball_vel_start_x, "ball vel start y":ball_vel_start_y, "ball vel start z":ball_vel_start_z, "ball acc start x":ball_acc_start_x, "ball acc start y":ball_acc_start_y, "ball acc start z":ball_acc_start_z, "ball time end":ball_time_end, "ball pos end x":ball_pos_end_x, "ball pos end y":ball_pos_end_y, "ball pos end z":ball_pos_end_z, "ball vel end x":ball_vel_end_x, "ball vel end y":ball_vel_end_y, "ball vel end z":ball_vel_end_z, "ball acc end x":ball_acc_end_x, "ball acc end y":ball_acc_end_y, "ball acc end z":ball_acc_end_z}
            ball_df.loc[len(ball_df.index)] = row # add row to end of df
            row = {"bat time start":bat_time_start, "bat pos head start x":bat_pos_head_start_x, "bat pos head start y":bat_pos_head_start_y, "bat pos head start z":bat_pos_head_start_z, "bat pos handle start x":bat_pos_handle_start_x, "bat pos handle start y":bat_pos_handle_start_y, "bat pos handle start z":bat_pos_handle_start_z, "bat time end":bat_time_end, "bat pos head end x":bat_pos_head_end_x, "bat pos head end y":bat_pos_head_end_y, "bat pos head end z":bat_pos_head_end_z, "bat pos handle end x":bat_pos_handle_end_x, "bat pos handle end y":bat_pos_handle_end_y, "bat pos handle end z":bat_pos_handle_end_z}
            bat_df.loc[len(bat_df.index)] = row # add row to end of df
            
            df = pd.concat([main_df, event_df, ball_df, bat_df], axis=1).reindex(main_df.index)



In [7]:
# display everything

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
display(df_processed)

NameError: name 'df_processed' is not defined

In [None]:
# clean up data / make data easily processable (just the numbers)

df_processed = df.copy()
# delete the first row (irrelevant data)
df_processed = df_processed.iloc[1:]

# convert relevant non-numeric data with numeric data
# df_processed.drop(['filename', 'main pitch event id', 'main hit event id', 'event personid', 'event eventid'], axis=1, inplace=True)
df_processed.drop(['filename', 'main pitch event id', 'main hit type', 'main pitch type', 'main hit event id', 'main hit grade', 'event personid', 'event eventid'], axis=1, inplace=True)
df_processed['main pitch result'] = df_processed['main pitch result'].map({'HitIntoPlay': 1, 'Strike': 0}) # for main pitch result, all strike = 0, hitintoplay = 1
df_processed['event type'] = df_processed['event type'].map({'Hit': 1, 'n/a': 0}) # for event type, all hit = 1, n/a = 0

# drop all columns with non-numeric values
for col in df_processed.columns:
    df_processed = df_processed[pd.to_numeric(df_processed[col], errors='coerce').notnull()]

In [None]:
# display cleaned up data

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
display(df_processed)

In [None]:
# making correlation table
# note: "event type" may be displayed as "nan" but it's just a math thing. event type isnt super relevant anyway

corr_matrix = df_processed.corr(method='pearson', numeric_only = False)

# styling correlation df
def cond_formatting(x):
    if x > 0.6 or x < -0.6:
        return 'background-color: lightgreen'
    else:
        return None
    
#display DataFrame with conditional formatting applied    
corr_matrix = corr_matrix.style.map(cond_formatting)
display(corr_matrix)

In [None]:
# download styled dataframe as an html file

download_location = "C:/Users/linds/Downloads/"


corr_html = corr_matrix.to_html()
file = open(download_location + "corr_styled_df.html", "w")
file.write(corr_html)
file.close()