In [204]:
import json
import os
import pandas as pd

# the folder location of .jsonl files you're compiling into dataframes
folder_path = 'C:/Users/linds/Downloads/wsd-data/wsd-data-main/wsd-hits'

# custom try statement used to access components in json data (whether or whether not it exists)
# input: a string of code
# output: the return value of the string of code (if works) or "n/a" (doesn't work)

def return_if_available(thing):
    try:
        return eval(thing)
    except:
        return "n/a"

In [205]:
# initialize all 
columns = {"filename": ['file'], "main pitch event id":['eventId'], "main pitch result":['result'], "main pitch speed":['mph'], "main pitch spin":['rpm'], "main hit event id":['eventId'], "main hit speed":['mph'], "main hit spin":['rpm'], "event angle start":['x, y'], "event type":['hit/no hit'], "event teamid":['mlbd'], "event personid":['mlbid'], "event eventid":['eventid'], "ball time start":['secs'], "ball pos start":['feet'], "ball vel start":['mph'], "ball acc start":['mph/s'], "ball time end":['secs'], "ball pos end":['feet'], "ball vel end":['mph'], "ball acc end":['mph/s'], "bat time start":['secs'], "bat pos head start":['feet'], "bat pos handle start":['feet'], "bat time end":['secs'], "bat pos head end":['feet'], "bat pos handle end":['feet']}

df = pd.DataFrame(columns)

In [206]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
display(df)

Unnamed: 0,filename,main pitch event id,main pitch result,main pitch speed,main pitch spin,main hit event id,main hit speed,main hit spin,event angle start,event type,event teamid,event personid,event eventid,ball time start,ball pos start,ball vel start,ball acc start,ball time end,ball pos end,ball vel end,ball acc end,bat time start,bat pos head start,bat pos handle start,bat time end,bat pos head end,bat pos handle end
0,file,eventId,result,mph,rpm,eventId,mph,rpm,"x, y",hit/no hit,mlbd,mlbid,eventid,secs,feet,mph,mph/s,secs,feet,mph,mph/s,secs,feet,feet,secs,feet,feet


In [207]:
# dataset

for filename in os.listdir(folder_path):
    if filename.endswith('.jsonl'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as json_file:
            data = json.load(json_file)
            # enter in data fields
            main_pitch_eventid = return_if_available("data['summary_acts']['pitch']['eventId']")
            main_pitch_result = return_if_available("data['summary_acts']['pitch']['result']")
            main_pitch_speed = return_if_available("data['summary_acts']['pitch']['speed']['mph']")
            main_pitch_spin = return_if_available("data['summary_acts']['pitch']['spin']['rpm']")
            main_hit_eventid = return_if_available("data['summary_acts']['hit']['eventId']")
            main_hit_speed = return_if_available("data['summary_acts']['hit']['speed']['mph']")
            main_hit_spin = return_if_available("data['summary_acts']['hit']['spin']['rpm']")

            event_angle_start = return_if_available("data['events'][0]['start']['angle']")
            event_type = return_if_available("data['events'][0]['type']")
            event_teamid = return_if_available("data['events'][0]['teamId']['mlbId']")
            event_personid = return_if_available("data['events'][0]['personId']['mlbId']")
            event_eventid = return_if_available("data['events'][0]['eventId']")

            ball_time_start = return_if_available("data['samples_ball'][0]['time']")
            ball_pos_start = return_if_available("data['samples_ball'][0]['pos']")
            ball_vel_start = return_if_available("data['samples_ball'][0]['vel']")
            ball_acc_start = return_if_available("data['samples_ball'][0]['acc']")
            ball_time_end = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['time']")
            ball_pos_end = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['pos']")
            ball_vel_end = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['vel']")
            ball_acc_end = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['acc']")

            bat_time_start = return_if_available("data['samples_bat'][0]['time']")
            bat_pos_head_start = return_if_available("data['samples_bat'][0]['head']['pos']")
            bat_pos_handle_start = return_if_available("data['samples_bat'][0]['handle']['pos']")
            bat_time_end = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['time']")
            bat_pos_head_end = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['head']['pos']")
            bat_pos_handle_end = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['handle']['pos']")
            

            row = {"filename": filename, "main pitch event id":main_pitch_eventid, "main pitch result":main_pitch_result, "main pitch speed":main_pitch_speed, "main pitch spin":main_pitch_spin, "main hit event id":main_hit_eventid, "main hit speed":main_hit_speed, "main hit spin":main_hit_spin, "event angle start": event_angle_start, "event type": event_type, "event teamid":event_teamid, "event personid":event_personid, "event eventid":event_eventid, "ball time start":ball_time_start, "ball pos start":ball_pos_start, "ball vel start":ball_vel_start, "ball acc start":ball_acc_start, "ball time end":ball_time_end, "ball pos end":ball_pos_end, "ball vel end":ball_vel_end, "ball acc end":ball_acc_end, "bat time start":bat_time_start, "bat pos head start":bat_pos_head_start, "bat pos handle start":bat_pos_handle_start, "bat time end":bat_time_end, "bat pos head end":bat_pos_head_end, "bat pos handle end":bat_pos_handle_end}
            df.loc[len(df.index)] = row # add row to end of df

In [339]:
# clean up data / make data easily processable

df_processed = df.copy()
# delete the first row (irrelevant data)
df_processed = df_processed.iloc[1:]

# convert relevant non-numeric data with numeric data
df_processed.drop(['filename', 'main pitch event id', 'main hit event id', 'event personid', 'event eventid'], axis=1, inplace=True)
df_processed['main pitch result'] = df_processed['main pitch result'].map({'HitIntoPlay': 1, 'Strike': 0}) # for main pitch result, all strike = 0, hitintoplay = 1
df_processed['event type'] = df_processed['event type'].map({'Hit': 1, 'n/a': 0}) # for main pitch result, all strike = 0, hitintoplay = 1

# split event angle start into 2 columns
df_eas = pd.DataFrame(df_processed["event angle start"].to_list(), columns=['event angle start x', 'event angle start y', 'mystery'])
df_eas = df_eas.drop('mystery', axis=1)
df_processed.drop('event angle start', axis=1, inplace=True)
df_processed = pd.concat([df_processed, df_eas], axis=1) # adds an extra row which offsets all added data??? confused

# drop all columns with non-numeric values
#for col in df_processed.columns:
    #df_processed = df_processed[pd.to_numeric(df_processed[col], errors='coerce').notnull()]

In [331]:
# display everything

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
display(df_processed)

Unnamed: 0,main pitch result,main pitch speed,main pitch spin,main hit speed,main hit spin,event type,event teamid,ball time start,ball pos start,ball vel start,ball acc start,ball time end,ball pos end,ball vel end,ball acc end,bat time start,bat pos head start,bat pos handle start,bat time end,bat pos head end,bat pos handle end,event angle start x,event angle start y
1,1.0,83.0,2350.0,24.0,1590.0,1.0,63813.0,0.00694,"[-2.596849315782403, 53.288876602035714, 3.383...","[3.26718411686325, -82.84755537151418, 4.58114...","[-10.855688369972766, 15.832688145936599, -21....",19.087453,"[1.8525602046375322, 67.83203316732605, 3.7462...","[0.1377004833101617, 23.803842266510213, -14.3...","[-0.13717183685812664, -3.421288638938082, -21...",-0.382787,"[-3.321382283749604, -1.2247888339284578, 6.24...","[-1.762194487351856, -2.4654863204140436, 4.39...",0.846878,"[-1.50356778639817, -1.0968493324304913, 4.673...","[-4.174916931934301, -0.586949869139028, 4.812...",-47.13422,29.57366
2,0.0,97.0,2300.0,86.0,2640.0,1.0,63813.0,0.015076,"[-2.062841063630847, 51.99717987465831, 5.3355...","[4.8787063473231544, -96.49554931214212, -2.23...","[-10.898124801309347, 21.61589051317451, -9.70...",4.46758,"[0.06770781372823284, 48.265007882984754, 7.08...","[-0.7387899878027722, 35.69761174560369, -7.44...","[0.07098392281136053, -5.601026426366868, -19....",-0.448233,"[3.08620016501022, -2.054621948233432, 7.06043...","[1.582198005625443, -2.3846775682789887, 4.814...",0.831608,"[2.770838566946811, -1.5830482419264198, 6.141...","[4.73639181434095, -0.06881005186392501, 5.019...",8.56738,36.176988
3,1.0,97.0,2260.0,89.0,2910.0,1.0,63813.0,0.041829,"[-2.0928475031082585, 48.239632554641965, 5.22...","[3.8294463095840006, -95.62229817342217, -3.40...","[-7.812121561581491, 21.31215590161704, -9.558...",13.140121,"[-2.561458478497107, 41.30671703695338, 5.2005...","[-1.099706854845727, -33.06295288831144, -17.3...","[0.3963572473536583, 3.5459340511462525, -21.2...",-0.444781,"[-4.230369081630526, -0.18146767184422885, 5.6...","[-1.727313453263074, -1.2450145300495987, 5.47...",0.791679,"[-2.8613633429656575, -2.0666281546003753, 5.6...","[-4.412134611879191, -0.21556267168153598, 4.3...",-23.310262,-35.649607
4,1.0,87.0,2870.0,97.0,5100.0,1.0,63813.0,0.024473,"[-2.876076304819204, 51.02717612071745, 4.9903...","[4.373762566650416, -86.36000112696351, -0.230...","[2.3131806432063504, 15.196678360067393, -22.9...",9.074988,"[7.410160311840269, 51.54851353950857, 3.87279...","[-29.721641330500596, -10.111926419172738, -11...","[3.9818480757148653, 1.3307188655425133, -21.4...",-0.402128,"[-3.5721670209709755, -0.922298037947788, 6.38...","[-1.5984871316110245, -1.8868538285535903, 4.7...",0.844334,"[-2.282921379064816, -1.6096880007298726, 5.30...","[-3.2414530446086935, 0.3907797799067867, 3.72...",83.529292,48.865278
5,0.0,94.0,2240.0,86.0,3950.0,1.0,63813.0,0.044344,"[-1.7196494392685657, 48.09576415306349, 5.482...","[7.612508273390842, -92.42699248739596, -2.735...","[-14.307129755359684, 17.882604437744902, -15....",3.804043,"[1.7065419753762534, 21.74927167074408, 10.367...","[-0.28443036084268497, 34.954278219955434, 3.0...","[0.03570855856399541, -5.187268825601601, -22....",-0.435464,"[-3.5954687691055662, -0.8159099112692123, 6.1...","[-1.5388423189588551, -1.9022641259940432, 4.7...",0.814435,"[-3.549686297036735, -0.47094279291539953, 6.6...","[-2.7919684627826924, 1.115526799412778, 4.584...",16.612458,29.022325
6,1.0,95.0,2250.0,90.0,2290.0,1.0,63813.0,0.04005,"[-1.2866787770406098, 48.61718233851773, 5.275...","[3.01098482589832, -93.44367454803424, -4.3299...","[-10.80968295020825, 24.289476515974116, -8.08...",8.59849,"[-59.59721166719756, 68.25901085742657, 0.8341...","[-26.117041575280133, -16.555158434837708, -5....","[3.544203533973247, 2.21628353599971, -22.7441...",-0.436415,"[-4.337222281746261, -0.32154857622510136, 5.8...","[-1.8365398851614667, -1.290347570734468, 5.39...",0.806829,"[-1.582852018333703, -2.170900754952979, 4.306...","[-3.987000861873379, -0.9358636622283327, 4.63...",24.836491,69.823967
7,1.0,90.0,2330.0,82.0,5780.0,1.0,90068.0,0.019891,"[-1.4982382232462375, 51.52981632454363, 5.598...","[3.464425704945426, -89.02599053333583, -2.505...","[-9.761549826643847, 20.089941135368473, -10.4...",6.399657,"[73.09212383333082, 141.33975911928388, -0.043...","[6.142702982514196, 11.598134083024204, -57.71...","[-1.1352566174345025, -2.143495868928057, -13....",-0.40991,"[-3.277695156941292, -0.5858384941240582, 6.82...","[-1.6265256941319568, -1.9770109949962105, 5.1...",0.809999,"[-1.2336408291239036, -1.9159286765409778, 3.5...","[-3.416171086812885, -1.3422125201155781, 5.04...",88.188223,47.522762
8,0.0,93.0,2180.0,69.0,3080.0,1.0,90068.0,0.01219,"[-1.936451792497752, 52.47129390039718, 5.1912...","[4.654131016993987, -92.88386249128108, -3.151...","[-7.273017606616933, 18.68730585395476, -11.70...",5.574335,"[1.703568770960814, 49.94496796167402, 4.11224...","[0.29512209093614794, 27.999223547096697, -14....","[0.4194926750380557, -4.6678826055111395, -20....",-0.434233,"[-4.130509466348237, -0.8448318017996654, 4.45...","[-1.4119740798417069, -0.9217569273314099, 4.5...",0.81221,"[-1.778626517251866, -2.2692472856355637, 4.16...","[-3.6966753212216696, -0.3639375945530297, 4.4...",-13.436606,9.382088
9,1.0,95.0,3350.0,104.0,570.0,1.0,63813.0,0.01968,"[-1.1446768567302474, 51.39707458709522, 5.473...","[4.472360150552755, -94.61538491357287, -4.197...","[-12.090697768091669, 21.709611475591608, -7.3...",18.756122,"[34.83104771681171, 6.771882462373257, 0.03290...","[6.058034792062719, -11.408768520985134, -8.89...","[-1.0934342445343779, 0.566386835592129, -22.5...",-0.440068,"[-3.8146703606987673, -0.8006537936357024, 6.1...","[-1.6229670967454761, -1.7839892076272947, 4.9...",0.79637,"[-1.6547202899818176, -0.10666062137405985, 5....","[-2.529271633689986, 1.9948888409424443, 4.176...",4.524567,13.396622
10,1.0,92.0,2080.0,101.0,620.0,1.0,63813.0,0.016639,"[-1.7723031327314809, 51.91188379429852, 5.715...","[5.178543559526201, -90.89916338277044, -4.266...","[-9.33171393380526, 21.24068795116257, -10.502...",8.5111,"[-76.67614398898954, 66.48937425691643, 1.4143...","[-18.79276798311662, -28.605347304392097, -7.0...","[2.9153171332376355, 4.5769311745158685, -22.8...",-0.419839,"[-3.5802434334491906, -1.318874359187737, 6.61...","[-1.8175278702052378, -1.8358598364025553, 4.6...",0.820061,"[-5.7338966419796105, -0.9553590694208155, 5.5...","[-5.136931803327552, 0.303047644336052, 3.2441...",-3.735868,13.848007


In [210]:
# just testing out stuff here

print(df.columns)
# print(df[df.columns[3]].tolist())
print(len(df.columns))

from itertools import combinations
# print( list(combinations(df.columns, 2)) )
print( list(combinations([2, 3, 8], 2)) )

Index(['filename', 'main pitch event id', 'main pitch result',
       'main pitch speed', 'main pitch spin', 'main hit event id',
       'main hit speed', 'main hit spin', 'event angle start', 'event type',
       'event teamid', 'event personid', 'event eventid', 'ball time start',
       'ball pos start', 'ball vel start', 'ball acc start', 'ball time end',
       'ball pos end', 'ball vel end', 'ball acc end', 'bat time start',
       'bat pos head start', 'bat pos handle start', 'bat time end',
       'bat pos head end', 'bat pos handle end'],
      dtype='object')
27
[(2, 3), (2, 8), (3, 8)]


In [333]:
corr_matrix = df_processed.corr()
print("Correlation matrix:")
print(corr_matrix)

ValueError: could not convert string to float: 'n/a'