In [174]:
import json
import os
import pandas as pd

# the folder location of .jsonl files you're compiling into dataframes
folder_path = 'C:/Users/linds/Downloads/wsd-data-main/wsd-hits'

# custom try statement used to access components in json data (whether or whether not it exists)
# input: a string of code
# output: the return value of the string of code (if works) or "n/a" (doesn't work)

def return_if_available(thing):
    try:
        return eval(thing)
    except:
        return "n/a"

In [175]:
# initialize all 
# initialize all 
main_columns = {"filename": ['file'], "main pitch event id":['eventId'], "main pitch type":['type'], "main pitch result":['result'], "main pitch speed":['mph'], "main pitch spin":['rpm'], "main hit event id":['eventId'], "main hit speed":['mph'], "main hit spin":['rpm'], "main hit type": ['type'], "main hit grade": ['rating']}
event_columns = {"event angle start x":['x'], "event angle start y":['y'], "event type":['hit/no hit'], "event teamid":['mlbd'], "event personid":['mlbid'], "event eventid":['eventid']}
ball_columns = {"ball time start":['secs'], "ball pos start x":['feet'], "ball pos start y":['feet'], "ball pos start z":['feet'], "ball vel start x":['mph'], "ball vel start y":['mph'], "ball vel start z":['mph'], "ball acc start x":['mph/s'], "ball acc start y":['mph/s'], "ball acc start z":['mph/s'], "ball time end":['secs'], "ball pos end x":['feet'], "ball pos end y":['feet'], "ball pos end z":['feet'], "ball vel end x":['mph'], "ball vel end y":['mph'], "ball vel end z":['mph'], "ball acc end x":['mph/s'], "ball acc end y":['mph/s'], "ball acc end z":['mph/s']}
bat_columns = {"bat time start":['secs'], "bat pos head start x":['feet'], "bat pos head start y":['feet'], "bat pos head start z":['feet'], "bat pos handle start x":['feet'], "bat pos handle start y":['feet'], "bat pos handle start z":['feet'], "bat time end":['secs'], "bat pos head end x":['feet'], "bat pos head end y":['feet'], "bat pos head end z":['feet'], "bat pos handle end x":['feet'], "bat pos handle end y":['feet'], "bat pos handle end z":['feet']}

main_df = pd.DataFrame(main_columns)
event_df = pd.DataFrame(event_columns)
ball_df = pd.DataFrame(ball_columns)
bat_df = pd.DataFrame(bat_columns)

pd.set_option('display.max_columns', None)
#pd.set_option('display.max_rows', None)

In [176]:
# classify hit type function

def classify_hit_type(main_pitch_result, event_angle_start_y, main_hit_speed):
    hit_type = 'none'
    if isinstance(event_angle_start_y, float)  and isinstance(main_hit_speed, float):
        if main_pitch_result == "HitIntoPlay":
            # pop up
            if(event_angle_start_y >= 50 ):
                hit_type = 'Pop Up'
                    
            # power hit or pop fly depending on exit velocity
            if((event_angle_start_y > 25 and event_angle_start_y < 50 )):
                if (main_hit_speed >= 95):
                    hit_type = 'Power'
                else: 
                    hit_type = 'Fly Ball'
                        
    
            # line drives: hard hit vs lightly hit
            if((event_angle_start_y >= 10 and event_angle_start_y <= 25 )):
                if (main_hit_speed >= 95):
                    hit_type = 'Hard Hit Line Drive'
                else: 
                    hit_type = 'Light Line Drive'
        
            # ground balls: hard hit vs lightly hit
            elif (event_angle_start_y < 10 ):
                if (main_hit_speed >= 95):
                    hit_type = 'Hard Hit Ground Ball'
                else: 
                    hit_type = 'Light Ground Ball'
        elif main_pitch_result == "Strike":
            hit_type = 'Strike'

    return hit_type

# classify if hit is good/bad function
def rate_hit_type(main_hit_type):
    if main_hit_type == 'none':
        return 'n/a'
    if(main_hit_type == 'Power' or main_hit_type == 'Fly Ball' or main_hit_type == 'Hard Hit Line Drive'):
        return 'Bad'
    else: 
        return 'Good'

In [177]:
# making a comprehensive dataframe (with key data from each json file)
for filename in os.listdir(folder_path):
    if filename.endswith('.jsonl'):
        file_path = os.path.join(folder_path, filename)
        with open(file_path, 'r') as json_file:
            data = json.load(json_file)
            # enter in data fields
            main_pitch_eventid = return_if_available("data['summary_acts']['pitch']['eventId']")
            main_pitch_type = return_if_available("data['summary_acts']['pitch']['type']")
            main_pitch_result = return_if_available("data['summary_acts']['pitch']['result']")
            main_pitch_speed = return_if_available("data['summary_acts']['pitch']['speed']['mph']")
            main_pitch_spin = return_if_available("data['summary_acts']['pitch']['spin']['rpm']")
            main_hit_eventid = return_if_available("data['summary_acts']['hit']['eventId']")
            main_hit_speed = return_if_available("data['summary_acts']['hit']['speed']['mph']")
            main_hit_spin = return_if_available("data['summary_acts']['hit']['spin']['rpm']")

            event_angle_start_x = return_if_available("data['events'][0]['start']['angle'][0]")
            event_angle_start_y = return_if_available("data['events'][0]['start']['angle'][1]")
            event_type = return_if_available("data['events'][0]['type']")
            event_teamid = return_if_available("data['events'][0]['teamId']['mlbId']")
            event_personid = return_if_available("data['events'][0]['personId']['mlbId']")
            event_eventid = return_if_available("data['events'][0]['eventId']")

            ball_time_start = return_if_available("data['samples_ball'][0]['time']")
            ball_pos_start_x = return_if_available("data['samples_ball'][0]['pos'][0]")
            ball_pos_start_y = return_if_available("data['samples_ball'][0]['pos'][1]")
            ball_pos_start_z = return_if_available("data['samples_ball'][0]['pos'][2]")
            ball_vel_start_x = return_if_available("data['samples_ball'][0]['vel'][0]")
            ball_vel_start_y = return_if_available("data['samples_ball'][0]['vel'][1]")
            ball_vel_start_z = return_if_available("data['samples_ball'][0]['vel'][2]")
            ball_acc_start_x = return_if_available("data['samples_ball'][0]['acc'][0]")
            ball_acc_start_y = return_if_available("data['samples_ball'][0]['acc'][1]")
            ball_acc_start_z = return_if_available("data['samples_ball'][0]['acc'][2]")
            ball_time_end = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['time']")
            ball_pos_end_x = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['pos'][0]")
            ball_pos_end_y = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['pos'][1]")
            ball_pos_end_z = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['pos'][2]")
            ball_vel_end_x = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['vel'][0]")
            ball_vel_end_y = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['vel'][1]")
            ball_vel_end_z = return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['vel'][2]")
            ball_acc_end_x= return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['acc'][0]")
            ball_acc_end_y= return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['acc'][1]")
            ball_acc_end_z= return_if_available("data['samples_ball'][len(data['samples_ball']) - 1]['acc'][2]")

            bat_time_start = return_if_available("data['samples_bat'][0]['time']")
            bat_pos_head_start_x = return_if_available("data['samples_bat'][0]['head']['pos'][0]")
            bat_pos_head_start_y = return_if_available("data['samples_bat'][0]['head']['pos'][1]")
            bat_pos_head_start_z = return_if_available("data['samples_bat'][0]['head']['pos'][2]")
            bat_pos_handle_start_x = return_if_available("data['samples_bat'][0]['handle']['pos'][0]")
            bat_pos_handle_start_y = return_if_available("data['samples_bat'][0]['handle']['pos'][1]")
            bat_pos_handle_start_z = return_if_available("data['samples_bat'][0]['handle']['pos'][2]")
            bat_time_end = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['time']")
            bat_pos_head_end_x = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['head']['pos'][0]")
            bat_pos_head_end_y = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['head']['pos'][1]")
            bat_pos_head_end_z = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['head']['pos'][2]")
            bat_pos_handle_end_x = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['handle']['pos'][0]")
            bat_pos_handle_end_y = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['handle']['pos'][1]")
            bat_pos_handle_end_z = return_if_available("data['samples_bat'][len(data['samples_bat']) - 1]['handle']['pos'][2]")

            main_hit_type = classify_hit_type(main_pitch_result, event_angle_start_y, main_hit_speed)
            main_hit_grade = rate_hit_type(main_hit_type)

            row = {"filename": filename, "main pitch event id":main_pitch_eventid, "main pitch type":main_pitch_type, "main pitch result":main_pitch_result, "main pitch speed":main_pitch_speed, "main pitch spin":main_pitch_spin, "main hit event id":main_hit_eventid, "main hit speed":main_hit_speed, "main hit spin":main_hit_spin, "main hit type":main_hit_type, "main hit grade":main_hit_grade}
            main_df.loc[len(main_df.index)] = row # add row to end of df
            row = {"event angle start x": event_angle_start_x, "event angle start y": event_angle_start_y, "event type": event_type, "event teamid":event_teamid, "event personid":event_personid, "event eventid":event_eventid}
            event_df.loc[len(event_df.index)] = row # add row to end of df
            row = {"ball time start":ball_time_start, "ball pos start x":ball_pos_start_x, "ball pos start y":ball_pos_start_y, "ball pos start z":ball_pos_start_z, "ball vel start x":ball_vel_start_x, "ball vel start y":ball_vel_start_y, "ball vel start z":ball_vel_start_z, "ball acc start x":ball_acc_start_x, "ball acc start y":ball_acc_start_y, "ball acc start z":ball_acc_start_z, "ball time end":ball_time_end, "ball pos end x":ball_pos_end_x, "ball pos end y":ball_pos_end_y, "ball pos end z":ball_pos_end_z, "ball vel end x":ball_vel_end_x, "ball vel end y":ball_vel_end_y, "ball vel end z":ball_vel_end_z, "ball acc end x":ball_acc_end_x, "ball acc end y":ball_acc_end_y, "ball acc end z":ball_acc_end_z}
            ball_df.loc[len(ball_df.index)] = row # add row to end of df
            row = {"bat time start":bat_time_start, "bat pos head start x":bat_pos_head_start_x, "bat pos head start y":bat_pos_head_start_y, "bat pos head start z":bat_pos_head_start_z, "bat pos handle start x":bat_pos_handle_start_x, "bat pos handle start y":bat_pos_handle_start_y, "bat pos handle start z":bat_pos_handle_start_z, "bat time end":bat_time_end, "bat pos head end x":bat_pos_head_end_x, "bat pos head end y":bat_pos_head_end_y, "bat pos head end z":bat_pos_head_end_z, "bat pos handle end x":bat_pos_handle_end_x, "bat pos handle end y":bat_pos_handle_end_y, "bat pos handle end z":bat_pos_handle_end_z}
            bat_df.loc[len(bat_df.index)] = row # add row to end of df
            
            df = pd.concat([main_df, event_df, ball_df, bat_df], axis=1).reindex(main_df.index)



In [178]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
df

Unnamed: 0,filename,main pitch event id,main pitch type,main pitch result,main pitch speed,main pitch spin,main hit event id,main hit speed,main hit spin,main hit type,main hit grade,event angle start x,event angle start y,event type,event teamid,event personid,event eventid,ball time start,ball pos start x,ball pos start y,ball pos start z,ball vel start x,ball vel start y,ball vel start z,ball acc start x,ball acc start y,ball acc start z,ball time end,ball pos end x,ball pos end y,ball pos end z,ball vel end x,ball vel end y,ball vel end z,ball acc end x,ball acc end y,ball acc end z,bat time start,bat pos head start x,bat pos head start y,bat pos head start z,bat pos handle start x,bat pos handle start y,bat pos handle start z,bat time end,bat pos head end x,bat pos head end y,bat pos head end z,bat pos handle end x,bat pos handle end y,bat pos handle end z
0,file,eventId,type,result,mph,rpm,eventId,mph,rpm,type,rating,x,y,hit/no hit,mlbd,mlbid,eventid,secs,feet,feet,feet,mph,mph,mph,mph/s,mph/s,mph/s,secs,feet,feet,feet,mph,mph,mph,mph/s,mph/s,mph/s,secs,feet,feet,feet,feet,feet,feet,secs,feet,feet,feet,feet,feet,feet
1,12345634_1020.jsonl,f1c5834c-ba80-419b-883f-56b665cb2e79,Sinker,Strike,89.0,2210,{},,,none,,,,,,,,0.00387,-3.019402,53.629209,5.281497,6.408269,-88.673374,0.072381,-8.980923,18.745075,-12.683055,4.262712,0.223031,58.100688,7.306118,-0.136285,32.123517,-10.852781,0.755103,-4.841392,-19.754323,-0.409264,-3.578308,-0.941109,6.825889,-1.629894,-2.093661,5.312428,0.823976,-1.077554,-2.352818,3.241268,-3.219796,-1.820186,4.835746
2,12345634_10282.jsonl,ac29b4ab-63bc-4672-a29c-f4517fd03c85,Changeup,HitIntoPlay,83.0,2350,daf9742c-869c-4370-9d66-59e217be1c89,24.0,1590,Pop Up,Good,27.482589,65.83796,Hit,63813,797796542,daf9742c-869c-4370-9d66-59e217be1c89,0.00694,-2.596849,53.288877,3.383166,3.267184,-82.847555,4.581147,-10.855688,15.832688,-21.049011,19.087453,1.85256,67.832033,3.746256,0.1377,23.803842,-14.351191,-0.137172,-3.421289,-21.20476,-0.382787,-3.321382,-1.224789,6.246467,-1.762194,-2.465486,4.390434,0.846878,-1.503568,-1.096849,4.673711,-4.174917,-0.58695,4.812109
3,12345634_10475.jsonl,8efde6c7-6ab0-40aa-a197-c1ad42bb7ee7,FourSeamFastball,HitIntoPlay,96.0,2200,62848ee8-bff2-4410-9c61-eb672c283a60,96.0,2160,Hard Hit Ground Ball,Good,37.200279,-9.146789,Hit,63813,352830460,62848ee8-bff2-4410-9c61-eb672c283a60,0.022692,-2.268567,50.971066,5.119148,6.588262,-94.775204,-3.389627,-8.737078,20.930298,-8.062493,3.424558,0.937829,-2.624121,0.167556,-38.113817,-50.563387,-13.247431,9.730987,14.90036,-19.11551,-0.440474,3.44278,-1.734959,6.780129,1.947712,-2.460268,4.622825,0.782766,1.770474,-4.242519,3.121144,2.94964,-1.82102,3.522601
4,12345634_10501.jsonl,b4727ec0-5df2-48ae-baab-1dea20f53f15,Curveball,Strike,78.0,2890,{},,,none,,,,,,,,0.019768,-3.416028,51.873311,5.122782,3.361396,-77.82572,1.00718,5.988397,14.333164,-24.465683,3.271371,-0.654045,58.655092,4.919008,-2.005755,44.563092,-9.06179,3.80621,-7.5134,-19.126702,-0.356866,-3.215408,-0.044018,6.373001,-1.546199,-1.980956,5.436373,0.846296,-1.900242,-2.50405,5.682833,-3.554418,-0.44421,5.022514
5,12345634_10636.jsonl,69931eb8-37bc-4b1c-91f8-7325b54faa72,Slider,Strike,84.0,2800,{},,,none,,,,,,,,-0.00334,-2.106492,54.542563,5.30093,1.546509,-83.591052,-0.734216,5.199078,14.972504,-21.561555,3.482975,2.212394,58.71811,4.897385,0.681248,34.070354,-11.077481,0.809986,-5.377468,-20.031893,-0.389824,-3.697975,-1.2654,6.560641,-2.041854,-1.981862,4.521229,0.809852,-1.780458,-1.455937,3.890136,-4.367648,-1.034994,4.628048
6,12345634_10962.jsonl,7139315c-faea-486f-94a4-c33e022866a9,Sinker,Strike,97.0,2300,49f533e3-075e-428b-abc3-3303e0871a34,86.0,2640,Strike,Good,-47.13422,29.57366,Hit,63813,412098649,49f533e3-075e-428b-abc3-3303e0871a34,0.015076,-2.062841,51.99718,5.335502,4.878706,-96.495549,-2.233093,-10.898125,21.615891,-9.702114,4.46758,0.067708,48.265008,7.081156,-0.73879,35.697612,-7.440813,0.070984,-5.601026,-19.996086,-0.448233,3.0862,-2.054622,7.060436,1.582198,-2.384678,4.814484,0.831608,2.770839,-1.583048,6.141535,4.736392,-0.06881,5.01951
7,12345634_11227.jsonl,444a6133-f51f-4fda-bef8-066259b9de2f,FourSeamFastball,HitIntoPlay,97.0,2260,6851a681-5242-42ff-906c-125b455c7b08,89.0,2910,Fly Ball,Bad,8.56738,36.176988,Hit,63813,797957728,6851a681-5242-42ff-906c-125b455c7b08,0.041829,-2.092848,48.239633,5.222792,3.829446,-95.622298,-3.400947,-7.812122,21.312156,-9.558819,13.140121,-2.561458,41.306717,5.200573,-1.099707,-33.062953,-17.367285,0.396357,3.545934,-21.295913,-0.444781,-4.230369,-0.181468,5.613366,-1.727313,-1.245015,5.476112,0.791679,-2.861363,-2.066628,5.613175,-4.412135,-0.215563,4.354635
8,12345634_11462.jsonl,443da200-358b-44b9-a8c2-b0a5cbe80d9b,Curveball,Strike,79.0,2920,54ca7cb0-2e2b-49ac-a782-0abd52515ce7,67.0,4190,Strike,Good,-78.757461,-44.25667,Hit,90068,459722179,54ca7cb0-2e2b-49ac-a782-0abd52515ce7,0.027719,-3.287808,50.922416,5.142105,3.97674,-78.797249,0.82135,5.650453,15.002227,-25.193983,5.265359,-0.912186,60.820065,4.299127,-2.636001,45.70726,-9.035567,2.538158,-8.971074,-17.474846,-0.358777,-3.598836,-0.963198,6.585433,-1.482871,-1.979612,5.205268,0.844224,-2.039378,-2.355384,4.333123,-3.967031,-0.438337,4.17709
9,12345634_11798.jsonl,f0de98ba-6090-408b-a259-882abf8b087d,Slider,Strike,79.0,2650,{},,,none,,,,,,,,0.025464,-1.958465,51.186708,5.200136,2.357634,-78.729499,-1.324209,7.971951,13.809189,-20.603901,19.150125,1.315992,59.679494,6.194348,-0.094086,28.113745,-13.709054,-0.788544,-4.434136,-20.839562,-0.3577,-3.780658,-0.699113,5.971335,-1.703002,-2.013671,4.800633,0.828875,-0.864876,-2.333591,3.810297,-2.625115,-0.835836,5.250278


In [179]:
for index, row in df.iterrows():
    # if row['main hit type'] != 'none':
    if str(row['main pitch type']) != '{}' and row['event angle start y'] != 'n/a' and row['main hit speed'] != 'None':
        print(index, row['filename'], row['main pitch type'], row['main pitch result'], row['event angle start y'], row['main hit speed'], row['main hit type'], row['main hit grade'])

0 file type result y mph type rating
2 12345634_10282.jsonl Changeup HitIntoPlay 65.83795982962585 24.0 Pop Up Good
3 12345634_10475.jsonl FourSeamFastball HitIntoPlay -9.146789451447502 96.0 Hard Hit Ground Ball Good
6 12345634_10962.jsonl Sinker Strike 29.573659989937717 86.0 Strike Good
7 12345634_11227.jsonl FourSeamFastball HitIntoPlay 36.176988409934935 89.0 Fly Ball Bad
8 12345634_11462.jsonl Curveball Strike -44.25666999833081 67.0 Strike Good
10 12345634_11892.jsonl Cutter HitIntoPlay -35.6496074915076 97.0 Hard Hit Ground Ball Good
12 12345634_12076.jsonl Sinker HitIntoPlay -11.943482730055928 82.0 Light Ground Ball Good
14 12345634_12324.jsonl Sinker Strike 9.933994469797232 96.0 Strike Good
17 12345634_12705.jsonl Curveball HitIntoPlay 10.774237222311674 53.0 Light Line Drive Good
19 12345634_12936.jsonl Sinker Strike 48.86527817961585 86.0 Strike Good
20 12345634_14023.jsonl Slider HitIntoPlay 2.607411976003535 97.0 Hard Hit Ground Ball Good
23 12345634_1574.jsonl Sinker H

In [180]:
# finding correlations between ALL of the numbers... all of them

In [181]:
# clean up data / make data easily processable (just the numbers)

df_processed = df.copy()
# delete the first row (irrelevant data)
df_processed = df_processed.iloc[1:]

# convert relevant non-numeric data with numeric data
# df_processed.drop(['filename', 'main pitch event id', 'main hit event id', 'event personid', 'event eventid'], axis=1, inplace=True)
df_processed.drop(['filename', 'main pitch event id', 'main hit type', 'main pitch type', 'main hit event id', 'main hit grade', 'event personid', 'event eventid'], axis=1, inplace=True)
df_processed['main pitch result'] = df_processed['main pitch result'].map({'HitIntoPlay': 1, 'Strike': 0}) # for main pitch result, all strike = 0, hitintoplay = 1
df_processed['event type'] = df_processed['event type'].map({'Hit': 1, 'n/a': 0}) # for event type, all hit = 1, n/a = 0

# drop all columns with non-numeric values
for col in df_processed.columns:
    df_processed = df_processed[pd.to_numeric(df_processed[col], errors='coerce').notnull()]

In [182]:
# display everything

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
display(df_processed)

Unnamed: 0,main pitch result,main pitch speed,main pitch spin,main hit speed,main hit spin,event angle start x,event angle start y,event type,event teamid,ball time start,ball pos start x,ball pos start y,ball pos start z,ball vel start x,ball vel start y,ball vel start z,ball acc start x,ball acc start y,ball acc start z,ball time end,ball pos end x,ball pos end y,ball pos end z,ball vel end x,ball vel end y,ball vel end z,ball acc end x,ball acc end y,ball acc end z,bat time start,bat pos head start x,bat pos head start y,bat pos head start z,bat pos handle start x,bat pos handle start y,bat pos handle start z,bat time end,bat pos head end x,bat pos head end y,bat pos head end z,bat pos handle end x,bat pos handle end y,bat pos handle end z
2,1.0,83.0,2350,24.0,1590,27.482589,65.83796,1,63813,0.00694,-2.596849,53.288877,3.383166,3.267184,-82.847555,4.581147,-10.855688,15.832688,-21.049011,19.087453,1.85256,67.832033,3.746256,0.1377,23.803842,-14.351191,-0.137172,-3.421289,-21.20476,-0.382787,-3.321382,-1.224789,6.246467,-1.762194,-2.465486,4.390434,0.846878,-1.503568,-1.096849,4.673711,-4.174917,-0.58695,4.812109
3,1.0,96.0,2200,96.0,2160,37.200279,-9.146789,1,63813,0.022692,-2.268567,50.971066,5.119148,6.588262,-94.775204,-3.389627,-8.737078,20.930298,-8.062493,3.424558,0.937829,-2.624121,0.167556,-38.113817,-50.563387,-13.247431,9.730987,14.90036,-19.11551,-0.440474,3.44278,-1.734959,6.780129,1.947712,-2.460268,4.622825,0.782766,1.770474,-4.242519,3.121144,2.94964,-1.82102,3.522601
6,0.0,97.0,2300,86.0,2640,-47.13422,29.57366,1,63813,0.015076,-2.062841,51.99718,5.335502,4.878706,-96.495549,-2.233093,-10.898125,21.615891,-9.702114,4.46758,0.067708,48.265008,7.081156,-0.73879,35.697612,-7.440813,0.070984,-5.601026,-19.996086,-0.448233,3.0862,-2.054622,7.060436,1.582198,-2.384678,4.814484,0.831608,2.770839,-1.583048,6.141535,4.736392,-0.06881,5.01951
7,1.0,97.0,2260,89.0,2910,8.56738,36.176988,1,63813,0.041829,-2.092848,48.239633,5.222792,3.829446,-95.622298,-3.400947,-7.812122,21.312156,-9.558819,13.140121,-2.561458,41.306717,5.200573,-1.099707,-33.062953,-17.367285,0.396357,3.545934,-21.295913,-0.444781,-4.230369,-0.181468,5.613366,-1.727313,-1.245015,5.476112,0.791679,-2.861363,-2.066628,5.613175,-4.412135,-0.215563,4.354635
8,0.0,79.0,2920,67.0,4190,-78.757461,-44.25667,1,90068,0.027719,-3.287808,50.922416,5.142105,3.97674,-78.797249,0.82135,5.650453,15.002227,-25.193983,5.265359,-0.912186,60.820065,4.299127,-2.636001,45.70726,-9.035567,2.538158,-8.971074,-17.474846,-0.358777,-3.598836,-0.963198,6.585433,-1.482871,-1.979612,5.205268,0.844224,-2.039378,-2.355384,4.333123,-3.967031,-0.438337,4.17709
10,1.0,87.0,2870,97.0,5100,-23.310262,-35.649607,1,63813,0.024473,-2.876076,51.027176,4.990343,4.373763,-86.360001,-0.230943,2.313181,15.196678,-22.951512,9.074988,7.41016,51.548514,3.872795,-29.721641,-10.111926,-11.460872,3.981848,1.330719,-21.445486,-0.402128,-3.572167,-0.922298,6.384278,-1.598487,-1.886854,4.775092,0.844334,-2.282921,-1.609688,5.304694,-3.241453,0.39078,3.725263
12,1.0,92.0,2210,82.0,2070,-31.334474,-11.943483,1,90068,0.031799,-2.865309,49.872089,5.119521,6.389228,-91.056439,-1.383618,-12.07117,19.193081,-14.209802,22.230078,26.649101,13.837972,0.381575,13.537513,-21.478904,-17.439765,-1.031375,1.840056,-19.941108,-0.428175,-3.376889,0.066594,6.623578,-1.54428,-1.633857,5.5441,0.801726,-1.540236,-3.982284,2.257236,-3.334122,-2.155617,3.184877
14,0.0,95.0,2280,96.0,50,-60.395851,9.933994,1,63813,0.021795,-2.228282,51.099069,5.355856,5.099644,-94.695735,-2.53487,-9.516926,23.047352,-10.262659,4.66421,1.491574,48.393083,5.475655,0.400801,32.13012,-10.107157,0.355739,-3.665815,-20.727574,-0.441383,3.152427,-1.875767,7.128561,1.731314,-2.408265,4.867556,0.831847,3.101523,-1.564266,4.623905,4.294259,0.847961,5.040798
17,1.0,78.0,2920,53.0,2150,41.948324,10.774237,1,90068,-0.003723,-3.408526,54.55696,5.143427,3.683754,-77.55629,2.092421,6.609625,14.684765,-25.815119,1.836097,61.312806,62.672696,2.207236,20.431337,22.026122,-0.170359,-0.043337,-3.396565,-23.324749,-0.346938,-3.122437,-0.714694,6.08413,-1.42236,-2.482976,4.901686,0.859611,-1.350998,-1.820034,3.49773,-3.604779,-0.567391,4.373294
19,0.0,94.0,2240,86.0,3950,83.529292,48.865278,1,63813,0.044344,-1.719649,48.095764,5.48259,7.612508,-92.426992,-2.735903,-14.30713,17.882604,-15.465568,3.804043,1.706542,21.749272,10.367554,-0.28443,34.954278,3.019046,0.035709,-5.187269,-22.501712,-0.435464,-3.595469,-0.81591,6.138747,-1.538842,-1.902264,4.722633,0.814435,-3.549686,-0.470943,6.663956,-2.791968,1.115527,4.584476


In [183]:
# making correlation table
# note: "event type" may be displayed as "nan" but it's just a math thing. event type isnt super relevant anyway

corr_matrix = df_processed.corr(method='pearson', numeric_only = False)

# styling correlation df
def cond_formatting(x):
    if x > 0.6 or x < -0.6:
        return 'background-color: lightgreen'
    else:
        return None
    
#display DataFrame with conditional formatting applied    
corr_matrix = corr_matrix.style.map(cond_formatting)
display(corr_matrix)

Unnamed: 0,main pitch result,main pitch speed,main pitch spin,main hit speed,main hit spin,event angle start x,event angle start y,event type,event teamid,ball time start,ball pos start x,ball pos start y,ball pos start z,ball vel start x,ball vel start y,ball vel start z,ball acc start x,ball acc start y,ball acc start z,ball time end,ball pos end x,ball pos end y,ball pos end z,ball vel end x,ball vel end y,ball vel end z,ball acc end x,ball acc end y,ball acc end z,bat time start,bat pos head start x,bat pos head start y,bat pos head start z,bat pos handle start x,bat pos handle start y,bat pos handle start z,bat time end,bat pos head end x,bat pos head end y,bat pos head end z,bat pos handle end x,bat pos handle end y,bat pos handle end z
main pitch result,1.0,-0.091295,-0.005582,0.363987,-0.127317,-0.15146,-0.211553,,-0.001715,0.01179,-0.00066,0.000488,-0.155999,-0.059736,0.094682,0.09942,0.033067,-0.054005,-0.130001,0.482869,-0.014634,0.206716,-0.321551,0.162166,-0.656677,0.013974,-0.179298,0.539216,-0.140055,-0.001489,-0.023183,0.11812,-0.037175,-0.044336,0.028567,-0.005621,-0.020186,-0.072078,0.007269,0.064747,-0.056289,0.087578,-0.274675
main pitch speed,-0.091295,1.0,-0.035054,0.109768,-0.128855,-0.041593,0.069277,,-0.092426,0.125825,0.018041,-0.209654,0.166849,0.253675,-0.997637,-0.735303,-0.498528,0.837768,0.833124,-0.120313,-0.022104,-0.027097,0.038355,-0.045086,0.049175,-0.001703,0.055539,-0.034922,0.014289,-0.060217,0.102831,0.025566,0.077101,0.142968,0.046133,0.129934,-0.149967,0.138084,-0.044844,0.049404,0.142401,0.019688,0.091513
main pitch spin,-0.005582,-0.035054,1.0,-0.126748,0.006744,0.083332,-0.056479,,-0.069077,-0.028628,-0.075034,0.031565,0.108754,0.014285,0.033116,0.055888,0.12954,-0.042782,-0.061944,0.032777,-0.036952,-0.07691,-0.011358,0.032648,0.043808,0.044642,-0.041649,-0.089264,-0.093138,-0.055615,-0.102754,0.071165,-0.097312,-0.082168,0.077911,-0.00966,-0.055242,-0.066647,0.01205,-0.036129,-0.090723,-0.022618,-0.05223
main hit speed,0.363987,0.109768,-0.126748,1.0,0.043732,-0.128144,-0.05838,,-0.045307,0.066636,0.056147,-0.073687,0.041431,-0.018681,-0.106639,-0.144474,-0.004833,0.099355,0.123057,0.183591,-0.083757,0.213843,-0.158781,-0.030222,-0.31976,-0.058726,0.006855,0.292994,-0.019259,0.086342,0.01228,0.084604,0.109512,0.025291,-0.132564,0.181247,0.115777,-0.048416,-0.144758,0.198793,0.022982,0.062718,0.004296
main hit spin,-0.127317,-0.128855,0.006744,0.043732,1.0,-0.097174,0.041586,,0.092434,0.07604,-0.043482,-0.062409,0.023164,0.041582,0.132489,0.029855,-0.003279,-0.094599,-0.10926,-0.04151,0.106371,-0.020146,0.090849,0.044955,0.098168,-0.145132,-0.046962,-0.08602,0.076682,-0.08356,-0.047536,0.083623,-0.050351,-0.043854,-0.012102,-0.024459,-0.136684,-0.00629,-0.033912,-0.061549,-0.059303,-0.068792,0.054861
event angle start x,-0.15146,-0.041593,0.083332,-0.128144,-0.097174,1.0,0.234854,,0.092145,-0.146804,0.014894,0.148761,0.022976,-0.047107,0.03967,0.073081,0.133297,0.007637,-0.04663,-0.147664,0.110564,0.012702,0.006168,-0.021602,0.14076,-0.084607,0.038312,-0.112008,0.075737,0.001511,-0.286906,-0.047067,-0.086191,-0.291073,0.113418,-0.023113,-0.031253,-0.221273,-0.01247,-0.001147,-0.277323,-0.084338,0.126097
event angle start y,-0.211553,0.069277,-0.056479,-0.05838,0.041586,0.234854,1.0,,0.067716,-0.036277,0.010478,0.024168,0.051281,-0.002582,-0.068372,0.031584,0.027553,0.096714,0.174415,-0.047341,0.128824,0.112961,-0.094311,-0.076171,0.126278,-0.310884,0.149553,-0.06482,0.275415,0.106071,-0.035028,0.05658,0.0884,-0.025909,0.058298,0.118372,0.08387,-0.026338,-0.033442,-0.037953,-0.005494,-0.178397,0.304815
event type,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
event teamid,-0.001715,-0.092426,-0.069077,-0.045307,0.092434,0.092145,0.067716,,1.0,-0.131278,-0.010612,0.138316,0.158785,0.032017,0.086734,0.110514,-0.128613,0.033142,-0.170256,0.100868,0.159024,-0.034932,-0.10985,0.055225,0.031662,-0.046149,0.073318,-0.025764,0.054564,-0.120027,-0.368191,0.242174,-0.1642,-0.362146,0.292706,-0.019033,-0.1247,-0.36831,-0.027755,-0.015658,-0.404911,-0.102329,0.048213
ball time start,0.01179,0.125825,-0.028628,0.066636,0.07604,-0.146804,-0.036277,,-0.131278,1.0,-0.019088,-0.994701,-0.016917,0.111843,-0.080888,-0.219013,-0.08121,0.053238,0.145181,0.001622,0.09918,0.026062,-0.09394,-0.073498,0.042838,-0.077325,0.066182,-0.0441,0.092722,-0.015652,0.136439,0.004157,0.047796,0.150054,-0.056229,0.04559,-0.033963,0.118619,-0.0246,0.005597,0.147289,-0.049729,0.062491


In [184]:
# download styled dataframe as an html file

download_location = "C:/Users/linds/Downloads/"

# commented out so i dont download a new chart every time i run this
'''
corr_html = corr_matrix.to_html()
file = open(download_location + "corr_styled_df.html", "w")
file.write(corr_html)
file.close()
# corr_matrix.to_csv(r'' + download_location + 'main_df.csv')
'''

'\ncorr_html = corr_matrix.to_html()\nfile = open(download_location + "corr_styled_df.html", "w")\nfile.write(corr_html)\nfile.close()\n# corr_matrix.to_csv(r\'\' + download_location + \'main_df.csv\')\n'

In [222]:
# finding correlations between pitch type, hit type, good/bad hit

In [226]:
# clean up data

df_light = df[['main pitch type', 'main hit type', 'main hit grade']].copy()
df_light = df_light.drop(index=[0]) # drop the first row, sample data
for row_index, row in df_light.iterrows():
    for col_data in row:
        row_null = True if (col_data == 'none' or col_data == 'n/a' or str(row['main pitch type']) == '{}') else False
    if row_null: # if there is blank/none/na data in row, delete it
        df_light = df_light.drop(index=[row_index])
df_light.reset_index(inplace=True, drop=True)
display(df_light)

Unnamed: 0,main pitch type,main hit type,main hit grade
0,Changeup,Pop Up,Good
1,FourSeamFastball,Hard Hit Ground Ball,Good
2,Sinker,Strike,Good
3,FourSeamFastball,Fly Ball,Bad
4,Curveball,Strike,Good
5,Cutter,Hard Hit Ground Ball,Good
6,Sinker,Light Ground Ball,Good
7,Sinker,Strike,Good
8,Curveball,Light Line Drive,Good
9,Sinker,Strike,Good


In [248]:
# method 1: each type is enumerated

df_method1 = df_light.copy()

df_method1['main pitch type']=df_method1['main pitch type'].astype('category').cat.codes
df_method1['main hit type']=df_method1['main hit type'].astype('category').cat.codes
df_method1['main hit grade'] = df_method1['main hit grade'].map({'Good': 1, 'Bad': 0}) # for main hit grade, all Good = 1, Bad = 0

# making correlation table
# note: "event type" may be displayed as "nan" but it's just a math thing. event type isnt super relevant anyway

corr_matrix = df_method1.corr(method='pearson')

# styling correlation df
def cond_formatting(x):
    if x > 0.6 or x < -0.6:
        return 'background-color: lightgreen'
    else:
        return None
    
#display DataFrame with conditional formatting applied    
corr_matrix = corr_matrix.style.map(cond_formatting)
display(corr_matrix)

Unnamed: 0,main pitch type,main hit type,main hit grade
main pitch type,1.0,0.231416,0.014219
main hit type,0.231416,1.0,0.470082
main hit grade,0.014219,0.470082,1.0


In [246]:
# method 2: make a category for each hit type/pitch type

pitch_type_cols = df_light['main pitch type'].unique()
hit_type_cols = df_light['main hit type'].unique()

cols = list(pitch_type_cols) + list(hit_type_cols)

df_method2 = pd.DataFrame()
df_method2 = df_method2.reindex(columns=df_method2.columns.union(cols))
display(df_method2)
print(pitch_type_cols)
print(hit_type_cols)

for row_index, row in df_light.iterrows():
    df_method2.loc[row_index] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]   # adding a row
    df_method2 = df_method2.sort_index()
    match row['main pitch type']:
        case 'Changeup':
            df_method2.loc[row_index, 'Changeup'] = 1
        case 'FourSeamFastball':
            df_method2.loc[row_index, 'FourSeamFastball'] = 1
        case 'Sinker':
            df_method2.loc[row_index, 'Sinker'] = 1
        case 'Curveball':
            df_method2.loc[row_index, 'Curveball'] = 1
        case 'Cutter':
            df_method2.loc[row_index, 'Cutter'] = 1
        case 'Slider':
            df_method2.loc[row_index, 'Slider'] = 1
        case _:
            # default handler
            print("pitch type not found on row", row_index)
    match row['main hit type']:
        case 'Pop Up':
            df_method2.loc[row_index, 'Pop Up'] = 1
        case 'Hard Hit Ground Ball':
            df_method2.loc[row_index, 'Hard Hit Ground Ball'] = 1
        case 'Strike':
            df_method2.loc[row_index, 'Strike'] = 1
        case 'Fly Ball':
            df_method2.loc[row_index, 'Fly Ball'] = 1
        case 'Light Ground Ball':
            df_method2.loc[row_index, 'Light Ground Ball'] = 1
        case 'Light Line Drive':
            df_method2.loc[row_index, 'Light Line Drive'] = 1
        case 'Hard Hit Line Drive':
            df_method2.loc[row_index, 'Hard Hit Line Drive'] = 1
        case 'Power':
            df_method2.loc[row_index, 'Power'] = 1
        case _:
            # default handler
            print("hit type not found on row", row_index)

# add main hit grade (from df_light) to this dataframe
df_method2 = pd.concat([df_method2, df_light['main hit grade']], axis=1)
df_method2['main hit grade'] = df_method2['main hit grade'].map({'Good': 1, 'Bad': 0}) # for main hit grade, all Good = 1, Bad = 0

corr_matrix = df_method2.corr(method='pearson')

# styling correlation df
def cond_formatting(x):
    if x > 0.6 or x < -0.6:
        return 'background-color: lightgreen'
    else:
        return None
    
#display DataFrame with conditional formatting applied    
corr_matrix = corr_matrix.style.map(cond_formatting)
display(corr_matrix)

Unnamed: 0,Changeup,FourSeamFastball,Sinker,Curveball,Cutter,Slider,Pop Up,Hard Hit Ground Ball,Strike,Fly Ball,Light Ground Ball,Light Line Drive,Hard Hit Line Drive,Power


['Changeup' 'FourSeamFastball' 'Sinker' 'Curveball' 'Cutter' 'Slider']
['Pop Up' 'Hard Hit Ground Ball' 'Strike' 'Fly Ball' 'Light Ground Ball'
 'Light Line Drive' 'Hard Hit Line Drive' 'Power']


Unnamed: 0,Changeup,FourSeamFastball,Sinker,Curveball,Cutter,Slider,Pop Up,Hard Hit Ground Ball,Strike,Fly Ball,Light Ground Ball,Light Line Drive,Hard Hit Line Drive,Power,main hit grade
Changeup,1.0,-0.140422,-0.241558,-0.090642,-0.073002,-0.178185,0.13036,-0.009563,-0.283282,0.05835,0.307802,-0.090642,0.170453,-0.050936,-0.11476
FourSeamFastball,-0.140422,1.0,-0.344046,-0.129099,-0.103975,-0.253785,-0.103975,0.095346,-0.195973,0.100301,-0.053838,0.258199,0.089443,-0.072548,-0.095346
Sinker,-0.241558,-0.344046,1.0,-0.22208,-0.178861,-0.436568,-0.058592,-0.033935,0.212665,-0.055927,-0.028742,-0.22208,-0.015917,0.043034,0.033935
Curveball,-0.090642,-0.129099,-0.22208,1.0,-0.067116,-0.163817,-0.067116,0.010258,0.029765,-0.090642,0.046337,0.097222,-0.057735,-0.046829,0.123091
Cutter,-0.073002,-0.103975,-0.178861,-0.067116,1.0,-0.131936,-0.054054,0.06196,0.023972,-0.073002,-0.083967,0.15101,-0.046499,-0.037716,0.099136
Slider,-0.178185,-0.253785,-0.436568,-0.163817,-0.131936,1.0,0.138881,-0.076413,0.088537,0.030814,-0.111464,-0.051732,-0.113496,0.096903,-0.006368
Pop Up,0.13036,-0.103975,-0.058592,-0.067116,-0.054054,0.138881,1.0,-0.099136,-0.209756,-0.073002,-0.083967,-0.067116,-0.046499,-0.037716,0.099136
Hard Hit Ground Ball,-0.009563,0.095346,-0.033935,0.010258,0.06196,-0.076413,-0.099136,1.0,-0.384697,-0.133887,-0.153998,-0.123091,-0.08528,-0.069171,0.181818
Strike,-0.283282,-0.195973,0.212665,0.029765,0.023972,0.088537,-0.209756,-0.384697,1.0,-0.283282,-0.325834,-0.260441,-0.180439,-0.146355,0.384697
Fly Ball,0.05835,0.100301,-0.055927,-0.090642,-0.073002,0.030814,-0.073002,-0.133887,-0.283282,1.0,-0.113401,-0.090642,-0.062799,-0.050936,-0.736379
