# Functions to extract information from json files

To do list:
----------

- [x] Create Trajectory data frame (separate)

- [x] Create flag for which points there exist accesible trajectory data

- [ ] Add in `statsData` indicator columns for Dfs, Aces, points won, etc

- [ ] Figure out what returner and server coordinates are (location at last shot????)

In [1]:
import json
import pandas as pd
import os
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option("max_rows", None)

In [2]:
def get_match_level_info(tracking_data_json, year):
    '''
    Args:
    -----
    year: match year
    tracking_data_json [json]: Json file
    
    Returns:
    --------
    dict of row to append into a dataframe
    '''
    
    # Get to the crux of the information
    tracking_data_dict = tracking_data_json['courtVisionData'][0]
    
    #atp_tracking_data_dict['isMatchComplete']


    # -----------------------------
    # Player information
    # -----------------------------
    player1_info = tracking_data_dict['playersData']['playerTeam']
    player2_info = tracking_data_dict['playersData']['opponentTeam']

    
    match_dict = dict(
        year = year,
        player1 = player1_info[0]['name'],
        player2 = player2_info[0]['name'],
        player1_id = player1_info[0]['id'],
        player1_country = player1_info[0]['country'],
        player1_seed = player1_info[0]['seed'],
        player2_id = player2_info[0]['id'],
        player2_country = player2_info[0]['country'],
        player2_seed = player2_info[0]['seed'],
        
    # -------------------------------
    # - I have no idea what this is
    # -------------------------------
        point_id = tracking_data_dict['pointId'],
        court_name = tracking_data_dict['courtName'],
        court_id = tracking_data_dict['courtId'],
        num_sets_completed = tracking_data_dict['setsCompleted'],
        
        # Mens/Womens Singles
        match_type = tracking_data_dict['eventType'],

        # Complete status?
        match_status = tracking_data_dict['matchStatus']
        
        
    )
    
    return match_dict
    
    
    

In [5]:
### Loop through all json files in directory
data_list = []

for filename in os.listdir('../json_data/'):
    if filename.endswith(".json"): 
         with open('../json_data/' + filename) as file_name:
                tracking_data_json = json.load(file_name)
                file_year = [int(s) for s in filename.split('_') if s.isdigit()]
                data_list.append(get_match_level_info(tracking_data_json, year = file_year[0]))
    else:
        continue

        
available_matches = pd.DataFrame(data_list)
available_matches.sort_values(by=['year'], inplace = True)
available_matches.to_csv('matches_in_repo.csv')

#available_matches[available_matches['match_type'] == "Men's Singles"]

In [24]:
#available_matches[available_matches['match_type'] == "Women's Singles"]

### Notes:
-----

* I suspect `point_id` is the last point played ?
    * Ex: `2_7_6` is | Set 2 | Game 7 | 6 ? |
    * Ex: `3_9_9` is | Set 3 | Game ? Nadal vs Djokovic was game 12... | ... |

### Rough Notes

In [3]:
# Open one tracking json file (for debugging***)
with open('json_data/year_2020_SM001_tracking_data.json') as filename:
  atp_tracking_data_json = json.load(filename)

In [5]:
atp_tracking_data_dict = atp_tracking_data_json['courtVisionData'][0]
atp_tracking_data_dict.keys()

dict_keys(['pointsData', 'eventType', 'matchStatus', 'playersData', 'pointId', 'isMatchComplete', 'statsData', 'courtName', 'courtId', 'setsCompleted'])

### Play-by-Play processing 

In [33]:
def categorise_serve_direction(serveBounceCordinate_y):
    '''
    Args:
    -----
    
    Returns:
    --------
    
    Assumes Serve bounce coordinate is given in metres
    '''
    
    if serveBounceCordinate_y == None:
        return None
    
    one_third_length = 4.115/3

    # Tenuous at the moment
    # What if a player really miss-hits the ball, and it bounces to the opposite side of the court?
    if ( (serveBounceCordinate_y <= one_third_length) and (serveBounceCordinate_y >= -one_third_length) ):
        serve_dir = 'T'
    elif ((serveBounceCordinate_y < 2*one_third_length) and (serveBounceCordinate_y > one_third_length)  ) or ((serveBounceCordinate_y > -2*one_third_length) and (serveBounceCordinate_y < -one_third_length)  ):
        serve_dir = 'Body'
    elif (serveBounceCordinate_y >= 2*one_third_length) or ( serveBounceCordinate_y <= -2*one_third_length ):
        serve_dir = 'Wide'
    else:
        serve_dir = None
        
        
    return serve_dir

In [34]:
categorise_serve_direction(None)

In [35]:
def get_point_level_info(one_point_sequence):
    '''
    Args:
    -----
    one_point_sequence [dict]: Dictionary
    
    Returns:
    --------
    dict of row to append into a dataframe
    
    Notes:
    ------
    Don't convert them to integers...yet
    '''
    
    serve_speed_kph = one_point_sequence['ballSpeedFrench']
    if ( (serve_speed_kph == '0') | ( serve_speed_kph == 'NA' ) ):
        serve_speed_kph = one_point_sequence['returnSpeedFrench']
        
    serve_speed_v2 = one_point_sequence['ballSpeed']
    
    if ( (serve_speed_v2 == '0') | ( serve_speed_v2 == 'NA' ) ):
        serve_speed_v2 = one_point_sequence['returnSpeed']
        
    # Flag for whether we have tracking data on this point sequence
    
    is_track_avail = True
    if len(one_point_sequence['trajectoryData']) == 0 :
        is_track_avail = False
    
    
    # Identify whether serve bounce is Body, Wide, or Down the T
    serveBounceCordinate_y = one_point_sequence['serveBounceCordinate']['y']
    
    serve_dir = categorise_serve_direction(serveBounceCordinate_y)
    

    point_dict = dict(
        # Match situation information
        #point_ID_v2 = one_point_sequence['id'],
        point_ID = one_point_sequence['pointId'],
        set_num = one_point_sequence['set'],
        #set_num_v2 = one_point_sequence['setNumber'],
        game_num = one_point_sequence['game'], 
        point_num = one_point_sequence['point'],
        #point_number_v2 = one_point_sequence['pointNumber'],
        serve_num = one_point_sequence['serve'],
        
        # players involved
        server_id = one_point_sequence['serverId'],
        returner_id = one_point_sequence['receiverId'],
        point_winner_id = one_point_sequence['scorerId'],
        court_side = one_point_sequence['court'],
        
        # Serve Stats
        serve_speed_kph = serve_speed_kph,
        serve_speed_v2 = serve_speed_v2,
        serve_type = one_point_sequence['serveType'],
        fault_distance_missed_ft = one_point_sequence['distanceOutsideCourt'],
        fault_distance_missed_m = one_point_sequence['distanceOutsideCourtFrench'],
        #return_placement = one_point_sequence['returnPlacement'],
        
        
        # How point ended
        rally_length = one_point_sequence['rallyLength'],
        point_end_type = one_point_sequence['pointEndType'],
        error_type = one_point_sequence['errorType'],
        trapped_by_net = one_point_sequence['trappedByNet'],

        strokeType = one_point_sequence['strokeType'],
        hand = one_point_sequence['hand'],
        
        last_stroke_net_height_ft = one_point_sequence['heightAboveNet'],
        last_stroke_net_height_m = one_point_sequence['heightAboveNetFrench'],
        # 0 is ground height...height does not start on top of the net!!!
        
        winner_placement = one_point_sequence['winnerPlacement'],
        unforcedErrorPlacement = one_point_sequence['unforcedErrorPlacement'],
        is_break_point = one_point_sequence['breakPoint'],
        is_break_point_converted = one_point_sequence['breakPointConverted'],
        runAroundForeHand = one_point_sequence['runAroundForeHand'],

        
        # Tracking info
        is_track_avail = is_track_avail,
        
        serveBounceCordinate_x = one_point_sequence['serveBounceCordinate']['x'],
        serveBounceCordinate_y = one_point_sequence['serveBounceCordinate']['y'],
        serveBounceCordinate_z = one_point_sequence['serveBounceCordinate']['z'],
        serve_dir = serve_dir,
        
        # (initial) Ball coordinate on last shot 
        ballHitCordinate_x = one_point_sequence['ballHitCordinate']['x'],
        ballHitCordinate_y = one_point_sequence['ballHitCordinate']['y'],
        ballHitCordinate_z = one_point_sequence['ballHitCordinate']['z'],
        
        # Ball coordinate on its last bounce of rally
        ballBounceCordinate_x = one_point_sequence['ballBounceCordinate']['x'],
        ballBounceCordinate_y = one_point_sequence['ballBounceCordinate']['y'],
        ballBounceCordinate_z = one_point_sequence['ballBounceCordinate']['z'],
        
        # Server and Returner coordinates
        server_coord_x = one_point_sequence['serverCordinate']['x'],
        server_coord_y = one_point_sequence['serverCordinate']['y'],
        server_coord_z = one_point_sequence['serverCordinate']['z'],
        returner_coord_x = one_point_sequence['receiverCordinate']['x'],
        returner_coord_y = one_point_sequence['receiverCordinate']['y'],
        returner_coord_z = one_point_sequence['receiverCordinate']['z'],
        
        # unknowns
        spin_rpm = one_point_sequence['spin'],
        cruciality = one_point_sequence['cruciality'],
        returnPlacement =  one_point_sequence['returnPlacement']
    )
    
    return point_dict

In [36]:
# Open one tracking json file
with open('json_data/year_2020_SM001_tracking_data.json') as filename:
  atp_tracking_data_json = json.load(filename)

atp_tracking_data_dict = atp_tracking_data_json['courtVisionData'][0]
point_id = '1_1_6_1'
one_point_sequence = atp_tracking_data_dict['pointsData'][point_id]


get_point_level_info(one_point_sequence)

{'point_ID': '1_1_6_1',
 'set_num': '1',
 'game_num': '1',
 'point_num': '6',
 'serve_num': '1',
 'server_id': '9801',
 'returner_id': '7792',
 'point_winner_id': '7792',
 'court_side': 'AdCourt',
 'serve_speed_kph': '190 KPH',
 'serve_speed_v2': '190 KPH',
 'serve_type': 'Flat',
 'fault_distance_missed_ft': 'NA',
 'fault_distance_missed_m': 'NA',
 'rally_length': 10,
 'point_end_type': 'Winner',
 'error_type': 'NA',
 'trapped_by_net': False,
 'strokeType': 'Ground',
 'hand': 'BackHand',
 'last_stroke_net_height_ft': '5.01 Feet',
 'last_stroke_net_height_m': '1.53 Metre',
 'winner_placement': 'Cross Court',
 'unforcedErrorPlacement': 'NA',
 'is_break_point': False,
 'is_break_point_converted': False,
 'runAroundForeHand': False,
 'is_track_avail': True,
 'serveBounceCordinate_x': 5.643,
 'serveBounceCordinate_y': -0.478,
 'serveBounceCordinate_z': 0.039,
 'serve_dir': 'T',
 'ballHitCordinate_x': 11.172,
 'ballHitCordinate_y': 2.161,
 'ballHitCordinate_z': 1.04,
 'ballBounceCordinate_x'

In [37]:
def get_match_point_level_info(raw_json_file):
    '''
    Args:
    -----
    one_point_sequence [dict]: Dictionary
    
    Returns:
    --------
    pandas DataFrame
    '''
    all_tracking_data_dict = raw_json_file['courtVisionData'][0]['pointsData']
    
    data_list = []
    for point_id_key in sorted(all_tracking_data_dict.keys()):
        #print(point_id_key)
        data_list.append( get_point_level_info( all_tracking_data_dict[point_id_key] ) )
    
    match_point_df = pd.DataFrame(data_list)
    
    # Sort Dataframe by Set Number, Game number, Point Number, Serve Number
    match_point_df[['set_num', 'game_num', 'point_num', 'serve_num']] = match_point_df[['set_num', 'game_num', 'point_num', 'serve_num']].astype(int)


    match_point_df.sort_values(by = ['set_num', 'game_num', 'point_num', 'serve_num'], inplace = True)
    
    return match_point_df



In [38]:
test_match = get_match_point_level_info(atp_tracking_data_json)
test_match.to_csv('point_sequence_djokovic_nadal_2020_rolandgarros_pbp.csv', index = False)
#test_match[['point_ID','server_coord_x', 'server_coord_y', 'server_coord_z', 'returner_coord_x','returner_coord_y', 'returner_coord_z']]

#test_match[['point_end_type', 'error_type', 'fault_distance_missed_m','last_stroke_net_height_m', 'serveBounceCordinate_x', 'serveBounceCordinate_y', 'serveBounceCordinate_z']]


In [40]:
#test_match[['point_ID', 'point_end_type', 'strokeType', 'hand']]
#test_match[['is_track_avail','point_ID', 'point_end_type', 'error_type', 'serve_num', 'distance_missed_fault_ft', 'distance_missed_fault_m', 'last_stroke_net_clearance_m']]
#test_match

In [41]:
# Save Nadal vs. Federer (2019)

# Open one tracking json file (for debugging***)
with open('/Users/petertea/tennis_analytics/projects/roland_garros_tracking_data/collect_data/json_data_2021/year_2019_SM003_tracking_data.json') as filename:
  rafa_fed_2019_tracking_data_json = json.load(filename)

rafa_fed_match = get_match_point_level_info(rafa_fed_2019_tracking_data_json)
rafa_fed_match.to_csv('point_sequence_federer_nadal_2019_rolandgarros_pbp.csv', index = False)

### Save Ball Trajectory Data

In [6]:
def save_trajectory_data_one_rally(one_point_sequence):
    '''
    Args:
    -----
    one_point_sequence [dict]: Dictionary
    
    Returns:
    --------
    pandas DataFrame (for one point sequence)
    
    Notes:
    ------
    Don't convert them to integers...yet
    '''
    
    ball_trajectory_df = pd.DataFrame(one_point_sequence['trajectoryData'])
    
    if ball_trajectory_df.empty:
        return ball_trajectory_df
    
    #######################################################################
    #                     Match situation information                     #
    #######################################################################
    # --> Get indices where ball is hit 
    hit_indices = ball_trajectory_df.index[ball_trajectory_df['position'] == 'hit'].tolist()
    hit_indices.append(ball_trajectory_df.shape[0])

    # Get lengths of rally index (expect 4 or 5)
    # In the usual case, we expect this sequence: Hit --> Peak --> Net --> Bounce
    # But what if it's a half volley? (Hit --> Peak --> Net)
    # But what if it's a hit on the rise?  Hit --> Peak --> Net --> Bounce --> Peak
    # *** Ball trajectory also includes erroneous balls (mishits)...so we sometimes get strike_index = 1 + rally_index
    hit_indices_diff_len = [x - hit_indices[i - 1] for i, x in enumerate(hit_indices)][1:]

    rally_length = len(hit_indices_diff_len)

    rally_index_list = []
    for rally_ind in range(1, rally_length + 1):
        rally_index_list.append(np.repeat( rally_ind, repeats=hit_indices_diff_len[rally_ind-1]))
    
    # Combine a list of numpy arrays into a single array
    ball_trajectory_df['strike_index'] = np.concatenate( rally_index_list, axis=0 )
    
    ##################################################
    #          Match situation information           #
    ##################################################
    ball_trajectory_df['point_ID'] = one_point_sequence['pointId']
    ball_trajectory_df['set_num'] = one_point_sequence['set']
    ball_trajectory_df['game_num'] = one_point_sequence['game'] 
    ball_trajectory_df['point_num'] = one_point_sequence['point']
    ball_trajectory_df['serve_num'] = one_point_sequence['serve']
    
    return ball_trajectory_df
    
    
    

In [17]:
#save_trajectory_data_one_rally(one_point_sequence)

In [7]:
def get_match_point_ball_trajectory_data(raw_json_file):
    '''
    Args:
    -----
    one_point_sequence [dict]: Dictionary
    
    Returns:
    --------
    pandas DataFrame
    '''
    all_tracking_data_dict = raw_json_file['courtVisionData'][0]['pointsData']
    
    match_ball_trajectory_list = []
    
    for point_id_key in sorted(all_tracking_data_dict.keys()):
        #print(point_id_key)
        ball_trajectory_df = save_trajectory_data_one_rally( all_tracking_data_dict[point_id_key] )
        
        if ball_trajectory_df.empty:
            continue
        else:
            match_ball_trajectory_list.append( ball_trajectory_df )

            
    match_ball_trajectory_df = pd.concat(match_ball_trajectory_list)
    
    
    ### Reorder columns
    match_ball_trajectory_df = match_ball_trajectory_df[['point_ID', 'set_num', 'game_num', 'point_num', 'serve_num', 'strike_index', 'position', 'x', 'y', 'z' ]]
    
    return match_ball_trajectory_df#.reset_index(inplace = True)


In [23]:
test_trajectory = get_match_point_ball_trajectory_data(atp_tracking_data_json)

test_trajectory.reset_index(drop = True)
test_trajectory.to_csv('ball_trajectory_djokovic_nadal_2020_rolandgarros_pbp.csv', index = False)

### Check out for each match, how many rally point data do we have?

In [9]:
data_list = []

for filename in os.listdir('./json_data/'):
    if filename.endswith(".json"): 
         with open('./json_data/' + filename) as file_name:
                #print(filename)
                atp_tracking_data_json = json.load(file_name)
                test_match = get_match_point_level_info(atp_tracking_data_json)
                num_points_avail = test_match['is_track_avail'].sum()
                num_points = test_match.shape[0]
                
                year = [int(s) for s in filename.split('_') if s.isdigit()][0]
                
                atp_flag = 'atp'
                if (filename[:3] == 'wta'):
                    atp_flag = 'wta'
                    
                tracking_data_dict = atp_tracking_data_json['courtVisionData'][0]
    
                #atp_tracking_data_dict['isMatchComplete']


                # -----------------------------
                # Player information
                # -----------------------------
                player1_info = tracking_data_dict['playersData']['playerTeam']
                player2_info = tracking_data_dict['playersData']['opponentTeam']
                player1 = player1_info[0]['name']
                player2 = player2_info[0]['name']



                    
                    
                
                summary_dict = dict(
                    player1 = player1,
                    player2 = player2,
                    num_points_avail = num_points_avail,
                    num_points = num_points,
                    year = year,
                    atp_flag = atp_flag,
                    filename=filename
                )
                data_list.append(summary_dict)
                
                #print('done!')
                
    else:
        continue




In [10]:
# Seems like we have a decent amount of available points!
summary_points_avail_df = pd.DataFrame(data_list)

summary_points_avail_df

Unnamed: 0,player1,player2,num_points_avail,num_points,year,atp_flag,filename
0,M.GASPARYAN,E.MERTENS,0,169,2020,wta,wta_year_2020_SD087_tracking_data.json
1,Y. HANFMANN,R. NADAL,207,209,2019,atp,year_2019_SM127_tracking_data.json
2,S. TSITSIPAS,S. WAWRINKA,517,526,2019,atp,year_2019_SM012_tracking_data.json
3,J.MUNAR,S.TSITSIPAS,335,335,2020,atp,year_2020_SM095_tracking_data.json
4,Kr.PLISKOVA,G.MUGURUZA,156,157,2020,wta,wta_year_2020_SD051_tracking_data.json
5,S.HALEP,I.BEGU,150,150,2020,wta,wta_year_2020_SD032_tracking_data.json
6,S. HALEP,I. SWIATEK,0,102,2019,wta,wta_year_2019_SD010_tracking_data.json
7,N. DJOKOVIC,D. THIEM,409,416,2019,atp,year_2019_SM002_tracking_data.json
8,D.SHAPOVALOV,G.SIMON,383,387,2020,atp,year_2020_SM088_tracking_data.json
9,F. FERRO,K. MLADENOVIC,190,196,2019,wta,wta_year_2019_SD125_tracking_data.json


In [11]:
summary_points_avail_df.to_csv('summary_points_avail_df.csv', index=False)