# Functions to extract information from json tracking files

### All these functions are moved into a more concise *.py file (`process.py`) in the source directory.

To do list:
----------

- [x] Get pre-match level information
- [x] Get play-by-play information
- [x] Create Trajectory data frame (separate)

- [x] Create flag for which points there exist accesible trajectory data

- [ ] Add in `statsData` indicator columns for Dfs, Aces, points won, etc

- [ ] Figure out what returner and server coordinates are (location at last shot????)

In [5]:
import json
import pandas as pd
import os
import numpy as np

pd.set_option('display.max_columns', None)
pd.set_option("max_rows", None)

In [6]:
def get_match_level_info(tracking_data_json, year):
    '''
    Args:
    -----
    year: match year
    tracking_data_json [json]: Json file
    
    Returns:
    --------
    dict of row to append into a dataframe
    
    
    Returns basic information from a match (player names, seed, court name)
    '''
    
    # Get to the crux of the information
    tracking_data_dict = tracking_data_json['courtVisionData'][0]
    
    #atp_tracking_data_dict['isMatchComplete']


    # -----------------------------
    # Player information
    # -----------------------------
    player1_info = tracking_data_dict['playersData']['playerTeam']
    player2_info = tracking_data_dict['playersData']['opponentTeam']

    
    match_dict = dict(
        year = year,
        player1 = player1_info[0]['name'],
        player2 = player2_info[0]['name'],
        player1_id = player1_info[0]['id'],
        player1_country = player1_info[0]['country'],
        player1_seed = player1_info[0]['seed'],
        player2_id = player2_info[0]['id'],
        player2_country = player2_info[0]['country'],
        player2_seed = player2_info[0]['seed'],
        
    # -------------------------------
    # - I have no idea what this is
    # -------------------------------
        point_id = tracking_data_dict['pointId'],
        court_name = tracking_data_dict['courtName'],
        court_id = tracking_data_dict['courtId'],
        num_sets_completed = tracking_data_dict['setsCompleted'],
        
        # Mens/Womens Singles
        match_type = tracking_data_dict['eventType'],

        # Complete status?
        match_status = tracking_data_dict['matchStatus']
        
        
    )
    
    return match_dict
    
    
    

In [4]:
### Loop through all json files in directory
data_list = []

for filename in os.listdir('../../json_data/'):
    if filename.endswith(".json"): 
         with open('../../json_data/' + filename) as file_name:
                tracking_data_json = json.load(file_name)
                file_year = [int(s) for s in filename.split('_') if s.isdigit()]
                match_info_to_add = get_match_level_info(tracking_data_json, year = file_year[0])
                match_info_to_add['filename'] = filename
                data_list.append(match_info_to_add)
    else:
        continue

        
available_matches = pd.DataFrame(data_list)
available_matches.sort_values(by=['year', 'match_type'], inplace = True)
available_matches.to_csv('matches_in_repo.csv', index = False)
#available_matches.head()
available_matches[available_matches['match_type'] == "Men's Singles"].head()

Unnamed: 0,year,player1,player2,player1_id,player1_country,player1_seed,player2_id,player2_country,player2_seed,point_id,court_name,court_id,num_sets_completed,match_type,match_status,filename
76,2018,R. NADAL,J. DEL POTRO,7792,ESP,1.0,11713,ARG,5.0,1_9_9,Court Philippe-Chatrier,1,3,Men's Singles,C,year_2018_SM002_tracking_data.json
1,2019,Y. HANFMANN,R. NADAL,22568,GER,,7792,ESP,2.0,3_9_6,Court Philippe-Chatrier,1,3,Men's Singles,I,year_2019_SM127_tracking_data.json
2,2019,S. TSITSIPAS,S. WAWRINKA,35398,GRE,6.0,7815,SUI,24.0,5_9_5,Court Suzanne-Lenglen,2,5,Men's Singles,C,year_2019_SM012_tracking_data.json
7,2019,N. DJOKOVIC,D. THIEM,9801,SRB,1.0,26348,AUT,4.0,5_9_8,Court Philippe-Chatrier,1,5,Men's Singles,I,year_2019_SM002_tracking_data.json
10,2019,N. DJOKOVIC,H. LAAKSONEN,9801,SRB,1.0,19041,SUI,,3_9_6,Court Suzanne-Lenglen,2,3,Men's Singles,C,year_2019_SM032_tracking_data.json


In [4]:
available_matches[available_matches['match_type'] == "Women's Singles"].head()

Unnamed: 0,year,player1,player2,player1_id,player1_country,player1_seed,player2_id,player2_country,player2_seed,point_id,court_name,court_id,num_sets_completed,match_type,match_status,filename
6,2019,S. HALEP,I. SWIATEK,18033,ROU,3.0,40613,POL,,2_6_6,Court Philippe-Chatrier,1,2,Women's Singles,C,wta_year_2019_SD010_tracking_data.json
9,2019,F. FERRO,K. MLADENOVIC,30524,FRA,,19921,FRA,,2_9_9,Court Suzanne-Lenglen,2,2,Women's Singles,C,wta_year_2019_SD125_tracking_data.json
14,2019,S. HALEP,L. TSURENKO,18033,ROU,3.0,21519,UKR,27.0,2_7_8,Court Philippe-Chatrier,1,2,Women's Singles,C,wta_year_2019_SD020_tracking_data.json
18,2019,S. CIRSTEA,K. JUVAN,14440,ROU,,40087,SLO,,3_9_4,Court Suzanne-Lenglen,2,3,Women's Singles,C,wta_year_2019_SD095_tracking_data.json
23,2019,S. STEPHENS,M. DOI,21500,USA,7.0,17949,JPN,,2_9_8,Court Suzanne-Lenglen,2,2,Women's Singles,C,wta_year_2019_SD096_tracking_data.json


### Notes:
-----

* I suspect `point_id` is the last point played ?
    * Ex: `2_7_6` is | Set 2 | Game 7 | 6 ? |
    * Ex: `3_9_9` is | Set 3 | Game ? Nadal vs Djokovic was game 12... | ... |

### Rough Notes

In [8]:
# Open one tracking json file (for debugging***)
with open('../../json_data/year_2019_SM012_tracking_data.json') as filename:
  atp_tracking_data_json = json.load(filename)

In [9]:
atp_tracking_data_dict = atp_tracking_data_json['courtVisionData'][0]
atp_tracking_data_dict.keys()

dict_keys(['isMatchComplete', 'eventType', 'courtName', 'courtId', 'pointsData', 'playersData', 'statsData', 'setsCompleted', 'pointId', 'matchStatus'])

In [10]:
#atp_tracking_data_dict['pointsData']['1_2_7_2']
atp_tracking_data_dict['pointsData']['1_2_7_2']

KeyError: '1_2_7_2'

### Play-by-Play processing 

In [5]:
def categorise_serve_direction(serveBounceCordinate_y):
    '''
    Args:
    -----
    serveBounceCordinate_y [int]
    
    Returns:
    --------
    
    Assumes Serve bounce coordinate is given in metres
    Note: (0,0,0) are the coordinates at the middle of the net.
    Dimension of court: 23.77 m in length (y), and 8.23 m wide (x) -- for single's court
    
    Classifies ball bounce coordinate as: Wide, Body, or T
    '''
    
    if serveBounceCordinate_y == None:
        return None
    
    # Court is 8.23 m wide
    one_third_length = 4.115/3

    # Tenuous at the moment
    # What if a player really miss-hits the ball, and it bounces to the opposite side of the court?
    if ( (serveBounceCordinate_y <= one_third_length) and (serveBounceCordinate_y >= -one_third_length) ):
        serve_dir = 'T'
    elif ((serveBounceCordinate_y < 2*one_third_length) and (serveBounceCordinate_y > one_third_length)  ) or ((serveBounceCordinate_y > -2*one_third_length) and (serveBounceCordinate_y < -one_third_length)  ):
        serve_dir = 'Body'
    elif (serveBounceCordinate_y >= 2*one_third_length) or ( serveBounceCordinate_y <= -2*one_third_length ):
        serve_dir = 'Wide'
    else:
        serve_dir = None
        
        
    return serve_dir

In [6]:
categorise_serve_direction(None)
categorise_serve_direction(0)

'T'

In [7]:
def get_point_level_info(one_point_sequence):
    '''
    Args:
    -----
    one_point_sequence [dict]: Dictionary
    
    Returns:
    --------
    dict of row to append into a dataframe
    
    *******************************************************
    Collects relevant information for a single rally point
    *******************************************************
    
    Notes:
    ------
    For a point sequence in a match, tidy information on relevant stats like serve speed
    or ball coordinates.
    '''
    
    # -- Get Serve Speed
    serve_speed_kph = one_point_sequence['ballSpeedFrench']
    if ( (serve_speed_kph == '0') | ( serve_speed_kph == 'NA' ) ):
        serve_speed_kph = one_point_sequence['returnSpeedFrench']
        
    serve_speed_kph_v2 = one_point_sequence['ballSpeed']
    
    if ( (serve_speed_kph_v2 == '0') | ( serve_speed_kph_v2 == 'NA' ) ):
        serve_speed_kph_v2 = one_point_sequence['returnSpeed']
       
    
    # -- Flag for whether we have tracking data on this point sequence
    is_track_avail = True
    if len(one_point_sequence['trajectoryData']) == 0 :
        is_track_avail = False
    
    # -- Serve Net Clearance
    z_net_serve = None
    if is_track_avail:
        
        try:
            served_ball_loc_net = one_point_sequence['trajectoryData'][2]
            
            if served_ball_loc_net['position'] == 'net':
                z_net_serve = served_ball_loc_net['z']
                
        except IndexError:
            print('Index Error...')
            
    ##########################################################################        
    # -- Add ball location at contact of serve
    x_ball_at_serve = None
    y_ball_at_serve = None
    z_ball_at_serve = None
    
    
    # -- Add max (peak) ball height location of serve
    z_peak_serve = None
    
    if is_track_avail :
        try:
            ball_loc_at_serve = one_point_sequence['trajectoryData'][0]
            
            if ball_loc_at_serve['position'] == 'hit':
                x_ball_at_serve = ball_loc_at_serve['x']
                y_ball_at_serve = ball_loc_at_serve['y']
                z_ball_at_serve = ball_loc_at_serve['z']
                
        except IndexError:
            print('Index Error...')
            
            
    #if is_track_avail :
        try:
            serve_peak = one_point_sequence['trajectoryData'][1]
            
            if serve_peak['position'] == 'peak':
                z_peak_serve = serve_peak['z']
                
        except IndexError:
            print('Index Error...')
            
    ########################################################################## 
            


    
    
    
    # -- Identify whether serve bounce is Body, Wide, or Down the T
    serveBounceCordinate_y = one_point_sequence['serveBounceCordinate']['y']
    
    serve_dir = categorise_serve_direction(serveBounceCordinate_y)
    

    point_dict = dict(
        # Match situation information
        #point_ID_v2 = one_point_sequence['id'],
        point_ID = one_point_sequence['pointId'],
        set_num = one_point_sequence['set'],
        #set_num_v2 = one_point_sequence['setNumber'],
        game_num = one_point_sequence['game'], 
        point_num = one_point_sequence['point'],
        #point_number_v2 = one_point_sequence['pointNumber'],
        serve_num = one_point_sequence['serve'],
        
        # players involved
        server_id = one_point_sequence['serverId'],
        returner_id = one_point_sequence['receiverId'],
        point_winner_id = one_point_sequence['scorerId'],
        court_side = one_point_sequence['court'],
        
        # Serve Stats
        serve_speed_kph = serve_speed_kph,
        serve_speed_kph_v2 = serve_speed_kph_v2,
        serve_type = one_point_sequence['serveType'],
        fault_distance_missed_ft = one_point_sequence['distanceOutsideCourt'],
        fault_distance_missed_m = one_point_sequence['distanceOutsideCourtFrench'],
        #return_placement = one_point_sequence['returnPlacement'],
        x_ball_at_serve = x_ball_at_serve,
        y_ball_at_serve = y_ball_at_serve,
        z_ball_at_serve = z_ball_at_serve,
        
        # How point ended
        rally_length = one_point_sequence['rallyLength'],
        point_end_type = one_point_sequence['pointEndType'],
        error_type = one_point_sequence['errorType'],
        trapped_by_net = one_point_sequence['trappedByNet'],

        strokeType = one_point_sequence['strokeType'],
        hand = one_point_sequence['hand'],
        
        last_stroke_net_height_ft = one_point_sequence['heightAboveNet'],
        last_stroke_net_height_m = one_point_sequence['heightAboveNetFrench'],
        # 0 is ground height...height does not start on top of the net!!!
        
        winner_placement = one_point_sequence['winnerPlacement'],
        unforcedErrorPlacement = one_point_sequence['unforcedErrorPlacement'],
        is_break_point = one_point_sequence['breakPoint'],
        is_break_point_converted = one_point_sequence['breakPointConverted'],
        runAroundForeHand = one_point_sequence['runAroundForeHand'],

        
        # Tracking info
        is_track_avail = is_track_avail,
        
        serveBounceCordinate_x = one_point_sequence['serveBounceCordinate']['x'],
        serveBounceCordinate_y = one_point_sequence['serveBounceCordinate']['y'],
        serveBounceCordinate_z = one_point_sequence['serveBounceCordinate']['z'],
        serve_dir = serve_dir,
        z_net_serve = z_net_serve,
        z_peak_serve = z_peak_serve,
        
        # (initial) Ball coordinate on last shot 
        ballHitCordinate_x = one_point_sequence['ballHitCordinate']['x'],
        ballHitCordinate_y = one_point_sequence['ballHitCordinate']['y'],
        ballHitCordinate_z = one_point_sequence['ballHitCordinate']['z'],
        
        # Ball coordinate on its last bounce of rally
        ballBounceCordinate_x = one_point_sequence['ballBounceCordinate']['x'],
        ballBounceCordinate_y = one_point_sequence['ballBounceCordinate']['y'],
        ballBounceCordinate_z = one_point_sequence['ballBounceCordinate']['z'],
        
        # Server and Returner coordinates
        server_coord_x = one_point_sequence['serverCordinate']['x'],
        server_coord_y = one_point_sequence['serverCordinate']['y'],
        server_coord_z = one_point_sequence['serverCordinate']['z'],
        returner_coord_x = one_point_sequence['receiverCordinate']['x'],
        returner_coord_y = one_point_sequence['receiverCordinate']['y'],
        returner_coord_z = one_point_sequence['receiverCordinate']['z'],
        
        # unknowns
        spin_rpm = one_point_sequence['spin'],
        cruciality = one_point_sequence['cruciality'],
        returnPlacement =  one_point_sequence['returnPlacement']
    )
    
    return point_dict

In [9]:
# Open one tracking json file
with open('../json_data/year_2020_SM001_tracking_data.json') as filename:
  atp_tracking_data_json = json.load(filename)

atp_tracking_data_dict = atp_tracking_data_json['courtVisionData'][0]
point_id = '1_1_8_2'
one_point_sequence = atp_tracking_data_dict['pointsData'][point_id]


get_point_level_info(one_point_sequence)

{'point_ID': '1_1_8_2',
 'set_num': '1',
 'game_num': '1',
 'point_num': '8',
 'serve_num': '2',
 'server_id': '9801',
 'returner_id': '7792',
 'point_winner_id': '7792',
 'court_side': 'AdCourt',
 'serve_speed_kph': '134 KPH',
 'serve_speed_kph_v2': '134 KPH',
 'serve_type': 'Unclassified',
 'fault_distance_missed_ft': 'NA',
 'fault_distance_missed_m': 'NA',
 'x_ball_at_serve': -11.252,
 'y_ball_at_serve': 1.014,
 'z_ball_at_serve': 2.817,
 'rally_length': 8,
 'point_end_type': 'Forced Error',
 'error_type': 'NA',
 'trapped_by_net': False,
 'strokeType': 'Ground',
 'hand': 'BackHand',
 'last_stroke_net_height_ft': '5.21 Feet',
 'last_stroke_net_height_m': '1.59 Metre',
 'winner_placement': 'NA',
 'unforcedErrorPlacement': 'NA',
 'is_break_point': True,
 'is_break_point_converted': True,
 'runAroundForeHand': False,
 'is_track_avail': True,
 'serveBounceCordinate_x': 3.975,
 'serveBounceCordinate_y': -2.174,
 'serveBounceCordinate_z': 0.03,
 'serve_dir': 'Body',
 'z_net_serve': 1.146,


In [10]:
one_point_sequence

{'cruciality': 'false',
 'returnPlacement': 3.5084999999999997,
 'trajectoryData': [{'x': -11.252, 'y': 1.014, 'z': 2.817, 'position': 'hit'},
  {'x': -11.252, 'y': 1.014, 'z': 2.817, 'position': 'peak'},
  {'x': 0.0, 'y': -1.55, 'z': 1.146, 'position': 'net'},
  {'x': 3.975, 'y': -2.174, 'z': 0.03, 'position': 'bounce'},
  {'x': 14.61, 'y': -4.51, 'z': 1.14, 'position': 'hit'},
  {'x': 6.461, 'y': -1.762, 'z': 2.018, 'position': 'peak'},
  {'x': 0.0, 'y': 0.443, 'z': 1.328, 'position': 'net'},
  {'x': -4.19, 'y': 1.852, 'z': 0.043, 'position': 'bounce'},
  {'x': -9.465, 'y': 3.799, 'z': 1.274, 'position': 'peak'},
  {'x': -11.644, 'y': 4.592, 'z': 1.036, 'position': 'hit'},
  {'x': -2.194, 'y': 3.86, 'z': 1.593, 'position': 'peak'},
  {'x': 0.0, 'y': 3.731, 'z': 1.557, 'position': 'net'},
  {'x': 11.113, 'y': 3.344, 'z': 0.041, 'position': 'bounce'},
  {'x': 15.419, 'y': 3.151, 'z': 0.884, 'position': 'hit'},
  {'x': 3.651, 'y': 2.379, 'z': 2.392, 'position': 'peak'},
  {'x': 0.0, 'y'

In [54]:
def get_match_point_level_info(raw_json_file):
    '''
    Args:
    -----
    one_point_sequence [dict]: Dictionary
    
    Returns:
    --------
    pandas DataFrame
    
    Collect all play-by-play information for a match into a pandas DataFrame
    '''
    all_tracking_data_dict = raw_json_file['courtVisionData'][0]['pointsData']
    
    data_list = []
    for point_id_key in sorted(all_tracking_data_dict.keys()):
        #print(point_id_key)
        data_list.append( get_point_level_info( all_tracking_data_dict[point_id_key] ) )
    
    match_point_df = pd.DataFrame(data_list)
    
    # Sort Dataframe by Set Number, Game number, Point Number, Serve Number
    match_point_df[['set_num', 'game_num', 'point_num', 'serve_num']] = match_point_df[['set_num', 'game_num', 'point_num', 'serve_num']].astype(int)


    match_point_df.sort_values(by = ['set_num', 'game_num', 'point_num', 'serve_num'], inplace = True)
    
    
    # -- Fix court side categorization (Ad / Deuce court)
    match_point_df['court_side'] = np.where(match_point_df['point_num'] %2 == 0,
                                            'AdCourt','DeuceCourt')
    
    # -- Reset row indices
    match_point_df.reset_index(drop=True, inplace=True)

    # -- Add Fault Flag
    is_fault = np.where(np.isin(match_point_df['point_end_type'], ['Faulty Serve', 'DoubleFault']),
                                          1, 0)
    
    # |--> some Faults are coded as 'NA'
    is_fault_v2 = np.where(match_point_df[['set_num', 'game_num', 'point_num']].duplicated(),1,0).tolist()
    is_fault_v2_order = np.array(is_fault_v2[1:] + [0])
    for serve_index in range(len(is_fault)):
        if( (is_fault_v2_order[serve_index] == 1) & (is_fault[serve_index] != 1) ):
            is_fault[serve_index] = 1
    
    match_point_df['is_fault'] = is_fault

    
    # -- Add Double Fault Flag
    match_point_df['is_doublefault'] = np.where(np.isin(match_point_df['point_end_type'], ['DoubleFault']),
                                                1, 0)
    # -- Add Tiebreak flag
    match_point_df['is_tiebreak'] = np.where(match_point_df['game_num'] > 12,
                                             1, 0)
    
    # -- Add ace flag
    match_point_df['is_prev_doublefault'] = np.insert(arr = match_point_df['is_doublefault'][:-1].values, obj=0, values=0)
    match_point_df['is_ace'] =  np.where(match_point_df['point_end_type'] == 'Ace',1,0)
    match_point_df['is_prev_ace'] = np.insert(arr = match_point_df['is_ace'][:-1].values, obj=0, values=0)
    
    
    # -- Add previous serve was ace / double fault flag
    
    # -- Add server score & returner score columns
    match_point_df = add_server_and_returner_scores(match_point_df)
    
    # -- Add cumulative games won and sets won
    #match_point_df =  add_cum_games_and_sets(match_point_df)
    
    # -- Reset row indices
    match_point_df.reset_index(drop=True, inplace=True)
    
    return match_point_df



In [13]:
def add_server_and_returner_scores(match_pbp):
    '''
    Args:
    -----
    match_pbp [pandas DataFrame]
    
    Returns:
    --------
    pandas DataFrame with 2 new columns: server score and returner score at the beginning of the point
    
    '''
    
    
    server_score_vec = []
    returner_score_vec = []


    for point_id in range(match_pbp.shape[0]):
        #print(point_id)

        # -- If first point of a game, set both scores to 0
        point_num = match_pbp['point_num'][point_id]
        if point_num == 1:
            server_score_vec.append(0)
            returner_score_vec.append(0)
            continue

        # -- Get server & returner's current scores
        current_server_score = server_score_vec[point_id -1]
        current_returner_score = returner_score_vec[point_id-1]
        
        
        # -- Sometimes, Faults aren't properly encoded... Ex sometimes coded as 'NA'
        prev_point_num = match_pbp['point_num'][point_id-1]
        if point_num == prev_point_num:
            is_fault = 1
        

        # -- If 1st Serve Fault, score does not change
        is_fault = match_pbp['is_fault'][point_id-1]
        is_doublefault = match_pbp['is_doublefault'][point_id-1]
        if ((is_fault ==1) & (is_doublefault == 0)):
            server_score_vec.append(current_server_score)
            returner_score_vec.append(current_returner_score)
            continue
            



        # -- Get IDs
        server_id = match_pbp['server_id'][point_id-1]
        returner_id = match_pbp['returner_id'][point_id-1]
        winner_id = match_pbp['point_winner_id'][point_id-1]


        # -- Deal with Tiebreaks
        is_tiebreak = match_pbp['is_tiebreak'][point_id-1]

        if(is_tiebreak == 1):
        
            next_server_id = match_pbp['server_id'][point_id]
            did_server_change = np.logical_not(next_server_id == server_id)


            if server_id == winner_id:
                update_server_score = current_server_score + 1

                # -- If server wins and changes, then score gets added to the returner
                if did_server_change:
                    server_score_vec.append(current_returner_score)
                    returner_score_vec.append(update_server_score)
                else:
                    server_score_vec.append(update_server_score)
                    returner_score_vec.append(current_returner_score)

            else: # -- If returner wins and changes, then score gets added to the server
                update_returner_score = current_returner_score + 1

                if did_server_change:
                    server_score_vec.append(update_returner_score)
                    returner_score_vec.append(current_server_score)

                else:
                    server_score_vec.append(current_server_score)
                    returner_score_vec.append(update_returner_score)
            continue


        if server_id == winner_id:
            update_server_score = current_server_score + 1
            server_score_vec.append(update_server_score) 
            returner_score_vec.append(current_returner_score)


        else:
            update_returner_score = current_returner_score + 1
            server_score_vec.append(current_server_score)
            returner_score_vec.append(update_returner_score)


    match_pbp['server_score'] = server_score_vec
    match_pbp['returner_score'] = returner_score_vec
    
    return match_pbp


In [55]:
match_point_df = get_match_point_level_info(atp_tracking_data_json)
match_point_df

Unnamed: 0,point_ID,set_num,game_num,point_num,serve_num,server_id,returner_id,point_winner_id,court_side,serve_speed_kph,serve_speed_kph_v2,serve_type,fault_distance_missed_ft,fault_distance_missed_m,x_ball_at_serve,y_ball_at_serve,z_ball_at_serve,rally_length,point_end_type,error_type,trapped_by_net,strokeType,hand,last_stroke_net_height_ft,last_stroke_net_height_m,winner_placement,unforcedErrorPlacement,is_break_point,is_break_point_converted,runAroundForeHand,is_track_avail,serveBounceCordinate_x,serveBounceCordinate_y,serveBounceCordinate_z,serve_dir,z_net_serve,z_peak_serve,ballHitCordinate_x,ballHitCordinate_y,ballHitCordinate_z,ballBounceCordinate_x,ballBounceCordinate_y,ballBounceCordinate_z,server_coord_x,server_coord_y,server_coord_z,returner_coord_x,returner_coord_y,returner_coord_z,spin_rpm,cruciality,returnPlacement,is_fault,is_doublefault,is_tiebreak,is_prev_doublefault,is_ace,is_prev_ace,server_score,returner_score
0,1_1_1_1,1,1,1,1,9801,7792,,DeuceCourt,0 KPH,,,,,,,,0,Faulty Serve,,False,,BackHand,,0 Metre,,,False,False,False,False,,,,,,,,,,,,,,,,,,,0.0,False,,1,0,0,0,0,0,0,0
1,1_1_1_2,1,1,1,2,9801,7792,9801.0,DeuceCourt,136 KPH,136 KPH,,,,,,,3,Unforced Error,,False,,ForeHand,,0 Metre,,,False,False,False,False,,,,,,,,,,,,,,,,,,,0.0,False,,0,0,0,0,0,0,0,0
2,1_1_2_1,1,1,2,1,9801,7792,9801.0,AdCourt,0 KPH,,,,,,,,0,Faulty Serve,,False,,BackHand,,0 Metre,,,False,False,False,False,,,,,,,,,,,,,,,,,,,0.0,False,,1,0,0,0,0,0,1,0
3,1_1_2_2,1,1,2,2,9801,7792,7792.0,AdCourt,0 KPH,,,,,,,,6,Forced Error,,False,,BackHand,,0 Metre,,,False,False,False,False,,,,,,,,,,,,,,,,,,,0.0,False,,0,0,0,0,0,0,1,0
4,1_1_3_1,1,1,3,1,9801,7792,7792.0,DeuceCourt,0 KPH,,,,,,,,0,Faulty Serve,,False,,BackHand,,0 Metre,,,False,False,False,False,,,,,,,,,,,,,,,,,,,0.0,False,,1,0,0,0,0,0,1,1
5,1_1_3_2,1,1,3,2,9801,7792,9801.0,DeuceCourt,137 KPH,137 KPH,,,,,,,5,Forced Error,,False,,BackHand,,0 Metre,,,False,False,False,False,,,,,,,,,,,,,,,,,,,0.0,False,,0,0,0,0,0,0,1,1
6,1_1_4_2,1,1,4,2,9801,7792,9801.0,AdCourt,153 KPH,153 KPH,,,,,,,11,Winner,,False,,BackHand,,0 Metre,,,False,False,False,False,,,,,,,,,,,,,,,,,,,0.0,False,,0,0,0,0,0,0,2,1
7,1_1_5_1,1,1,5,1,9801,7792,7792.0,DeuceCourt,194 KPH,194 KPH,,,,,,,2,Unforced Error,,False,,BackHand,,,,,False,False,False,False,,,,,,,,,,,,,,,,,,,0.0,False,,0,0,0,0,0,0,3,1
8,1_1_6_1,1,1,6,1,9801,7792,7792.0,AdCourt,190 KPH,190 KPH,Flat,,,-11.209,0.737,2.832,10,Winner,,False,Ground,BackHand,5.01 Feet,1.53 Metre,Cross Court,,False,False,False,True,5.643,-0.478,0.039,T,1.127,2.832,11.172,2.161,1.04,-9.23,-4.023,0.033,-16.408,2.816,0.0,11.882,1.555,0.0,3162.74,False,3.50975,0,0,0,0,0,0,3,2
9,1_1_7_1,1,1,7,1,9801,7792,7792.0,DeuceCourt,174 KPH,174 KPH,Unclassified,,,-11.178,-0.692,2.819,10,Forced Error,,False,Ground,ForeHand,,,,,False,False,False,True,5.36,3.922,0.038,Wide,1.156,2.819,-15.478,-2.848,0.98,11.928,3.185,0.041,-15.361,-1.227,0.0,12.472,-1.548,0.0,1214.72,False,4.3955,0,0,0,0,0,0,3,3


In [14]:
def add_cum_games_and_sets(match_pbp):
    
    '''
    Args:
    -----
    match_pbp [pandas DataFrame]
    
    Returns:
    --------
    pandas DataFrame with cumulative games and sets won
    
    '''
    
    # Arbitrarily set the 1st server as 'player1' and the other as 'player2'
    player1 = match_pbp['server_id'][0]
    player2 = match_pbp['returner_id'][0]

    # Ids for the beginning point of each game 
    # By virtue of the data, some points might begin with a 2nd serve!!!! 
    #beginning_point_ids = np.where((match_pbp['point_num'] == 1) & (match_pbp['serve_num'] == 1))[0]
    beginning_point_ids = match_pbp[match_pbp['point_num'] == 1][['set_num', 'game_num', 'serve_num']].drop_duplicates(subset=['set_num', 'game_num'], keep = 'first').index


    # Ids for the last point of each game
    last_point_ids = np.asarray(beginning_point_ids)[1:] - 1

    # Dataframe of all last points played
    last_points_df = match_pbp.loc[last_point_ids] 

    # Add indicator columns of whether server or returner won the last point of each game
    last_points_df['server_game_won'] = np.where(last_points_df['server_score'] > last_points_df['returner_score'],
                                                 1,0)


    last_points_df['returner_game_won'] = np.where(last_points_df['server_game_won']  == 0,
                                                   1,0)

    last_points_df.reset_index(drop=True, inplace=True)


    # Calculate the Cumulative number of games won
    p1_cum_games = []
    p2_cum_games = []


    for set_number in np.unique(last_points_df['set_num']):
        p1_game_vec = [0]
        p2_game_vec = [0]

        # For each set, calculate the cumulative number of games won for each player    
        set_pbp = last_points_df[last_points_df['set_num'] == set_number]
        set_pbp.reset_index(drop=True, inplace=True)

        for point_id in range(set_pbp.shape[0]):
            if set_pbp['server_game_won'][point_id] == 1:
                 winner_id = set_pbp['server_id'][point_id]

            if set_pbp['returner_game_won'][point_id] == 1:
                winner_id = set_pbp['returner_id'][point_id]


            if winner_id == player1:
                p1_game_vec.append(p1_game_vec[point_id] +1)
                p2_game_vec.append(p2_game_vec[point_id])

            elif winner_id == player2:
                p1_game_vec.append(p1_game_vec[point_id])
                p2_game_vec.append(p2_game_vec[point_id]+1)
            else:
                continue

        p1_cum_games.append(p1_game_vec)
        p2_cum_games.append(p2_game_vec)

        p1_cum_games_np = np.array([item for sublist in p1_cum_games for item in sublist])
        p2_cum_games_np = np.array([item for sublist in p2_cum_games for item in sublist])

        # Remove Cumulative games that denote final game of a set
        # Ex: A set that is 6-0 is done, and won't be required for the calculation of point importance
        # Note: Roland Garros uses an ADVANTAGE SET
        last_set = match_pbp['set_num'][match_pbp.shape[0]-1]
        last_game = match_pbp['game_num'][match_pbp.shape[0]-1]

        absolute_diff = abs(p1_cum_games_np - p2_cum_games_np)
        #last_game_ids = np.cumsum( np.array(match_pbp.groupby('set_num')['game_num'].max()+1))
        last_game_ids = np.logical_or( np.logical_or(p1_cum_games_np >6, p2_cum_games_np >6),
                                       np.logical_or(np.logical_and(p1_cum_games_np ==6, absolute_diff >=2),
                                                     np.logical_and(p2_cum_games_np ==6, absolute_diff >=2))
                 )
        p1_cum_games_to_add = p1_cum_games_np[np.logical_not(last_game_ids)]
        p2_cum_games_to_add = p2_cum_games_np[np.logical_not(last_game_ids)]

    # -- Left join play-by-play data with columns denoting cumulative number of games won by each player
    games_df = match_pbp.loc[beginning_point_ids]
    games_df['player1'] = player1
    games_df['player2'] = player2
    
    # -- Roland Garros plays an Advantage Set!!!! **** This needs fixing
    games_df = games_df[games_df['game_num'] <= 13]
    
    games_df['p1_cum_games'] = p1_cum_games_to_add 
    games_df['p2_cum_games'] = p2_cum_games_to_add 

    added_cum_games_df = pd.merge(left = match_pbp,
                                  right = games_df[['game_num','set_num','player1','player2','p1_cum_games','p2_cum_games']],
                                  how="left",
                                  on = ['game_num','set_num'])

    # -- add cumulative sets won

    # Find indices where we change set
    set_change_id = np.delete(np.array(added_cum_games_df[added_cum_games_df['set_num'].diff() != 0].index),0)
    last_points_in_set_df = added_cum_games_df.loc[set_change_id-1]
    last_points_in_set_df.reset_index(drop=True, inplace=True)

    p1_cum_sets = [0]
    p2_cum_sets = [0]

    for set_index in range(last_points_in_set_df.shape[0]):
        # -- Figure out who won each set
        if last_points_in_set_df['server_score'][set_index] > last_points_in_set_df['returner_score'][set_index]:
            set_winner_id = last_points_in_set_df['server_id'][set_index]

        if last_points_in_set_df['server_score'][set_index] < last_points_in_set_df['returner_score'][set_index]:
            set_winner_id = last_points_in_set_df['returner_id'][set_index]

        if set_winner_id == player1:
            p1_cum_sets.append(p1_cum_sets[set_index] + 1) 
            p2_cum_sets.append(p2_cum_sets[set_index])

        if set_winner_id == player2:
            p1_cum_sets.append(p1_cum_sets[set_index]) 
            p2_cum_sets.append(p2_cum_sets[set_index] + 1)


    changeover_set_df = added_cum_games_df.loc[np.append(0, set_change_id)]
    changeover_set_df.reset_index(drop=True, inplace=True)
    changeover_set_df['p1_cum_sets'] = p1_cum_sets
    changeover_set_df['p2_cum_sets'] = p2_cum_sets

    added_cum_games_and_sets_df = pd.merge(left = added_cum_games_df,
                                           right = changeover_set_df[['set_num','player1','player2','p1_cum_sets','p2_cum_sets']],
                                           how="left",
                                           on = ['set_num','player1','player2' ])
    
    return(added_cum_games_and_sets_df)


In [82]:
#with open('../json_data/year_2019_SM012_tracking_data.json') as filename:
#  atp_tracking_data_json = json.load(filename)

#with open('../json_data/year_2019_SM103_tracking_data.json') as filename:
#  atp_tracking_data_json = json.load(filename)

with open('../json_data/year_2020_SM112_tracking_data.json') as filename:
  atp_tracking_data_json = json.load(filename)


In [83]:
test_match = get_match_point_level_info(atp_tracking_data_json)
#test_match
#test_match.to_csv('point_sequence_djokovic_nadal_2020_rolandgarros_pbp.csv', index = False)
#test_match[['point_ID','server_coord_x', 'server_coord_y', 'server_coord_z', 'returner_coord_x','returner_coord_y', 'returner_coord_z']]

#test_match[['point_end_type', 'error_type', 'fault_distance_missed_m','last_stroke_net_height_m', 'serveBounceCordinate_x', 'serveBounceCordinate_y', 'serveBounceCordinate_z']]


In [85]:
#test_match

In [89]:
with open('../ao_json_data/ao_year_2020_MS111_tracking_data.json') as filename:
  atp_tracking_data_json = json.load(filename)

test_match = get_match_point_level_info(atp_tracking_data_json)

test_match

Index Error...


Unnamed: 0,point_ID,set_num,game_num,point_num,serve_num,server_id,returner_id,point_winner_id,court_side,serve_speed_kph,serve_speed_kph_v2,serve_type,fault_distance_missed_ft,fault_distance_missed_m,x_ball_at_serve,y_ball_at_serve,z_ball_at_serve,rally_length,point_end_type,error_type,trapped_by_net,strokeType,hand,last_stroke_net_height_ft,last_stroke_net_height_m,winner_placement,unforcedErrorPlacement,is_break_point,is_break_point_converted,runAroundForeHand,is_track_avail,serveBounceCordinate_x,serveBounceCordinate_y,serveBounceCordinate_z,serve_dir,z_net_serve,z_peak_serve,ballHitCordinate_x,ballHitCordinate_y,ballHitCordinate_z,ballBounceCordinate_x,ballBounceCordinate_y,ballBounceCordinate_z,server_coord_x,server_coord_y,server_coord_z,returner_coord_x,returner_coord_y,returner_coord_z,spin_rpm,cruciality,returnPlacement,is_fault,is_doublefault,is_tiebreak,server_score,returner_score
0,1_1_1_1,1,1,1,1,ATPD994,ATPBH09,ATPBH09,DeuceCourt,202 KPH,202 KPH,Slice,,,-11.374,-0.808,2.729,2,Unforced Error,,True,Ground,ForeHand,2.55 Feet,0.78 Metre,,Net Error,False,False,False,True,5.66,0.593,0.028,T,1.103,2.729,-8.77,-0.239,1.497,-7.105,-0.106,0.028,-9.409,0.619,0.0,14.883,-0.574,0.0,,False,3.5215,0,0,0,0,0
1,1_1_2_1,1,1,2,1,ATPD994,ATPBH09,ATPD994,AdCourt,191 KPH,191 KPH,Unclassified,,,-11.112,1.363,2.74,11,Unforced Error,,False,Ground,ForeHand,5.2 Feet,1.58 Metre,,,False,False,False,True,4.987,-0.501,0.028,T,1.041,2.74,13.305,-1.683,0.916,0.305,1.63,0.026,-13.933,0.843,0.0,13.585,-2.869,0.0,3111.0,False,3.5765,0,0,0,0,1
2,1_1_3_1,1,1,3,1,ATPD994,ATPBH09,ATPD994,DeuceCourt,185 KPH,185 KPH,,,,5.536,3.857,0.031,1,Forced Error,,False,,ForeHand,,,,,False,False,False,True,,,,,,,5.536,3.857,0.031,,,,-11.426,-0.929,0.0,13.922,3.823,0.0,3591.79,False,4.3925,0,0,0,1,1
3,1_1_4_1,1,1,4,1,ATPD994,ATPBH09,ATPBH09,AdCourt,0 KPH,,Kick,,,-11.243,1.303,2.762,1,,,False,,BackHand,3.8 Feet,1.16 Metre,,,False,False,False,True,5.913,-4.664,0.031,Wide,1.159,2.762,-11.243,1.303,2.762,5.913,-4.664,0.031,-11.487,1.143,0.0,14.316,-3.658,0.0,0.0,False,2.9134,1,0,0,2,1
4,1_1_4_2,1,1,4,2,ATPD994,ATPBH09,ATPD994,AdCourt,174 KPH,174 KPH,Slice,,,-11.209,1.594,2.688,3,Forced Error,,False,Ground,ForeHand,3.93 Feet,1.2 Metre,,,False,False,False,True,5.421,-2.331,0.032,Body,1.185,2.688,13.595,4.752,0.41,-10.337,4.341,0.034,-6.928,0.488,0.0,14.239,3.831,0.0,2227.93,False,3.468,0,0,0,2,1
5,1_1_5_1,1,1,5,1,ATPD994,ATPBH09,ATPD994,DeuceCourt,0 KPH,,Unclassified,0.83 Feet,0.25 Metre,-11.186,-0.785,2.73,1,Faulty Serve,Net Error,True,,BackHand,2.85 Feet,0.87 Metre,,,False,False,False,True,-0.254,0.533,0.028,T,0.868,2.73,-11.186,-0.785,2.73,-0.254,0.533,0.028,-11.438,-1.065,0.0,13.839,3.575,0.0,0.0,False,1.43,1,0,0,3,1
6,1_1_5_2,1,1,5,2,ATPD994,ATPBH09,ATPBH09,DeuceCourt,154 KPH,154 KPH,Unclassified,,,-11.572,-0.503,2.693,4,Unforced Error,,True,Ground,ForeHand,2.45 Feet,0.75 Metre,,Net Error,False,False,False,True,4.918,0.958,0.028,T,1.286,2.693,-12.961,-3.033,0.952,-9.531,-2.075,0.033,-13.419,-2.383,0.0,13.503,2.114,0.0,2813.15,False,3.1005,0,0,0,3,1
7,1_1_6_1,1,1,6,1,ATPD994,ATPBH09,ATPBH09,AdCourt,0 KPH,,Unclassified,3.16 Feet,0.96 Metre,-11.259,1.34,2.747,1,Faulty Serve,Net Error,True,,BackHand,2.95 Feet,0.9 Metre,,,False,False,False,True,-0.964,-0.409,0.031,T,0.899,2.747,-11.259,1.34,2.747,-0.964,-0.409,0.031,-11.476,1.089,0.0,14.181,-3.698,0.0,0.0,False,1.4792,1,0,0,3,2
8,1_1_6_2,1,1,6,2,ATPD994,ATPBH09,ATPD994,AdCourt,152 KPH,152 KPH,Slice,,,-11.185,1.582,2.633,5,Forced Error,,False,Ground,ForeHand,5.4 Feet,1.65 Metre,,,False,False,False,True,5.014,-2.855,0.03,Wide,1.33,2.633,12.382,4.889,0.858,-8.748,-4.615,0.037,-12.8,-1.422,0.0,13.345,3.947,0.0,2419.51,False,3.633,0,0,0,3,2
9,1_2_1_1,1,2,1,1,ATPBH09,ATPD994,ATPBH09,DeuceCourt,192 KPH,192 KPH,Unclassified,,,-11.226,-0.479,2.823,3,Unforced Error,,True,Ground,BackHand,2.98 Feet,0.91 Metre,,Net Error,False,False,False,True,4.931,0.808,0.028,T,1.044,2.823,12.929,-4.081,1.222,0.322,-0.082,0.026,-13.237,1.728,0.0,13.455,-3.678,0.0,1586.1,False,3.7695,0,0,0,0,0


In [76]:
match_pbp = test_match




# Arbitrarily set the 1st server as 'player1' and the other as 'player2'
player1 = match_pbp['server_id'][0]
player2 = match_pbp['returner_id'][0]

# Ids for the beginning point of each game 
# By virtue of the data, some points might begin with a 2nd serve!!!! WTF
#beginning_point_ids = np.where((match_pbp['point_num'] == 1) & (match_pbp['serve_num'] == 1))[0]
beginning_point_ids = match_pbp[match_pbp['point_num'] == 1][['set_num', 'game_num', 'serve_num']].drop_duplicates(subset=['set_num', 'game_num'], keep = 'first').index



# Ids for the last point of each game
last_point_ids = np.asarray(beginning_point_ids)[1:] - 1

# Dataframe of all last points played
last_points_df = match_pbp.loc[last_point_ids] 

# Add indicator columns of whether server or returner won the last point of each game
last_points_df['server_game_won'] = np.where(last_points_df['server_score'] > last_points_df['returner_score'],
                                             1,0)


last_points_df['returner_game_won'] = np.where(last_points_df['server_game_won']  == 0,
                                               1,0)

last_points_df.reset_index(drop=True, inplace=True)


# Calculate the Cumulative number of games won
p1_cum_games = []
p2_cum_games = []


for set_number in np.unique(last_points_df['set_num']):
    p1_game_vec = [0]
    p2_game_vec = [0]

    # For each set, calculate the cumulative number of games won for each player    
    set_pbp = last_points_df[last_points_df['set_num'] == set_number]
    set_pbp.reset_index(drop=True, inplace=True)

    for point_id in range(set_pbp.shape[0]):
        if set_pbp['server_game_won'][point_id] == 1:
             winner_id = set_pbp['server_id'][point_id]

        if set_pbp['returner_game_won'][point_id] == 1:
            winner_id = set_pbp['returner_id'][point_id]


        if winner_id == player1:
            p1_game_vec.append(p1_game_vec[point_id] +1)
            p2_game_vec.append(p2_game_vec[point_id])

        elif winner_id == player2:
            p1_game_vec.append(p1_game_vec[point_id])
            p2_game_vec.append(p2_game_vec[point_id]+1)
        else:
            continue

    p1_cum_games.append(p1_game_vec)
    p2_cum_games.append(p2_game_vec)

    p1_cum_games_np = np.array([item for sublist in p1_cum_games for item in sublist])
    p2_cum_games_np = np.array([item for sublist in p2_cum_games for item in sublist])

    # Remove Cumulative games that denote final game of a set
    # Ex: A set that is 6-0 is done, and won't be required for the calculation of point importance
    # Note: Roland Garros uses an ADVANTAGE SET
    last_set = match_pbp['set_num'][match_pbp.shape[0]-1]
    last_game = match_pbp['game_num'][match_pbp.shape[0]-1]

    absolute_diff = abs(p1_cum_games_np - p2_cum_games_np)
    #last_game_ids = np.cumsum( np.array(match_pbp.groupby('set_num')['game_num'].max()+1))
    last_game_ids = np.logical_or( np.logical_or(p1_cum_games_np >6, p2_cum_games_np >6),
                                   np.logical_or(np.logical_and(p1_cum_games_np ==6, absolute_diff >=2),
                                                 np.logical_and(p2_cum_games_np ==6, absolute_diff >=2))
             )
    p1_cum_games_to_add = p1_cum_games_np[np.logical_not(last_game_ids)]
    p2_cum_games_to_add = p2_cum_games_np[np.logical_not(last_game_ids)]

# -- Left join play-by-play data with columns denoting cumulative number of games won by each player
games_df = match_pbp.loc[beginning_point_ids]
games_df['player1'] = player1
games_df['player2'] = player2

# -- Roland Garros plays an Advantage Set!!!! **** This needs fixing
games_df = games_df[games_df['game_num'] <= 13]

games_df['p1_cum_games'] = p1_cum_games_to_add 
games_df['p2_cum_games'] = p2_cum_games_to_add 

added_cum_games_df = pd.merge(left = match_pbp,
                              right = games_df[['game_num','set_num','player1','player2','p1_cum_games','p2_cum_games']],
                              how="left",
                              on = ['game_num','set_num'])



In [31]:
# Save Nadal vs. Federer (2019)

# Open one tracking json file (for debugging***)
with open('/Users/petertea/tennis_analytics/projects/roland_garros_tracking_data/json_data/year_2019_SM003_tracking_data.json') as filename:
  rafa_fed_2019_tracking_data_json = json.load(filename)

rafa_fed_match = get_match_point_level_info(rafa_fed_2019_tracking_data_json)
rafa_fed_match.head(10)
#rafa_fed_match.to_csv('point_sequence_federer_nadal_2019_rolandgarros_pbp.csv', index = False)

Unnamed: 0,point_ID,set_num,game_num,point_num,serve_num,server_id,returner_id,point_winner_id,court_side,serve_speed_kph,serve_speed_v2,serve_type,fault_distance_missed_ft,fault_distance_missed_m,rally_length,point_end_type,error_type,trapped_by_net,strokeType,hand,last_stroke_net_height_ft,last_stroke_net_height_m,winner_placement,unforcedErrorPlacement,is_break_point,is_break_point_converted,runAroundForeHand,is_track_avail,serveBounceCordinate_x,serveBounceCordinate_y,serveBounceCordinate_z,serve_dir,z_net_serve,ballHitCordinate_x,ballHitCordinate_y,ballHitCordinate_z,ballBounceCordinate_x,ballBounceCordinate_y,ballBounceCordinate_z,server_coord_x,server_coord_y,server_coord_z,returner_coord_x,returner_coord_y,returner_coord_z,spin_rpm,cruciality,returnPlacement
1,1_1_3_1,1,1,3,1,7792,2508,7792,AdCourt,176 KPH,176 KPH,Unclassified,,,5,Unforced Error,,False,Ground,BackHand,6.21 Feet,1.89 Metre,,,False,False,False,True,5.052,0.59,0.039,T,1.132,13.267,-4.656,0.961,0.074,1.385,0.037,-14.055,0.145,0.0,13.944,-3.942,0.0,3108.53,False,3.1125
2,1_1_4_1,1,1,4,1,7792,2508,7792,AdCourt,170.7 KPH,170.7 KPH,Unclassified,2.96 Feet,0.9 Metre,0,Faulty Serve,,True,,BackHand,3.2 Feet,0.97 Metre,,,False,False,False,True,-0.903,-2.948,0.034,Wide,0.974,-11.643,0.915,2.928,-0.903,-2.948,0.034,-11.589,0.815,0.0,12.861,-3.973,0.0,0.0,False,2.4504
3,1_1_4_2,1,1,4,2,7792,2508,2508,AdCourt,149 KPH,149 KPH,Pronated,,,4,Unforced Error,,False,Ground,BackHand,11.04 Feet,3.36 Metre,,,False,False,False,True,4.306,-2.952,0.032,Wide,1.125,14.677,-1.639,1.246,11.237,-0.135,0.035,-14.277,-0.864,0.0,14.442,-2.402,0.0,,False,3.515
4,1_1_5_1,1,1,5,1,7792,2508,2508,DeuceCourt,187.57 KPH,187.57 KPH,Unclassified,2.09 Feet,0.64 Metre,0,Faulty Serve,,True,,BackHand,11.04 Feet,3.36 Metre,,,False,False,False,True,-0.636,0.601,0.043,T,,-11.467,-0.459,2.866,-0.636,0.601,0.043,-11.595,-0.637,0.0,12.898,3.495,0.0,0.0,False,2.9045
5,1_1_5_2,1,1,5,2,7792,2508,2508,DeuceCourt,144 KPH,144 KPH,Slice,,,4,Winner,,False,Ground,BackHand,4.47 Feet,1.36 Metre,Cross Court,,False,False,False,True,4.793,0.106,0.037,T,1.272,10.507,-5.299,1.315,-5.029,3.513,0.034,-14.935,1.696,0.0,11.434,-4.668,0.0,3101.03,False,1.92525
6,1_1_6_1,1,1,6,1,7792,2508,7792,AdCourt,164 KPH,164 KPH,Unclassified,,,1,Forced Error,,False,,BackHand,3.12 Feet,0.95 Metre,,,True,False,False,True,4.082,-3.884,0.032,Wide,1.049,11.482,-6.435,1.432,0.058,-1.584,0.035,-11.282,-0.027,0.0,12.113,-5.055,0.0,,False,3.72225
7,1_1_7_1,1,1,7,1,7792,2508,7792,DeuceCourt,174 KPH,174 KPH,Unclassified,,,1,Forced Error,,True,,BackHand,2.43 Feet,0.74 Metre,,,False,False,False,True,4.138,1.042,0.035,T,1.018,12.359,1.467,1.163,0.592,-0.51,0.048,-11.121,-0.976,0.0,12.753,2.143,0.0,1028.09,False,2.9
8,1_1_8_1,1,1,8,1,7792,2508,2508,AdCourt,177 KPH,177 KPH,Pronated,,,8,Winner,,False,Overhead,ForeHand,4.84 Feet,1.48 Metre,Down the Line,,False,False,False,True,4.865,-3.677,0.03,Wide,1.107,4.326,-1.533,2.806,-4.315,-2.532,0.043,-14.634,0.701,0.0,4.632,-1.606,0.0,,False,3.85575
9,1_1_9_1,1,1,9,1,7792,2508,2508,DeuceCourt,177.09 KPH,177.09 KPH,Unclassified,0.6 Feet,0.18 Metre,0,Faulty Serve,,True,,BackHand,3 Feet,0.91 Metre,,,False,False,False,True,-0.184,0.559,0.037,T,0.913,-11.551,-0.544,2.908,-0.184,0.559,0.037,-11.619,-0.769,0.0,12.87,3.522,0.0,0.0,False,2.4364
10,1_1_9_2,1,1,9,2,7792,2508,7792,DeuceCourt,151 KPH,151 KPH,Pronated,,,7,Unforced Error,,False,Ground,ForeHand,7.01 Feet,2.14 Metre,,Wide Error,False,False,False,True,4.771,1.497,0.036,Body,1.244,11.073,-3.494,1.431,-10.698,4.615,0.027,-15.452,1.879,0.0,11.285,-4.285,0.0,2130.83,False,1.85625


### Save Ball Trajectory Data (i.e. Rally Data)

In [6]:
def save_trajectory_data_one_rally(one_point_sequence):
    '''
    Args:
    -----
    one_point_sequence [dict]: Dictionary
    
    Returns:
    --------
    pandas DataFrame (for one point sequence)
    
    Notes:
    ------
    '''
    
    ball_trajectory_df = pd.DataFrame(one_point_sequence['trajectoryData'])
    
    if ball_trajectory_df.empty:
        return ball_trajectory_df
    
    #######################################################################
    #                     Match situation information                     #
    #######################################################################
    # --> Get indices where ball is hit 
    hit_indices = ball_trajectory_df.index[ball_trajectory_df['position'] == 'hit'].tolist()
    hit_indices.append(ball_trajectory_df.shape[0])

    # Get lengths of rally index (expect 4 or 5)
    # In the usual case, we expect this sequence: Hit --> Peak --> Net --> Bounce
    # But what if it's a half volley? (Hit --> Peak --> Net)
    # But what if it's a hit on the rise?  Hit --> Peak --> Net --> Bounce --> Peak
    # *** Ball trajectory also includes erroneous balls (mishits)...so we sometimes get strike_index = 1 + rally_index
    hit_indices_diff_len = [x - hit_indices[i - 1] for i, x in enumerate(hit_indices)][1:]

    rally_length = len(hit_indices_diff_len)

    rally_index_list = []
    for rally_ind in range(1, rally_length + 1):
        rally_index_list.append(np.repeat( rally_ind, repeats=hit_indices_diff_len[rally_ind-1]))
    
    # Combine a list of numpy arrays into a single array
    ball_trajectory_df['strike_index'] = np.concatenate( rally_index_list, axis=0 )
    
    ##################################################
    #          Match situation information           #
    ##################################################
    ball_trajectory_df['point_ID'] = one_point_sequence['pointId']
    ball_trajectory_df['set_num'] = one_point_sequence['set']
    ball_trajectory_df['game_num'] = one_point_sequence['game'] 
    ball_trajectory_df['point_num'] = one_point_sequence['point']
    ball_trajectory_df['serve_num'] = one_point_sequence['serve']
    
    return ball_trajectory_df
    
    
    

In [17]:
#save_trajectory_data_one_rally(one_point_sequence)

In [32]:
def get_match_point_ball_trajectory_data(raw_json_file):
    '''
    Args:
    -----
    one_point_sequence [dict]: Dictionary
    
    Returns:
    --------
    pandas DataFrame
    '''
    all_tracking_data_dict = raw_json_file['courtVisionData'][0]['pointsData']
    
    match_ball_trajectory_list = []
    
    for point_id_key in sorted(all_tracking_data_dict.keys()):
        #print(point_id_key)
        ball_trajectory_df = save_trajectory_data_one_rally( all_tracking_data_dict[point_id_key] )
        
        if ball_trajectory_df.empty:
            continue
        else:
            match_ball_trajectory_list.append( ball_trajectory_df )

            
    match_ball_trajectory_df = pd.concat(match_ball_trajectory_list)
    
    
    ### Reorder columns
    match_ball_trajectory_df = match_ball_trajectory_df[['point_ID', 'set_num', 'game_num', 'point_num', 'serve_num', 'strike_index', 'position', 'x', 'y', 'z' ]]
    
    return match_ball_trajectory_df#.reset_index(inplace = True)


In [23]:
test_trajectory = get_match_point_ball_trajectory_data(atp_tracking_data_json)

test_trajectory.reset_index(drop = True)
test_trajectory.to_csv('ball_trajectory_djokovic_nadal_2020_rolandgarros_pbp.csv', index = False)

### Check out for each match, how many rally point data do we have?

In [9]:
data_list = []

for filename in os.listdir('./json_data/'):
    if filename.endswith(".json"): 
         with open('./json_data/' + filename) as file_name:
                #print(filename)
                atp_tracking_data_json = json.load(file_name)
                test_match = get_match_point_level_info(atp_tracking_data_json)
                num_points_avail = test_match['is_track_avail'].sum()
                num_points = test_match.shape[0]
                
                year = [int(s) for s in filename.split('_') if s.isdigit()][0]
                
                atp_flag = 'atp'
                if (filename[:3] == 'wta'):
                    atp_flag = 'wta'
                    
                tracking_data_dict = atp_tracking_data_json['courtVisionData'][0]
    
                #atp_tracking_data_dict['isMatchComplete']


                # -----------------------------
                # Player information
                # -----------------------------
                player1_info = tracking_data_dict['playersData']['playerTeam']
                player2_info = tracking_data_dict['playersData']['opponentTeam']
                player1 = player1_info[0]['name']
                player2 = player2_info[0]['name']



                    
                    
                
                summary_dict = dict(
                    player1 = player1,
                    player2 = player2,
                    num_points_avail = num_points_avail,
                    num_points = num_points,
                    year = year,
                    atp_flag = atp_flag,
                    filename=filename
                )
                data_list.append(summary_dict)
                
                #print('done!')
                
    else:
        continue




In [10]:
# Seems like we have a decent amount of available points!
summary_points_avail_df = pd.DataFrame(data_list)

summary_points_avail_df

Unnamed: 0,player1,player2,num_points_avail,num_points,year,atp_flag,filename
0,M.GASPARYAN,E.MERTENS,0,169,2020,wta,wta_year_2020_SD087_tracking_data.json
1,Y. HANFMANN,R. NADAL,207,209,2019,atp,year_2019_SM127_tracking_data.json
2,S. TSITSIPAS,S. WAWRINKA,517,526,2019,atp,year_2019_SM012_tracking_data.json
3,J.MUNAR,S.TSITSIPAS,335,335,2020,atp,year_2020_SM095_tracking_data.json
4,Kr.PLISKOVA,G.MUGURUZA,156,157,2020,wta,wta_year_2020_SD051_tracking_data.json
5,S.HALEP,I.BEGU,150,150,2020,wta,wta_year_2020_SD032_tracking_data.json
6,S. HALEP,I. SWIATEK,0,102,2019,wta,wta_year_2019_SD010_tracking_data.json
7,N. DJOKOVIC,D. THIEM,409,416,2019,atp,year_2019_SM002_tracking_data.json
8,D.SHAPOVALOV,G.SIMON,383,387,2020,atp,year_2020_SM088_tracking_data.json
9,F. FERRO,K. MLADENOVIC,190,196,2019,wta,wta_year_2019_SD125_tracking_data.json


In [11]:
summary_points_avail_df.to_csv('summary_points_avail_df.csv', index=False)