# Cleaning Roland Garros tracking data

This is an exploratory script, meant to see what data is available in each json file.

In [1]:
import json
import pandas as pd
import numpy as np

In [3]:
# Open one tracking json file
with open('../../json_data/year_2020_SM001_tracking_data.json') as filename:
  atp_tracking_data_json = json.load(filename)

In [4]:
atp_tracking_data_json.keys()

dict_keys(['courtVisionData'])

In [5]:
atp_tracking_data_dict = atp_tracking_data_json['courtVisionData'][0]
atp_tracking_data_dict.keys()

dict_keys(['isMatchComplete', 'eventType', 'courtName', 'courtId', 'pointsData', 'playersData', 'statsData', 'setsCompleted', 'pointId', 'matchStatus'])

In [6]:
atp_tracking_data_dict['statsData'][0].keys()

dict_keys(['aces', 'convertedBreakPoints', 'doubleFault', 'firstServeIn', 'firstServePointsWon', 'netPoints', 'pointsWon', 'returnPoints', 'secondServeIn', 'secondServePointsWon', 'unforcedError', 'winner'])

In [7]:
atp_tracking_data_dict['statsData'][0]['aces'].keys()

dict_keys(['percentage', 'crucialPercentage', 'percentagePlayer', 'percentageOpponent', 'percentagePlayerCrucial', 'percentageOpponentCrucial', 'count', 'onCourt', 'belowCourt', 'playerPositionsData'])

In [8]:
atp_tracking_data_dict['statsData'][0]['aces']['percentagePlayer']

{'set0': {'adCourt': {'percentageT': '0 %',
   'percentageM': '0 %',
   'percentageW': '100 %'},
  'deuceCourt': {'percentageT': '0 %',
   'percentageM': '0 %',
   'percentageW': '0 %'}},
 'set1': {'adCourt': {'percentageT': '0 %',
   'percentageM': '0 %',
   'percentageW': '0 %'},
  'deuceCourt': {'percentageT': '0 %',
   'percentageM': '0 %',
   'percentageW': '0 %'}},
 'set2': {'adCourt': {'percentageT': '0 %',
   'percentageM': '0 %',
   'percentageW': '100 %'},
  'deuceCourt': {'percentageT': '0 %',
   'percentageM': '0 %',
   'percentageW': '0 %'}},
 'set3': {'adCourt': {'percentageT': '0 %',
   'percentageM': '0 %',
   'percentageW': '0 %'},
  'deuceCourt': {'percentageT': '0 %',
   'percentageM': '0 %',
   'percentageW': '0 %'}},
 'set4': None,
 'set5': None}

In [9]:
# Take a look at non play-by-play information

# *********************************************************************************
#                Scrape match stats (anything that isn't point-by-point)
# *********************************************************************************

# Mens/Womens Singles/ Doubles
atp_tracking_data_dict['eventType']

# Complete status?
atp_tracking_data_dict['matchStatus']
atp_tracking_data_dict['isMatchComplete']


# -----------------------------
# Player information
# -----------------------------
player1_info = atp_tracking_data_dict['playersData']['playerTeam']
player2_info = atp_tracking_data_dict['playersData']['opponentTeam']

player1 = player1_info[0]['name']
player1_id = player1_info[0]['id']
player1_country = player1_info[0]['country']
player1_seed = player1_info[0]['seed']

player2 = player2_info[0]['name']
player2_id = player2_info[0]['id']
player2_country = player2_info[0]['country']
player2_seed = player2_info[0]['seed']

court_name = atp_tracking_data_dict['courtName']
court_id = atp_tracking_data_dict['courtId']
num_sets_completed = atp_tracking_data_dict['setsCompleted']

# -------------------------------
# - I have no idea what this is
# -------------------------------
point_id = atp_tracking_data_dict['pointId']




non_pbp_dict = dict(
    player1 = player1,
    player1_id = player1_id,
    player1_country = player1_country,
    player1_seed = player1_seed,
    
    player2 = player2,
    player2_id = player2_id,
    player2_country = player2_country,
    player2_seed = player2_seed,
    
    point_id = point_id,
    court_name = court_name,
    court_id = court_id,
    num_sets_completed = num_sets_completed
    
)

non_pbp_dict

{'player1': 'N.DJOKOVIC',
 'player1_id': '9801',
 'player1_country': 'SRB',
 'player1_seed': '1',
 'player2': 'R.NADAL',
 'player2_id': '7792',
 'player2_country': 'ESP',
 'player2_seed': '2',
 'point_id': '3_9_9',
 'court_name': 'Court Philippe CHATRIER',
 'court_id': 1,
 'num_sets_completed': 3}

### Play - by - Play tracking data

In [40]:
# -- Sort point ids in order
#sorted(atp_tracking_data_dict['pointsData'].keys())

In [10]:
# Play - by play tracking data
point_id = '1_1_8_2'
one_point_sequence = atp_tracking_data_dict['pointsData'][point_id]


In [11]:
one_point_sequence.keys()

dict_keys(['cruciality', 'returnPlacement', 'trajectoryData', 'errorType', 'winnerPlacement', 'unforcedErrorPlacement', 'pointId', 'serverId', 'scorerId', 'receiverId', 'ballSpeed', 'returnSpeed', 'returnSpeedFrench', 'rallyLength', 'spin', 'heightAboveNet', 'ballSpeedFrench', 'heightAboveNetFrench', 'distanceOutsideCourt', 'distanceOutsideCourtFrench', 'pointEndType', 'strokeType', 'serveType', 'court', 'setNumber', 'set', 'game', 'point', 'serve', 'hand', 'breakPoint', 'runAroundForeHand', 'breakPointConverted', 'trappedByNet', 'ballHitCordinate', 'ballPeakCordinate', 'ballNetCordinate', 'ballBounceCordinate', 'ballLastCordinate', 'serverCordinate', 'receiverCordinate', 'serveBounceCordinate', 'id', 'pointNumber'])

In [12]:
one_point_sequence

{'cruciality': 'false',
 'returnPlacement': 3.5084999999999997,
 'trajectoryData': [{'x': -11.252, 'y': 1.014, 'z': 2.817, 'position': 'hit'},
  {'x': -11.252, 'y': 1.014, 'z': 2.817, 'position': 'peak'},
  {'x': 0.0, 'y': -1.55, 'z': 1.146, 'position': 'net'},
  {'x': 3.975, 'y': -2.174, 'z': 0.03, 'position': 'bounce'},
  {'x': 14.61, 'y': -4.51, 'z': 1.14, 'position': 'hit'},
  {'x': 6.461, 'y': -1.762, 'z': 2.018, 'position': 'peak'},
  {'x': 0.0, 'y': 0.443, 'z': 1.328, 'position': 'net'},
  {'x': -4.19, 'y': 1.852, 'z': 0.043, 'position': 'bounce'},
  {'x': -9.465, 'y': 3.799, 'z': 1.274, 'position': 'peak'},
  {'x': -11.644, 'y': 4.592, 'z': 1.036, 'position': 'hit'},
  {'x': -2.194, 'y': 3.86, 'z': 1.593, 'position': 'peak'},
  {'x': 0.0, 'y': 3.731, 'z': 1.557, 'position': 'net'},
  {'x': 11.113, 'y': 3.344, 'z': 0.041, 'position': 'bounce'},
  {'x': 15.419, 'y': 3.151, 'z': 0.884, 'position': 'hit'},
  {'x': 3.651, 'y': 2.379, 'z': 2.392, 'position': 'peak'},
  {'x': 0.0, 'y'

In [14]:
len(one_point_sequence['trajectoryData'])

39

In [15]:
set_num = int(one_point_sequence['set'])
set_num_v2 = int(one_point_sequence['setNumber'])
game_num = int(one_point_sequence['game'])
server_id = int(one_point_sequence['serverId'])
returner_id = int(one_point_sequence['receiverId'])
point_winner_id = int(one_point_sequence['scorerId'])
court_side = one_point_sequence['court']
rally_length = one_point_sequence['rallyLength']

point_end_type = one_point_sequence['pointEndType']
error_type = one_point_sequence['errorType']
winner_placement = one_point_sequence['winnerPlacement']

# Serve Stats
serve_number = one_point_sequence['serve']
serve_speed_kph = one_point_sequence['ballSpeedFrench']
serve_speed_v2 = one_point_sequence['ballSpeed']
serve_type = one_point_sequence['serveType']

# Guesses
net_clearance_ft = one_point_sequence['heightAboveNet']
net_clearance_ft_m = one_point_sequence['heightAboveNetFrench']
point_id = one_point_sequence['pointId']

# Unknowns
trapped_by_net = one_point_sequence['trappedByNet']
ID = one_point_sequence['id']
point_number = one_point_sequence['point'] # ***
point_number_2 = one_point_sequence['pointNumber']
distanceOutsideCourt = one_point_sequence['distanceOutsideCourt']
distanceOutsideCourt_fr = one_point_sequence['distanceOutsideCourtFrench']
runAroundForeHand = one_point_sequence['runAroundForeHand']

hand = one_point_sequence['hand']
cruciality = one_point_sequence['cruciality']
return_speed = one_point_sequence['returnSpeed']
spin = one_point_sequence['spin']
return_placement = one_point_sequence['returnPlacement']

{'y': -1.123, 'x': 0.0, 'z': 1.527, 'erroneousBall': False}

In [16]:
### Coordinate data from pointsData
one_point_sequence['ballHitCordinate']
one_point_sequence['ballPeakCordinate']
one_point_sequence['ballBounceCordinate']
one_point_sequence['trajectoryData']

one_point_sequence['serverCordinate']
one_point_sequence['receiverCordinate']
one_point_sequence['ballNetCordinate']

{'x': 0.0, 'y': 4.169, 'z': 1.588, 'erroneousBall': False}

In [11]:
#atp_tracking_data_dict['statsData']

In [17]:
one_point_sequence['serverCordinate']
#one_point_sequence['receiverCordinate']

{'x': -11.68, 'y': 3.436, 'z': 0.0, 'erroneousBall': False}

In [18]:
one_point_sequence['trajectoryData']

[{'x': -11.252, 'y': 1.014, 'z': 2.817, 'position': 'hit'},
 {'x': -11.252, 'y': 1.014, 'z': 2.817, 'position': 'peak'},
 {'x': 0.0, 'y': -1.55, 'z': 1.146, 'position': 'net'},
 {'x': 3.975, 'y': -2.174, 'z': 0.03, 'position': 'bounce'},
 {'x': 14.61, 'y': -4.51, 'z': 1.14, 'position': 'hit'},
 {'x': 6.461, 'y': -1.762, 'z': 2.018, 'position': 'peak'},
 {'x': 0.0, 'y': 0.443, 'z': 1.328, 'position': 'net'},
 {'x': -4.19, 'y': 1.852, 'z': 0.043, 'position': 'bounce'},
 {'x': -9.465, 'y': 3.799, 'z': 1.274, 'position': 'peak'},
 {'x': -11.644, 'y': 4.592, 'z': 1.036, 'position': 'hit'},
 {'x': -2.194, 'y': 3.86, 'z': 1.593, 'position': 'peak'},
 {'x': 0.0, 'y': 3.731, 'z': 1.557, 'position': 'net'},
 {'x': 11.113, 'y': 3.344, 'z': 0.041, 'position': 'bounce'},
 {'x': 15.419, 'y': 3.151, 'z': 0.884, 'position': 'hit'},
 {'x': 3.651, 'y': 2.379, 'z': 2.392, 'position': 'peak'},
 {'x': 0.0, 'y': 1.994, 'z': 2.173, 'position': 'net'},
 {'x': -7.225, 'y': 0.951, 'z': 0.037, 'position': 'bounc

In [19]:
stats_data1 = atp_tracking_data_dict['statsData'][1]
stats_data1 .keys()

dict_keys(['aces', 'convertedBreakPoints', 'doubleFault', 'firstServeIn', 'firstServePointsWon', 'netPoints', 'pointsWon', 'returnPoints', 'secondServeIn', 'secondServePointsWon', 'unforcedError', 'winner'])

In [20]:
atp_tracking_data_dict['statsData'][0].keys()

dict_keys(['aces', 'convertedBreakPoints', 'doubleFault', 'firstServeIn', 'firstServePointsWon', 'netPoints', 'pointsWon', 'returnPoints', 'secondServeIn', 'secondServePointsWon', 'unforcedError', 'winner'])

In [21]:
stats_data1['doubleFault']

{'onCourt': ['2_8_4_1', '2_8_4_2'],
 'belowCourt': {'avgDoubleFaultInTournamentByPlayer': 1,
  'avgDoubleFaultInTournament': 3,
  'doubleFaultInCurrentMatchByPlayer': 1},
 'playerPositionsData': {'set1': None,
  'set2': 7.917700000000001,
  'set3': None,
  'set4': None,
  'set5': None}}

In [33]:
ball_trajectory_df = pd.DataFrame(one_point_sequence['trajectoryData'])

hit_num = len(one_point_sequence['trajectoryData'])//4
ball_trajectory_df

Unnamed: 0,y,x,z,position
0,0.737,-11.209,2.832,hit
1,0.737,-11.209,2.832,peak
2,-0.129,0.0,1.127,net
3,-0.478,5.643,0.039,bounce
4,-1.155,16.766,1.129,hit
5,-0.37,4.571,2.393,peak
6,-0.077,0.0,2.168,net
7,0.534,-9.336,0.04,bounce
8,0.734,-13.251,1.331,hit
9,-0.49,-5.037,1.733,peak


In [27]:
list(range(1, rally_length+1))
np.repeat(list(range(1, rally_length+1)), repeats=4)

array([ 1,  1,  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  5,
        5,  5,  5,  6,  6,  6,  6,  7,  7,  7,  7,  8,  8,  8,  8,  9,  9,
        9,  9, 10, 10, 10, 10, 11, 11, 11, 11])

In [29]:
ball_trajectory_df['rally_index'] = np.repeat(list(range(1, rally_length+1)), repeats=4)

In [49]:
# Get indices where ball is hit. 
hit_indices = ball_trajectory_df.index[ball_trajectory_df['position'] == 'hit'].tolist()
hit_indices.append(ball_trajectory_df.shape[0])

# Get lengths of rally index (expect 4 or 5)
# In the usual case, we expect this sequence: Hit --> Peak --> Net --> Bounce
# But what if it's a half volley? (Hit --> Peak --> Net)
# But what if it's a hit on the rise?  Hit --> Peak --> Net --> Bounce --> Peak
hit_indices_diff_len = [x - hit_indices[i - 1] for i, x in enumerate(hit_indices)][1:]

rally_length = len(hit_indices_diff_len)

to_see = []
for rally_ind in range(1, rally_length + 1):
    to_see.append(np.repeat( rally_ind, repeats=hit_indices_diff_len[rally_ind-1]))
    

#np.where(ball_trajectory_df['position'] == 'hit',1,0)

In [57]:
ball_trajectory_df['rally_index'] = np.concatenate( to_see, axis=0 )

In [58]:
ball_trajectory_df

Unnamed: 0,y,x,z,position,rally_index
0,0.737,-11.209,2.832,hit,1
1,0.737,-11.209,2.832,peak,1
2,-0.129,0.0,1.127,net,1
3,-0.478,5.643,0.039,bounce,1
4,-1.155,16.766,1.129,hit,2
5,-0.37,4.571,2.393,peak,2
6,-0.077,0.0,2.168,net,2
7,0.534,-9.336,0.04,bounce,2
8,0.734,-13.251,1.331,hit,3
9,-0.49,-5.037,1.733,peak,3
