<a href="https://colab.research.google.com/github/nathanij/atpPredictor/blob/main/Tennis_Model_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setup

In [None]:
!pip install --upgrade tables

In [None]:
import pandas as pd
import numpy as np
import random
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
import datetime
from google.colab import drive
drive.mount('/content/drive')

Import all data since 2010

In [None]:
!git clone https://github.com/nathanij/atp-data.git
filenames = ['/content/atp-data/atp_matches_' + str(year) + '.csv' for year in range(2010,2022)]
data = pd.concat([pd.read_csv(file) for file in filenames], ignore_index= True)

Remove unused columns, add index column for accessing simplicity

In [48]:
drops = ['tourney_name', 'draw_size', 'match_num', 'best_of', 'score']
players = ['winner_', 'loser_']
attrs = ['name', 'seed', 'ioc', 'rank_points']
for d in drops:
  data = data.drop(d, axis = 1, )
for p in players:
  for a in attrs:
    data = data.drop(p+a, axis = 1)
data.insert(0, 'ind', data.index)

Functions for calculating usable stats from existing info

In [49]:
def calcTime(id, row): #calculates time on court
  tourney = data.loc[row, 'tourney_id']
  ln = len(data)
  time = 0
  if data.loc[row, 'tourney_date'] // 10000 >= 2019:
    match = row + 1
    while match < ln and data.loc[match, 'tourney_id'] == tourney:
      if data.loc[match, 'winner_id'] == id:
        time += data.loc[match, 'minutes']
      match += 1
    return time
  else:
    match = row - 1
    while match >= 0 and data.loc[match, 'tourney_id'] == tourney:
      if data.loc[match, 'winner_id'] == id:
        time += data.loc[match, 'minutes']
      match -= 1
    return time

In [22]:
def withinYears(x, year, month, row): #for calculating if data is applicable (within previous x years)
  mDate = data.loc[row, 'tourney_date']
  mYear = mDate // 10000
  mMonth = (mDate % 10000) // 100
  yDif = year - mYear
  if yDif <= x - 1:
    return True
  elif yDif == x and mMonth > month:
    return True
  return False

In [53]:
#returned arrays are of the following form
#serve stats = [svpt, 1stIn, 1stWon, 2ndWon, bpsaved, bpfaced, svGames, ace, df, totalWon, 2ndIn, bplost, svgamesWon]
#return stats = [retPt, 1stIn, 1stWon, 2ndWon, bpwon, bpchances, dfs faced, retGames, totalWon, 2ndIn]
def calcStats(id, row):
  sStat = ['svpt', '1stIn', '1stWon', '2ndWon', 'bpSaved', 'bpFaced', 'SvGms', 'ace', 'df'] #serve stat categories
  rStat = ['svpt', '1stIn', '1stWon', '2ndWon', 'bpSaved', 'bpFaced', 'df', 'SvGms']
  tourney = data.loc[row, 'tourney_id'] #to make sure current tourney not included
  match = row - 1
  serveStats = [0] * 13
  returnStats = [0] * 10
  date = data.loc[row, 'tourney_date']
  year = date // 10000
  month = (date % 10000) // 100
  if year <= 2018:
    while match >= 0 and data.loc[match, 'tourney_id'] == tourney:
      match -= 1
  while match >= 0 and withinYears(1, year, month, match):
    if data.loc[match, 'winner_id'] == id:
      for i in range(len(sStat)):
        s = data.loc[match, 'w_' + sStat[i]]
        if not pd.isna(s):
          serveStats[i] += s
      for j in range(len(rStat)):
        s = data.loc[match, 'l_' + rStat[j]]
        if not pd.isna(s):
          returnStats[j] += s
    elif data.loc[match, 'loser_id'] == id:
      for i in range(len(rStat)):
        s = data.loc[match, 'l_' + sStat[i]]
        if not pd.isna(s):
          serveStats[i] += s
      for j in range(len(rStat)):
        s = data.loc[match, 'w_' + rStat[j]]
        if not pd.isna(s):
          returnStats[j] += s
    match -= 1
  #fill in extra columns and edit current using relationships
  serveStats[9] = serveStats[2] + serveStats[3]
  serveStats[10] = serveStats[0] - serveStats[1] - serveStats[8]
  serveStats[11] = serveStats[5] - serveStats[4]
  serveStats[12] = serveStats[6] - serveStats[11]
  returnStats[2] = returnStats[1] - returnStats[2]
  returnStats[9] = returnStats[0] - returnStats[1] - returnStats[6]
  returnStats[3] = returnStats[9] - returnStats[3]
  returnStats[4] = returnStats[5] - returnStats[4]
  returnStats[8] = returnStats[2] + returnStats[3]
  return (serveStats, returnStats)

In [82]:
def tupleDivide(row, player, s, i1, i2): #couldn't figure out how to vectorize with tuples
  x = data.loc[row, player+'raw'][s][i2]
  if x == 0:
    return 0.5
  return data.loc[row, player+'raw'][s][i1] / x

Use functions to make calculations

In [83]:
players = ['winner_', 'loser_']
sStat = ['svpt', '1stIn', '1stWon', '2ndWon', 'bpSaved', 'bpFaced', 'SvGms', 'ace', 'df']
for p in players:
  stats = data.apply(lambda x: calcStats(x[p+'id'], x['ind']), axis=1)
  data[p+'raw'] = stats
for p in players:
  data[p+'court_time'] = data.apply(lambda x: calcTime(x[p+'id'], x['ind']), axis=1)
  data[p+ 'ratio'] = data.apply(lambda x: tupleDivide(x['ind'], p, 0, 7, 8), axis=1)
  data[p+'1stSv%'] = data.apply(lambda x: tupleDivide(x['ind'], p, 0, 1, 0), axis=1)
  data[p+'1stSvW%'] = data.apply(lambda x: tupleDivide(x['ind'], p, 0, 2, 1), axis=1)
  data[p+'2ndSvW%'] = data.apply(lambda x: tupleDivide(x['ind'], p, 0, 3, 10), axis=1)
  data[p+'bpSave%'] = data.apply(lambda x: tupleDivide(x['ind'], p, 0, 4, 5), axis=1)
  data[p+'svGam%'] = data.apply(lambda x: tupleDivide(x['ind'], p, 0, 12, 6), axis=1)
  data[p+'svPt%'] = data.apply(lambda x: tupleDivide(x['ind'], p, 0, 9, 0), axis=1)
  data[p+'1stRet%'] = data.apply(lambda x: tupleDivide(x['ind'], p, 1, 2, 1), axis=1)
  data[p+'2ndRet%'] = data.apply(lambda x: tupleDivide(x['ind'], p, 1, 3, 9), axis=1)
  data[p+'bpConv%'] = data.apply(lambda x: tupleDivide(x['ind'], p, 1, 4, 5), axis=1)
  data[p+'retGam%'] = data.apply(lambda x: tupleDivide(x['ind'], p, 1, 4, 7), axis=1)
  data[p+'retPt%'] = data.apply(lambda x: tupleDivide(x['ind'], p, 1, 8, 0), axis=1)

Drop now unwanted columns

In [84]:
data = data.drop('winner_raw', axis = 1)
data = data.drop('loser_raw', axis = 1)
plys = ['w_', 'l_']
for p in plys:
  for s in sStat:
    data = data.drop(p+s, axis = 1)
data = data.drop('minutes', axis = 1)

Save calculations so you can start from here

In [None]:
data.to_hdf('ptCalc(key=a).h5', key = 'a')
!cp ptCalc\(key\=a\).h5 /content/drive/MyDrive/tennisData

In [18]:
data = pd.read_hdf('/content/drive/MyDrive/tennisData/ptCalc(key=a).h5', key = 'a')

Functions to calculate current forms and head-to-head

In [27]:
#calculates overall form and form on a specific surface
#for the previous year (at a specific tournament level)
#return in form [ovrForm, surForm]
def checkForms(id, row):
  tourney = data.loc[row, 'tourney_id'] #to make sure current tourney not included
  level = data.loc[row, 'tourney_level']
  surface = data.loc[row, 'surface']
  match = row - 1
  date = data.loc[row, 'tourney_date']
  year = date // 10000
  month = (date % 10000) // 100
  if year <= 2018:
    while match >= 0 and data.loc[match, 'tourney_id'] == tourney:
      match -= 1
  win = 0
  surWin = 0
  count = 0
  surCount = 0
  while match >= 0 and withinYears(1, year, month, match):
    if data.loc[match, 'tourney_level'] == level:
      if data.loc[match, 'winner_id'] == id:
        count += 1
        win += 1
        if data.loc[match, 'surface'] == surface:
          surCount += 1
          surWin += 1
      elif data.loc[match, 'loser_id'] ==id:
        count += 1
        if data.loc[match, 'surface'] == surface:
          surCount += 1
      match -= 1
  if count == 0:
    a = 0
  else:
    a = win / count
  if surCount == 0:
    b = 0
  else:
    b = surWin / surCount
  return a, b

In [None]:
def calcH2H(p0, p1, row): #calculates win% for p0 against p1 in last 3 years
    tourney = data.loc[row, 'tourney_id'] #to make sure current tourney not included
    match = row - 1
    date = data.loc[row, 'tourney_date']
    year = date // 10000
    month = (date % 10000) // 100
    count = 0
    win = 0
    if year <= 2018:
      while match >= 0 and data.loc[match, 'tourney_id'] == tourney:
        match -= 1
    while match >= 0 and withinYears(3, year, month, match):
      if p0 == data.loc[match, 'winner_id'] and p1 == data.loc[match, 'loser_id']:
        count += 1
        win += 1
      elif p0 == data.loc[match, 'loser_id'] and p1 == data.loc[match, 'winner_id']:
        count += 1
      match -= 1
    if count == 0:
      return 0.5
    else:
      return win/count

Make the calculations

In [None]:
players = ['winner_', 'loser_']
for p in players:
  data[p+'form'], data[p+'surForm'] = data.apply(lambda x: checkForms(x[p+'id'], x['ind']), axis=1)
data['h2h'] = data.apply(lambda x: calcH2H(x['winner_id'], x['loser_id'], x['ind']), axis = 1)

Save pre-randomization

In [None]:
data.to_hdf('forms(key=a).h5', key = 'a')
!cp forms\(key\=a\).h5 /content/drive/MyDrive/tennisData

In [25]:
data = pd.read_hdf('/content/drive/MyDrive/tennisData/forms(key=a).h5', key = 'a')

Randomize winners for training

In [None]:
att = ['id', 'rank', 'entry', 'age', 'ht', 'hand', 'court_time', 'ratio', '1stSv%',
       '1stSvW%', '2ndSvW%', 'bpSave%', 'svGam%', 'svPt%', '1stRet%', '2ndRet%', 
       'bpConv%', 'retGam%', 'retPt%']
ln = len(data)
data['winner'] = [0] * ln
pl = ['p0_', 'p1_']
for p in pl:
  for a in att:
    data[p+a] = [0] * ln
for row in range(ln):
  win = random.randint(0,1)
  if win == 0:
    for p in att:
      data.loc[row, 'p0_'+p] = data.loc[row, 'winner_'+p]
      data.loc[row, 'p1_'+p] = data.loc[row, 'loser_'+p]
  else:
    data.loc[row, 'winner'] = 1
    for p in att:
      data.loc[row, 'p1_'+p] = data.loc[row, 'winner_'+p]
      data.loc[row, 'p0_'+p] = data.loc[row, 'loser_'+p]
for p in players:
  for a in att:
    data = data.drop(p+a, axis = 1)

In [None]:
data.head()

Save randomized data

In [None]:
data.to_hdf('rand(key=a).h5', key = 'a')
!cp rand\(key\=a\).h5 /content/drive/MyDrive/tennisData

In [None]:
data = pd.read_hdf('/content/drive/MyDrive/tennisData/rand(key=a).h5', key = 'a')

Clean data, convert tangibles to numerals, filter out missing data

Functions for cleaning:

In [None]:
def cSurface(x): #converts surface to a number, Grass 0, Hard 1, Clay 2
  if x == 'Grass':
    return 0
  elif x == 'Hard':
    return 1
  return 2

In [None]:
def cEntry(x): #qualifier 1, non-qualifier 0 (for fatigue reasons)
  if x == 'Q':
    return 1
  return 0

In [None]:
def cHand(x): #0 for right hand, 1 for left
  if x == 'R':
    return 0
  return 1

In [None]:
def cRound(x):
  if x == 'R128':
    return 1
  elif x == 'R64':
    return 2
  elif x == 'R32':
    return 3
  elif x == 'R16':
    return 4
  elif x == 'QF':
    return 5
  elif x == 'SF':
    return 6
  else:
    return 7