In [1]:
# imports
import pandas as pd
import numpy as np
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

In [2]:
# hide warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# import the csv file
START_YR = 2000 # min is 1968
END_YR = 2020 # inclusive

def createDf(start_yr, end_yr):
    if (end_yr - start_yr) < 0:
        return None
    
    year_range = range(start_yr, end_yr + 1)
    url = 'https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_%s.csv' % year_range[0]
    df = pd.read_csv(url).fillna(0)
    print("Downloading %s" % year_range[0])
    
    if (end_yr - start_yr) > 0: 
        for yr in year_range[1:]:
            url = 'https://raw.githubusercontent.com/JeffSackmann/tennis_atp/master/atp_matches_%s.csv' % yr
            newDf = pd.read_csv(url).fillna(0)
            print("Downloading %s" % yr)
            df = df.append(newDf)
    return df

df = createDf(START_YR, END_YR)
        
# df_2018 = pd.read_csv("./atp_matches_2018.csv").fillna(0)
# df_2019 = pd.read_csv("./atp_matches_2019.csv").fillna(0)
# df = df_2018.append(df_2019)
df['y'] = 1
print(df.size)
print(df.columns)

Downloading 2000
Downloading 2001
Downloading 2002
Downloading 2003
Downloading 2004
Downloading 2005
Downloading 2006
Downloading 2007
Downloading 2008
Downloading 2009
Downloading 2010
Downloading 2011
Downloading 2012
Downloading 2013
Downloading 2014
Downloading 2015
Downloading 2016
Downloading 2017
Downloading 2018
Downloading 2019
Downloading 2020
3121050
Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank

In [4]:
# randomize order of player 1 and player 2 columns and y value
np.random.seed(1)

df['shouldSwap'] = np.random.randint(2, size = df.shape[0])
df.loc[df['shouldSwap'],['winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'winner_rank', 'winner_rank_points', 'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced', 'loser_rank', 'loser_rank_points']] = df.loc[df['shouldSwap'],['loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced', 'loser_rank', 'loser_rank_points', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'winner_rank', 'winner_rank_points']].values
df['y'] = df['shouldSwap'].apply(lambda x: int(not x))

# rename columns from winner to p1 and loser to p2
def renameWinnerLoser(x):
    if x == 'draw_size':
        return x
    return x.replace('winner_', 'p1_').replace('w_', 'p1_').replace('loser_', 'p2_').replace('l_', 'p2_')
df = df.rename(renameWinnerLoser, axis='columns')
df

Unnamed: 0,tourney_id,tourney_name,surface,draw_size,tourney_level,tourney_date,match_num,p1_id,p1_seed,p1_entry,...,p2_2ndWon,p2_SvGms,p2_bpSaved,p2_bpFaced,p1_rank,p1_rank_points,p2_rank,p2_rank_points,y,shouldSwap
0,2000-301,Auckland,Hard,32,A,20000110,1,101543,0,0,...,28.0,17.0,3.0,5.0,63.0,595.0,11.0,1612.0,0,1
1,2000-301,Auckland,Hard,32,A,20000110,2,102644,0,0,...,13.0,12.0,5.0,6.0,49.0,723.0,211.0,157.0,0,1
2,2000-301,Auckland,Hard,32,A,20000110,3,103252,0,0,...,7.0,8.0,7.0,11.0,48.0,726.0,59.0,649.0,1,0
3,2000-301,Auckland,Hard,32,A,20000110,4,103507,7,0,...,14.0,10.0,6.0,8.0,45.0,768.0,61.0,616.0,1,0
4,2000-301,Auckland,Hard,32,A,20000110,5,102103,0,Q,...,18.0,12.0,5.0,9.0,167.0,219.0,34.0,873.0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
742,2020-M-DC-2020-WG2-PO-POL-HKG-01,Davis Cup WG2 PO: POL vs HKG,Hard,4,D,20200306,2,105668,0,0,...,0.0,0.0,0.0,0.0,461.0,68.0,960.0,11.0,0,1
743,2020-M-DC-2020-WG2-PO-POL-HKG-01,Davis Cup WG2 PO: POL vs HKG,Hard,4,D,20200306,4,209874,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,1
744,2020-M-DC-2020-WG2-PO-SYR-ZIM-01,Davis Cup WG2 PO: SYR vs ZIM,Hard,4,D,20200306,1,208518,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,813.0,18.0,1,0
745,2020-M-DC-2020-WG2-PO-SYR-ZIM-01,Davis Cup WG2 PO: SYR vs ZIM,Hard,4,D,20200306,2,111761,0,0,...,0.0,0.0,0.0,0.0,430.0,79.0,0.0,0.0,0,1


In [5]:
# Model 1: basic logistic regression + basic features
# 
# predicting: is_winner_player1
# 
# features:
# for each player = (seed, rank, rank_pts, ht, age, is_right_hand)
# tournament info = (is_4_draw, is_8_draw, is_32_draw, is_64_draw, is_hard, is_grass, is_A_level, 
# is_D_level, is_F_level, is_G_level)

# step 1: feature engineering and data cleaning
def featEngModel1(dataset):
    def seedToInt(seed):
        resultStr = seed
        try:
            result = float(resultStr)
        except:
            result = sys.maxsize
        return result
    
    # add same winner and loser columns
    newDataSet = dataset[['y', 'p1_rank', 'p1_rank_points', 'p1_ht', 'p1_age', 
                          'p2_rank', 'p2_rank_points', 'p2_ht', 'p2_age']]
    
    # add winner compute columns
    newDataSet.loc[:, 'p1_seed'] = dataset['p1_seed'].apply(lambda x: seedToInt(x))
    newDataSet.loc[:, 'p1_is_right_hand'] = dataset['p1_hand'].apply(lambda x: x == 'R')
    
    # add loser columns
    newDataSet.loc[:, 'p2_seed'] = dataset['p2_seed'].apply(lambda x: seedToInt(x))
    newDataSet.loc[:, 'p2_is_right_hand'] = dataset['p2_hand'].apply(lambda x: x == 'R')
    
    # add record between winner and loser
    pastRecord = dict()
    winRecordList = list()
    loseRecordList = list()
    for index, row in dataset.iterrows():
        match = (row['p1_id'], row['p2_id'])
        reverseMatch = (row['p2_id'], row['p1_id'])
        if match not in pastRecord:
            pastRecord[match] = 0
        if reverseMatch not in pastRecord:
            pastRecord[reverseMatch] = 0
        pastRecord[match] += 1
        winRecordList.append(pastRecord[match])
        loseRecordList.append(pastRecord[reverseMatch])
    newDataSet.loc[:, 'p1_past_wins'] = winRecordList
    newDataSet.loc[:, 'p1_past_losses'] = loseRecordList
    newDataSet.loc[:, 'rank_diff'] = newDataSet['p1_rank'].sub(newDataSet['p2_rank'], axis = 0)
    newDataSet.loc[:, 'seed_diff'] = newDataSet['p1_seed'].sub(newDataSet['p2_seed'], axis = 0)
    newDataSet.loc[:, 'age_diff'] = newDataSet['p1_age'].sub(newDataSet['p2_age'], axis = 0)
    newDataSet.loc[:, 'ht_diff'] = newDataSet['p1_ht'].sub(newDataSet['p2_ht'], axis = 0)
    
    # add tournament info columns
    newDataSet.loc[:, 'is_4_draw'] = dataset['draw_size'].apply(lambda x: x == 4)
    newDataSet.loc[:, 'is_8_draw'] = dataset['draw_size'].apply(lambda x: x == 8)
    newDataSet.loc[:, 'is_32_draw'] = dataset['draw_size'].apply(lambda x: x == 32)
    newDataSet.loc[:, 'is_64_draw'] = dataset['draw_size'].apply(lambda x: x == 64)
    newDataSet.loc[:, 'is_hard'] = dataset['surface'].apply(lambda x: x == 'Hard')
    newDataSet.loc[:, 'is_grass'] = dataset['surface'].apply(lambda x: x == 'Grass')
    newDataSet.loc[:, 'is_A_level'] = dataset['tourney_level'].apply(lambda x: x == 'A')
    newDataSet.loc[:, 'is_D_level'] = dataset['tourney_level'].apply(lambda x: x == 'D')
    newDataSet.loc[:, 'is_F_level'] = dataset['tourney_level'].apply(lambda x: x == 'F')
    newDataSet.loc[:, 'is_G_level'] = dataset['tourney_level'].apply(lambda x: x == 'G')
    
    print(newDataSet.columns)
    return newDataSet


# step 2: create datasets
def createTestTrainSplitModel1(dataset): 
    newDataset = featEngModel1(dataset)
    msk = np.arange(len(newDataset)) < (0.8 * newDataset.shape[0])
    train = newDataset[msk]
    test = newDataset[~msk]
    return (train, test)

model1Train, model1Test = createTestTrainSplitModel1(df)
print((model1Train.size, model1Test.size))

# step 3: train model

X = model1Train.loc[:, model1Train.columns != 'y']
y = model1Train['y']
clf_lr = LogisticRegression(random_state=0).fit(X, y)
print(clf_lr.coef_)

# step 4: test model quality
X_test = model1Test.loc[:, model1Test.columns != 'y']
y_test = model1Test['y']
clf_lr.predict(X_test)
clf_lr.score(X_test, y_test)

Index(['y', 'p1_rank', 'p1_rank_points', 'p1_ht', 'p1_age', 'p2_rank',
       'p2_rank_points', 'p2_ht', 'p2_age', 'p1_seed', 'p1_is_right_hand',
       'p2_seed', 'p2_is_right_hand', 'p1_past_wins', 'p1_past_losses',
       'rank_diff', 'seed_diff', 'age_diff', 'ht_diff', 'is_4_draw',
       'is_8_draw', 'is_32_draw', 'is_64_draw', 'is_hard', 'is_grass',
       'is_A_level', 'is_D_level', 'is_F_level', 'is_G_level'],
      dtype='object')
(1448173, 362036)
[[ 3.93467084e-05 -1.21994693e-06  1.68001110e-04 -9.16137453e-04
   2.36604731e-05  1.37076982e-05 -2.80604724e-05 -1.17245447e-03
  -7.57754347e-04  4.38960023e-05 -2.88044357e-04  1.45244622e-05
   2.01912397e-04 -2.93516210e-04  1.56862353e-05 -4.69709990e-04
   2.56317016e-04  1.96061582e-04  4.68607727e-05  2.23842092e-05
  -2.05407900e-05  6.66349308e-05 -9.78390309e-05 -1.97804184e-06
  -8.11323597e-05  4.68607727e-05  2.30891693e-05  1.32015884e-05]]


0.4934315924383211

In [6]:
# Model 2: Random Forest

# train model
clf_rf = RandomForestClassifier(max_depth=3, random_state=0)
clf_rf.fit(X, y)

# eval model
print(clf_rf.feature_importances_)
clf_rf.predict(X_test)
clf_rf.score(X_test, y_test)

[0.08805647 0.08741057 0.02058491 0.08523817 0.05868025 0.09200547
 0.03962424 0.06653641 0.02671961 0.00588708 0.05453974 0.00274217
 0.03115341 0.0257786  0.05307997 0.05034262 0.09283464 0.07842515
 0.00295174 0.01120954 0.         0.0031937  0.00468265 0.00977246
 0.         0.00048781 0.00574866 0.00231397]


0.4926305671259212

In [7]:
# Model 3: SVM
# train model
clf_svm = svm.SVC()
clf_svm.fit(X, y)

#eval model
clf_svm.predict(X_test)
print(clf_svm.score(X_test, y_test))

0.5013617430310798
