In [1]:
# imports
import pandas as pd
import numpy as np
import sys
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm

In [2]:
# import the csv file
df = pd.read_csv("./atp_matches_2019.csv").fillna(-1)
df['y'] = 1
print(df.size)
print(df.columns)

139050
Index(['tourney_id', 'tourney_name', 'surface', 'draw_size', 'tourney_level',
       'tourney_date', 'match_num', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age',
       'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'score', 'best_of', 'round',
       'minutes', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points',
       'y'],
      dtype='object')


In [3]:
# randomize order of player 1 and player 2 columns and y value
np.random.seed(1)

df['shouldSwap'] = np.random.randint(2, size = df.shape[0])
df.loc[df['shouldSwap'],['winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'winner_rank', 'winner_rank_points', 'loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced', 'loser_rank', 'loser_rank_points']] = df.loc[df['shouldSwap'],['loser_id', 'loser_seed', 'loser_entry', 'loser_name', 'loser_hand',
       'loser_ht', 'loser_ioc', 'loser_age', 'l_ace', 'l_df', 'l_svpt',
       'l_1stIn', 'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced', 'loser_rank', 'loser_rank_points', 'winner_id', 'winner_seed', 'winner_entry',
       'winner_name', 'winner_hand', 'winner_ht', 'winner_ioc', 'winner_age', 'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon',
       'w_SvGms', 'w_bpSaved', 'w_bpFaced', 'winner_rank', 'winner_rank_points']].values
df['y'] = df['shouldSwap'].apply(lambda x: int(not x))

# rename columns from winner to p1 and loser to p2
def renameWinnerLoser(x):
    if x == 'draw_size':
        return x
    
    return x.replace('winner_', 'p1_').replace('w_', 'p1_').replace('loser_', 'p2_').replace('l_', 'p2_')
df = df.rename(renameWinnerLoser, axis='columns')
print(df.columns)

Index(['tourney_id', 'tourney_name', 'surface', 'drap1_size', 'tourney_level',
       'tourney_date', 'match_num', 'p1_id', 'p1_seed', 'p1_entry', 'p1_name',
       'p1_hand', 'p1_ht', 'p1_ioc', 'p1_age', 'p2_id', 'p2_seed', 'p2_entry',
       'p2_name', 'p2_hand', 'p2_ht', 'p2_ioc', 'p2_age', 'score', 'best_of',
       'round', 'minutes', 'p1_ace', 'p1_df', 'p1_svpt', 'p1_1stIn',
       'p1_1stWon', 'p1_2ndWon', 'p1_SvGms', 'p1_bpSaved', 'p1_bpFaced',
       'p2_ace', 'p2_df', 'p2_svpt', 'p2_1stIn', 'p2_1stWon', 'p2_2ndWon',
       'p2_SvGms', 'p2_bpSaved', 'p2_bpFaced', 'p1_rank', 'p1_rank_points',
       'p2_rank', 'p2_rank_points', 'y', 'shouldSwap'],
      dtype='object')


In [4]:
# Model 1: basic logistic regression + basic features
# 
# predicting: is_winner_player1
# 
# features:
# for each player = (seed, rank, rank_pts, ht, age, is_right_hand)
# tournament info = (is_4_draw, is_8_draw, is_32_draw, is_64_draw, is_hard, is_grass, is_A_level, 
# is_D_level, is_F_level, is_G_level)

# step 1: feature engineering and data cleaning
def featEngModel1(dataset):
    def seedToInt(seed):
        resultStr = seed
        try:
            result = float(resultStr)
        except:
            result = sys.maxsize
        return result
    
    # add same winner and loser columns
    newDataSet = dataset[['p1_rank', 'p1_rank_points', 'p1_ht', 'p1_age', 
                          'p2_rank', 'p2_rank_points', 'p2_ht', 'p2_age']]
    
    # add winner compute columns
    newDataSet.loc[:, 'p1_seed'] = dataset['p1_seed'].apply(lambda x: seedToInt(x))
    newDataSet.loc[:, 'p1_is_right_hand'] = dataset['p1_hand'].apply(lambda x: x == 'R')
    
    # add loser columns
    newDataSet.loc[:, 'p2_seed'] = dataset['p2_seed'].apply(lambda x: seedToInt(x))
    newDataSet.loc[:, 'p2_is_right_hand'] = dataset['p2_hand'].apply(lambda x: x == 'R')
    
    # add record between winner and loser
    pastRecord = dict()
    winRecordList = list()
    loseRecordList = list()
    for index, row in dataset.iterrows():
        match = (row['p1_id'], row['p2_id'])
        reverseMatch = (row['p2_id'], row['p1_id'])
        if match not in pastRecord:
            pastRecord[match] = 0
        if reverseMatch not in pastRecord:
            pastRecord[reverseMatch] = 0
        pastRecord[match] += 1
        winRecordList.append(pastRecord[match])
        loseRecordList.append(pastRecord[reverseMatch])
    newDataSet.loc[:, 'p1_past_wins'] = winRecordList
    newDataSet.loc[:, 'p1_past_losses'] = loseRecordList
    
    # add tournament info columns
    newDataSet.loc[:, 'is_4_draw'] = dataset['draw_size'].apply(lambda x: x == 4)
    newDataSet.loc[:, 'is_8_draw'] = dataset['draw_size'].apply(lambda x: x == 8)
    newDataSet.loc[:, 'is_32_draw'] = dataset['draw_size'].apply(lambda x: x == 32)
    newDataSet.loc[:, 'is_64_draw'] = dataset['draw_size'].apply(lambda x: x == 64)
    newDataSet.loc[:, 'is_hard'] = dataset['surface'].apply(lambda x: x == 'Hard')
    newDataSet.loc[:, 'is_grass'] = dataset['surface'].apply(lambda x: x == 'Grass')
    newDataSet.loc[:, 'is_A_level'] = dataset['tourney_level'].apply(lambda x: x == 'A')
    newDataSet.loc[:, 'is_D_level'] = dataset['tourney_level'].apply(lambda x: x == 'D')
    newDataSet.loc[:, 'is_F_level'] = dataset['tourney_level'].apply(lambda x: x == 'F')
    newDataSet.loc[:, 'is_G_level'] = dataset['tourney_level'].apply(lambda x: x == 'G')
    
    newDataSet = newDataSet.assign(y = lambda x: 1)
    print(newDataSet.columns)
    return newDataSet


# step 2: create datasets
def swapWinnerAndLoserTrain(trainSet):
    inverseDataset = trainSet.copy()
    inverseDataset = inverseDataset.reindex(columns = ['loser_rank', 'loser_rank_points', 'loser_ht', 'loser_age',
       'winner_rank', 'winner_rank_points', 'winner_ht', 'winner_age',
       'loser_seed', 'loser_is_right_hand', 'winner_seed',
       'winner_is_right_hand', 'is_4_draw', 'is_8_draw', 'is_32_draw',
       'is_64_draw', 'is_hard', 'is_grass', 'is_A_level', 'is_D_level',
       'is_F_level', 'is_G_level', 'winner_past_wins', 'winner_past_losses', 'y'])
    inverseDataset = inverseDataset.assign(y = lambda x: 0)
    newTrainset = trainSet.copy().append(inverseDataset)
    return newTrainset

def createTestTrainSplitModel1(dataset): 
    newDataset = featEngModel1(dataset)
    # newDataset = newDataset.dropna()
    # msk = np.random.rand(len(newDataset)) < 0.8
    msk = np.arange(len(newDataset)) < (0.8 * len(newDataset))
    train = newDataset[msk]
    # train = swapWinnerAndLoserTrain(train)
    test = newDataset[~msk]
    return (train, test)

model1Train, model1Test = createTestTrainSplitModel1(df)
print((model1Train.size, model1Test.size))

# step 3: train model

X = model1Train.loc[:, model1Train.columns != 'y']
y = model1Train['y']
clf_lr = LogisticRegression(random_state=0).fit(X, y)
print(clf_lr.coef_)

# step 4: test model quality
X_test = model1Test.loc[:, model1Test.columns != 'y']
y_test = model1Test['y']
clf_lr.predict(X_test)
clf_lr.score(X_test, y_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item_labels[indexer[info_axis]]] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


KeyError: 'draw_size'

In [None]:
# Model 2: Random Forest

# train model
clf_rf = RandomForestClassifier(max_depth=3, random_state=0)
clf_rf.fit(X, y)

# eval model
print(clf_rf.feature_importances_)
clf_rf.predict(X_test)
clf_rf.score(X_test, y_test)

In [None]:
# Model 3: SVM
# train model
clf_svm = svm.SVC()
clf_svm.fit(X, y)

#eval model
clf_svm.predict(X_test)
print(clf_svm.score(X_test, y_test))