In [255]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

def generate_matchup_stats(winner, loser, year, stats_df):
    winner_stats = stats_df.loc[
        (stats_df.id == winner) & (stats_df.year < year),
        ['id', 'nmatches', 'ace', 'df', 'svpt', '1stIn', '1stWon', '2ndWon', 'SvGms', 'bpSaved', 'bpFaced']
    ].groupby(by='id', as_index=False).sum(min_count=1)
    loser_stats = stats_df.loc[
        (stats_df.id == loser) & (stats_df.year < year),
        ['id', 'nmatches', 'ace', 'df', 'svpt', '1stIn', '1stWon', '2ndWon', 'SvGms', 'bpSaved', 'bpFaced']
    ].groupby(by='id', as_index=False).sum(min_count=1)


    if not winner_stats.empty and not loser_stats.empty:
        winner_stats = winner_stats / winner_stats['nmatches'].iloc[0]
        loser_stats = loser_stats / loser_stats['nmatches'].iloc[0]

        winner_stats.drop(['id', 'nmatches'], axis=1, inplace=True)
        loser_stats.drop(['id', 'nmatches'], axis=1, inplace=True)

        return pd.concat([winner_stats, loser_stats], axis=1).squeeze(axis=0)

### Preprocess Data

In [59]:
path = '../../../Data/'

# import db data from local csv files
tournaments = pd.read_csv(path+'tournament.csv')
games = pd.read_csv(path+'game.csv')
players = pd.read_csv(path+'player_stats_yearly.csv')[
    ['id', 'year', 'nmatches', 'ace', 'df', 'svpt', '1stIn', '1stWon', '2ndWon', 'SvGms', 'bpSaved', 'bpFaced']
]

  games = pd.read_csv(path+'game.csv')


In [236]:
# restructure match data
matchups = tournaments[['id', 'start_date']].merge(
    right=games[['tourney_id', 'winner_id', 'loser_id']],
    left_on='id',
    right_on='tourney_id',
    how="inner",
)[['start_date', 'winner_id', 'loser_id']]

# convert start date to just year
matchups['year'] = matchups['start_date'].apply(lambda x: int(x[:4]))
matchups = matchups[matchups.year > 1987]
matchups = matchups.drop(['start_date'], axis=1)

matchups.head()

Unnamed: 0,winner_id,loser_id,year
159536,4931,4896,1988
159537,5019,749,1988
159538,5509,4946,1988
159539,5257,4720,1988
159540,4888,4730,1988


In [237]:
# randomly swap 50% of winners with losers to balance dataset
idx = np.random.choice(matchups.shape[0], matchups.shape[0]//2)
matchups.iloc[idx, [0,1]] = matchups.iloc[idx, [1,0]]

# zero = first person won, one = second person won
target = np.zeros(matchups.shape[0])
target[idx] = 1

In [238]:
tqdm.pandas()

# get the stats vectors for each match
match_stats = matchups.progress_apply(
    lambda row: generate_matchup_stats(row['winner_id'], row['loser_id'], row['year'], players), 
    axis=1
)

100%|██████████| 863426/863426 [40:27<00:00, 355.70it/s]  


In [242]:
# add updated target variable
match_stats['win'] = target

# drop match stats where both the winner and loser have all NaN values
match_stats = match_stats.dropna(
    axis=0, 
    thresh=2,
)

### ML Model

In [258]:
# split data
X_train, X_test, y_train, y_test = train_test_split(
    match_stats.iloc[:, :-1].to_numpy(),
    match_stats.iloc[:, -1].to_numpy(), 
    test_size=.2
)

array([[ 0.9375    ,  1.03125   , 14.75      , ...,  5.11864407,
         1.96610169,  3.57627119],
       [ 6.96640827,  2.13953488, 61.81912145, ...,  3.96521739,
         1.82608696,  3.07826087],
       [ 0.91304348,  2.01449275, 38.53623188, ...,  8.07594937,
         2.41772152,  4.01687764],
       ...,
       [ 1.13333333,  0.6       , 19.86666667, ...,  0.33333333,
         0.13043478,  0.23188406],
       [        nan,         nan,         nan, ...,  0.76315789,
         0.65789474,  1.        ],
       [ 0.69230769,  1.        , 24.76923077, ...,  0.70175439,
         0.19298246,  0.33333333]])

In [270]:
clf = Pipeline(
    [
        ('scaler', StandardScaler()),
        ('imputer', SimpleImputer(strategy='constant', fill_value=-1)),
        ('rf', RandomForestClassifier(n_estimators=1000, n_jobs=-2))
    ]
)

In [271]:
clf.fit(X_train, y_train)

In [268]:
clf.score(X_train, y_train)

0.9076032637336677

In [272]:
clf.score(X_test, y_test)

0.6438546597733943

In [4]:
df = []
for year in range(min(players['year']), max(players['year'])):
    x = players[players['year'] < year] # non-inclusive!
    x = x.groupby('id').sum().reset_index()
    x = x[x['nmatches'] > 10] # filter pointless players
    x['year'] = year
    df.append(x)
df = pd.concat(df)
feat_cols = ['ace', 'df', 'svpt', '1stIn', '1stWon', '2ndWon', 'SvGms', 'bpSaved', 'bpFaced']
for feat in feat_cols:
    df[feat] /= df['nmatches']
df = df[feat_cols+['id', 'year']]
df

Unnamed: 0,ace,df,svpt,1stIn,1stWon,2ndWon,SvGms,bpSaved,bpFaced,id,year
47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20841,1889
47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20841,1890
51,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20845,1890
62,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20856,1890
47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20841,1891
...,...,...,...,...,...,...,...,...,...,...,...
36589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40520,2022
36600,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40550,2022
36603,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40553,2022
36608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40558,2022


In [5]:
game['year'] = game['tourney_id'].apply(lambda x: int(x.split('-')[0]))
tmp = df.merge(game, left_on=['id', 'year'], right_on=['winner_id', 'year'])
print(tmp.columns)
tmp = df.merge(tmp, left_on=['id', 'year'], right_on=['loser_id', 'year'])
print(tmp.columns)

Index(['ace', 'df', 'svpt', '1stIn', '1stWon', '2ndWon', 'SvGms', 'bpSaved',
       'bpFaced', 'id', 'year', 'tourney_id', 'match_num', 'winner_id',
       'winner_seed', 'winner_entry', 'winner_age', 'loser_id', 'loser_seed',
       'loser_entry', 'loser_age', 'score', 'max_sets', 'round', 'minutes',
       'w_ace', 'w_df', 'w_svpt', 'w_1stIn', 'w_1stWon', 'w_2ndWon', 'w_SvGms',
       'w_bpSaved', 'w_bpFaced', 'l_ace', 'l_df', 'l_svpt', 'l_1stIn',
       'l_1stWon', 'l_2ndWon', 'l_SvGms', 'l_bpSaved', 'l_bpFaced',
       'winner_rank', 'winner_rank_points', 'loser_rank', 'loser_rank_points'],
      dtype='object')
Index(['ace_x', 'df_x', 'svpt_x', '1stIn_x', '1stWon_x', '2ndWon_x', 'SvGms_x',
       'bpSaved_x', 'bpFaced_x', 'id_x', 'year', 'ace_y', 'df_y', 'svpt_y',
       '1stIn_y', '1stWon_y', '2ndWon_y', 'SvGms_y', 'bpSaved_y', 'bpFaced_y',
       'id_y', 'tourney_id', 'match_num', 'winner_id', 'winner_seed',
       'winner_entry', 'winner_age', 'loser_id', 'loser_seed', 'loser_e

In [6]:
feats = np.array(tmp[[x+'_x' for x in feat_cols]+[x+'_y' for x in feat_cols]])
feats = feats[np.sum(feats, axis=1) > 0] # remove unnecessary data

N, d = feats.shape

labels = np.ones((N,))

# randomly reverse the ordering for half the data
ind = np.random.permutation(N)[:N//2]
tmp_data = feats[ind, :d//2]
feats[ind, :d//2] = feats[ind, d//2:]
feats[ind, d//2:] = tmp_data
labels[ind] = 0

# split shuffled data into train and test
ind = np.random.permutation(N)
feats, labels = feats[ind], labels[ind]
trn_tst = int(N*0.70)
trn_feats, trn_labels = feats[:trn_tst], labels[:trn_tst]
tst_feats, tst_labels = feats[trn_tst:], labels[trn_tst:]

In [7]:
np.mean(tst_labels == 1) # random guessing baseline!

0.5003591250408097

In [8]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

clf = make_pipeline(StandardScaler(), SVC(gamma='auto', verbose=True))
clf.fit(trn_feats, trn_labels)

[LibSVM]..........................................................................................................................................................................................
*...........................*..
*
optimization finished, #iter = 214370
obj = -233841.181357, rho = 0.254423
nSV = 235453, nBSV = 234465
Total nSV = 235453


Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(gamma='auto', verbose=True))])

# vegas odds: 68.91 % accuracy

In [9]:
trn_pred = clf.predict(trn_feats)
tst_pred = clf.predict(tst_feats)

print('TRN ACC:', np.mean(trn_pred == trn_labels))
print('TST ACC:', np.mean(tst_pred == tst_labels))

TRN ACC: 0.6081475314644709
TST ACC: 0.6082843617368593


# V1 model: 60.82% accuracy

SVM with all 18 feats and entire career stats

In [10]:
import pickle
with open('model_v1.pkl', 'wb') as f:
    pickle.dump(clf, f)