In [13]:
import joblib
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.svm import SVC
from scipy.stats import uniform
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

def generate_matchup_stats(winner, loser, year, stats_df):
    winner_stats = stats_df.loc[
        (stats_df.id == winner) & (stats_df.year < year),
        ['id', 'nmatches', 'ace', 'df', 'svpt', '1stIn', '1stWon', '2ndWon', 'SvGms', 'bpSaved', 'bpFaced']
    ].groupby(by='id', as_index=False).sum(min_count=1)
    loser_stats = stats_df.loc[
        (stats_df.id == loser) & (stats_df.year < year),
        ['id', 'nmatches', 'ace', 'df', 'svpt', '1stIn', '1stWon', '2ndWon', 'SvGms', 'bpSaved', 'bpFaced']
    ].groupby(by='id', as_index=False).sum(min_count=1)


    if not winner_stats.empty and not loser_stats.empty:
        winner_stats = winner_stats / winner_stats['nmatches'].iloc[0]
        loser_stats = loser_stats / loser_stats['nmatches'].iloc[0]

        winner_stats.drop(['id', 'nmatches'], axis=1, inplace=True)
        loser_stats.drop(['id', 'nmatches'], axis=1, inplace=True)

        return pd.concat([winner_stats, loser_stats], axis=1).squeeze(axis=0)

### Preprocess Data

In [2]:
path = '../../../Data/'

# import db data from local csv files
tournaments = pd.read_csv(path+'tournament.csv')
games = pd.read_csv(path+'game.csv')
players = pd.read_csv(path+'player_stats_yearly.csv')[
    ['id', 'year', 'nmatches', 'ace', 'df', 'svpt', '1stIn', '1stWon', '2ndWon', 'SvGms', 'bpSaved', 'bpFaced']
]

  games = pd.read_csv(path+'game.csv')


In [3]:
# restructure match data
matchups = tournaments[['id', 'start_date']].merge(
    right=games[['tourney_id', 'winner_id', 'loser_id']],
    left_on='id',
    right_on='tourney_id',
    how='inner',
)[['start_date', 'winner_id', 'loser_id']]

# convert start date to just year
matchups['year'] = matchups['start_date'].apply(lambda x: int(x[:4]))
matchups = matchups[matchups.year > 1987]
matchups = matchups.drop(['start_date'], axis=1)

matchups.head()

Unnamed: 0,winner_id,loser_id,year
159536,4931,4896,1988
159537,5019,749,1988
159538,5509,4946,1988
159539,5257,4720,1988
159540,4888,4730,1988


In [4]:
# randomly swap 50% of winners with losers to balance dataset
idx = np.random.choice(matchups.shape[0], matchups.shape[0]//2)
matchups.iloc[idx, [0,1]] = matchups.iloc[idx, [1,0]]

# zero = first person won, one = second person won
target = np.zeros(matchups.shape[0])
target[idx] = 1

In [5]:
tqdm.pandas()

# get the stats vectors for each match
match_stats = matchups.progress_apply(
    lambda row: generate_matchup_stats(row['winner_id'], row['loser_id'], row['year'], players), 
    axis=1
)

100%|██████████| 863426/863426 [50:02<00:00, 287.58it/s]  


In [6]:
# add updated target variable
match_stats['win'] = target

# drop match stats where both the winner and loser have all NaN values
match_stats = match_stats.dropna(
    axis=0, 
    thresh=2,
)

### ML Model

In [18]:
# randomized hyparam tuning params
k = 5 # folds to fit
n_models = 100 # models to train
cpus = -2 # number of cpus
seed = 510212

In [19]:
# split data
X_train, X_test, y_train, y_test = train_test_split(
    match_stats.iloc[:, :-1].to_numpy(),
    match_stats.iloc[:, -1].to_numpy(), 
    test_size=.15,
    random_state=seed
)

In [26]:
# define pipeline 
pipe = Pipeline(
    [
        ('scaler', StandardScaler()), # standardize
        ('imputer', SimpleImputer(strategy='constant', fill_value=-1)), # impute -1 for missing values
    ]
)

# define hyperparameter search space
models = {
    'LGR': [
        LogisticRegression(
            penalty='elasticnet', 
            solver='saga', 
            max_iter=2000, 
            random_state=seed
        ),
        {
            'LGR__C': uniform(loc=0,scale=4), 
            'LGR__l1_ratio': uniform(loc=0, scale=1)
        }
    ],
    'RF': [
        RandomForestClassifier(
            random_state=seed,
        ),
        {
            'RF__n_estimators': list(range(50,10000,50)), 
            'RF__max_features': ['sqrt', 'log2'], 
            'RF__min_samples_leaf': uniform(loc=0, scale=1),
            'RF__max_depth': list(range(1,500)) + [None]
        }
    ],
    'GBM': [
        GradientBoostingClassifier(
            n_iter_no_change=10,
            validation_fraction=0.1,
            random_state=seed,
        ),
        {
            'GBM__learning_rate': uniform(loc=0, scale=0.4),
            'GBM__n_estimators': [1] + list(range(25,10000,25)),
            'GBM__min_samples_split': uniform(loc=0, scale=1),
            'GBM__min_samples_leaf': uniform(loc=0, scale=1),
            'GBM__max_depth': list(range(1,25)),
            'GBM__max_features': ['sqrt', 'log2'],
            'GBM__subsample': np.linspace(0.5, 1.0, num=20)
        }
    ]
}

In [None]:
best_name = ''
best_model = None
best_score = 0

for model in models:
    print(f'Randomized CV: {model}')
    
    # add model to pipeline
    pipe.steps.append((model, models[model][0]))

    # create randomized search cv
    rcv = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=models[model][1],
        n_iter=n_models,
        scoring='accuracy',
        refit=True,
        cv=k,
        random_state=seed,
        verbose=2,
        n_jobs=cpus
    )

    # tune hyperparams
    rcv.fit(X_train, y_train)
    if best_score < rcv.best_score_:
        best_score = rcv.best_score_
        best_model = rcv.best_estimator_
        best_name = model

    # remove model from pipeline
    pipe.steps.pop()

In [28]:
# model performance
### using accuracy as the metric since ultimately that's what we care about in the simulation feature
print(f'{best_name} was the best model after hyperparameter tuning')
print(f'{k}-fold CV Accuracy: {best_score*100:.2f}%')
print(f'Test Accuracy: {best_model.score(X_test, y_test)*100:.2f}%')

GBM was the best model after hyperparameter tuning
5-fold CV Accuracy: 64.58%
Test Accuracy: 64.39%


In [29]:
# save model
joblib.dump(best_model, best_name + '_classifier.joblib')

['GBM_classifier.joblib']