In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#### Loading in the processed data and mapping the target column to binary output

In [None]:
engin = pd.read_csv("nonsparsetheta.csv")
engin = engin.drop('Unnamed: 0', axis=1)
engin['y'] = np.where(engin['target'] == engin['player1_name'],
                              'player1',
                              'player2')
m = {"player1": 0, "player2": 1}
engin['maptarget'] = engin['y'].map(m)
engin = engin.drop('y', axis=1)

In [None]:
engin.columns
enginx = engin[[
    "player1_age", 
    "player2_age", 
    "player1_ht", 
    "player2_ht", 
    "player1_rank", 
    "player2_rank", 
    "player1_h2h", 
    "player2_h2h",
    "surface", 
    "tourney_level",
    "player_1_recent_form",
    "player_2_recent_form",
    "player_1_theta_form",
    "player_2_theta_form",
    'player1_surface_win_pct', 
    'player2_surface_win_pct',
    'player1_level_win_pct', 
    'player2_level_win_pct',]]
enginy = engin['maptarget']

In [None]:
le = LabelEncoder()
enginx['surface'] = le.fit_transform(enginx['surface'])
enginx['tourney_level'] = le.fit_transform(enginx['tourney_level'])

In [None]:
engintrainx, engintestx, engintrainy, engintesty = train_test_split(enginx, enginy, test_size=0.25, random_state=5323)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_validate
import xgboost

#### Pipline for cross-validation and model metrics

In [None]:
def cv(model, X, y, cv=100):
    scoring = ['accuracy', 'precision', 'recall', 'f1']
    results = cross_validate(estimator=model,
                             X = X,
                             y = y,
                             cv=cv,
                             scoring=scoring,
                             return_train_score=True,
                             return_estimator=True)
    return {
            "Mean Training Accuracy": results['train_accuracy'].mean()*100,
            "Mean Training Precision": results['train_precision'].mean(),
            "Mean Training Recall": results['train_recall'].mean(),
            "Mean Training F1 Score": results['train_f1'].mean(),
            "Mean Validation Accuracy": results['test_accuracy'].mean()*100,
            "Mean Validation Precision": results['test_precision'].mean(),
            "Mean Validation Recall": results['test_recall'].mean(),
            "Mean Validation F1 Score": results['test_f1'].mean(),
            "model": results
            }

In [None]:
logit = LogisticRegression(solver='liblinear')
logit_result = cv(logit, X=engintrainx, y = engintrainy, cv=5)

In [None]:
logit_result

In [None]:
rf = RandomForestClassifier()

#randomized grid search cv to find the best range 
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [False]

randomgrid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = randomgrid, n_iter = 2, cv = 3, verbose=2, random_state=5323)
rf_random.fit(engintrainx, engintrainy)

In [None]:
rf_random.best_params_

In [None]:
rft = RandomForestClassifier(
    n_estimators=600,
    min_samples_split=4,
    min_samples_leaf=4,
    max_features='sqrt',
    max_depth=20,
    bootstrap=False)

rft.fit(engintrainx, engintrainy)

In [None]:
from sklearn.metrics import accuracy_score
preds = rft.predict(engintestx)
accuracy_score(engintesty, preds)

In [None]:
import matplotlib.pyplot as plt
plt.bar(engintrainx.columns, rft.feature_importances_)
plt.title("Feature importance per RF model")
plt.xticks(rotation=90)

In [None]:
#xgboost doesn't play nice with the cv method I made 
#xgresults = cv(model=xgb, X = engintrainx, y = engintrainy, cv=10)
xgb = xgboost.XGBClassifier()
xgb.fit(engintrainx, engintrainy)
xgp = xgb.predict(engintestx)
accuracy_score(engintesty, xgp)

#### Exploring the confidence metric using logistic regression. Is it needed or does rank cover it well enough?

In [None]:
enginxwithout = engin[[
    "player1_age", 
    "player2_age", 
    "player1_ht", 
    "player2_ht", 
    "player1_rank", 
    "player2_rank", 
    "player1_h2h", 
    "player2_h2h",
    "surface", 
    "tourney_level",
    'player1_surface_win_pct', 
    'player2_surface_win_pct',
    'player1_level_win_pct', 
    'player2_level_win_pct',]]

enginxwith = engin[[
    "player1_age", 
    "player2_age", 
    "player1_ht", 
    "player2_ht", 
    "player1_rank", 
    "player2_rank", 
    "player1_h2h", 
    "player2_h2h",
    "surface", 
    "tourney_level",
    "player_1_theta_form",
    "player_2_theta_form",
    'player1_surface_win_pct', 
    'player2_surface_win_pct',
    'player1_level_win_pct', 
    'player2_level_win_pct',]]

enginy = engin['maptarget']
enginxwith['surface'] = le.fit_transform(enginx['surface'])
enginxwith['tourney_level'] = le.fit_transform(enginx['tourney_level'])
enginxwithout['surface'] = le.fit_transform(enginx['surface'])
enginxwithout['tourney_level'] = le.fit_transform(enginx['tourney_level'])

In [None]:
engintrainxwith, engintestxwith, engintrainy, engintesty = train_test_split(enginxwith, enginy, test_size=0.25, random_state=5323)
engintrainxwithout, engintestxwithout, engintrainy, engintesty = train_test_split(enginxwithout, enginy, test_size=0.25, random_state=5323)

In [None]:
from statsmodels.api import Logit

logitwith = Logit(engintrainy.astype(int),engintrainxwith).fit()
logitwithout = Logit(engintrainy.astype(int),engintrainxwithout).fit()

In [None]:
predswith = logitwith.predict(engintestxwith)
predswith = list(map(round,predswith))
print(accuracy_score(engintesty, predswith))
predswithout = logitwithout.predict(engintestxwithout)
predswithout = list(map(round,predswithout))
print(accuracy_score(engintesty, predswithout))

In [None]:
print(logitwith.summary())

In [None]:
print(logitwithout.summary())