# Hyperopt

In [41]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from scipy.stats import loguniform
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
import hyperopt.pyll.stochastic
import seaborn as sns
import matplotlib as plt
from sklearn.metrics import accuracy_score

### Load and Format Data

In [42]:
os.chdir('/Users/johnoliver/Downloads/grad-nba-wins/data/mod_data')
# data from 2010
df10 = pd.read_csv("mod10.csv")
# get rid of first variable (unique identifier)
df10 = df10.iloc[:,1:]

In [43]:
# define x and y variables
feature_cols = ["h_avg_points","a_avg_points","h_avg_fg","a_avg_fg",
                "h_avg_fga","a_avg_fga","h_avg_3p","a_avg_3p",
                "h_avg_3pa","a_avg_3pa","h_avg_ft","a_avg_ft",
                "h_avg_orb","a_avg_orb","h_avg_drb" ,"a_avg_drb",
                "h_avg_ast","a_avg_ast","h_avg_stl", "a_avg_stl", 
                "h_avg_blk","a_avg_blk","h_avg_tov","a_avg_tov",
                "h_avg_pf","a_avg_pf" ,"h_avg_tsp",  "a_avg_tsp",
                "h_avg_ortg","a_avg_ortg","h_avg_drtg","a_avg_drtg",
                "h_win_perc","a_win_perc","h_back","a_back",
                "home_elo", "away_elo"]

X = df10[feature_cols]
Y = df10.win_status

scaler = StandardScaler()
scale_X = scaler.fit_transform(X)

# split data into training and testing
X_train,X_test,y_train,y_test=train_test_split(X,Y,
                                               test_size=0.3,
                                               random_state=0)
# scaled data
scale_X_train,scale_X_test,y_train,y_test=train_test_split(scale_X,Y,
                                               test_size=0.3,
                                               random_state=0)

# Logistic Regression

In [None]:
%%time
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats = 3, random_state=1)
def hyperopt_train_test(params):
    X_ = X_train[:]
    clf = LogisticRegression(**params)
    return cross_val_score(clf, X_, y_train).mean()

space4lr = {
    'solver': hp.choice('solver', ['newton-cg', 'lbfgs', 'liblinear']),
    'penalty': hp.choice('penalty', ['none', 'l1', 'l2', 'elasticnet']),
    'C': hp.uniform('C', 0, 20),
    'multi_class': hp.choice('multi_class', ["auto", "ovr"]),
    'verbose': hp.choice('verbose', [0,1,2])
}

best = 0
def f(params):
    global best
    acc = hyperopt_train_test(params)
    if acc > best:
        best = acc
    print('new best:', best, params)
    return {'loss': -acc, 'status': STATUS_OK}

trials = Trials()
best = fmin(f, space4lr, algo=tpe.suggest, max_evals=100, trials=trials)

In [56]:
print("Cross Validation Score: 0.709")
print("Difference in CV from Unparameterized: ", round((0.709 - 0.6977156659765356)*100,3))
print("Difference in CV from Random Search: ", round((0.709 - 0.709)*100,3))
print("Hyperopt Run Time: 48.2s")
print("Difference in Run Time: 1 min 55 seconds")
print('best hyps:')
print(best)

Cross Validation Score: 0.709
Difference in CV from Unparameterized:  1.128
Difference in CV from Random Search:  0.0
Random Search Run Time: 1min 2s


# Random Forest

In [1]:
%%time
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats = 3, random_state=1)
def hyperopt_train_test(params):
    X_ = X_train[:]
    clf = RandomForestClassifier(**params)
    return cross_val_score(clf, X_, y_train, cv = cv).mean()

space4rf = {
    'max_depth': hp.choice('max_depth', [1,2,3,4,5,6,7,8,9,10,11,12,13,14,
                                         15,16,17,18,19,20]),
    'max_features': hp.choice('max_features', [1,2,3,4,5]),
    'n_estimators': hp.choice('n_estimators', [1,2,3,4,5,6,7,8,9,10,11,12,13,14,
                                         15,16,17,18,19,20]),
    'criterion': hp.choice('criterion', ["gini", "entropy"])
}

best = 0
def f(params):
    global best
    acc = hyperopt_train_test(params)
    if acc > best:
        best = acc
    print('new best:', best, params)
    return {'loss': -acc, 'status': STATUS_OK}

trials = Trials()
best = fmin(f, space4rf, algo=tpe.suggest, max_evals=100, trials=trials)

NameError: name 'RepeatedStratifiedKFold' is not defined

In [75]:
print("Cross Validation Score: 0.679")
print("Difference in CV from Unparameterized: ", round((0.6805 - 0.6676)*100,3))
print("Difference in CV from Random Search: ", round((0.6805 - 0.679)*100,3))
print("Random Search Run Time: 1 min 44 seconds")
print("Difference in run time: -32 seconds")
print('best hyps:')
print(best)

Cross Validation Score: 0.679
Difference in CV from Unparameterized:  1.29
Difference in CV from Random Search:  0.15
Random Search Run Time: 1 min 44 seconds
Difference in run time: -32 seconds
best hyps:
{'C': 0.54133135022197, 'gamma': 0.5642208213263, 'kernel': 0}


# Support Vector Machine

In [None]:
%%time
def hyperopt_train_test(params):
    X_ = scale_X_train[:]
    clf = SVC(**params)
    return cross_val_score(clf, X_, y_train).mean()

space4svm = {
    'C': hp.uniform('C', 0, 20),
    'kernel': hp.choice('kernel', ['linear', 'sigmoid', 'poly', 'rbf']),
    'gamma': hp.uniform('gamma', 0, 20)
}

best = 0
def f(params):
    global best
    acc = hyperopt_train_test(params)
    if acc > best:
        best = acc
    print('new best:', best, params)
    return {'loss': -acc, 'status': STATUS_OK}


trials = Trials()
best = fmin(f, space4svm, algo=tpe.suggest, max_evals=100, trials=trials)

In [74]:
print("Cross Validation Score: 0.706")
print("Difference in CV from Unparameterized: ", round((0.706 - 0.6676)*100,3))
print("Difference in CV from Random Search: ", round((0.706 - 0.697)*100,3))
print("Random Search Run Time: 1 min 51 seconds")
print("Difference in run time: -36 seconds")
print('best hyps:')
print(best)

Cross Validation Score: 0.706
Difference in CV from Unparameterized:  3.84
Difference in CV from Random Search:  0.9
Random Search Run Time: 1 min 51 seconds
Difference in run time: -36 seconds
best hyps:
{'C': 0.54133135022197, 'gamma': 0.5642208213263, 'kernel': 0}
