# Random Search 

### Load Packages

In [117]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
import scipy.stats as stats
from sklearn.metrics import classification_report, confusion_matrix

### Read and Format Data

In [118]:
os.chdir('/Users/johnoliver/Downloads/grad-nba-wins/data/mod_data')
# data from 2010
df10 = pd.read_csv("mod10.csv")
# get rid of first variable (unique identifier)
df10 = df10.iloc[:,1:]

In [119]:
# define x and y variables
feature_cols = ["h_avg_points","a_avg_points","h_avg_fg","a_avg_fg",
                "h_avg_fga","a_avg_fga","h_avg_3p","a_avg_3p",
                "h_avg_3pa","a_avg_3pa","h_avg_ft","a_avg_ft",
                "h_avg_orb","a_avg_orb","h_avg_drb" ,"a_avg_drb",
                "h_avg_ast","a_avg_ast","h_avg_stl", "a_avg_stl", 
                "h_avg_blk","a_avg_blk","h_avg_tov","a_avg_tov",
                "h_avg_pf","a_avg_pf" ,"h_avg_tsp",  "a_avg_tsp",
                "h_avg_ortg","a_avg_ortg","h_avg_drtg","a_avg_drtg",
                "h_win_perc","a_win_perc","h_back","a_back",
                "home_elo", "away_elo"]

X = df10[feature_cols]
Y = df10.win_status

scaler = StandardScaler()
scale_X = scaler.fit_transform(X)

# split data into training and testing
X_train,X_test,y_train,y_test=train_test_split(X,Y,
                                               test_size=0.3,
                                               random_state=0)
# scaled data
scale_X_train,scale_X_test,y_train,y_test=train_test_split(scale_X,Y,
                                               test_size=0.3,
                                               random_state=0)

# Logistic Regression

In [None]:
%%time
# set up k fold cross validation 
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats = 3, random_state=1)

# define model
lr_model = LogisticRegression()
# define hyperparameters to search for
lr_space = dict()
lr_space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
lr_space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
lr_space['C'] = loguniform(1e-5, 100)
lr_space['multi_class'] = ['auto', 'ovr']
lr_space['verbose'] = [0, 1, 2]

# define random search domain
lr_rand_search = RandomizedSearchCV(lr_model, lr_space, n_iter=100,
                            scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)
search_results = lr_rand_search.fit(X_train, y_train)

In [128]:
bestmodel = search_results.best_estimator_
print("Cross Validation Score: %s" % round(search_results.score(X_train, y_train),3))
print("Difference in CV Scores: ", round((search_results.score(X_train, y_train) - 0.6977156659765356)*100,3))
print("Test Score: ", round(bestmodel.score(X_test, y_test),3))
print("Difference in Tests: ", round((bestmodel.score(X_test, y_test) - 0.64) *100,3))
print("Random Search Run Time: 2min 43s")
print('Best Parameters: ', search_results.best_params_)

Cross Validation Score: 0.719
Difference in CV Scores:  2.148
Test Score:  0.683
Difference in Tests:  4.333
Random Search Run Time: 2min 43s
Best Parameters:  {'C': 0.05639944644003838, 'multi_class': 'auto', 'penalty': 'l1', 'solver': 'liblinear', 'verbose': 2}


# Random Forest

In [None]:
%%time
#define model
rf_model = RandomForestClassifier()

# define hyperparameters to search for
rf_space = dict()
rf_space['max_depth'] = [1,2,3,4,5,6,7,8,9,10,
                         11,12,13,14,15,16,17,18,19,20]
rf_space['max_features'] = [1,2,3,4,5]
rf_space['n_estimators'] = [1,2,3,4,5,6,7,8,9,10,
                         11,12,13,14,15,16,17,18,19,20]
rf_space['criterion'] = ['gini', 'entropy']

# define random search domain
rf_rand_search = RandomizedSearchCV(rf_model, rf_space, n_iter=100,
                            scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)

rf_search_results = rf_rand_search.fit(X_train, y_train)

In [130]:
bestmodel = rf_search_results.best_estimator_
print("Cross Validation Score: %s" % round(rf_search_results.score(X_train, y_train),3))
print("Difference in CV Scores: ", round((rf_search_results.score(X_train, y_train) - 0.6977156659765356)*100,3))
print("Test Score: ", round(bestmodel.score(X_test, y_test),3))
print("Difference in Tests: ", round((bestmodel.score(X_test, y_test) - 0.64) *100,3))
print("Random Search Run Time: 1min 15s")
print('Best Parameters: ', rf_search_results.best_params_)

Cross Validation Score: 0.782
Difference in CV Scores:  8.452
Test Score:  0.683
Difference in Tests:  4.333
Random Search Run Time: 1min 15s
Best Parameters:  {'n_estimators': 16, 'max_features': 5, 'max_depth': 4, 'criterion': 'entropy'}


# Support Vector Machine

In [None]:
%%time
#define models
svc_model = SVC()

# define hyperparameters to search for
svc_space = dict()
svc_space['C'] = loguniform(1e-5, 100)
svc_space['gamma'] = ['auto', 'scale']
svc_space['verbose'] = [1,2,3]
svc_space['kernel'] = ['linear', 'rbf']

# define random search domain
svc_rand_search = RandomizedSearchCV(svc_model, svc_space, n_iter=100,
                            scoring='accuracy', refit = True, cv = cv)

svc_search_results = svc_rand_search.fit(scale_X_train, y_train)

In [None]:
bestmodel = svc_search_results.best_estimator_
print("Cross Validation Score: %s" % round(svc_search_results.score(X_train, y_train),3))
print("Difference in CV Scores: ", round((svc_search_results.score(X_train, y_train) - 0.6977156659765356)*100,3))
print("Test Score: ", round(bestmodel.score(X_test, y_test),3))
print("Difference in Tests: ", round((bestmodel.score(X_test, y_test) - 0.64) *100,3))
print("Random Search Run Time: 1min 15s")
print('Best Parameters: ', svc_search_results.best_params_)

# Naive Bayes

In [None]:
%%time
#define models
nb_model = GaussianNB()

# define hyperparameters to search for
nb_space = dict()
nb_space['var_smoothing'] = np.logspace(0,-9, num=100)

# define random search domain
nb_rand_search = RandomizedSearchCV(nb_model, nb_space, n_iter=100,
                            scoring='accuracy', refit = True)

nb_rand_search.fit(scale_X_train, y_train)

In [101]:
nb_ypred = nb_rand_search.predict(X_test)
print('Test Score: %s' % accuracy_score(y_test, nb_ypred))
print("Nonparameterized Run Time: ")
print('Best Parameters: ', nb_rand_search.best_params_)
print('Difference from non-param: ', (accuracy_score(y_test, nb_ypred) - 0.66)*100, '%')

Test Score: 0.38666666666666666
Nonparameterized Run Time: 
Best Parameters:  {'var_smoothing': 1.0}
Difference from non-param:  -27.333333333333336 %


