# Random Search 

### Load Packages

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from numpy import mean
from numpy import std
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from scipy.stats import loguniform
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
import scipy.stats as stats
from sklearn.metrics import classification_report, confusion_matrix

### Read and Format Data

In [None]:
os.chdir('/Users/johnoliver/Downloads/grad-nba-wins/data/mod_data')
# data from 2010
df10 = pd.read_csv("mod10.csv")
# get rid of first variable (unique identifier)
df10 = df10.iloc[:,1:]

In [None]:
# define x and y variables
feature_cols = ["h_avg_points","a_avg_points","h_avg_fg","a_avg_fg",
                "h_avg_fga","a_avg_fga","h_avg_3p","a_avg_3p",
                "h_avg_3pa","a_avg_3pa","h_avg_ft","a_avg_ft",
                "h_avg_orb","a_avg_orb","h_avg_drb" ,"a_avg_drb",
                "h_avg_ast","a_avg_ast","h_avg_stl", "a_avg_stl", 
                "h_avg_blk","a_avg_blk","h_avg_tov","a_avg_tov",
                "h_avg_pf","a_avg_pf" ,"h_avg_tsp",  "a_avg_tsp",
                "h_avg_ortg","a_avg_ortg","h_avg_drtg","a_avg_drtg",
                "h_win_perc","a_win_perc","h_back","a_back",
                "home_elo", "away_elo"]

X = df10[feature_cols]
Y = df10.win_status

scaler = StandardScaler()
scale_X = scaler.fit_transform(X)

# split data into training and testing
X_train,X_test,y_train,y_test=train_test_split(X,Y,
                                               test_size=0.3,
                                               random_state=0)
# scaled data
scale_X_train,scale_X_test,y_train,y_test=train_test_split(scale_X,Y,
                                               test_size=0.3,
                                               random_state=0)

# Logistic Regression

In [None]:
%%time
# set up k fold cross validation 
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats = 3, random_state=1)

# define model
lr_model = LogisticRegression()
# define hyperparameters to search for
lr_space = dict()
lr_space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
lr_space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
lr_space['C'] = loguniform(1e-5, 100)
lr_space['multi_class'] = ['auto', 'ovr']
lr_space['verbose'] = [0, 1, 2]

# define random search domain
lr_rand_search = RandomizedSearchCV(lr_model, lr_space, n_iter=100,
                            scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)
search_results = lr_rand_search.fit(X_train, y_train)

In [149]:
bestmodel = search_results.best_estimator_
print("Cross Validation Score: %s" % round(search_results.score(X_train, y_train),3))
print("Difference in CV Scores: ", round((search_results.score(X_train, y_train) - 0.6977156659765356)*100,3))
print("Random Search Run Time: 2min 43s")
print("Difference in Run Times: 2 mins 38s")
print('Best Parameters: ', search_results.best_params_)

Cross Validation Score: 0.719
Difference in CV Scores:  2.148
Random Search Run Time: 2min 43s
Difference in Run Times: 2 mins 38s
Best Parameters:  {'C': 0.05639944644003838, 'multi_class': 'auto', 'penalty': 'l1', 'solver': 'liblinear', 'verbose': 2}


# Random Forest

In [None]:
%%time
#define model
rf_model = RandomForestClassifier()

# define hyperparameters to search for
rf_space = dict()
rf_space['max_depth'] = [1,2,3,4,5,6,7,8,9,10,
                         11,12,13,14,15,16,17,18,19,20]
rf_space['max_features'] = [1,2,3,4,5]
rf_space['n_estimators'] = [1,2,3,4,5,6,7,8,9,10,
                         11,12,13,14,15,16,17,18,19,20]
rf_space['criterion'] = ['gini', 'entropy']

# define random search domain
rf_rand_search = RandomizedSearchCV(rf_model, rf_space, n_iter=100,
                            scoring='accuracy', n_jobs=-1, cv=cv, random_state=1)

rf_search_results = rf_rand_search.fit(X_train, y_train)

In [165]:
df = pd.DataFrame(rf_search_results.cv_results_)
vals = ['param_n_estimators', 'param_max_features', 'param_max_depth', 'param_criterion', 'mean_test_score']
df[vals]

Unnamed: 0,param_n_estimators,param_max_features,param_max_depth,param_criterion,mean_test_score
0,1,1,3,gini,0.608440
1,19,4,11,gini,0.650462
2,11,1,7,gini,0.640842
3,20,3,2,entropy,0.647502
4,10,4,12,gini,0.625597
...,...,...,...,...,...
95,15,2,11,gini,0.637964
96,14,1,13,entropy,0.634576
97,17,1,5,gini,0.645638
98,11,1,17,entropy,0.629324


In [161]:
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_n_estimators,param_max_features,param_max_depth,param_criterion,params,split0_test_score,...,split23_test_score,split24_test_score,split25_test_score,split26_test_score,split27_test_score,split28_test_score,split29_test_score,mean_test_score,std_test_score,rank_test_score
0,0.011958,0.001853,0.005987,0.001249,1,1,3,gini,"{'n_estimators': 1, 'max_features': 1, 'max_de...",0.585714,...,0.557143,0.628571,0.614286,0.642857,0.485714,0.608696,0.608696,0.608440,0.042825,89
1,0.115827,0.022807,0.015272,0.007582,19,4,11,gini,"{'n_estimators': 19, 'max_features': 4, 'max_d...",0.600000,...,0.671429,0.614286,0.700000,0.742857,0.542857,0.753623,0.724638,0.650462,0.053879,25
2,0.071625,0.025830,0.012312,0.010379,11,1,7,gini,"{'n_estimators': 11, 'max_features': 1, 'max_d...",0.600000,...,0.585714,0.685714,0.614286,0.671429,0.528571,0.681159,0.681159,0.640842,0.050582,47
3,0.101003,0.010171,0.013766,0.004709,20,3,2,entropy,"{'n_estimators': 20, 'max_features': 3, 'max_d...",0.600000,...,0.657143,0.671429,0.671429,0.685714,0.628571,0.681159,0.623188,0.647502,0.036708,33
4,0.066746,0.009053,0.009629,0.002462,10,4,12,gini,"{'n_estimators': 10, 'max_features': 4, 'max_d...",0.657143,...,0.671429,0.585714,0.685714,0.571429,0.542857,0.637681,0.666667,0.625597,0.055317,79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.081917,0.006823,0.011193,0.002237,15,2,11,gini,"{'n_estimators': 15, 'max_features': 2, 'max_d...",0.657143,...,0.571429,0.628571,0.657143,0.614286,0.614286,0.637681,0.637681,0.637964,0.057475,53
96,0.071121,0.007940,0.009933,0.002700,14,1,13,entropy,"{'n_estimators': 14, 'max_features': 1, 'max_d...",0.614286,...,0.585714,0.600000,0.700000,0.628571,0.528571,0.637681,0.623188,0.634576,0.055903,63
97,0.070835,0.006221,0.011174,0.004713,17,1,5,gini,"{'n_estimators': 17, 'max_features': 1, 'max_d...",0.628571,...,0.671429,0.614286,0.700000,0.628571,0.614286,0.637681,0.652174,0.645638,0.032925,37
98,0.071338,0.023369,0.011277,0.004954,11,1,17,entropy,"{'n_estimators': 11, 'max_features': 1, 'max_d...",0.600000,...,0.614286,0.628571,0.714286,0.642857,0.571429,0.608696,0.594203,0.629324,0.054378,74


In [150]:
bestmodel = rf_search_results.best_estimator_
print("Cross Validation Score: %s" % round(rf_search_results.score(X_train, y_train),3))
print("Difference in CV Scores: ", round((rf_search_results.score(X_train, y_train) - 0.6977156659765356)*100,3))
print("Random Search Run Time: 1min 15s")
print("Difference in Run Times: 43s")
print('Best Parameters: ', rf_search_results.best_params_)

Cross Validation Score: 0.782
Difference in CV Scores:  8.452
Random Search Run Time: 1min 15s
Difference in Run Times: 43s
Best Parameters:  {'n_estimators': 16, 'max_features': 5, 'max_depth': 4, 'criterion': 'entropy'}


# Support Vector Machine

In [None]:
%%time
#define models
svc_model = SVC()

# define hyperparameters to search for
svc_space = dict()
svc_space['C'] = loguniform(1e-5, 100)
svc_space['degree'] = [1,2,3]
svc_space['gamma'] = ['auto', 'scale']
svc_space['kernel'] = ['linear', 'rbf']

# define random search domain
svc_rand_search = RandomizedSearchCV(svc_model, svc_space, n_iter=100,
                            scoring='accuracy', cv = cv)

svc_search_results = svc_rand_search.fit(scale_X_train, y_train)

In [None]:
bestmodel = svc_search_results.best_estimator_
print("Cross Validation Score: %s" % round(svc_search_results.score(X_train, y_train),3))
print("Difference in CV Scores: ", round((svc_search_results.score(X_train, y_train) - 0.6977156659765356)*100,3))
print("Test Score: ", round(bestmodel.score(X_test, y_test),3))
print("Difference in Tests: ", round((bestmodel.score(X_test, y_test) - 0.64) *100,3))
print("Random Search Run Time: 12min 21s")
print('Best Parameters: ', svc_search_results.best_params_)

# Naive Bayes

In [None]:
%%time
#define models
nb_model = GaussianNB()

# define hyperparameters to search for
nb_space = dict()
nb_space['var_smoothing'] = loguniform(1e-10, 100)

# define random search domain
nb_rand_search = RandomizedSearchCV(nb_model, nb_space, n_iter=100,
                            scoring='accuracy', refit = True)

nb_search_results = nb_rand_search.fit(scale_X_train, y_train)

In [None]:
bestmodel = nb_search_results.best_estimator_
print("Cross Validation Score: %s" % round(nb_search_results.score(X_train, y_train),3))
print("Difference in CV Scores: ", round((nb_search_results.score(X_train, y_train) - 0.6977156659765356)*100,3))
print("Test Score: ", round(bestmodel.score(X_test, y_test),3))
print("Difference in Tests: ", round((bestmodel.score(X_test, y_test) - 0.64) *100,3))
print("Random Search Run Time: 2.15s")
print('Best Parameters: ', nb_search_results.best_params_)