In [1]:
# import basic packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import packages for Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# import the modeling packages 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Loaded in the data
df = pd.read_csv('../csv_files/Capstone_p1_final_Classification.csv', index_col=0)
df.head()

Unnamed: 0,H_FTPct,H_EFGPct,H_ThreePARt,H_FTR,H_REBPct,H_BLKPct,H_AST_TOV_Ratio,A_FTPct,A_EFGPct,A_ThreePARt,A_FTR,A_REBPct,A_BLKPct,A_AST_TOV_Ratio,Target
0,0.672228,0.357844,0.718132,0.601149,0.498663,0.265413,0.671275,0.907603,0.867653,0.864242,0.480322,0.501337,0.85586,0.773579,0
1,0.774289,0.272976,0.736725,0.499349,0.615201,0.176942,0.489684,0.749759,0.621326,0.758603,0.528431,0.384799,0.414273,0.799606,0
2,0.693817,0.715626,0.788988,0.765937,0.42761,0.558048,0.825491,0.87488,0.535157,0.721358,0.759112,0.57239,0.202294,0.699409,1
3,0.464181,0.651074,0.652064,0.539056,0.386152,0.386913,0.721012,0.441771,0.440452,0.84241,0.79047,0.613848,0.736733,0.678961,1
4,0.454367,0.497135,0.661054,0.379835,0.6237,0.172729,0.798354,0.626564,0.42946,0.969252,0.703116,0.3763,0.370873,0.967992,0


In [3]:
# Splitting up our data into variable and target data
X = df.iloc[:, :-1] # Variable
Y = df.Target # Target

In [4]:
# import the RandomOverSampler package from imblearn 
from imblearn.over_sampling import RandomOverSampler

# define the model
ros = RandomOverSampler(random_state=2019)

# fit the data only to the RandomOverSampler model
# this will help address the imbalanced nature of the target variable 
X_resample, Y_resample = ros.fit_resample(X, Y)



In [5]:
# tweaked code from class to build this block of code

# save features and targets as the X_resample and Y_resample variables 
features, targets = X_resample, Y_resample

# define an empty list that the following models will feed into
models = []

# append the list with all the desired models 
# the first time we ran all the models listed below
# the second we commented out the models we did not want to run and left only our selected models 
models.append(('LogisticRegression', LogisticRegression(solver='liblinear', random_state=2019)))
#models.append(('DecisionTreeClassifier', DecisionTreeClassifier(max_depth=9, max_features=10, criterion='gini', splitter='best', random_state=2019)))
#models.append(('KNeighborsClassifier', KNeighborsClassifier()))
#models.append(('SVC', SVC(kernel='rbf',gamma='auto')))
#models.append(('AdaBoostClassifier', AdaBoostClassifier(random_state=2019)))
#models.append(('XGBoost', xgb.XGBClassifier(n_estimators=250, max_depth=5, col_sample_bytree=.5, learning_rate=0.6, random_state=2019)))
#models.append(('Random Forest:', RandomForestClassifier(n_estimators=75, max_depth=15, max_features=9, random_state=2019)))

# use Cross Validation in the model with a 'stratify' option using the StratifiedKFolds package from sklearn
# specifiy that the scoring method is F1 
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv, scoring='f1')
    print("Model:{0}, F1 Score: mean={1:0.5f}, var={2:0.5f}".format(name, score.mean(), score.var()))
    
    
# Same as above but the scoring option has beeen changed to AUC
cv1 = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv1, scoring='roc_auc')
    print("Model:{0}, AUC Score: mean={1:0.5f}, var={2:0.5f}".format(name, score.mean(), score.var()))

Model:LogisticRegression, F1 Score: mean=0.92132, var=0.00002
Model:LogisticRegression, AUC Score: mean=0.97966, var=0.00001
