In [1]:
# import basic packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# import packages for Cross Validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

# import the modeling packages 
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Loaded in the data
df = pd.read_csv('../csv_files/Model_Ready_Classification.csv', index_col=0)
df.head()

Unnamed: 0,H_FTPct,H_EFGPct,H_ThreePARt,H_FTR,H_REBPct,H_BLKPct,H_AST_TOV_Ratio,A_FTPct,A_EFGPct,A_ThreePARt,A_FTR,A_REBPct,A_BLKPct,A_AST_TOV_Ratio,Target
0,0.833,0.461538,0.395604,0.32967,0.506173,0.036585,1.785714,0.952,0.628049,0.463415,0.256098,0.493827,0.10989,2.142857,0
1,0.885,0.430851,0.404255,0.276596,0.538462,0.02439,1.133333,0.87,0.542683,0.414634,0.280488,0.461538,0.053191,2.266667,0
2,0.844,0.590909,0.428571,0.415584,0.486486,0.076923,2.5,0.935,0.512821,0.397436,0.397436,0.513514,0.025974,1.8125,1
3,0.727,0.567568,0.364865,0.297297,0.475,0.053333,2.0,0.71,0.48,0.453333,0.413333,0.525,0.094595,1.727273,1
4,0.722,0.511905,0.369048,0.214286,0.540816,0.02381,2.363636,0.806,0.47619,0.511905,0.369048,0.459184,0.047619,3.166667,0


In [3]:
# Splitting up our data into variable and target data
X = df.iloc[:, :-1] # Variable
Y = df.Target # Target

In [4]:
# import the RandomOverSampler package from imblearn 
from imblearn.over_sampling import RandomOverSampler

# define the model
ros = RandomOverSampler(random_state=2019)

# fit the data only to the RandomOverSampler model
# this will help address the imbalanced nature of the target variable 
X_resample, Y_resample = ros.fit_resample(X, Y)



In [61]:
# tweaked code from class to build this block of code

# save features and targets as the X_resample and Y_resample variables 
features, targets = X_resample, Y_resample

# define an empty list that the following models will feed into
models = []

# append the list with all the desired models 
# the first time we ran all the models listed below
# the second we commented out the models we did not want to run and left only our selected models 
#models.append(('LogisticRegression', LogisticRegression(solver='liblinear', random_state=2019)))
#models.append(('DecisionTreeClassifier', DecisionTreeClassifier(max_depth=10, max_features=14, criterion='gini', splitter='best', random_state=2019)))
#models.append(('XGBoost', xgb.XGBClassifier(n_estimators=200, max_depth=6, col_sample_bytree=.5, learning_rate=0.5, random_state=2019)))
models.append(('Random Forest:', RandomForestClassifier(n_estimators=75, max_depth=17, max_features=7, random_state=2019)))

# use Cross Validation in the model with a 'stratify' option using the StratifiedKFolds package from sklearn
# specifiy that the scoring method is F1 
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv, scoring='f1')
    print("Model:{0}, F1 Score: mean={1:0.5f}, var={2:0.5f}".format(name, score.mean(), score.var()))
    
    
# Same as above but the scoring option has beeen changed to AUC
cv1 = StratifiedKFold(n_splits=10, shuffle=True, random_state=2019)
for name, model in models:
    score = cross_val_score(model, features, targets, cv=cv1, scoring='roc_auc')
    print("Model:{0}, AUC Score: mean={1:0.5f}, var={2:0.5f}".format(name, score.mean(), score.var()))

Model:Random Forest:, F1 Score: mean=0.92264, var=0.00019
Model:Random Forest:, AUC Score: mean=0.97864, var=0.00006
