## Decision Tree - Pipeline 1

In [1]:
# import required package for data handling
import pandas as pd
import numpy as np

# import required packages for splitting data
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# import required packages for evaluating models
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# import the packages needed for the Decision Tree
from sklearn.tree import DecisionTreeClassifier  
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus



In [2]:
# read in the csv
df = pd.read_csv('../csv_files/Capstone_p1_final_Classification.csv',index_col=0)
df.head(2)

Unnamed: 0,H_FTPct,H_EFGPct,H_ThreePARt,H_FTR,H_REBPct,H_BLKPct,H_AST_TOV_Ratio,A_FTPct,A_EFGPct,A_ThreePARt,A_FTR,A_REBPct,A_BLKPct,A_AST_TOV_Ratio,Target
0,0.672228,0.357844,0.718132,0.601149,0.498663,0.265413,0.671275,0.907603,0.867653,0.864242,0.480322,0.501337,0.85586,0.773579,0
1,0.774289,0.272976,0.736725,0.499349,0.615201,0.176942,0.489684,0.749759,0.621326,0.758603,0.528431,0.384799,0.414273,0.799606,0


In [3]:
# Splitting up our data into variable and target data
X = df.iloc[:, :-1] # Variable
Y = df.Target # Target

In [4]:
# Split dataset into training set and test set
# 70% training and 30% test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2019)

In [5]:
# import the RandomOverSampler package from imblearn 
from imblearn.over_sampling import RandomOverSampler

# define the model
ros = RandomOverSampler(random_state=2019)

# fit the data only to the RandomOverSampler model
# this will help address the imbalanced nature of the target variable 
X_train_resample, Y_train_resample = ros.fit_resample(X_train, Y_train)

In [6]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(max_depth=9, max_features=8, criterion='gini', splitter='best', random_state=2019)

# Train Decision Tree Classifer
clf = clf.fit(X_train_resample,Y_train_resample)

#Predict the response for test dataset
Y_pred = clf.predict(X_test)

In [7]:
# Model Accuracy, how often is the Decision Tree correct?
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))

# We are going to look at the classification report and also the confusion matrix for the Decision Tree  
print(metrics.classification_report(Y_test, Y_pred))
print(metrics.confusion_matrix(Y_test, Y_pred))

Accuracy: 0.8236289776574137
              precision    recall  f1-score   support

           0       0.79      0.79      0.79      1245
           1       0.85      0.85      0.85      1709

    accuracy                           0.82      2954
   macro avg       0.82      0.82      0.82      2954
weighted avg       0.82      0.82      0.82      2954

[[ 984  261]
 [ 260 1449]]


In [8]:
# Calculating accuracy, F1 score, and AUC 
print('Accuracy:', metrics.accuracy_score(Y_test, Y_pred))
print('F1_Score:', metrics.f1_score(Y_test, Y_pred, average='weighted'))
print('AUC:', metrics.roc_auc_score(Y_test, Y_pred, average='weighted'))

Accuracy: 0.8236289776574137
F1_Score: 0.8236193403394515
AUC: 0.8191128469407178


In [9]:
# Displaying the feature importance
# define the feature importance variable 
# use the feature importance values from the decision tree above, sort in descending order 
feature_imp = pd.Series(clf.feature_importances_,index=df.iloc[:,:-1].columns).sort_values(ascending=False)

# print the results 
feature_imp

H_EFGPct           0.295884
A_EFGPct           0.279372
A_AST_TOV_Ratio    0.096050
H_AST_TOV_Ratio    0.090899
H_REBPct           0.083958
A_REBPct           0.037977
H_FTR              0.024302
A_FTR              0.023873
A_FTPct            0.016417
H_FTPct            0.014259
A_ThreePARt        0.013058
H_BLKPct           0.009085
A_BLKPct           0.007679
H_ThreePARt        0.007188
dtype: float64