## Decision Tree - Pipeline 2

In [9]:
# import required package for data handling
import pandas as pd
import numpy as np

# import required packages for splitting data
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

# import required packages for evaluating models
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

# import the packages needed for the Decision Tree
from sklearn.tree import DecisionTreeClassifier  
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus
from sklearn import tree

In [10]:
# read in the csv
df = pd.read_csv('../csv_files/Capstone_p2_final_Classification.csv',index_col=0)
df.head(2)

Unnamed: 0,H_FTPct,H_EFGPct,H_ThreePARt,H_FTR,H_REBPct,H_BLKPct,H_AST_TOV_Ratio,A_FTPct,A_EFGPct,A_ThreePARt,A_FTR,A_REBPct,A_BLKPct,A_AST_TOV_Ratio,Target
0,0.833,0.461538,0.395604,0.574169,0.506173,0.191273,1.10146,0.952,0.628049,0.463415,0.71138,0.493827,0.10989,1.135444,0
1,0.885,0.430851,0.404255,0.525924,0.538462,0.156174,1.02108,0.87,0.542683,0.414634,0.727744,0.461538,0.053191,1.146123,0


In [11]:
# Splitting up our data into variable and target data
X = df.iloc[:, :-1] # Variable
Y = df.Target # Target

In [12]:
# Split dataset into training set and test set
# 70% training and 30% test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=2019)

In [13]:
# import the RandomOverSampler package from imblearn 
from imblearn.over_sampling import RandomOverSampler

# define the model
ros = RandomOverSampler(random_state=2019)

# fit the data only to the RandomOverSampler model
# this will help address the imbalanced nature of the target variable 
X_train_resample, Y_train_resample = ros.fit_resample(X_train, Y_train)

In [18]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(max_depth=9, max_features=8, criterion='gini', splitter='best',random_state=2019)

# Train Decision Tree Classifer
clf = clf.fit(X_train_resample,Y_train_resample)

#Predict the response for test dataset
Y_pred = clf.predict(X_test)

In [19]:
# Model Accuracy, how often is the Decision Tree correct?
print("Accuracy:",metrics.accuracy_score(Y_test, Y_pred))

# We are going to look at the classification report and also the confusion matrix for the Decision Tree  
print(metrics.classification_report(Y_test, Y_pred))
print(metrics.confusion_matrix(Y_test, Y_pred))

Accuracy: 0.8232904536222072
              precision    recall  f1-score   support

           0       0.78      0.81      0.79      1245
           1       0.86      0.83      0.84      1709

    accuracy                           0.82      2954
   macro avg       0.82      0.82      0.82      2954
weighted avg       0.82      0.82      0.82      2954

[[1012  233]
 [ 289 1420]]


In [20]:
# Calculating accuracy, F1 score, and AUC 
print('Accuracy:', metrics.accuracy_score(Y_test, Y_pred))
print('F1_Score:', metrics.f1_score(Y_test, Y_pred, average='weighted'))
print('AUC:', metrics.roc_auc_score(Y_test, Y_pred, average='weighted'))

Accuracy: 0.8232904536222072
F1_Score: 0.8237621386528182
AUC: 0.8218733330043403


In [21]:
# Displaying the feature importance
# define the feature importance variable 
# use the feature importance values from the decision tree above, sort in descending order 
feature_imp = pd.Series(clf.feature_importances_,index=df.iloc[:,:-1].columns).sort_values(ascending=False)

# print the results 
feature_imp

A_EFGPct           0.293910
H_EFGPct           0.253937
A_REBPct           0.106625
H_AST_TOV_Ratio    0.097293
A_AST_TOV_Ratio    0.076085
H_REBPct           0.034217
A_FTR              0.028985
H_FTR              0.027277
H_FTPct            0.023399
A_FTPct            0.017784
A_ThreePARt        0.015993
H_BLKPct           0.013928
H_ThreePARt        0.007804
A_BLKPct           0.002762
dtype: float64

In [None]:
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus

dot_data = StringIO()

export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True)

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
Image(graph.create_png())

In [82]:
plot_clf

NameError: name 'plot_clf' is not defined

In [102]:
tree.plot_tree(clf)

[Text(251.04464285714286, 346.49999999999994, 'X[1] <= 0.516\nentropy = 0.483\nsamples = 6890\nvalue = [2819, 4071]'),
 Text(129.8125, 300.29999999999995, 'X[6] <= 0.469\nentropy = 0.479\nsamples = 3619\nvalue = [2178, 1441]'),
 Text(64.76785714285715, 254.09999999999997, 'X[1] <= 0.451\nentropy = 0.381\nsamples = 1203\nvalue = [308, 895]'),
 Text(35.42857142857143, 207.89999999999998, 'X[6] <= 0.419\nentropy = 0.499\nsamples = 408\nvalue = [193, 215]'),
 Text(17.714285714285715, 161.7, 'X[2] <= 0.275\nentropy = 0.38\nsamples = 137\nvalue = [35, 102]'),
 Text(8.857142857142858, 115.5, 'X[8] <= 0.484\nentropy = 0.491\nsamples = 60\nvalue = [26, 34]'),
 Text(4.428571428571429, 69.30000000000001, 'X[1] <= 0.371\nentropy = 0.236\nsamples = 22\nvalue = [3, 19]'),
 Text(2.2142857142857144, 23.100000000000023, 'entropy = 0.0\nsamples = 2\nvalue = [2, 0]'),
 Text(6.642857142857143, 23.100000000000023, 'entropy = 0.095\nsamples = 20\nvalue = [1, 19]'),
 Text(13.285714285714286, 69.3000000000000