In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

In [7]:
colnames = ['preg', 'glu', 'bp', 'sft', 'ins', 'bmi', 'dpf', 'age', 'outcome']
prima_df = pd.read_csv("prima-indians-diabetes.csv",names=colnames)
prima_df.head()

Unnamed: 0,preg,glu,bp,sft,ins,bmi,dpf,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [8]:
prima_df['outcome'].value_counts()

0    500
1    268
Name: outcome, dtype: int64

In [9]:
prima_df['outcome'] = prima_df['outcome'].replace({0: 'Healthy', 1: 'Diabetic'})

In [10]:
X=prima_df[['preg', 'glu', 'bp', 'sft', 'ins', 'bmi', 'dpf', 'age']]
Y=prima_df['outcome']

In [11]:
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.3,random_state=0)

In [12]:
Xtrain=np.array(xtrain)
Ytrain=np.array(ytrain)

In [13]:
Xtest=np.array(xtest)
Ytest=np.array(ytest)

In [22]:
model=DecisionTreeClassifier(criterion = 'entropy',random_state=0)
model.fit(Xtrain,Ytrain)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [23]:
ypred=model.predict(Xtest)


In [24]:
acc=metrics.accuracy_score(Ytest,ypred)
print(acc)

0.7229437229437229


In [25]:
cm=metrics.confusion_matrix(Ytest,ypred)
print(cm)

[[ 44  30]
 [ 34 123]]


In [26]:
cr=metrics.classification_report(Ytest,ypred)
print(cr)

              precision    recall  f1-score   support

    Diabetic       0.56      0.59      0.58        74
     Healthy       0.80      0.78      0.79       157

   micro avg       0.72      0.72      0.72       231
   macro avg       0.68      0.69      0.69       231
weighted avg       0.73      0.72      0.72       231



In [27]:

print (pd.DataFrame(model.feature_importances_, columns = ["Imp"], index = xtrain.columns))


           Imp
preg  0.112823
glu   0.245911
bp    0.087740
sft   0.059990
ins   0.037504
bmi   0.182333
dpf   0.175725
age   0.097975


In [28]:
#Perform Grid Search Method to find the optimal max_depth size
from sklearn.model_selection import GridSearchCV
parameter={'max_depth':np.arange(1,10)}
GS=GridSearchCV(model,parameter,cv=3)
GS.fit(X,Y)


GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [29]:
GS.best_params_

{'max_depth': 4}

In [41]:
#Perform Grid Search Method to find the optimal max_depth size for without passing criterion value (entropy or gini)
# as it is passed earlier as >> model=DecisionTreeClassifier(criterion = 'entropy',random_state=0)
model=DecisionTreeClassifier(random_state=0)
model.fit(Xtrain,Ytrain)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [39]:
from sklearn.model_selection import GridSearchCV
parameter={'max_depth':np.arange(1,10), 'criterion' : ['entropy', 'gini']}
GS=GridSearchCV(model,parameter,cv=3)
GS.fit(X,Y)

GridSearchCV(cv=3, error_score='raise-deprecating',
       estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': array([1, 2, 3, 4, 5, 6, 7, 8, 9]), 'criterion': ['entropy', 'gini']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [40]:
GS.best_params_

{'criterion': 'entropy', 'max_depth': 4}

In [30]:
model2=DecisionTreeClassifier(criterion = 'entropy',max_depth=4,random_state=0)
model2.fit(Xtrain,Ytrain)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=4,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [31]:
ypred2=model2.predict(Xtest)

In [32]:
acc2=metrics.accuracy_score(Ytest,ypred2)
print(acc2)

0.7316017316017316


In [33]:
cm2=metrics.confusion_matrix(Ytest,ypred2)
print(cm2)

[[ 42  32]
 [ 30 127]]


In [34]:
cr2=metrics.classification_report(Ytest,ypred2)
print(cr2)

              precision    recall  f1-score   support

    Diabetic       0.58      0.57      0.58        74
     Healthy       0.80      0.81      0.80       157

   micro avg       0.73      0.73      0.73       231
   macro avg       0.69      0.69      0.69       231
weighted avg       0.73      0.73      0.73       231



In [35]:
from IPython.display import Image  
from sklearn import tree
from os import system

Diabetic_Tree_File = open('diabetes_tree.dot','w')
dot_data = tree.export_graphviz(model, out_file=Diabetic_Tree_File, 
        feature_names = list(xtrain), class_names = list(ytrain))

Diabetic_Tree_File.close()


# importance of features in the tree building ( The importance of a feature is computed as the 
#(normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )


In [36]:
system("dot -Tpng diabetes_tree.dot -o diabetic_tree.png")   # This command is to the OS
Image("diabetic_tree.png")        # use the image command to read the .png file and print on screen

TypeError: a bytes-like object is required, not 'str'

TypeError: a bytes-like object is required, not 'str'

<IPython.core.display.Image object>

# To View Desion tree online
copy diabetes_tree word file graphviz data into the Text Area of http://www.webgraphviz.com/ and Generate Graph.