In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
colnames = ['preg', 'glu', 'bp', 'sft', 'ins', 'bmi', 'dpf', 'age', 'outcome']
prima_df = pd.read_csv("pima-indians-diabetes-1.data",names=colnames)
prima_df.head()

Unnamed: 0,preg,glu,bp,sft,ins,bmi,dpf,age,outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
prima_df.shape

(768, 9)

In [4]:
prima_df['outcome'].value_counts()

0    500
1    268
Name: outcome, dtype: int64

In [5]:
prima_df['outcome'] = prima_df['outcome'].replace({0: 'Healthy', 1: 'Diabetic'})

In [6]:
prima_df['outcome'] = prima_df.outcome.astype('category')

In [7]:
X=prima_df[['preg', 'glu', 'bp', 'sft', 'ins', 'bmi', 'dpf', 'age']]
Y=prima_df['outcome']

In [66]:
xtrain,xtest,ytrain,ytest=train_test_split(X,Y,test_size=0.3,random_state=0)

In [67]:
Xtrain=np.array(xtrain)
Ytrain=np.array(ytrain)

In [68]:
Xtest=np.array(xtest)
Ytest=np.array(ytest)

In [69]:
model=DecisionTreeClassifier(criterion = 'entropy')
model.fit(Xtrain,Ytrain)

DecisionTreeClassifier(criterion='entropy')

In [70]:
ypred=model.predict(Xtest)

In [71]:
acc=metrics.accuracy_score(Ytest,ypred)
print(acc)

0.7316017316017316


In [72]:
cm=metrics.confusion_matrix(Ytest,ypred)
print(cm)

[[ 45  29]
 [ 33 124]]


In [73]:
cr=metrics.classification_report(Ytest,ypred)
print(cr)

              precision    recall  f1-score   support

    Diabetic       0.58      0.61      0.59        74
     Healthy       0.81      0.79      0.80       157

    accuracy                           0.73       231
   macro avg       0.69      0.70      0.70       231
weighted avg       0.74      0.73      0.73       231



In [74]:
print(pd.DataFrame(model.feature_importances_, columns = ["Imp"], index = xtrain.columns))

           Imp
preg  0.107955
glu   0.257501
bp    0.109647
sft   0.063217
ins   0.028752
bmi   0.179092
dpf   0.146573
age   0.107263


In [75]:
model2=DecisionTreeClassifier(criterion = 'entropy',max_depth=5,max_leaf_nodes=20)
model2.fit(Xtrain,Ytrain)

DecisionTreeClassifier(criterion='entropy', max_depth=5, max_leaf_nodes=20)

In [76]:
ypred2=model2.predict(Xtest)

In [77]:
acc2=metrics.accuracy_score(Ytest,ypred2)
print(acc2)

0.7705627705627706


In [78]:
cm2=metrics.confusion_matrix(Ytest,ypred2)
print(cm2)

[[ 49  25]
 [ 28 129]]


In [79]:
cr2=metrics.classification_report(Ytest,ypred2)
print(cr2)

              precision    recall  f1-score   support

    Diabetic       0.64      0.66      0.65        74
     Healthy       0.84      0.82      0.83       157

    accuracy                           0.77       231
   macro avg       0.74      0.74      0.74       231
weighted avg       0.77      0.77      0.77       231



In [80]:
from IPython.display import Image  
from sklearn import tree
from os import system

Diabetic_Tree_File = open('diabetes_tree.dot','w')
dot_data = tree.export_graphviz(model, out_file=Diabetic_Tree_File, 
        feature_names = list(xtrain), class_names = list(ytrain))

Diabetic_Tree_File.close()

# http://graphviz.it/#/gallery/longflat.gv
# importance of features in the tree building ( The importance of a feature is computed as the 
#(normalized) total reduction of the criterion brought by that feature. It is also known as the Gini importance )
