In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('diabetes.csv')
data

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.282870,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0
...,...,...,...,...,...,...,...,...,...,...
9995,1469198,6,95,85,37,267,18.497542,0.660240,31,0
9996,1432736,0,55,51,7,50,21.865341,0.086589,34,0
9997,1410962,5,99,59,47,67,30.774018,2.301594,43,1
9998,1958653,0,145,67,30,21,18.811861,0.789572,26,0


In [23]:
data.describe()

Unnamed: 0,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age
count,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,107.8502,71.2075,28.8176,139.2436,31.567022,0.400944,30.1341
std,31.920909,16.801478,14.50648,133.777919,9.804366,0.381463,12.106047
min,44.0,24.0,7.0,14.0,18.200807,0.078044,21.0
25%,84.0,58.0,15.0,39.0,21.247427,0.137065,22.0
50%,105.0,72.0,31.0,85.0,31.922421,0.199698,24.0
75%,129.0,85.0,41.0,197.0,39.328921,0.621158,35.0
max,192.0,117.0,92.0,796.0,56.034628,2.301594,77.0


In [3]:
row_count = len(data)
print(f'Analysing {row_count} rows of data')

diabetic_count = data['Diabetic'].value_counts()
print(f'Diabetic count: {diabetic_count}')

Analysing 10000 rows of data
Diabetic count: Diabetic
0    6656
1    3344
Name: count, dtype: int64


In [None]:
fig = plt.figure(figsize=(6,6))
ax = fig.gca()
diabetic_count.plot.bar(ax=ax)
ax.set_title('Patients with diabetes')
ax.set_xlabel('Diagnosis')
ax.set_ylabel('Patient count')
plt.show()
fig.savefig('Diabetic.png')
plt.close(fig)
print('Figure saved')

In [5]:
import joblib
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report

In [6]:
import sklearn
sklearn.__version__

'1.6.1'

In [7]:
data

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.282870,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0
...,...,...,...,...,...,...,...,...,...,...
9995,1469198,6,95,85,37,267,18.497542,0.660240,31,0
9996,1432736,0,55,51,7,50,21.865341,0.086589,34,0
9997,1410962,5,99,59,47,67,30.774018,2.301594,43,1
9998,1958653,0,145,67,30,21,18.811861,0.789572,26,0


In [8]:
data.columns

Index(['PatientID', 'Pregnancies', 'PlasmaGlucose', 'DiastolicBloodPressure',
       'TricepsThickness', 'SerumInsulin', 'BMI', 'DiabetesPedigree', 'Age',
       'Diabetic'],
      dtype='object')

#### Separate features from the label

In [6]:
X, y = data[['PlasmaGlucose', 'DiastolicBloodPressure', 'TricepsThickness', 'SerumInsulin', 'BMI', 
            'DiabetesPedigree', 'Age']].values, data['Diabetic'].values

In [10]:
len(X)

10000

In [11]:
len(y)

10000

#### Split the data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

print(len(X_train))
print(len(X_test))

7000
3000


#### Train the Model

In [9]:
%time
dtree_model = DecisionTreeClassifier()
dtree_model.fit(X_train, y_train) # Training
dtree_model

CPU times: user 19 μs, sys: 3 μs, total: 22 μs
Wall time: 37.4 μs


In [10]:
predictions = dtree_model.predict(X_test)

In [11]:
acc = np.average(predictions == y_test)
acc

0.8526666666666667

In [12]:
print(confusion_matrix(predictions, y_test))

[[1748  216]
 [ 226  810]]


In [13]:
print(classification_report(predictions, y_test))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89      1964
           1       0.79      0.78      0.79      1036

    accuracy                           0.85      3000
   macro avg       0.84      0.84      0.84      3000
weighted avg       0.85      0.85      0.85      3000



In [14]:
X_new = [[120,83,27,195,37.89166881,0.865807302,22],
         [90,62,54,593,20.49619978,0.548990862,21]]

predictions = dtree_model.predict(X_new)

classnames = ['Non-Diabetic', 'Diabetic']

for prediction in predictions:
    print(classnames[prediction])

Non-Diabetic
Diabetic


#### Visualize the Tree
1. Install Graphviz software (https://graphviz.org/)
2. Install pydot (pip install pydot)
3. import the packages

In [24]:
!pip install pydot

Collecting pydot
  Downloading pydot-3.0.4-py3-none-any.whl.metadata (10 kB)
Downloading pydot-3.0.4-py3-none-any.whl (35 kB)
Installing collected packages: pydot
Successfully installed pydot-3.0.4


In [15]:
from sklearn.tree import export_graphviz
from six import StringIO
from IPython.display import Image
import pydot

array([[1.71000000e+02, 8.00000000e+01, 3.40000000e+01, ...,
        4.35097259e+01, 1.21319135e+00, 2.10000000e+01],
       [9.20000000e+01, 9.30000000e+01, 4.70000000e+01, ...,
        2.12405757e+01, 1.58364981e-01, 2.30000000e+01],
       [1.15000000e+02, 4.70000000e+01, 5.20000000e+01, ...,
        4.15115235e+01, 7.90185680e-02, 2.30000000e+01],
       ...,
       [9.90000000e+01, 5.90000000e+01, 4.70000000e+01, ...,
        3.07740178e+01, 2.30159419e+00, 4.30000000e+01],
       [1.45000000e+02, 6.70000000e+01, 3.00000000e+01, ...,
        1.88118606e+01, 7.89572255e-01, 2.60000000e+01],
       [1.00000000e+02, 5.40000000e+01, 3.40000000e+01, ...,
        3.88409428e+01, 1.75464759e-01, 2.30000000e+01]])

In [19]:
data

Unnamed: 0,PatientID,Pregnancies,PlasmaGlucose,DiastolicBloodPressure,TricepsThickness,SerumInsulin,BMI,DiabetesPedigree,Age,Diabetic
0,1354778,0,171,80,34,23,43.509726,1.213191,21,0
1,1147438,8,92,93,47,36,21.240576,0.158365,23,0
2,1640031,7,115,47,52,35,41.511523,0.079019,23,0
3,1883350,9,103,78,25,304,29.582192,1.282870,43,1
4,1424119,1,85,59,27,35,42.604536,0.549542,22,0
...,...,...,...,...,...,...,...,...,...,...
9995,1469198,6,95,85,37,267,18.497542,0.660240,31,0
9996,1432736,0,55,51,7,50,21.865341,0.086589,34,0
9997,1410962,5,99,59,47,67,30.774018,2.301594,43,1
9998,1958653,0,145,67,30,21,18.811861,0.789572,26,0


In [20]:
data.drop(['PatientID', 'Pregnancies', 'Diabetic'], axis=1, inplace=True)
features = data.columns
features

Index(['PlasmaGlucose', 'DiastolicBloodPressure', 'TricepsThickness',
       'SerumInsulin', 'BMI', 'DiabetesPedigree', 'Age'],
      dtype='object')

In [21]:
dot_data = StringIO()
export_graphviz(dtree_model, out_file=dot_data, feature_names=features, rounded=True)
graph = pydot.graph_from_dot_data(dot_data.getvalue())

In [None]:
Image(graph[0].create_png())

#### RandomForest

In [8]:
from sklearn.ensemble import RandomForestClassifier

In [35]:
%time
randomtree_model = RandomForestClassifier(n_estimators=100)
randomtree_model.fit(X_train, y_train)
randomtree_model

CPU times: user 5 μs, sys: 0 ns, total: 5 μs
Wall time: 9.78 μs


In [33]:
predictions = randomtree_model.predict(X_test)
acc = np.average(predictions == y_test)
acc

0.8946666666666667

In [11]:
print(confusion_matrix(predictions, y_test))

[[1846  168]
 [ 128  858]]


In [12]:
print(classification_report(predictions, y_test))

              precision    recall  f1-score   support

           0       0.94      0.92      0.93      2014
           1       0.84      0.87      0.85       986

    accuracy                           0.90      3000
   macro avg       0.89      0.89      0.89      3000
weighted avg       0.90      0.90      0.90      3000



In [13]:
data.drop(['PatientID', 'Pregnancies', 'Diabetic'], axis=1, inplace=True)
features = data.columns
features

Index(['PlasmaGlucose', 'DiastolicBloodPressure', 'TricepsThickness',
       'SerumInsulin', 'BMI', 'DiabetesPedigree', 'Age'],
      dtype='object')

In [24]:
dot_data = StringIO()
export_graphviz(randomtree_model.estimators_[0], out_file=dot_data, feature_names=features, rounded=True)
graph = pydot.graph_from_dot_data(dot_data.getvalue())

In [None]:
Image(graph[0].create_png())

In [23]:
len(randomtree_model.estimators_)

200