In [740]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [741]:

file = 'diabetes_prediction_dataset.csv'
db = pd.read_csv(file)

**Data Preprocessing**

In [742]:
db.head(20)

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,80.0,0,1,never,25.19,6.6,140,0
1,Female,54.0,0,0,No Info,27.32,6.6,80,0
2,Male,28.0,0,0,never,27.32,5.7,158,0
3,Female,36.0,0,0,current,23.45,5.0,155,0
4,Male,76.0,1,1,current,20.14,4.8,155,0
5,Female,20.0,0,0,never,27.32,6.6,85,0
6,Female,44.0,0,0,never,19.31,6.5,200,1
7,Female,79.0,0,0,No Info,23.86,5.7,85,0
8,Male,42.0,0,0,never,33.64,4.8,145,0
9,Female,32.0,0,0,never,27.32,5.0,100,0


In [743]:
db.shape

(100000, 9)

In [744]:
db.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,100000.0,41.885856,22.51684,0.08,24.0,43.0,60.0,80.0
hypertension,100000.0,0.07485,0.26315,0.0,0.0,0.0,0.0,1.0
heart_disease,100000.0,0.03942,0.194593,0.0,0.0,0.0,0.0,1.0
bmi,100000.0,27.320767,6.636783,10.01,23.63,27.32,29.58,95.69
HbA1c_level,100000.0,5.527507,1.070672,3.5,4.8,5.8,6.2,9.0
blood_glucose_level,100000.0,138.05806,40.708136,80.0,100.0,140.0,159.0,300.0
diabetes,100000.0,0.085,0.278883,0.0,0.0,0.0,0.0,1.0


In [745]:
db.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   gender               100000 non-null  object 
 1   age                  100000 non-null  float64
 2   hypertension         100000 non-null  int64  
 3   heart_disease        100000 non-null  int64  
 4   smoking_history      100000 non-null  object 
 5   bmi                  100000 non-null  float64
 6   HbA1c_level          100000 non-null  float64
 7   blood_glucose_level  100000 non-null  int64  
 8   diabetes             100000 non-null  int64  
dtypes: float64(3), int64(4), object(2)
memory usage: 6.9+ MB


In [746]:
db.age = db.age.astype(int)

In [747]:
db.groupby('diabetes').count()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level
diabetes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,91500,91500,91500,91500,91500,91500,91500,91500
1,8500,8500,8500,8500,8500,8500,8500,8500


In [748]:
pd.set_option('display.max_rows', 200)

In [749]:
db_no = db.query("diabetes == 0")
db_no = db_no.iloc[0:8500]
db_no.shape

(8500, 9)

In [750]:
db_yes = db.query("diabetes == 1")
db_yes.shape

(8500, 9)

In [751]:
db_predict = pd.concat([db_yes, db_no])
db_predict = db_predict.reset_index(drop=True)
db.shape

(100000, 9)

In [752]:
db_predict.head()

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,Female,44,0,0,never,19.31,6.5,200,1
1,Male,67,0,1,not current,27.32,6.5,200,1
2,Male,50,1,0,current,27.32,5.7,260,1
3,Male,73,0,0,former,25.91,9.0,160,1
4,Female,53,0,0,former,27.32,7.0,159,1


In [753]:
db_predict['smoking_history'].value_counts()

never          6309
No Info        4632
former         2341
current        1735
not current    1192
ever            791
Name: smoking_history, dtype: int64

In [754]:
gender = {'Male':1, 'Female':0}
smoking_history = {'never':1, 'No Info':2, 'former':3, 'current':4, 'not current':5, 'ever':6}

db_predict['gender'] = db_predict.loc[:,'gender'].map(gender)
db_predict['smoking_history'] = db_predict.loc[:,'smoking_history'].map(smoking_history)

In [755]:
db_predict

Unnamed: 0,gender,age,hypertension,heart_disease,smoking_history,bmi,HbA1c_level,blood_glucose_level,diabetes
0,0,44,0,0,1,19.31,6.5,200,1
1,1,67,0,1,5,27.32,6.5,200,1
2,1,50,1,0,4,27.32,5.7,260,1
3,1,73,0,0,3,25.91,9.0,160,1
4,0,53,0,0,3,27.32,7.0,159,1
...,...,...,...,...,...,...,...,...,...
16995,1,62,0,0,2,27.32,4.0,145,0
16996,1,45,0,0,2,27.32,6.1,90,0
16997,0,46,0,0,1,27.32,4.0,159,0
16998,0,16,0,0,2,22.95,5.8,155,0


In [756]:
X = db_predict.iloc[:,:-1].values
y = db_predict.iloc[:,-1].values

**Machine Learning Classifiers**

In [757]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [758]:
# Split the dateset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=12)

In [759]:
# Scale the dataset
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

**Decision Tree Classifier**

In [760]:
from sklearn.tree import DecisionTreeClassifier

In [761]:

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

In [762]:
score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [763]:
print('Confusion Matrix')
print('    0 \t  1')
print(f'0 {cm[0]}')
print(f'1 {cm[1]}')

print(f'Accuracy Score: {np.floor(score * 100)}%')

Confusion Matrix
    0 	  1
0 [1473  220]
1 [ 201 1506]
Accuracy Score: 87.0%


**Random Forest Classifier**

In [764]:
from sklearn.ensemble import RandomForestClassifier

In [765]:
rdc = RandomForestClassifier(n_estimators = 100, random_state = 12)
rdc.fit(X_train, y_train)
y_pred = rdc.predict(X_test)

In [766]:
score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [767]:
print('Confusion Matrix')
print('    0 \t  1')
print(f'0 {cm[0]}')
print(f'1 {cm[1]}')

print(f'Accuracy Score: {np.floor(score * 100)}%')

Confusion Matrix
    0 	  1
0 [1523  170]
1 [ 140 1567]
Accuracy Score: 90.0%


**SVM Classifier**

In [768]:
from sklearn.svm import SVC

In [769]:
kernels = ['sigmoid', 'linear', 'poly', 'rbf']

for kernel in kernels:
    svm = SVC(kernel=kernel)
    svm.fit(X_train, y_train)
    y_pred = svm.predict(X_test)

    score = accuracy_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    
    print(kernel.title())
    print('Confusion Matrix')
    print('    0 \t  1')
    print(f'0 {cm[0]}')
    print(f'1 {cm[1]}')

    print(f'Accuracy Score: {np.floor(score * 100)}%')
    print('')

Sigmoid
Confusion Matrix
    0 	  1
0 [1417  276]
1 [ 287 1420]
Accuracy Score: 83.0%

Linear
Confusion Matrix
    0 	  1
0 [1490  203]
1 [ 200 1507]
Accuracy Score: 88.0%

Poly
Confusion Matrix
    0 	  1
0 [1508  185]
1 [ 171 1536]
Accuracy Score: 89.0%

Rbf
Confusion Matrix
    0 	  1
0 [1486  207]
1 [ 152 1555]
Accuracy Score: 89.0%



**Gradient Boosting Classifier**

In [770]:
from sklearn.ensemble import GradientBoostingClassifier

In [771]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)
y_pred = gbc.predict(X_test)

In [772]:
score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [773]:
print('Confusion Matrix')
print('    0 \t  1')
print(f'0 {cm[0]}')
print(f'1 {cm[1]}')

print(f'Accuracy Score: {np.floor(score * 100)}%')

Confusion Matrix
    0 	  1
0 [1524  169]
1 [ 120 1587]
Accuracy Score: 91.0%


**Naive Bayes**

In [774]:
from sklearn.naive_bayes import GaussianNB

In [775]:
nb_classifier = GaussianNB()
nb_classifier.fit(X_train, y_train)
y_pred = nb_classifier.predict(X_test)

In [776]:
score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [777]:
print('Confusion Matrix')
print('    0 \t  1')
print(f'0 {cm[0]}')
print(f'1 {cm[1]}')

print(f'Accuracy Score: {np.floor(score * 100)}%')

Confusion Matrix
    0 	  1
0 [1510  183]
1 [ 390 1317]
Accuracy Score: 83.0%


**K-Nearest Neighbors (KNN) Classifier**

In [778]:
from sklearn.neighbors import KNeighborsClassifier

In [779]:
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train, y_train)
y_pred = knn_classifier.predict(X_test)

In [780]:
score = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)

In [781]:
print('Confusion Matrix')
print('    0 \t  1')
print(f'0 {cm[0]}')
print(f'1 {cm[1]}')

print(f'Accuracy Score: {np.floor(score * 100)}%')

Confusion Matrix
    0 	  1
0 [1481  212]
1 [ 171 1536]
Accuracy Score: 88.0%
