In [21]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers import Dense
import pickle

# Reading Data


In [22]:
data = pd.read_csv("/content/diabetes.csv")
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Clean Data


In [23]:
print(data.isnull().sum())

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


In [24]:
medical_cols = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

print("Zeros in columns:")
for col in medical_cols:
    zero_count = (data[col] == 0).sum()
    print(f"{col}: {zero_count} zeros")


Zeros in columns:
Glucose: 5 zeros
BloodPressure: 35 zeros
SkinThickness: 227 zeros
Insulin: 374 zeros
BMI: 11 zeros


Replace zeros with average

In [25]:
data[medical_cols] = data[medical_cols].replace(0,np.nan)

simpel_imputer = SimpleImputer(strategy="median")
data[medical_cols] = simpel_imputer.fit_transform(data[medical_cols])

data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


# Data Preprocessing

In [26]:
x = data.drop("Outcome", axis=1)
y = data["Outcome"]

In [27]:
x.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,6,148.0,72.0,35.0,125.0,33.6,0.627,50
1,1,85.0,66.0,29.0,125.0,26.6,0.351,31
2,8,183.0,64.0,29.0,125.0,23.3,0.672,32
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33


In [28]:
y.head()

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1


Data scaling

In [29]:
scaler = MinMaxScaler()
x = scaler.fit_transform(x)

Split Data

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state= 42)

# Model

Logistic Regression

In [31]:
model_LogisticReg = LogisticRegression()
model_LogisticReg.fit(x_train,y_train)
print("Logistic Regression Score = ",model_LogisticReg.score(x_test,y_test))

Logistic Regression Score =  0.7792207792207793


Random Forest

In [32]:
model_RandomForest = RandomForestClassifier(random_state= 42)
model_RandomForest.fit(x_train,y_train)
print("Random Forest Score = ",model_RandomForest.score(x_test,y_test))

Random Forest Score =  0.7402597402597403


SVM

In [35]:
model_SVM = SVC(kernel="linear", random_state= 42)
model_SVM.fit(x_train,y_train)
print("SVM Score = ",model_SVM.score(x_test,y_test))

SVM Score =  0.7727272727272727


Using Neural Network

In [36]:
model_DL = Sequential([
    Dense(16, activation='relu'),
    Dense(8, activation='relu'),
    Dense(1, activation='sigmoid') # binary
])
model_DL.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model_DL.fit(x = x_train, y = y_train, validation_split = 0.2,batch_size = 16, epochs = 30)


test_loss, test_accuracy = model_DL.evaluate(x_test, y_test)
print(f"Neural Network Test Accuracy: {test_accuracy:.4f}")

Epoch 1/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 31ms/step - accuracy: 0.6755 - loss: 0.6734 - val_accuracy: 0.6098 - val_loss: 0.6796
Epoch 2/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6627 - loss: 0.6552 - val_accuracy: 0.6098 - val_loss: 0.6652
Epoch 3/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6755 - loss: 0.6377 - val_accuracy: 0.6098 - val_loss: 0.6472
Epoch 4/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6729 - loss: 0.6235 - val_accuracy: 0.6423 - val_loss: 0.6286
Epoch 5/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6598 - loss: 0.6149 - val_accuracy: 0.6585 - val_loss: 0.6127
Epoch 6/30
[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6719 - loss: 0.5972 - val_accuracy: 0.6911 - val_loss: 0.6007
Epoch 7/30
[1m31/31[0m [32m━━━━━━━━━

# Deployment

In [37]:
file_name = "Final_project.sav"
pickle.dump(model_LogisticReg,open(file_name,'wb'))

Get info from user to pridect

In [38]:
sex = input("Male Or Women : ")
if sex == "Women" or sex == "women" :
    p = int(input("Enter the number of Pregnancies : "))
else:
    p = 0
g = int(input("Num of Glucose : "))
b = int(input("Num of Blood Pressure : "))
s = int(input("Num of Skin Thickness : "))
i = int(input("Num of Insulin : "))
bmi = float(input("Your BMI : "))
print("Diabetes Pedigree Function = (1 + coefficient of relationship) × family_diabetes_prevalence")
d =  float(input("Your Diabetes Pedigree Function : "))
a = int(input("Your Age : "))

Male Or Women : male
Num of Glucose : 50
Num of Blood Pressure : 99
Num of Skin Thickness : 36
Num of Skin Insulin : 12
Your BMI : 45
Diabetes Pedigree Function = (1 + coefficient of relationship) × family_diabetes_prevalence
Your Diabetes Pedigree Function : 2
Your Age : 25


Show the prediction

In [39]:
loaded_model = pickle.load(open('/content/Final_project.sav','rb'))
result = loaded_model.predict([[p,g,b,s,i,bmi,d,a]])

if result ==  [1] :
    print("You are a diabetic patient")
else:
    print("You are not a diabetic patient")

You are a diabetic patient
