In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv('diabetes.csv')

In [4]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [5]:
df.shape

(768, 9)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [7]:
# O has to be converted as Nan Values
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [8]:
df.head(20)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0
6,3,78,50,32,88,31.0,0.248,26,1
7,10,115,0,0,0,35.3,0.134,29,0
8,2,197,70,45,543,30.5,0.158,53,1
9,8,125,96,0,0,0.0,0.232,54,1


In [9]:
cols_with_zero = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_zero] = df[cols_with_zero].replace(0, np.nan)
df[cols_with_zero] = df[cols_with_zero].fillna(df[cols_with_zero].median())

In [10]:
df.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [11]:
#Now we have to split it as target data and test data
X = df.drop('Outcome', axis=1)
y = df['Outcome']


In [12]:
from sklearn.preprocessing import StandardScaler

In [13]:
#Scale the X data 
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
#split through train test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [16]:
# Import the models Logistic Regression,Support Vector Machine,Random Forest,XGBoost
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [17]:
import warnings
warnings.filterwarnings("ignore")

In [18]:
#To test best model import matrics
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

In [19]:
#Now chechk the best score
logreg = LogisticRegression()
svm = SVC(kernel='rbf', probability=True)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

models = {
    "Logistic_Regression": logreg,
    "Support_Vector_Machine": svm,
    "Random_Forest": rf,
    "XGBoost": xgb
}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(f"\n--- {name} ---")
    print("Accuracy:", accuracy_score(y_test, y_pred))
    print("ROC AUC:", roc_auc_score(y_test, y_pred))
    print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
    print("Classification Report:\n", classification_report(y_test, y_pred))



--- Logistic_Regression ---
Accuracy: 0.7532467532467533
ROC AUC: 0.7232323232323232
Confusion Matrix:
 [[82 17]
 [21 34]]
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.83      0.81        99
           1       0.67      0.62      0.64        55

    accuracy                           0.75       154
   macro avg       0.73      0.72      0.73       154
weighted avg       0.75      0.75      0.75       154


--- Support_Vector_Machine ---
Accuracy: 0.7467532467532467
ROC AUC: 0.7101010101010101
Confusion Matrix:
 [[83 16]
 [23 32]]
Classification Report:
               precision    recall  f1-score   support

           0       0.78      0.84      0.81        99
           1       0.67      0.58      0.62        55

    accuracy                           0.75       154
   macro avg       0.72      0.71      0.72       154
weighted avg       0.74      0.75      0.74       154


--- Random_Forest ---
Accuracy: 0.740259740259

In [20]:
#  Test on example patient data
example_patients = pd.DataFrame([
    [6, 148, 72, 35, 0, 33.6, 0.627, 50], 
    [1, 89, 66, 23, 94, 28.1, 0.167, 21] 
], columns=X.columns)


In [21]:

# Scale example data using same scaler
example_scaled = scaler.transform(example_patients)

In [22]:
# Predict using trained models
print("\nPredictions for Example Patients")
for name, model in models.items():
    predictions = model.predict(example_scaled)
    print(f"{name}:")
    for i, pred in enumerate(predictions):
        result = "Diabetic" if pred == 1 else "Non-Diabetic"
        print(f"Patient {i+1}: {result}")



Predictions for Example Patients
Logistic_Regression:
Patient 1: Diabetic
Patient 2: Non-Diabetic
Support_Vector_Machine:
Patient 1: Diabetic
Patient 2: Non-Diabetic
Random_Forest:
Patient 1: Diabetic
Patient 2: Non-Diabetic
XGBoost:
Patient 1: Diabetic
Patient 2: Non-Diabetic


In [23]:
import pickle
for name, clf in models.items():
    clf.fit(X_train, y_train)
    with open(f"{name}.pkl", "wb") as f:
        pickle.dump(clf, f)

# Save feature names
with open("columns.pkl", "wb") as f:
    pickle.dump(X.columns.tolist(), f)

print("All models and column names saved successfully.")

All models and column names saved successfully.


In [24]:
import pickle

# After fitting scaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Save the scaler
with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)
