In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
df = pd.read_csv('../Data/cleaned_diabetes_data2.csv')

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0,6,148.0,72.0,35.0,125.0,33.6,0.627,50,1
1,1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,0
2,2,8,183.0,64.0,29.0,125.0,23.3,0.672,32,1
3,3,1,89.0,66.0,23.0,113.75,28.1,0.167,21,0
4,4,0,137.0,52.0,35.0,135.75,43.1,1.136,33,1


In [5]:
##independent and dependent feature
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [6]:
X

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0,6,148.0,72.0,35.0,125.00,33.6,0.627,50
1,1,1,85.0,66.0,29.0,125.00,26.6,0.351,31
2,2,8,183.0,64.0,29.0,125.00,23.3,0.672,32
3,3,1,89.0,66.0,23.0,113.75,28.1,0.167,21
4,4,0,137.0,52.0,35.0,135.75,43.1,1.136,33
...,...,...,...,...,...,...,...,...,...
762,762,10,101.0,76.0,42.5,135.75,32.9,0.171,58
763,763,2,122.0,70.0,27.0,125.00,36.8,0.340,27
764,764,5,121.0,72.0,23.0,113.75,26.2,0.245,30
765,765,1,126.0,60.0,29.0,125.00,30.1,0.349,47


In [7]:
y

0      1
1      0
2      1
3      0
4      1
      ..
762    0
763    0
764    0
765    1
766    0
Name: Outcome, Length: 767, dtype: int64

In [8]:
df.shape

(767, 10)

In [9]:
##train test split
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y , test_size = 0.25 , random_state = 42)

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
X_train

array([[-0.1444913 ,  1.98549274, -1.0967839 , ...,  0.48048783,
        -0.27466369,  1.35297628],
       [-1.4253202 ,  0.08957523,  0.26459558, ...,  0.44836966,
        -0.80322387, -0.91038157],
       [-0.16704111, -0.85838352, -1.03037514, ..., -0.82029807,
         0.45165709, -0.81984726],
       ...,
       [-0.5368579 , -0.54239727, -0.43269635, ..., -1.14147978,
        -1.13042779, -1.0914502 ],
       [ 0.20728565,  1.98549274,  0.62984373, ...,  0.81772862,
        -0.75648045,  0.71923608],
       [-1.29453133, -1.17436977,  0.13177807, ..., -1.57507508,
        -0.6917588 , -1.0914502 ]], shape=(575, 9))

In [12]:
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()

In [13]:
reg.fit(X_train , y_train)

In [14]:
y_pred = reg.predict(X_test)

In [15]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.7604166666666666
Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.84      0.82       124
           1       0.68      0.62      0.65        68

    accuracy                           0.76       192
   macro avg       0.74      0.73      0.73       192
weighted avg       0.76      0.76      0.76       192

Confusion Matrix:
 [[104  20]
 [ 26  42]]


In [16]:
print(y_train.value_counts())


Outcome
0    376
1    199
Name: count, dtype: int64


In [17]:
(df ==0).sum()

Unnamed: 0                    1
Pregnancies                 111
Glucose                       0
BloodPressure                 0
SkinThickness                 0
Insulin                       0
BMI                           0
DiabetesPedigreeFunction      0
Age                           0
Outcome                     500
dtype: int64

In [18]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression

# Define parameter grid
param_grid = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']  # 'liblinear' supports L1 and L2
}

# Set up the model
model = LogisticRegression()

# Set up GridSearchCV
grid = GridSearchCV(model, param_grid, cv=5, scoring='accuracy')
grid.fit(X_train, y_train)

# Best parameters
print("Best Hyperparameters:", grid.best_params_)
print("Best Score:", grid.best_score_)

# Predict with best estimator
best_model = grid.best_estimator_
y_pred = best_model.predict(X_test)


Best Hyperparameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}
Best Score: 0.768695652173913


In [19]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

from sklearn.metrics import classification_report
print("Random Forest:\n", classification_report(y_test, y_pred_rf))


Random Forest:
               precision    recall  f1-score   support

           0       0.81      0.82      0.82       124
           1       0.67      0.65      0.66        68

    accuracy                           0.76       192
   macro avg       0.74      0.73      0.74       192
weighted avg       0.76      0.76      0.76       192



In [20]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

print("KNN:\n", classification_report(y_test, y_pred_knn))


KNN:
               precision    recall  f1-score   support

           0       0.76      0.80      0.78       124
           1       0.59      0.53      0.56        68

    accuracy                           0.70       192
   macro avg       0.67      0.66      0.67       192
weighted avg       0.70      0.70      0.70       192



In [22]:
df.head()

Unnamed: 0.1,Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0,6,148.0,72.0,35.0,125.0,33.6,0.627,50,1
1,1,1,85.0,66.0,29.0,125.0,26.6,0.351,31,0
2,2,8,183.0,64.0,29.0,125.0,23.3,0.672,32,1
3,3,1,89.0,66.0,23.0,113.75,28.1,0.167,21,0
4,4,0,137.0,52.0,35.0,135.75,43.1,1.136,33,1


In [24]:
df.to_csv('../Data/cleaned_diabetes_data3.csv')

In [25]:
import pickle
pickle.dump(scaler,open('scaler.pkl','wb'))
with open('logistic_model.pkl', 'wb') as file:
    pickle.dump(model, file)


In [26]:
df.to_csv('../Data/cleaned_diabetes_data3.csv')