Import Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib

Load Dataset

In [None]:
data = pd.read_csv('diabetes.csv')
print("Dataset loaded successfully")
print(data.head())  #


Dataset loaded successfully
   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


Data Cleaning and Handling Missing Values

In [None]:
cols_with_zero_missing = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
data[cols_with_zero_missing] = data[cols_with_zero_missing].replace(0, np.nan)

Fill missing values with the mean of each column & removing duplicate

In [None]:
data.fillna(data.mean(), inplace=True)
data.drop_duplicates(inplace=True)
print("Missing values after cleaning:")
print(data.isnull().sum())  # Should be 0 now

Missing values after cleaning:
Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64


Outlier Removal using IQR

In [None]:
Q1 = data.quantile(0.25)
Q3 = data.quantile(0.75)
IQR = Q3 - Q1

Keep data within 1.5 * IQR range

In [None]:
data = data[~((data < (Q1 - 1.5 * IQR)) | (data > (Q3 + 1.5 * IQR))).any(axis=1)]
print(f"Dataset shape after outlier removal: {data.shape}")

Dataset shape after outlier removal: (515, 9)


Feature & Target Split

In [None]:
X = data.drop('Outcome', axis=1)
y = data['Outcome']

Feature Scaling

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

Split Data into Train and Test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

Train Logistic Regression Model

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)


Evaluate Model

In [None]:
y_pred = model.predict(X_test)

print("Model Performance on Test Set:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))

Model Performance on Test Set:
Accuracy: 0.7475728155339806
Precision: 0.8181818181818182
Recall: 0.45
F1-score: 0.5806451612903226


Save Model and Scaler

In [None]:
joblib.dump(model, "group7_model.joblib")
joblib.dump(scaler, "group7_scaler.joblib")

print("Model and scaler saved successfully")

Model and scaler saved successfully
