In [None]:
# Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

In [None]:
# Load the dataset

In [None]:
df = pd.read_csv("diabetes.csv")
df.head()

In [None]:
# Data overview

In [None]:
print(df.info())
df.describe()

In [None]:
# Visualize class distribution

In [None]:
sns.countplot(x='Outcome', data=df)
plt.title("Class Distribution (Outcome)")
plt.show()
df['Outcome'].value_counts()

In [None]:
# Check for zero values in critical columns

In [None]:
suspicious_cols = ["Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI"]
for col in df.columns:
    print(f"{col}: {(df[col] == 0).sum()} zeros")

In [None]:
# Boxplot for outlier detection

In [None]:
fig, axes = plt.subplots(1, len(suspicious_cols), figsize=(20, 4))
for i, col in enumerate(suspicious_cols):
    sns.boxplot(y=df[col], ax=axes[i], color='lightblue')
    axes[i].set_title(col)
plt.tight_layout()
plt.show()

In [None]:
# Replace zero values with NaN

In [None]:
df[suspicious_cols] = df[suspicious_cols].replace(0, np.nan)
df.isnull().sum()

In [None]:
# Impute missing values (mean or median)

In [None]:
mean_cols = ["Glucose", "BloodPressure", "BMI"]
median_cols = ["SkinThickness", "Insulin"]
for col in mean_cols:
    df[col] = df[col].fillna(df[col].mean())
for col in median_cols:
    df[col] = df[col].fillna(df[col].median())
df.isnull().sum()

In [None]:
# Split features and target

In [None]:
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

In [None]:
# Feature scaling

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
X_scaled_df.head()

In [None]:
# Model training and evaluation with cross-validation

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
model = LogisticRegression(max_iter=1000, random_state=42)
accuracy = cross_val_score(model, X_scaled, y, cv=cv, scoring='accuracy')
precision = cross_val_score(model, X_scaled, y, cv=cv, scoring='precision')
recall = cross_val_score(model, X_scaled, y, cv=cv, scoring='recall')
f1 = cross_val_score(model, X_scaled, y, cv=cv, scoring='f1')
print("Accuracy (mean):", accuracy.mean())
print("Precision (mean):", precision.mean())
print("Recall (mean):", recall.mean())
print("F1-score (mean):", f1.mean())

In [None]:
# Train final model on full dataset

In [None]:
final_model = LogisticRegression(max_iter=1000, random_state=42)
final_model.fit(X_scaled, y)

In [None]:
# Predict for a new patient

In [None]:
new_data = pd.DataFrame({
    "Pregnancies": [2],
    "Glucose": [130],
    "BloodPressure": [80],
    "SkinThickness": [25],
    "Insulin": [100],
    "BMI": [30.0],
    "DiabetesPedigreeFunction": [0.5],
    "Age": [35]
})
new_data_scaled = scaler.transform(new_data)
prediction = final_model.predict(new_data_scaled)
prob = final_model.predict_proba(new_data_scaled)
print("Prediction:", "Diabetic" if prediction[0] == 1 else "Non-Diabetic")
print("Diabetes Probability:", round(prob[0][1], 4))