In [27]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

# Load dataset
df = pd.read_csv("data/diabetes_extended_noisy.csv")

# Identify categorical & numeric
cat_cols = df.select_dtypes(include=['object']).columns.tolist()
num_cols = df.select_dtypes(exclude=['object']).columns.tolist()
num_cols.remove("Outcome")

# Replace unrealistic zeros with NaN
zero_na_cols = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']
for col in zero_na_cols:
    df[col] = df[col].replace(0, np.nan)

# Impute missing values
df[num_cols] = SimpleImputer(strategy='median').fit_transform(df[num_cols])
if cat_cols:
    df[cat_cols] = SimpleImputer(strategy='most_frequent').fit_transform(df[cat_cols])

# Train/Test split
X = df.drop("Outcome", axis=1)
y = df["Outcome"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

print("Data loaded and cleaned. Ready for modeling.")




Data loaded and cleaned. Ready for modeling.


In [28]:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

print("Preprocessing pipeline created.")


Preprocessing pipeline created.


In [15]:
numeric_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="median"))
])

categorical_transformer = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

print("Preprocessing pipeline created.")


Preprocessing pipeline created.


In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

log_reg = Pipeline([
    ("preprocess", preprocess),
    ("model", LogisticRegression(max_iter=100000))
])

log_reg.fit(X_train, y_train)
y_pred_lr = log_reg.predict(X_test)

print("\n===== LOGISTIC REGRESSION =====")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_lr))
print("\nAccuracy:", accuracy_score(y_test, y_pred_lr))
print("\nReport:\n", classification_report(y_test, y_pred_lr))



===== LOGISTIC REGRESSION =====
Confusion Matrix:
 [[111  20]
 [ 34  35]]

Accuracy: 0.73

Report:
               precision    recall  f1-score   support

           0       0.77      0.85      0.80       131
           1       0.64      0.51      0.56        69

    accuracy                           0.73       200
   macro avg       0.70      0.68      0.68       200
weighted avg       0.72      0.73      0.72       200



In [33]:
from sklearn.ensemble import RandomForestClassifier

rf = Pipeline([
    ("preprocess", preprocess),
    ("model", RandomForestClassifier(random_state=42))
])

rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("\n===== RANDOM FOREST =====")
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_rf))
print("\nAccuracy:", accuracy_score(y_test, y_pred_rf))
print("\nReport:\n", classification_report(y_test, y_pred_rf))



===== RANDOM FOREST =====
Confusion Matrix:
 [[107  24]
 [ 32  37]]

Accuracy: 0.72

Report:
               precision    recall  f1-score   support

           0       0.77      0.82      0.79       131
           1       0.61      0.54      0.57        69

    accuracy                           0.72       200
   macro avg       0.69      0.68      0.68       200
weighted avg       0.71      0.72      0.72       200



In [34]:
from sklearn.ensemble import GradientBoostingClassifier

gb = Pipeline([
    ("preprocess", preprocess),
    ("model", GradientBoostingClassifier(random_state=42))
])

gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_test)

print("\n===== Gradient Boosting =====")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb))
print("\nAccuracy:", accuracy_score(y_test, y_pred_gb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_gb))



===== Gradient Boosting =====
Confusion Matrix:
[[106  25]
 [ 25  44]]

Accuracy: 0.75

Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.81      0.81       131
           1       0.64      0.64      0.64        69

    accuracy                           0.75       200
   macro avg       0.72      0.72      0.72       200
weighted avg       0.75      0.75      0.75       200



In [35]:
from sklearn.ensemble import HistGradientBoostingClassifier

hgb = Pipeline([
    ("preprocess", preprocess),
    ("model", HistGradientBoostingClassifier(random_state=42))
])

hgb.fit(X_train, y_train)
y_pred_hgb = hgb.predict(X_test)

print("\n===== HistGradientBoosting =====")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred_hgb))
print("\nAccuracy:", accuracy_score(y_test, y_pred_hgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_hgb))



===== HistGradientBoosting =====
Confusion Matrix:
[[109  22]
 [ 27  42]]

Accuracy: 0.755

Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.83      0.82       131
           1       0.66      0.61      0.63        69

    accuracy                           0.76       200
   macro avg       0.73      0.72      0.72       200
weighted avg       0.75      0.76      0.75       200

