<a href="https://colab.research.google.com/github/muajnstu/DSK-Chain-to-predict-diabeties-/blob/main/Diabetics_Prediction_with_Classical_model_ipynb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.metrics import (accuracy_score, confusion_matrix, roc_auc_score, f1_score)
from sklearn.metrics import (confusion_matrix, accuracy_score, f1_score, roc_auc_score, recall_score, precision_score)
from sklearn.neighbors import KNeighborsClassifier
from sklearn import neighbors
from imblearn.over_sampling import SMOTE
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import GaussianNB
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
import shap
import matplotlib.pyplot as plt

In [2]:
df=pd.read_csv('https://raw.githubusercontent.com/muajnstu/DSK-Chain-to-predict-diabeties-/refs/heads/main/update_dataframe%20(1).csv')
X = df.drop(columns=['Outcome'])
y = df['Outcome']

#print("Class distribution:\n", y.value_counts())
# --- Handle Imbalanced Data ---

#smote = SMOTE(random_state=42)
#X_resampled, y_resampled = smote.fit_resample(X, y)

#print("Balanced class distribution:\n", pd.Series(y_resampled).value_counts())
# --- Train/Test Split ---

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=46, stratify=y)


In [4]:
# metrics function
def print_metrics(y_true, y_pred, y_prob=None):
    cm = confusion_matrix(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    num_classes = cm.shape[0]

    if num_classes == 2:
        TN, FP, FN, TP = cm.ravel()
        specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
        sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
        gmean = np.sqrt(specificity * sensitivity)
        type1 = FP / (FP + TN) if (FP + TN) > 0 else 0
        type2 = FN / (TP + FN) if (TP + FN) > 0 else 0
        fmeasure = f1_score(y_true, y_pred, pos_label=1)
        auc = 0
        if y_prob is not None and hasattr(y_prob, "shape") and y_prob.shape[1] > 1:
            try:
                auc = roc_auc_score(y_true, y_prob[:, 1])
            except Exception:
                auc = 0
    else:
        TP = np.diag(cm)
        FP = np.sum(cm, axis=0) - TP
        FN = np.sum(cm, axis=1) - TP
        TN = np.sum(cm) - (FP + FN + TP)
        specificity = np.mean([TN[i] / (TN[i] + FP[i]) if (TN[i] + FP[i]) > 0 else 0 for i in range(num_classes)])
        sensitivity = np.mean([TP[i] / (TP[i] + FN[i]) if (TP[i] + FN[i]) > 0 else 0 for i in range(num_classes)])
        gmean = np.sqrt(specificity * sensitivity)
        type1 = np.mean([FP[i] / (FP[i] + TN[i]) if (FP[i] + TN[i]) > 0 else 0 for i in range(num_classes)])
        type2 = np.mean([FN[i] / (TP[i] + FN[i]) if (TP[i] + FN[i]) > 0 else 0 for i in range(num_classes)])
        fmeasure = f1_score(y_true, y_pred, average='macro')
        auc = 0
        if y_prob is not None and hasattr(y_prob, "shape") and y_prob.shape[1] > 1:
            try:
                auc = roc_auc_score(y_true, y_prob, multi_class='ovr', average='macro')
            except Exception:
                auc = 0

    print(f"Accuracy      : {accuracy:.4f}")
    print(f"Sensitivity   : {sensitivity:.4f}")
    print(f"Specificity   : {specificity:.4f}")
    print(f"G-Mean        : {gmean:.4f}")
    print(f"Type I Error  : {type1:.4f}")
    print(f"Type II Error : {type2:.4f}")
    print(f"F1 Score      : {fmeasure:.4f}")
    print(f"AUROC         : {auc:.4f}")

# Universal runner for any model
def run_model(name, model, X_train, X_test, y_train, y_test):
    print(f"\n===== Running {name} =====")
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    try:
        y_prob = model.predict_proba(X_test)
    except AttributeError:
        y_prob = None
    print_metrics(y_test, y_pred, y_prob)

#  classical ML models
ml_models = {
    #"SVM_linear": SVC(kernel='linear', probability=True, random_state=42),
    "SVM_rbf": SVC(kernel='rbf', probability=True, random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "RandomForest": RandomForestClassifier(n_estimators=100, random_state=42),
    "GradientBoosting": GradientBoostingClassifier(n_estimators=100, random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "NaiveBayes": GaussianNB()
}

# Run all models in a loop
for name, model in ml_models.items():
    run_model(name, model, X_train, X_test, y_train, y_test)


===== Running SVM_rbf =====
Accuracy      : 0.8148
Sensitivity   : 0.0000
Specificity   : 1.0000
G-Mean        : 0.0000
Type I Error  : 0.0000
Type II Error : 1.0000
F1 Score      : 0.0000
AUROC         : 0.6382

===== Running DecisionTree =====
Accuracy      : 0.7026
Sensitivity   : 0.1748
Specificity   : 0.8226
G-Mean        : 0.3792
Type I Error  : 0.1774
Type II Error : 0.8252
F1 Score      : 0.1788
AUROC         : 0.4972

===== Running RandomForest =====
Accuracy      : 0.7666
Sensitivity   : 0.1220
Specificity   : 0.9131
G-Mean        : 0.3337
Type I Error  : 0.0869
Type II Error : 0.8780
F1 Score      : 0.1622
AUROC         : 0.5786

===== Running GradientBoosting =====
Accuracy      : 0.8268
Sensitivity   : 0.0772
Specificity   : 0.9972
G-Mean        : 0.2775
Type I Error  : 0.0028
Type II Error : 0.9228
F1 Score      : 0.1418
AUROC         : 0.6883

===== Running LogisticRegression =====
Accuracy      : 0.8140
Sensitivity   : 0.0000
Specificity   : 0.9991
G-Mean        : 0.00