<a href="https://colab.research.google.com/github/muajnstu/Large_Scale_Implementation_of_DSK_Chain/blob/main/Downstram_Pipeline_of_Proposed_Method.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from sklearn.metrics import (accuracy_score, confusion_matrix, roc_auc_score, f1_score)
from sklearn.metrics import (confusion_matrix, accuracy_score, f1_score, roc_auc_score, recall_score, precision_score)
from sklearn.neighbors import KNeighborsClassifier
from sklearn import neighbors
from imblearn.over_sampling import SMOTE
from sklearn.base import BaseEstimator, ClassifierMixin
import numpy as np
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier,
    BaggingClassifier
)
from sklearn.linear_model import (
    LogisticRegression,
    RidgeClassifier,
    Perceptron,
    SGDClassifier,
    PassiveAggressiveClassifier
)
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/muajnstu/Large_Scale_Implementation_of_DSK_Chain/refs/heads/main/filtered%20data/SCE_data_fraudulent.csv')

X = df.drop(columns=['Is Fraudulent'])
y = df['Is Fraudulent']

In [None]:
df = pd.read_csv('https://raw.githubusercontent.com/muajnstu/Large_Scale_Implementation_of_DSK_Chain/refs/heads/main/filtered%20data/Clustered_EmployeeAttrition.csv')

X = df.drop(columns=['Attrition'])
y = df['Attrition']

In [None]:
y = y.astype(str)

In [None]:
y_cat = pd.Categorical(y)
y_codes = y_cat.codes
original_labels = y_cat.categories

In [None]:
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y_codes)
print("Class distribution after SMOTE:", pd.Series(y_resampled).value_counts())

Class distribution after SMOTE: 1    7985
3    7985
0    7985
2    7985
4    7985
Name: count, dtype: int64


In [None]:
y_resampled_labels = [original_labels[i] for i in y_resampled]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled_labels, test_size=0.2, random_state=46, stratify=y_resampled_labels
)

In [None]:
def print_metrics(y_true, y_pred, y_prob=None):
    cm = confusion_matrix(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)
    num_classes = cm.shape[0]

    if num_classes == 2:
        TN, FP, FN, TP = cm.ravel()
        specificity = TN / (TN + FP) if (TN + FP) > 0 else 0
        sensitivity = TP / (TP + FN) if (TP + FN) > 0 else 0
        gmean = np.sqrt(specificity * sensitivity)
        type1 = FP / (FP + TN) if (FP + TN) > 0 else 0
        type2 = FN / (TP + FN) if (TP + FN) > 0 else 0
        fmeasure = f1_score(y_true, y_pred, pos_label=1)
        auc = 0
        if y_prob is not None and hasattr(y_prob, "shape") and y_prob.shape[1] > 1:
            try:
                auc = roc_auc_score(y_true, y_prob[:, 1])
            except Exception:
                auc = 0
    else:
        TP = np.diag(cm)
        FP = np.sum(cm, axis=0) - TP
        FN = np.sum(cm, axis=1) - TP
        TN = np.sum(cm) - (FP + FN + TP)

        specificity = np.mean([
            TN[i] / (TN[i] + FP[i]) if (TN[i] + FP[i]) > 0 else 0 for i in range(num_classes)
        ])
        sensitivity = np.mean([
            TP[i] / (TP[i] + FN[i]) if (TP[i] + FN[i]) > 0 else 0 for i in range(num_classes)
        ])
        gmean = np.sqrt(specificity * sensitivity)
        type1 = np.mean([
            FP[i] / (FP[i] + TN[i]) if (FP[i] + TN[i]) > 0 else 0 for i in range(num_classes)
        ])
        type2 = np.mean([
            FN[i] / (TP[i] + FN[i]) if (TP[i] + FN[i]) > 0 else 0 for i in range(num_classes)
        ])
        fmeasure = f1_score(y_true, y_pred, average='macro')
        auc = 0
        if y_prob is not None and hasattr(y_prob, "shape") and y_prob.shape[1] > 1:
            try:
                auc = roc_auc_score(y_true, y_prob, multi_class='ovr', average='macro')
            except Exception:
                auc = 0

    print(f"Accuracy      : {accuracy:.4f}")
    print(f"Sensitivity   : {sensitivity:.4f}")
    print(f"Specificity   : {specificity:.4f}")
    print(f"G-Mean        : {gmean:.4f}")
    print(f"Type I Error  : {type1:.4f}")
    print(f"Type II Error : {type2:.4f}")
    print(f"F1 Score      : {fmeasure:.4f}")
    print(f"AUROC         : {auc:.4f}")

In [None]:
def run_model(name, model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    try:
        y_prob = model.predict_proba(X_test)
    except Exception:
        y_prob = None
    print(f"\nModel: {name}")
    print_metrics(y_test, y_pred, y_prob)

In [None]:
ml_models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "ExtraTrees": ExtraTreesClassifier(random_state=42),
    "Bagging": BaggingClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "RidgeClassifier": RidgeClassifier(random_state=42),
    "DecisionTree": DecisionTreeClassifier(random_state=42),
    "NaiveBayes": GaussianNB(),
    "Perceptron": Perceptron(random_state=42),
    "SGDClassifier": SGDClassifier(random_state=42),
    "KNN": KNeighborsClassifier(n_neighbors=3),
    "PassiveAggressive": PassiveAggressiveClassifier(random_state=42),
    #"LinearSVM": SVC(kernel='linear', probability=True, random_state=42),
    "RBFSVM": SVC(kernel='rbf', probability=True, random_state=42),
    "LDA": LinearDiscriminantAnalysis(),
    "QDA": QuadraticDiscriminantAnalysis(),
    #"XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(verbosity=-1, random_state=42)
}


In [None]:
for name, model in ml_models.items():
    run_model(name, model, X_resampled, X_test, y_resampled_labels, y_test)


Model: RandomForest
Accuracy      : 1.0000
Sensitivity   : 1.0000
Specificity   : 1.0000
G-Mean        : 1.0000
Type I Error  : 0.0000
Type II Error : 0.0000
F1 Score      : 1.0000
AUROC         : 1.0000

Model: ExtraTrees
Accuracy      : 1.0000
Sensitivity   : 1.0000
Specificity   : 1.0000
G-Mean        : 1.0000
Type I Error  : 0.0000
Type II Error : 0.0000
F1 Score      : 1.0000
AUROC         : 1.0000

Model: Bagging
Accuracy      : 0.9670
Sensitivity   : 0.9672
Specificity   : 0.9934
G-Mean        : 0.9802
Type I Error  : 0.0066
Type II Error : 0.0328
F1 Score      : 0.9670
AUROC         : 0.9992

Model: GradientBoosting
Accuracy      : 0.9764
Sensitivity   : 0.9765
Specificity   : 0.9953
G-Mean        : 0.9858
Type I Error  : 0.0047
Type II Error : 0.0235
F1 Score      : 0.9764
AUROC         : 0.9999


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Model: LogisticRegression
Accuracy      : 0.4340
Sensitivity   : 0.4345
Specificity   : 0.8868
G-Mean        : 0.6207
Type I Error  : 0.1132
Type II Error : 0.5655
F1 Score      : 0.4378
AUROC         : 0.8527

Model: RidgeClassifier
Accuracy      : 0.3915
Sensitivity   : 0.3931
Specificity   : 0.8784
G-Mean        : 0.5876
Type I Error  : 0.1216
Type II Error : 0.6069
F1 Score      : 0.3296
AUROC         : 0.0000

Model: DecisionTree
Accuracy      : 1.0000
Sensitivity   : 1.0000
Specificity   : 1.0000
G-Mean        : 1.0000
Type I Error  : 0.0000
Type II Error : 0.0000
F1 Score      : 1.0000
AUROC         : 1.0000

Model: NaiveBayes
Accuracy      : 0.5755
Sensitivity   : 0.5759
Specificity   : 0.9151
G-Mean        : 0.7260
Type I Error  : 0.0849
Type II Error : 0.4241
F1 Score      : 0.5757
AUROC         : 0.9253

Model: Perceptron
Accuracy      : 0.1651
Sensitivity   : 0.1667
Specificity   : 0.8333
G-Mean        : 0.3727
Type I Error  : 0.1667
Type II Error : 0.8333
F1 Score      : 

In [None]:
# Train and evaluate all models
for name, model in ml_models.items():
    run_model(name, model, X_resampled, X_test, y_resampled_labels, y_test)


Model: RandomForest
Accuracy      : 1.0000
Sensitivity   : 1.0000
Specificity   : 1.0000
G-Mean        : 1.0000
Type I Error  : 0.0000
Type II Error : 0.0000
F1 Score      : 1.0000
AUROC         : 1.0000

Model: ExtraTrees
Accuracy      : 1.0000
Sensitivity   : 1.0000
Specificity   : 1.0000
G-Mean        : 1.0000
Type I Error  : 0.0000
Type II Error : 0.0000
F1 Score      : 1.0000
AUROC         : 1.0000

Model: Bagging
Accuracy      : 0.9955
Sensitivity   : 0.9955
Specificity   : 0.9989
G-Mean        : 0.9972
Type I Error  : 0.0011
Type II Error : 0.0045
F1 Score      : 0.9955
AUROC         : 1.0000

Model: GradientBoosting
Accuracy      : 0.9260
Sensitivity   : 0.9260
Specificity   : 0.9815
G-Mean        : 0.9533
Type I Error  : 0.0185
Type II Error : 0.0740
F1 Score      : 0.9221
AUROC         : 0.9881


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



Model: LogisticRegression
Accuracy      : 0.7966
Sensitivity   : 0.7966
Specificity   : 0.9492
G-Mean        : 0.8695
Type I Error  : 0.0508
Type II Error : 0.2034
F1 Score      : 0.7905
AUROC         : 0.9410

Model: RidgeClassifier
Accuracy      : 0.7100
Sensitivity   : 0.7100
Specificity   : 0.9275
G-Mean        : 0.8115
Type I Error  : 0.0725
Type II Error : 0.2900
F1 Score      : 0.6268
AUROC         : 0.0000

Model: DecisionTree
Accuracy      : 1.0000
Sensitivity   : 1.0000
Specificity   : 1.0000
G-Mean        : 1.0000
Type I Error  : 0.0000
Type II Error : 0.0000
F1 Score      : 1.0000
AUROC         : 1.0000

Model: NaiveBayes
Accuracy      : 0.8288
Sensitivity   : 0.8288
Specificity   : 0.9572
G-Mean        : 0.8907
Type I Error  : 0.0428
Type II Error : 0.1712
F1 Score      : 0.8250
AUROC         : 0.9555

Model: Perceptron
Accuracy      : 0.3348
Sensitivity   : 0.3348
Specificity   : 0.8337
G-Mean        : 0.5283
Type I Error  : 0.1663
Type II Error : 0.6652
F1 Score      : 



Accuracy      : 0.5068
Sensitivity   : 0.5068
Specificity   : 0.8767
G-Mean        : 0.6666
Type I Error  : 0.1233
Type II Error : 0.4932
F1 Score      : 0.4150
AUROC         : 0.8142

Model: LightGBM
Accuracy      : 0.9733
Sensitivity   : 0.9733
Specificity   : 0.9933
G-Mean        : 0.9833
Type I Error  : 0.0067
Type II Error : 0.0267
F1 Score      : 0.9729
AUROC         : 0.9992
