<a href="https://colab.research.google.com/github/mannat244/ML_Lab/blob/main/ML_Lab3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
DATASETS = ["iris"]

In [28]:
import os, tempfile
import pandas as pd
import numpy as np
import scipy.io.arff as arff
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
    accuracy_score, classification_report,
    ConfusionMatrixDisplay, RocCurveDisplay,
    roc_auc_score, balanced_accuracy_score
)

In [29]:
def load_keel(folder, names):
    out = {}
    for f in os.listdir(folder):
        if f.endswith(".dat") and any(f.startswith(n) for n in names):
            lines = open(folder+f).readlines()
            lines = [l for l in lines if not l.lower().startswith(("@inputs","@output"))]

            tmp = tempfile.NamedTemporaryFile(delete=False, mode="w")
            tmp.writelines(lines); tmp.close()

            d,_ = arff.loadarff(tmp.name)
            df = pd.DataFrame(d)
            df = df.map(lambda x: x.decode() if isinstance(x, bytes) else x)

            out[f] = df
    return out


In [30]:
datasets = load_keel("/content/",DATASETS)

print(datasets.keys())


dict_keys(['iris-5-3tst.dat', 'iris-5-5tst.dat', 'iris-5-3tra.dat', 'iris-5-4tst.dat', 'iris-5-5tra.dat', 'iris-5-2tra.dat', 'iris-5-2tst.dat', 'iris-5-4tra.dat', 'iris-5-1tra.dat', 'iris-5-1tst.dat'])


In [31]:
import numpy as np
from sklearn.metrics import roc_auc_score


In [32]:
def bayesian_soft_scores(X_train, y_train, X_test, positive_class):
    classes = y_train.unique()
    means, vars_, priors = {}, {}, {}

    for c in classes:
        Xc = X_train[y_train == c]
        means[c] = Xc.mean().values
        vars_[c] = Xc.var().values + 1e-6
        priors[c] = len(Xc) / len(X_train)

    def gaussian_log_prob(x, mean, var):
        return -0.5 * np.sum(np.log(2*np.pi*var) + ((x-mean)**2)/var)

    scores = []
    for x in X_test.values:
        log_post = {}
        for c in classes:
            log_post[c] = np.log(priors[c]) + gaussian_log_prob(x, means[c], vars_[c])
        # soft score = P(Setosa | X) ∝ posterior
        scores.append(np.exp(log_post[positive_class]))

    return np.array(scores)


In [37]:
def manual_auc(y_true, scores):
    import numpy as np

    # Convert to numpy arrays
    y_true = np.array(y_true)
    scores = np.array(scores)

    # Sort by score (descending)
    order = np.argsort(scores)[::-1]
    y_true = y_true[order]
    scores = scores[order]

    thresholds = np.unique(scores)
    P = np.sum(y_true == 1)
    N = np.sum(y_true == 0)

    TPR, FPR = [], []

    for t in thresholds:
        y_pred = (scores >= t).astype(int)

        TP = np.sum((y_pred == 1) & (y_true == 1))
        FP = np.sum((y_pred == 1) & (y_true == 0))

        TPR.append(TP / P if P else 0)
        FPR.append(FP / N if N else 0)

    # Convert to arrays
    TPR = np.array(TPR)
    FPR = np.array(FPR)

    # Sort ROC points by increasing FPR
    idx = np.argsort(FPR)
    FPR = FPR[idx]
    TPR = TPR[idx]
    thresholds = thresholds[idx]

    # ---- Optimal Threshold (Youden’s Index) ----
    J = TPR - FPR
    best_idx = np.argmax(J)

    print("\nOptimal Threshold (Youden’s Index):", thresholds[best_idx])
    print("TPR at Optimal Threshold:", round(TPR[best_idx], 4))
    print("FPR at Optimal Threshold:", round(FPR[best_idx], 4))
    print("Youden’s Index:", round(J[best_idx], 4))

    # ---- Trapezoidal Rule for AUC ----
    auc = 0.0
    for i in range(1, len(FPR)):
        auc += (FPR[i] - FPR[i-1]) * (TPR[i] + TPR[i-1]) / 2

    return auc


In [34]:
def evaluate_dataset_bayesian(datasets, name):
    train = datasets[f"{name}-5-1tra.dat"]
    test  = datasets[f"{name}-5-1tst.dat"]

    Xtr, ytr = train.iloc[:, :-1], train.iloc[:, -1]
    Xte, yte = test.iloc[:, :-1], test.iloc[:, -1]

    POS = "Iris-setosa"

    y_bin = (yte == POS).astype(int).values

    scores = bayesian_soft_scores(Xtr, ytr, Xte, POS)

    auc_manual = manual_auc(y_bin, scores)
    auc_sklearn = roc_auc_score(y_bin, scores)

    print("\n" + "="*45)
    print(f"DATASET : {name.upper()}")
    print("CLASSIFIER : Bayesian (Gaussian)")
    print("Positive Class :", POS)
    print("="*45)
    print("Manual AUC (Trapezoidal):", round(auc_manual, 4))
    print("Sklearn AUC:", round(auc_sklearn, 4))


In [38]:
evaluate_dataset_bayesian(datasets, "iris")



Optimal Threshold (Youden’s Index): 0.0025740806672740688
TPR at Optimal Threshold: 1.0
FPR at Optimal Threshold: 0.0
Youden’s Index: 1.0

DATASET : IRIS
CLASSIFIER : Bayesian (Gaussian)
Positive Class : Iris-setosa
Manual AUC (Trapezoidal): 0.9925
Sklearn AUC: 1.0
