<a href="https://colab.research.google.com/github/mannat244/ML_Lab/blob/main/ML_Lab_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **ML Lab 2** 2311201205


---



**Write a program to construct a Decision Tree for any four datasets downloaded from the KEEL dataset repository using the following algorithms:**

a) ID3  
b) C4.5  
c) CART  

Evaluate the performance using the following metrics:

a) Confusion Matrix  
b) Precision and Recall for each class  
c) Area Under the Curve (AUC), assuming Setosa as the positive class  
d) Geometric Mean  

Explain the following terms:

- True Positive Rate (TPR)  
- Area Under the Curve (AUC)  
- F-Measure  


We define the 4 datasets that we would work on -

In [None]:
DATASETS = ["iris","glass","haberman","wine"]

In [None]:
import os, tempfile
import pandas as pd
import numpy as np
import scipy.io.arff as arff
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
    accuracy_score, classification_report,
    ConfusionMatrixDisplay, RocCurveDisplay,
    roc_auc_score, balanced_accuracy_score
)

In [None]:
def load_keel(folder, names):
    out = {}
    for f in os.listdir(folder):
        if f.endswith(".dat") and any(f.startswith(n) for n in names):
            lines = open(folder+f).readlines()
            lines = [l for l in lines if not l.lower().startswith(("@inputs","@output"))]

            tmp = tempfile.NamedTemporaryFile(delete=False, mode="w")
            tmp.writelines(lines); tmp.close()

            d,_ = arff.loadarff(tmp.name)
            df = pd.DataFrame(d)
            df = df.map(lambda x: x.decode() if isinstance(x, bytes) else x)

            out[f] = df
    return out


In [None]:
datasets = load_keel("/content/",DATASETS)

print(datasets.keys())


In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import (
    accuracy_score, classification_report,
    ConfusionMatrixDisplay, RocCurveDisplay,
    balanced_accuracy_score, roc_auc_score
)
import matplotlib.pyplot as plt
import numpy as np

def evaluate_dataset(datasets, name):
    acc, bal_acc, aucs = [], [], []

    for i in range(1, 6):
        train = datasets[f"{name}-5-{i}tra.dat"]
        test  = datasets[f"{name}-5-{i}tst.dat"]

        Xtr, ytr = train.iloc[:, :-1], train.iloc[:, -1]
        Xte, yte = test.iloc[:, :-1], test.iloc[:, -1]

        clf = DecisionTreeClassifier(criterion="gini", random_state=0)
        clf.fit(Xtr, ytr)

        ypred = clf.predict(Xte)
        yprob = clf.predict_proba(Xte)

        acc.append(accuracy_score(yte, ypred))
        bal_acc.append(balanced_accuracy_score(yte, ypred))

        pos = clf.classes_[0]
        y_true_bin = (yte == pos).astype(int)
        y_score = yprob[:, list(clf.classes_).index(pos)]
        aucs.append(roc_auc_score(y_true_bin, y_score))

        # ---- Representative outputs (Fold 1 only) ----
        if i == 1:
            rep_clf, rep_Xte, rep_yte = clf, Xte, yte
            rep_ytrue_bin, rep_yscore = y_true_bin, y_score

    # -------- CLEAN REPORT --------
    print("\n" + "="*55)
    print(f"DATASET : {name.upper()}")
    print("ALGORITHM : CART (Gini Index)")
    print("="*55)

    print("Final Accuracy (mean of 5 folds):",
          round(np.mean(acc), 4))
    print("Balanced Accuracy:",
          round(np.mean(bal_acc), 4))
    print("AUC (One-vs-Rest):",
          round(np.mean(aucs), 4))

    print("\nClassification Report (Representative Fold):")
    print(classification_report(rep_yte, rep_clf.predict(rep_Xte)))

    # ---- Visual Outputs ----
    ConfusionMatrixDisplay.from_estimator(rep_clf, rep_Xte, rep_yte)
    plt.title("Confusion Matrix")
    plt.show()

    RocCurveDisplay.from_predictions(
        rep_ytrue_bin,
        rep_yscore,
        name=f"{name} (Positive = {rep_clf.classes_[0]})"
    )
    plt.title("ROC Curve")
    plt.show()

    plt.figure(figsize=(14,6))
    plot_tree(
        rep_clf,
        feature_names=Xtr.columns,
        class_names=rep_clf.classes_,
        filled=True
    )
    plt.title("CART Decision Tree")
    plt.show()


In [None]:
for d in DATASETS:
    evaluate_dataset(datasets, d)