In [1]:
import sys
import os
import numpy as np
import pandas as pd

PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJECT_ROOT not in sys.path:
    sys.path.insert(0, PROJECT_ROOT)

from src.models.decision_tree import DecisionTreeModel
from src.dataset import Dataset

from sklearn.metrics import (
    confusion_matrix,
    accuracy_score, recall_score, precision_score, f1_score,
    classification_report
)

pd.set_option("display.width", 1000)
pd.set_option("display.expand_frame_repr", False)


In [2]:
dataset = Dataset()
f_train, f_test, l_train, l_test = dataset.split_data(target_column = 'LUNG_CANCER')

In [3]:
model = DecisionTreeModel(
    max_depth=7,
    min_samples_leaf=3
)

model.train(f_train, l_train)

In [4]:
print(model.score(f_train, l_train))
print(model.score(f_test, l_test))

0.9409090909090909
0.875


In [5]:
def get_predictions(model, X):
    if hasattr(model, "predict") and callable(getattr(model, "predict")):
        return model.predict(X)
    if hasattr(model, "model") and hasattr(model.model, "predict"):
        return model.model.predict(X)
    raise AttributeError("Không tìm thấy hàm predict trong model. Hãy kiểm tra DecisionTreeModel.")

y_test = l_test
y_pred = get_predictions(model, f_test)

y_test = np.array(y_test).ravel()
y_pred = np.array(y_pred).ravel()

print("y_test:", np.unique(y_test))
print("y_pred:", np.unique(y_pred))

y_test: [0 1]
y_pred: [0 1]


In [6]:
y_test = np.array(y_test).ravel()
y_pred = np.array(y_pred).ravel()

labels = sorted(list(set(y_test.tolist()) | set(y_pred.tolist())))

if set(labels) == {0, 1}:
    pos_label = 1
elif set(labels) == {1, 2}:
    pos_label = 2
else:
    pos_label = labels[-1]  

cm = confusion_matrix(y_test, y_pred, labels=labels)

cm_df = pd.DataFrame(
    cm,
    index=pd.Index(labels, name="Actual"),
    columns=pd.Index(labels, name="Predicted"),
)

print("\nConfusion Matrix:")
print(cm_df.to_string())

if len(labels) == 2:
    TN, FP, FN, TP = cm.ravel()
    print(f"\nTN={TN}, FP={FP}, FN={FN}, TP={TP}")

acc = accuracy_score(y_test, y_pred)
rec = recall_score(y_test, y_pred, pos_label=pos_label)
pre = precision_score(y_test, y_pred, pos_label=pos_label, zero_division=0)
f1  = f1_score(y_test, y_pred, pos_label=pos_label)

print(f"\nAccuracy : {acc:.4f}")
print(f"Recall   : {rec:.4f}   (TP/(TP+FN))")
print(f"Precision: {pre:.4f}   (TP/(TP+FP))")
print(f"F1-score : {f1:.4f}   (2*P*R/(P+R))")


Confusion Matrix:
Predicted  0   1
Actual          
0          6   6
1          1  43

TN=6, FP=6, FN=1, TP=43

Accuracy : 0.8750
Recall   : 0.9773   (TP/(TP+FN))
Precision: 0.8776   (TP/(TP+FP))
F1-score : 0.9247   (2*P*R/(P+R))
