In [1]:
import sys
import os
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score

PROJ_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJ_ROOT not in sys.path:
    sys.path.insert(0, PROJ_ROOT)

from src.dataset import Dataset

from src.models.xgboost import XGBoost
pd.set_option("display.width", 1000)
pd.set_option("display.expand_frame_repr", False)

In [2]:
dataset = Dataset()
f_train, f_test, l_train, l_test = dataset.split_data(target_column='LUNG_CANCER')

In [3]:
xgb = XGBoost(
    reg_lambda=1.0,
    gamma=1.0,
    max_depth=5,
    lr=0.5,
    n_estimators=50,
)
xgb.fit(f_train, l_train)

print(xgb.score(f_train, l_train))
print(xgb.score(f_test, l_test))

0.9363636363636364
0.875


In [4]:
X_train, X_test = f_train, f_test
y_train, y_test = l_train, l_test

if "l_test" in globals():
    y_test = np.array(l_test).ravel()
elif "y_test" in globals():
    y_test = np.array(y_test).ravel()


if "f_test" in globals():
    X_ = f_test
elif "X_test" in globals():
    X_ = X_test

y_pred_raw = xgb.predict(X_)
y_pred_raw = np.array(y_pred_raw)

# TH 1: (n,2) -> argmax
if y_pred_raw.ndim == 2 and y_pred_raw.shape[1] >= 2:
    y_pred = np.argmax(y_pred_raw, axis=1)

# TH 2: (n,) float -> threshold 0.5 nếu không phải {0,1}
elif y_pred_raw.ndim == 1 and np.issubdtype(y_pred_raw.dtype, np.floating):
    uniq = set(np.unique(y_pred_raw).tolist())
    if not uniq.issubset({0.0, 1.0}):
        y_pred = (y_pred_raw >= 0.5).astype(int)
    else:
        y_pred = y_pred_raw.astype(int)

# TH 3: label
else:
    y_pred = y_pred_raw.ravel().astype(int)

y_test = np.array(y_test).astype(int).ravel()

labels = [0, 1]
cm = confusion_matrix(y_test, y_pred, labels=labels)

cm_df = pd.DataFrame(
    cm,
    index=pd.Index(labels, name="Actual"),
    columns=pd.Index(labels, name="Predicted")
)

print("\nConfusion Matrix:")
print(cm_df.to_string())

TN, FP, FN, TP = cm.ravel()
print(f"\nTN={TN}, FP={FP}, FN={FN}, TP={TP}")

acc = accuracy_score(y_test, y_pred)
rec = recall_score(y_test, y_pred, pos_label=1)
pre = precision_score(y_test, y_pred, pos_label=1, zero_division=0)
f1  = f1_score(y_test, y_pred, pos_label=1)

print(f"\nAccuracy : {acc:.4f}")
print(f"Recall   : {rec:.4f}   (TP/(TP+FN))")
print(f"Precision: {pre:.4f}   (TP/(TP+FP))")
print(f"F1-score : {f1:.4f}   (2*P*R/(P+R))")


Confusion Matrix:
Predicted  0   1
Actual          
0          6   6
1          1  43

TN=6, FP=6, FN=1, TP=43

Accuracy : 0.8750
Recall   : 0.9773   (TP/(TP+FN))
Precision: 0.8776   (TP/(TP+FP))
F1-score : 0.9247   (2*P*R/(P+R))
