In [1]:
import sys
import os
import numpy as np
import pandas as pd

PROJ_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
if PROJ_ROOT not in sys.path:
    sys.path.insert(0, PROJ_ROOT)
    
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    confusion_matrix,
    accuracy_score, recall_score, precision_score, f1_score
)


from src.models.random_forest import RandomForestModel
from src.dataset import Dataset
from sklearn.ensemble import RandomForestClassifier

pd.set_option("display.width", 1000)
pd.set_option("display.expand_frame_repr", False)


In [2]:
dataset = Dataset()
f_train, f_test, l_train, l_test = dataset.split_data(target_column = 'LUNG_CANCER')

In [3]:
model = RandomForestModel(n_trees = 120)
model.train(f_train, l_train)

print(model.score(f_train, l_train))
print(model.score(f_test, l_test))

0.9954545454545455
0.8928571428571429


In [None]:
def get_predictions(model, X):
    if hasattr(model, "predict") and callable(getattr(model, "predict")):
        return model.predict(X)
    if hasattr(model, "model") and hasattr(model.model, "predict"):
        return model.model.predict(X)
    raise AttributeError("Không tìm thấy hàm predict trong model. Hãy kiểm tra DecisionTreeModel.")

y_test = l_test
y_pred = get_predictions(model, f_test)

y_test = np.array(y_test).ravel()
y_pred = np.array(y_pred).ravel()

print("y_test:", np.unique(y_test))
print("y_pred:", np.unique(y_pred))

y_test: [0 1]
y_pred: [0 1]


In [5]:
def print_eval(y_test, y_pred, labels=None, pos_label=None):
    y_test = np.array(y_test).ravel()
    y_pred = np.array(y_pred).ravel()

    if labels is None:
        labels = sorted(list(set(y_test.tolist()) | set(y_pred.tolist())))

    if pos_label is None:
        if set(labels) == {0, 1}:
            pos_label = 1
        elif set(labels) == {1, 2}:
            pos_label = 2
        else:
            pos_label = labels[-1]

    cm = confusion_matrix(y_test, y_pred, labels=labels)
    cm_df = pd.DataFrame(
        cm,
        index=pd.Index(labels, name="Actual"),
        columns=pd.Index(labels, name="Predicted")
    )

    print("\nConfusion Matrix:")
    print(cm_df.to_string())

    if len(labels) == 2:
        TN, FP, FN, TP = cm.ravel()
        print(f"\nTN={TN}, FP={FP}, FN={FN}, TP={TP}")

    acc = accuracy_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred, pos_label=pos_label)
    pre = precision_score(y_test, y_pred, pos_label=pos_label, zero_division=0)
    f1  = f1_score(y_test, y_pred, pos_label=pos_label)

    print(f"\nAccuracy : {acc:.4f}")
    print(f"Recall   : {rec:.4f}   (TP/(TP+FN))")
    print(f"Precision: {pre:.4f}   (TP/(TP+FP))")
    print(f"F1-score : {f1:.4f}   (2*P*R/(P+R))")

rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    bootstrap=True,      
    oob_score=True,    
)

X_train, X_test = f_train, f_test
y_train, y_test = l_train, l_test
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print_eval(y_test, y_pred)


rf = RandomForestClassifier(
    n_estimators=300,
    random_state=42,
    bootstrap=True,
    oob_score=True,
    n_jobs=-1
)

rf.fit(X_train, y_train)
print(f"OOB score: {rf.oob_score_:.4f}")



Confusion Matrix:
Predicted  0   1
Actual          
0          4   8
1          0  44

TN=4, FP=8, FN=0, TP=44

Accuracy : 0.8571
Recall   : 1.0000   (TP/(TP+FN))
Precision: 0.8462   (TP/(TP+FP))
F1-score : 0.9167   (2*P*R/(P+R))
OOB score: 0.8909
