In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from pathlib import Path

from models.logistic_regression import LogisticRegression
from models.random_forest import RandomForest
from evaluator.evaluator import Evaluator

In [None]:
DATA_DIR = Path("../data")
FINAL_DATA_DIR = DATA_DIR / "final"

# 0. Load dữ liệu

In [None]:
def load_data(file_path: Path):
    X_train = np.load(file_path / "X_train.npy")
    y_train = np.load(file_path / "y_train.npy")
    X_test = np.load(file_path / "X_test.npy")
    y_test = np.load(file_path / "y_test.npy")

    return X_train, y_train, X_test, y_test


# dữ liệu gốc chưa xử lí cân bằng
original_X_train, original_y_train, original_X_test, original_y_test = load_data(
    FINAL_DATA_DIR / "original"
)

# dữ liệu sau khi under-sampling
under_sample_X_train, under_sample_y_train, under_sample_X_test, under_sample_y_test = (
    load_data(FINAL_DATA_DIR / "under_sampled")
)

# dữ liệu sau khi over-sampling
over_sample_X_train, over_sample_y_train, over_sample_X_test, over_sample_y_test = (
    load_data(FINAL_DATA_DIR / "over_sampled")
)

In [None]:
evaluator = Evaluator(metrics=["accuracy", "precision", "recall", "f1_score", "pr_auc"])

In [None]:
# original_model = LogisticRegression(
#     learning_rate=0.01,
#     n_iterations=1000,
# )

# original_model.fit(original_X_train, original_y_train)
# y_pred = original_model.predict(original_X_test)

In [None]:
under_sample_model = LogisticRegression(
    learning_rate=0.01,
    n_iterations=1000,
)

under_sample_model.fit(under_sample_X_train, under_sample_y_train)

under_sample_y_pred = under_sample_model.predict(under_sample_X_test)

evaluator.evaluate(
    y_true=under_sample_y_test, y_pred=under_sample_y_pred, visualize=True
)

In [None]:
y_pred = under_sample_model.predict(
    original_X_test,
)
results = evaluator.evaluate(y_true=original_y_test, y_pred=y_pred, visualize=True)

In [None]:
eval_test = Evaluator(metrics=["accuracy", "precision", "recall", "f1_score", "pr_auc"])
results = eval_test.evaluate(
    y_true=original_y_test,
    y_pred=y_pred,
    visualize=True,
)

In [None]:
rf_model = RandomForest(
    n_trees=10,
    max_depth=10,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features="sqrt",
    random_state=42,
)

print("Training Random Forest...")
rf_model.fit(under_sample_X_train, under_sample_y_train)

print("\nPredicting...")
rf_y_pred = rf_model.predict(under_sample_X_test)

print("\nEvaluating on under-sampled test set:")
evaluator.evaluate(
    y_true=under_sample_y_test,
    y_pred=rf_y_pred,
    visualize=True,
)

In [None]:
y_pred = rf_model.predict(X=original_X_test)

evaluator.evaluate(
    y_true=original_y_test,
    y_pred=y_pred,
    visualize=True,
)

In [None]:
rf_model_improved = RandomForest(
    n_trees=50,  # Tăng số trees
    max_depth=15,  # Tăng độ sâu
    min_samples_split=5,
    min_samples_leaf=2,
    max_features="sqrt",
    random_state=42,
)

rf_model_improved.fit(under_sample_X_train, under_sample_y_train)

y_pred_improved = rf_model_improved.predict(original_X_test)
evaluator.evaluate(
    y_true=original_y_test,
    y_pred=y_pred_improved,
    visualize=True,
)