In [7]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from scipy.stats import randint


# 1.Đọc data

In [8]:
DATA_DIR = "../data/cleaned_data"

train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
val_df   = pd.read_csv(os.path.join(DATA_DIR, "val.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

print("Train:", train_df.shape, "Val:", val_df.shape, "Test:", test_df.shape)

Train: (1721, 23) Val: (192, 23) Test: (213, 23)


# 2.Tách X/Y

In [9]:
feature_cols = [c for c in train_df.columns if c not in ("CLASS","NSP")]

def split(df, target):
    X = df[feature_cols].values
    y = df[target].values
    return X, y

# CLASS
X_tr_c, y_tr_c = split(train_df, "CLASS")
X_val_c, y_val_c = split(val_df, "CLASS")
X_test_c, y_test_c = split(test_df, "CLASS")

# NSP
X_tr_n, y_tr_n = split(train_df, "NSP")
X_val_n, y_val_n = split(val_df, "NSP")
X_test_n, y_test_n = split(test_df, "NSP")

print("Số feature:", len(feature_cols))


Số feature: 21


# 3.Xây pipeline + grid search CV để tìm bộ tham số tối ưu sau này

3.1 Xây pipeline

In [10]:
pipe = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote',  SMOTE(random_state=42)),
    ('rf',     RandomForestClassifier(random_state=42))
])


3.2 Thiết lập parameter search space

In [11]:
param_dist = {
    'rf__n_estimators': randint(100, 300),
    'rf__max_depth': randint(5, 30),
    'rf__min_samples_split': randint(2, 10),
    'rf__min_samples_leaf': randint(1, 10),
    'rf__max_features': ['sqrt', 'log2', None],
    'rf__bootstrap': [True, False]
}

3.3 Cross-validation splitter

In [12]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


# 4. Train model voi label cua CLASS

4.1 Train model (10 class)

In [13]:
rnd_c = RandomizedSearchCV(
    estimator   = pipe,
    param_distributions = param_dist,
    n_iter      = 50,
    cv          = cv,
    scoring     = 'accuracy',
    n_jobs      = -1,
    verbose     = 2,
    random_state= 42
)
X_train_c = np.vstack([X_tr_c, X_val_c])
y_train_c = np.concatenate([y_tr_c, y_val_c])
rnd_c.fit(X_train_c, y_train_c)

print(">>> Best params (CLASS):", rnd_c.best_params_)
print(">>> Best CV acc:", rnd_c.best_score_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
>>> Best params (CLASS): {'rf__bootstrap': False, 'rf__max_depth': 20, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 3, 'rf__min_samples_split': 5, 'rf__n_estimators': 286}
>>> Best CV acc: 0.8975435047093079


4.2 Danh gia tren tap test

In [15]:
X_valtest_c = np.vstack([X_test_c, X_val_c])
y_valtest_c = np.concatenate([y_test_c, y_val_c])
best_c = rnd_c.best_estimator_
y_pred_c = best_c.predict(X_valtest_c)

print("\n=== TEST (CLASS) ===")
print("Accuracy:", accuracy_score(y_valtest_c, y_pred_c))
print(classification_report(y_valtest_c, y_pred_c))
print(confusion_matrix(y_valtest_c, y_pred_c))



=== TEST (CLASS) ===
Accuracy: 0.945679012345679
              precision    recall  f1-score   support

           0       0.93      0.91      0.92        78
           1       0.95      0.94      0.94       109
           2       0.82      0.90      0.86        10
           3       0.92      0.92      0.92        12
           4       0.88      0.82      0.85        17
           5       0.98      1.00      0.99        58
           6       0.98      0.98      0.98        45
           7       0.88      1.00      0.93        14
           8       1.00      1.00      1.00        14
           9       0.94      0.96      0.95        48

    accuracy                           0.95       405
   macro avg       0.93      0.94      0.93       405
weighted avg       0.95      0.95      0.95       405

[[ 71   3   2   0   1   0   0   0   0   1]
 [  1 102   0   1   1   1   1   2   0   0]
 [  1   0   9   0   0   0   0   0   0   0]
 [  0   1   0  11   0   0   0   0   0   0]
 [  0   1   0   0  

# 5. Train model voi label cua NSP

5.1 Train model (NSP)

In [16]:
rnd_n = RandomizedSearchCV(
    estimator   = pipe,
    param_distributions = param_dist,
    n_iter      = 50,
    cv          = cv,
    scoring     = 'accuracy',
    n_jobs      = -1,
    verbose     = 2,
    random_state= 42
)
X_train_n = np.vstack([X_tr_n, X_val_n])
y_train_n = np.concatenate([y_tr_n, y_val_n])
rnd_n.fit(X_train_n, y_train_n)

print(">>> Best params (NSP):", rnd_n.best_params_)
print(">>> Best CV acc:", rnd_n.best_score_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
>>> Best params (NSP): {'rf__bootstrap': False, 'rf__max_depth': 26, 'rf__max_features': 'sqrt', 'rf__min_samples_leaf': 2, 'rf__min_samples_split': 3, 'rf__n_estimators': 143}
>>> Best CV acc: 0.9419600016403976


5.2 Danh gia tren tap test (NSP)

In [17]:
X_valtest_n = np.vstack([X_test_n, X_val_n])
y_valtest_n = np.concatenate([y_test_n, y_val_n])
best_n = rnd_n.best_estimator_
y_pred_n = best_n.predict(X_valtest_n)

print("\n=== TEST (NSP) ===")
print("Accuracy:", accuracy_score(y_valtest_n, y_pred_n))
print(classification_report(y_valtest_n, y_pred_n))
print(confusion_matrix(y_valtest_n, y_pred_n))



=== TEST (NSP) ===
Accuracy: 0.9654320987654321
              precision    recall  f1-score   support

           0       0.98      0.97      0.98       310
           1       0.89      0.93      0.91        67
           2       0.97      1.00      0.98        28

    accuracy                           0.97       405
   macro avg       0.94      0.97      0.95       405
weighted avg       0.97      0.97      0.97       405

[[301   8   1]
 [  5  62   0]
 [  0   0  28]]
