In [99]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from scipy.stats import randint, uniform

# 1. Đọc dữ lệu

In [100]:
DATA_DIR = "../data/cleaned_data"

train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
val_df   = pd.read_csv(os.path.join(DATA_DIR, "val.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

print("Train:", train_df.shape, "Val:", val_df.shape, "Test:", test_df.shape)

Train: (1721, 23) Val: (192, 23) Test: (213, 23)


# 2. Tách X/y

In [101]:
feature_cols = [c for c in train_df.columns if c not in ("CLASS","NSP")]

def split(df, target):
    X = df[feature_cols].values
    y = df[target].values
    return X, y

# CLASS
X_tr_c, y_tr_c = split(train_df, "CLASS")
X_val_c, y_val_c = split(val_df,   "CLASS")
X_test_c, y_test_c = split(test_df,  "CLASS")

# NSP
X_tr_n, y_tr_n = split(train_df, "NSP")
X_val_n, y_val_n = split(val_df,   "NSP")
X_test_n, y_test_n = split(test_df,  "NSP")


print("Số feature:", len(feature_cols))

Số feature: 21


# 3. Xây dựng Pipeline & GridSearchCV  

3.1 Định nghĩa pipeline

In [102]:
pipe = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote',   SMOTE(random_state=42)),
    ('knn',     KNeighborsClassifier())
])

3.2 Param grid gồm 2 nhánh: PCA vs SelectKBest

In [103]:
param_dist = {
    'knn__n_neighbors': randint(5, 51),
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan', 'minkowski'],
    'knn__p': [1,2,3],
    'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'knn__leaf_size': randint(5, 51)
}

3.3 cross‑val splitter

In [104]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 4. GridSearchCV

* 10 CLASS

In [105]:
rnd_c = RandomizedSearchCV(
    estimator   = pipe,
    param_distributions = param_dist,
    n_iter      = 200,
    cv          = cv,
    scoring     = 'accuracy',
    n_jobs      = -1,
    verbose     = 2,
    random_state= 42
)
rnd_c.fit(X_train, y_train_cls)

print(">>> Best params (CLASS):", rnd_c.best_params_)
print(">>> Best CV acc:", rnd_c.best_score_)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
>>> Best params (CLASS): {'knn__algorithm': 'auto', 'knn__leaf_size': 5, 'knn__metric': 'minkowski', 'knn__n_neighbors': 6, 'knn__p': 1, 'knn__weights': 'distance'}
>>> Best CV acc: 0.7356066734074822


* NSP

In [106]:
rnd_n = RandomizedSearchCV(
    estimator   = pipe,
    param_distributions = param_dist,
    n_iter      = 200,
    cv          = cv,
    scoring     = 'accuracy',
    n_jobs      = -1,
    verbose     = 2,
    random_state= 42
)
rnd_n.fit(X_tr_n, y_tr_n)

print(">>> Best params (NSP):", rnd_n.best_params_)
print(">>> Best CV acc:", rnd_n.best_score_)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
>>> Best params (NSP): {'knn__algorithm': 'auto', 'knn__leaf_size': 5, 'knn__metric': 'minkowski', 'knn__n_neighbors': 6, 'knn__p': 1, 'knn__weights': 'distance'}
>>> Best CV acc: 0.8895888102460396


# 5. Đánh giá trên Test

* 10 Class

In [107]:
X_valtest_c = np.vstack([X_test_c, X_val_c])
y_valtest_c = np.concatenate([y_test_c, y_val_c])

In [108]:
best_c = rnd_c.best_estimator_
y_pred_c = best_c.predict(X_valtest_c)

print("\n=== TEST (CLASS) ===")
print("Accuracy:", accuracy_score(y_valtest_c, y_pred_c))
print(classification_report(y_valtest_c, y_pred_c))
print(confusion_matrix(y_valtest_c, y_pred_c))


=== TEST (CLASS) ===
Accuracy: 0.7901234567901234
              precision    recall  f1-score   support

           0       0.73      0.74      0.74        78
           1       0.93      0.79      0.86       109
           2       0.35      0.60      0.44        10
           3       0.80      1.00      0.89        12
           4       0.44      0.41      0.42        17
           5       0.91      0.88      0.89        58
           6       0.78      0.89      0.83        45
           7       1.00      0.86      0.92        14
           8       0.81      0.93      0.87        14
           9       0.69      0.73      0.71        48

    accuracy                           0.79       405
   macro avg       0.75      0.78      0.76       405
weighted avg       0.81      0.79      0.79       405

[[58  2  7  0  5  0  1  0  0  5]
 [ 6 86  3  2  4  4  1  0  0  3]
 [ 2  1  6  0  0  0  0  0  0  1]
 [ 0  0  0 12  0  0  0  0  0  0]
 [ 3  0  0  0  7  0  1  0  0  6]
 [ 0  1  0  1  0 51  5  0

* NSP

In [109]:
X_valtest_n = np.vstack([X_test_n, X_val_n])
y_valtest_n = np.concatenate([y_test_n, y_val_n])

In [110]:
best_n = rnd_n.best_estimator_
y_pred_n = best_n.predict(X_valtest_n)

print("\n=== TEST (NSP) ===")
print("Accuracy:", accuracy_score(y_valtest_n, y_pred_n))
print(classification_report(y_valtest_n, y_pred_n))
print(confusion_matrix(y_valtest_n, y_pred_n))


=== TEST (NSP) ===
Accuracy: 0.8938271604938272
              precision    recall  f1-score   support

           0       0.98      0.92      0.95       310
           1       0.64      0.82      0.72        67
           2       0.82      0.82      0.82        28

    accuracy                           0.89       405
   macro avg       0.81      0.85      0.83       405
weighted avg       0.91      0.89      0.90       405

[[284  26   0]
 [  7  55   5]
 [  0   5  23]]
