In [49]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from scipy.stats import randint, uniform

# 1. Đọc dữ lệu

In [22]:
DATA_DIR = "../data/cleaned_data"

train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
val_df   = pd.read_csv(os.path.join(DATA_DIR, "val.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

print("Train:", train_df.shape, "Val:", val_df.shape, "Test:", test_df.shape)

Train: (1721, 23) Val: (192, 23) Test: (213, 23)


# 2. Tách X/y

In [61]:
feature_cols = [c for c in train_df.columns if c not in ("CLASS","NSP")]

def split(df, target):
    X = df[feature_cols].values
    y = df[target].values
    return X, y

# CLASS
X_tr_c, y_tr_c = split(train_df, "CLASS")
X_val_c, y_val_c = split(val_df,   "CLASS")
X_test_c, y_test_c = split(test_df,  "CLASS")

# NSP
X_tr_n, y_tr_n = split(train_df, "NSP")
X_val_n, y_val_n = split(val_df,   "NSP")
X_test_n, y_test_n = split(test_df,  "NSP")


print("Số feature:", len(feature_cols))

Số feature: 21


# 3. Xây dựng Pipeline & GridSearchCV  

3.1 Định nghĩa pipeline

In [51]:
pipe = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote',   SMOTE(random_state=42)),
    ('knn',     KNeighborsClassifier())
])

3.2 Param grid gồm 2 nhánh: PCA vs SelectKBest

In [84]:
param_dist = {
    'knn__n_neighbors': randint(5, 51),
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan', 'minkowski'],
    'knn__p': [1,2,3],
    'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'knn__leaf_size': randint(5, 51)
}

3.3 cross‑val splitter

In [53]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 4. GridSearchCV

* 10 CLASS

In [59]:
X_trval_c = np.vstack([X_train, X_val])
y_trval_c = np.concatenate([y_train_cls, y_val_cls])

In [86]:
rnd_c = RandomizedSearchCV(
    estimator   = pipe,
    param_distributions = param_dist,
    n_iter      = 200,
    cv          = cv,
    scoring     = 'accuracy',
    n_jobs      = -1,
    verbose     = 2,
    random_state= 42
)
rnd_c.fit(X_trval_c, y_trval_c)

print(">>> Best params (CLASS):", rnd_c.best_params_)
print(">>> Best CV acc:", rnd_c.best_score_)

Fitting 5 folds for each of 200 candidates, totalling 1000 fits
>>> Best params (CLASS): {'knn__algorithm': 'brute', 'knn__leaf_size': 37, 'knn__metric': 'manhattan', 'knn__n_neighbors': 5, 'knn__p': 2, 'knn__weights': 'distance'}
>>> Best CV acc: 0.7427979713750632


* NSP

In [65]:
X_trval_n = np.vstack([X_tr_n, X_val_n])
y_trval_n = np.concatenate([y_tr_n, y_val_n])

In [None]:
rnd_n = RandomizedSearchCV(
    estimator   = pipe,
    param_distributions = param_dist,
    n_iter      = 200,
    cv          = cv,
    scoring     = 'f1_macro',
    n_jobs      = -1,
    verbose     = 2,
    random_state= 42
)
rnd_n.fit(X_trval_n, y_trval_n)

print(">>> Best params (NSP):", rnd_n.best_params_)
print(">>> Best CV acc:", rnd_n.best_score_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
>>> Best params (NSP): {'knn__algorithm': 'auto', 'knn__leaf_size': 49, 'knn__metric': 'manhattan', 'knn__n_neighbors': 2, 'knn__p': 2, 'knn__weights': 'uniform'}
>>> Best CV acc: 0.9152994408978442


# 5. Đánh giá trên Test

* 10 Class

In [87]:
best_c = rnd_c.best_estimator_
y_pred_c = best_c.predict(X_test_c)

print("\n=== TEST (CLASS) ===")
print("Accuracy:", accuracy_score(y_test_c, y_pred_c))
print(classification_report(y_test_c, y_pred_c))
print(confusion_matrix(y_test_c, y_pred_c))


=== TEST (CLASS) ===
Accuracy: 0.8262910798122066
              precision    recall  f1-score   support

           0       0.76      0.74      0.75        34
           1       0.96      0.82      0.88        56
           2       0.33      0.50      0.40         6
           3       0.83      1.00      0.91         5
           4       0.62      0.62      0.62         8
           5       0.91      0.91      0.91        32
           6       0.83      0.89      0.86        28
           7       1.00      1.00      1.00         9
           8       0.89      0.89      0.89         9
           9       0.72      0.81      0.76        26

    accuracy                           0.83       213
   macro avg       0.79      0.82      0.80       213
weighted avg       0.84      0.83      0.83       213

[[25  0  3  0  3  0  0  0  0  3]
 [ 2 46  2  1  0  3  1  0  0  1]
 [ 2  0  3  0  0  0  0  0  0  1]
 [ 0  0  0  5  0  0  0  0  0  0]
 [ 1  0  0  0  5  0  0  0  0  2]
 [ 0  0  0  0  0 29  3  0

* NSP

In [83]:
best_n = rnd_n.best_estimator_
y_pred_n = best_n.predict(X_test_n)

print("\n=== TEST (NSP) ===")
print("Accuracy:", accuracy_score(y_test_n, y_pred_n))
print(classification_report(y_test_n, y_pred_n))
print(confusion_matrix(y_test_n, y_pred_n))


=== TEST (NSP) ===
Accuracy: 0.9061032863849765
              precision    recall  f1-score   support

           0       0.96      0.94      0.95       159
           1       0.70      0.78      0.74        36
           2       0.94      0.89      0.91        18

    accuracy                           0.91       213
   macro avg       0.87      0.87      0.87       213
weighted avg       0.91      0.91      0.91       213

[[149  10   0]
 [  7  28   1]
 [  0   2  16]]
