In [3]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. Đọc dữ lệu

In [4]:
DATA_DIR = "../data/cleaned_data"
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

print("Train:", train_df.shape, "Test:", test_df.shape)

Train: (1700, 23) Test: (426, 23)


# 2. Tách X/y

In [5]:
feature_cols = [c for c in train_df.columns if c not in ("CLASS", "NSP")]
def split(df, target):
    X = df[feature_cols].values
    y = df[target].values
    return X, y

X_train_c, y_train_c = split(train_df, "CLASS")
X_test_c,  y_test_c  = split(test_df,  "CLASS")

X_train_n, y_train_n = split(train_df, "NSP")
X_test_n,  y_test_n  = split(test_df,  "NSP")


# 3. Pipeline - StandardScaler + KNN


In [6]:
from sklearn.pipeline import Pipeline


In [7]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('knn',    KNeighborsClassifier())
])


# 4. Thiết lập siêu tham số cho RandomizedSearchCV


In [8]:
param_dist = {
    'knn__n_neighbors': list(range(5, 51)),
    'knn__weights': ['uniform', 'distance'],
    'knn__metric': ['euclidean', 'manhattan', 'minkowski'],
    'knn__p': [1, 2, 3],
    'knn__algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'knn__leaf_size': list(range(5, 51))
}


In [9]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


# 5. RandomizedSearchCV cho CLASS

* 10 CLASS

In [10]:
rnd_c = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=200,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

In [11]:
rnd_c.fit(X_train_c, y_train_c)


Fitting 5 folds for each of 200 candidates, totalling 1000 fits


0,1,2
,estimator,Pipeline(step...lassifier())])
,param_distributions,"{'knn__algorithm': ['auto', 'ball_tree', ...], 'knn__leaf_size': [5, 6, ...], 'knn__metric': ['euclidean', 'manhattan', ...], 'knn__n_neighbors': [5, 6, ...], ...}"
,n_iter,200
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,2
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_neighbors,10
,weights,'distance'
,algorithm,'brute'
,leaf_size,44
,p,3
,metric,'manhattan'
,metric_params,
,n_jobs,


In [12]:
print(">>> Best params (CLASS):", rnd_c.best_params_)
print(">>> Best CV acc (CLASS):", rnd_c.best_score_)

>>> Best params (CLASS): {'knn__weights': 'distance', 'knn__p': 3, 'knn__n_neighbors': 10, 'knn__metric': 'manhattan', 'knn__leaf_size': 44, 'knn__algorithm': 'brute'}
>>> Best CV acc (CLASS): 0.7705882352941176


In [13]:
best_c = rnd_c.best_estimator_
y_pred_c = best_c.predict(X_test_c)
print("=== TEST (CLASS) ===")
print("Accuracy:", accuracy_score(y_test_c, y_pred_c))
print(classification_report(y_test_c, y_pred_c))
print(confusion_matrix(y_test_c, y_pred_c))

=== TEST (CLASS) ===
Accuracy: 0.8004694835680751
              precision    recall  f1-score   support

           0       0.74      0.85      0.79        73
           1       0.83      0.89      0.86       114
           2       0.75      0.46      0.57        13
           3       1.00      0.58      0.73        19
           4       0.83      0.29      0.43        17
           5       0.77      0.81      0.79        63
           6       0.81      0.81      0.81        54
           7       1.00      0.88      0.93        16
           8       0.82      0.75      0.78        12
           9       0.76      0.82      0.79        45

    accuracy                           0.80       426
   macro avg       0.83      0.72      0.75       426
weighted avg       0.81      0.80      0.79       426

[[ 62   1   2   0   1   1   0   0   0   6]
 [  7 102   0   0   0   4   1   0   0   0]
 [  5   1   6   0   0   1   0   0   0   0]
 [  0   7   0  11   0   1   0   0   0   0]
 [  2   6   0   0  

# 6. RandomizedSearchCV cho NSP


In [14]:
rnd_n = RandomizedSearchCV(
    estimator=pipe,
    param_distributions=param_dist,
    n_iter=200,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1,
    verbose=2,
    random_state=42
)

In [15]:
rnd_n.fit(X_train_n, y_train_n)


Fitting 5 folds for each of 200 candidates, totalling 1000 fits


0,1,2
,estimator,Pipeline(step...lassifier())])
,param_distributions,"{'knn__algorithm': ['auto', 'ball_tree', ...], 'knn__leaf_size': [5, 6, ...], 'knn__metric': ['euclidean', 'manhattan', ...], 'knn__n_neighbors': [5, 6, ...], ...}"
,n_iter,200
,scoring,'accuracy'
,n_jobs,-1
,refit,True
,cv,StratifiedKFo... shuffle=True)
,verbose,2
,pre_dispatch,'2*n_jobs'
,random_state,42

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_neighbors,11
,weights,'distance'
,algorithm,'ball_tree'
,leaf_size,38
,p,1
,metric,'manhattan'
,metric_params,
,n_jobs,


In [16]:
print(">>> Best params (NSP):", rnd_n.best_params_)
print(">>> Best CV acc (NSP):", rnd_n.best_score_)

>>> Best params (NSP): {'knn__weights': 'distance', 'knn__p': 1, 'knn__n_neighbors': 11, 'knn__metric': 'manhattan', 'knn__leaf_size': 38, 'knn__algorithm': 'ball_tree'}
>>> Best CV acc (NSP): 0.9129411764705881


In [17]:
best_n = rnd_n.best_estimator_
y_pred_n = best_n.predict(X_test_n)
print("=== TEST (NSP) ===")
print("Accuracy:", accuracy_score(y_test_n, y_pred_n))
print(classification_report(y_test_n, y_pred_n))
print(confusion_matrix(y_test_n, y_pred_n))

=== TEST (NSP) ===
Accuracy: 0.9225352112676056
              precision    recall  f1-score   support

           0       0.94      0.98      0.96       333
           1       0.79      0.72      0.75        64
           2       0.92      0.76      0.83        29

    accuracy                           0.92       426
   macro avg       0.88      0.82      0.85       426
weighted avg       0.92      0.92      0.92       426

[[325   8   0]
 [ 16  46   2]
 [  3   4  22]]
