In [5]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from scipy.stats import uniform


In [6]:
DATA_DIR = "data/cleaned_data/"

train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
val_df   = pd.read_csv(os.path.join(DATA_DIR, "val.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

print("Train:", train_df.shape, "Val:", val_df.shape, "Test:", test_df.shape)


Train: (1721, 23) Val: (192, 23) Test: (213, 23)


In [7]:
feature_cols = [c for c in train_df.columns if c not in ("CLASS","NSP")]

def split(df, target):
    X = df[feature_cols].values
    y = df[target].values
    return X, y

# CLASS
X_tr_c, y_tr_c = split(train_df, "CLASS")
X_val_c, y_val_c = split(val_df, "CLASS")
X_test_c, y_test_c = split(test_df, "CLASS")

# NSP
X_tr_n, y_tr_n = split(train_df, "NSP")
X_val_n, y_val_n = split(val_df, "NSP")
X_test_n, y_test_n = split(test_df, "NSP")

print("Số feature:", len(feature_cols))


Số feature: 21


In [8]:
pipe = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote',  SMOTE(random_state=42)),
    ('svm',    SVC(probability=True, random_state=42))
])


In [9]:
param_dist = {
    'svm__C': uniform(0.1, 10),
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__gamma': ['scale', 'auto'],
    'svm__degree': [2, 3, 4]  # Chỉ dùng nếu kernel là poly
}


In [10]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [11]:
rnd_c = RandomizedSearchCV(
    estimator   = pipe,
    param_distributions = param_dist,
    n_iter      = 50,
    cv          = cv,
    scoring     = 'accuracy',
    n_jobs      = -1,
    verbose     = 2,
    random_state= 42
)
X_train_c = np.vstack([X_tr_c, X_val_c])
y_train_c = np.concatenate([y_tr_c, y_val_c])
rnd_c.fit(X_train_c, y_train_c)

print(">>> Best params (CLASS):", rnd_c.best_params_)
print(">>> Best CV acc:", rnd_c.best_score_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END svm__C=1.6599452033620266, svm__degree=4, svm__gamma=scale, svm__kernel=linear; total time=   1.1s
[CV] END svm__C=3.845401188473625, svm__degree=2, svm__gamma=scale, svm__kernel=poly; total time=   1.3s
[CV] END svm__C=1.6599452033620266, svm__degree=4, svm__gamma=scale, svm__kernel=linear; total time=   1.0s
[CV] END svm__C=3.845401188473625, svm__degree=2, svm__gamma=scale, svm__kernel=poly; total time=   1.4s
[CV] END svm__C=3.845401188473625, svm__degree=2, svm__gamma=scale, svm__kernel=poly; total time=   1.4s
[CV] END svm__C=7.896910002727692, svm__degree=2, svm__gamma=scale, svm__kernel=rbf; total time=   1.2s
[CV] END svm__C=7.896910002727692, svm__degree=2, svm__gamma=scale, svm__kernel=rbf; total time=   1.3s
[CV] END svm__C=7.896910002727692, svm__degree=2, svm__gamma=scale, svm__kernel=rbf; total time=   1.3s
[CV] END svm__C=7.896910002727692, svm__degree=2, svm__gamma=scale, svm__kernel=rbf; total time

In [12]:
X_valtest_c = np.vstack([X_test_c, X_val_c])
y_valtest_c = np.concatenate([y_test_c, y_val_c])
best_c = rnd_c.best_estimator_
y_pred_c = best_c.predict(X_valtest_c)

print("\n=== TEST (CLASS) ===")
print("Accuracy:", accuracy_score(y_valtest_c, y_pred_c))
print(classification_report(y_valtest_c, y_pred_c))
print(confusion_matrix(y_valtest_c, y_pred_c))



=== TEST (CLASS) ===
Accuracy: 0.9061728395061729
              precision    recall  f1-score   support

           0       0.94      0.81      0.87        78
           1       0.97      0.93      0.95       109
           2       0.64      0.90      0.75        10
           3       0.92      1.00      0.96        12
           4       0.75      0.88      0.81        17
           5       0.93      0.91      0.92        58
           6       0.86      0.98      0.92        45
           7       0.93      0.93      0.93        14
           8       0.88      1.00      0.93        14
           9       0.88      0.90      0.89        48

    accuracy                           0.91       405
   macro avg       0.87      0.92      0.89       405
weighted avg       0.91      0.91      0.91       405

[[ 63   2   5   0   2   0   1   0   0   5]
 [  1 101   0   1   3   3   0   0   0   0]
 [  1   0   9   0   0   0   0   0   0   0]
 [  0   0   0  12   0   0   0   0   0   0]
 [  0   0   0   0 

Train model


In [13]:
rnd_n = RandomizedSearchCV(
    estimator   = pipe,
    param_distributions = param_dist,
    n_iter      = 50,
    cv          = cv,
    scoring     = 'accuracy',
    n_jobs      = -1,
    verbose     = 2,
    random_state= 42
)
X_train_n = np.vstack([X_tr_n, X_val_n])
y_train_n = np.concatenate([y_tr_n, y_val_n])
rnd_n.fit(X_train_n, y_train_n)

print(">>> Best params (NSP):", rnd_n.best_params_)
print(">>> Best CV acc:", rnd_n.best_score_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END svm__C=7.896910002727692, svm__degree=2, svm__gamma=scale, svm__kernel=rbf; total time=   0.8s
[CV] END svm__C=7.896910002727692, svm__degree=2, svm__gamma=scale, svm__kernel=rbf; total time=   0.8s
[CV] END svm__C=7.896910002727692, svm__degree=2, svm__gamma=scale, svm__kernel=rbf; total time=   0.8s
[CV] END svm__C=7.896910002727692, svm__degree=2, svm__gamma=scale, svm__kernel=rbf; total time=   0.9s
[CV] END svm__C=3.845401188473625, svm__degree=2, svm__gamma=scale, svm__kernel=poly; total time=   0.9s
[CV] END svm__C=7.896910002727692, svm__degree=2, svm__gamma=scale, svm__kernel=rbf; total time=   0.9s
[CV] END svm__C=3.845401188473625, svm__degree=2, svm__gamma=scale, svm__kernel=poly; total time=   1.0s
[CV] END svm__C=3.845401188473625, svm__degree=2, svm__gamma=scale, svm__kernel=poly; total time=   1.0s
[CV] END svm__C=3.845401188473625, svm__degree=2, svm__gamma=scale, svm__kernel=poly; total time=   1.0

Test Validation


In [14]:
X_valtest_n = np.vstack([X_test_n, X_val_n])
y_valtest_n = np.concatenate([y_test_n, y_val_n])
best_n = rnd_n.best_estimator_
y_pred_n = best_n.predict(X_valtest_n)

print("\n=== TEST (NSP) ===")
print("Accuracy:", accuracy_score(y_valtest_n, y_pred_n))
print(classification_report(y_valtest_n, y_pred_n))
print(confusion_matrix(y_valtest_n, y_pred_n))



=== TEST (NSP) ===
Accuracy: 0.9333333333333333
              precision    recall  f1-score   support

           0       0.98      0.94      0.96       310
           1       0.76      0.91      0.83        67
           2       0.93      0.93      0.93        28

    accuracy                           0.93       405
   macro avg       0.89      0.93      0.91       405
weighted avg       0.94      0.93      0.94       405

[[291  19   0]
 [  4  61   2]
 [  2   0  26]]
