In [2]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from scipy.stats import uniform


In [3]:
DATA_DIR = "../data/cleaned_data"

train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
val_df   = pd.read_csv(os.path.join(DATA_DIR, "val.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

print("Train:", train_df.shape, "Val:", val_df.shape, "Test:", test_df.shape)


FileNotFoundError: [Errno 2] No such file or directory: '../data/cleaned_data\\train.csv'

In [None]:
feature_cols = [c for c in train_df.columns if c not in ("CLASS","NSP")]

def split(df, target):
    X = df[feature_cols].values
    y = df[target].values
    return X, y

# CLASS
X_tr_c, y_tr_c = split(train_df, "CLASS")
X_val_c, y_val_c = split(val_df, "CLASS")
X_test_c, y_test_c = split(test_df, "CLASS")

# NSP
X_tr_n, y_tr_n = split(train_df, "NSP")
X_val_n, y_val_n = split(val_df, "NSP")
X_test_n, y_test_n = split(test_df, "NSP")

print("Số feature:", len(feature_cols))


In [None]:
pipe = ImbPipeline([
    ('scaler', StandardScaler()),
    ('smote',  SMOTE(random_state=42)),
    ('svm',    SVC(probability=True, random_state=42))
])


In [None]:
param_dist = {
    'svm__C': uniform(0.1, 10),
    'svm__kernel': ['linear', 'rbf', 'poly'],
    'svm__gamma': ['scale', 'auto'],
    'svm__degree': [2, 3, 4]  # Chỉ dùng nếu kernel là poly
}


In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [None]:
rnd_c = RandomizedSearchCV(
    estimator   = pipe,
    param_distributions = param_dist,
    n_iter      = 50,
    cv          = cv,
    scoring     = 'accuracy',
    n_jobs      = -1,
    verbose     = 2,
    random_state= 42
)
X_train_c = np.vstack([X_tr_c, X_val_c])
y_train_c = np.concatenate([y_tr_c, y_val_c])
rnd_c.fit(X_train_c, y_train_c)

print(">>> Best params (CLASS):", rnd_c.best_params_)
print(">>> Best CV acc:", rnd_c.best_score_)


In [None]:
X_valtest_c = np.vstack([X_test_c, X_val_c])
y_valtest_c = np.concatenate([y_test_c, y_val_c])
best_c = rnd_c.best_estimator_
y_pred_c = best_c.predict(X_valtest_c)

print("\n=== TEST (CLASS) ===")
print("Accuracy:", accuracy_score(y_valtest_c, y_pred_c))
print(classification_report(y_valtest_c, y_pred_c))
print(confusion_matrix(y_valtest_c, y_pred_c))


In [None]:
rnd_n = RandomizedSearchCV(
    estimator   = pipe,
    param_distributions = param_dist,
    n_iter      = 50,
    cv          = cv,
    scoring     = 'accuracy',
    n_jobs      = -1,
    verbose     = 2,
    random_state= 42
)
X_train_n = np.vstack([X_tr_n, X_val_n])
y_train_n = np.concatenate([y_tr_n, y_val_n])
rnd_n.fit(X_train_n, y_train_n)

print(">>> Best params (NSP):", rnd_n.best_params_)
print(">>> Best CV acc:", rnd_n.best_score_)


In [None]:
X_valtest_n = np.vstack([X_test_n, X_val_n])
y_valtest_n = np.concatenate([y_test_n, y_val_n])
best_n = rnd_n.best_estimator_
y_pred_n = best_n.predict(X_valtest_n)

print("\n=== TEST (NSP) ===")
print("Accuracy:", accuracy_score(y_valtest_n, y_pred_n))
print(classification_report(y_valtest_n, y_pred_n))
print(confusion_matrix(y_valtest_n, y_pred_n))
