In [None]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as ImbPipeline
from scipy.stats import randint, uniform

from xgboost import XGBClassifier

I. Data path


In [2]:
DATA_DIR = "../data/cleaned_data"
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
val_df = pd.read_csv(os.path.join(DATA_DIR, "val.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

print("Train:", train_df.shape, "Val:", val_df.shape, "Test:", test_df.shape)

Train: (1721, 23) Val: (192, 23) Test: (213, 23)


In [3]:
feature_cols = [c for c in train_df.columns if c not in ("CLASS","NSP")]


In [4]:
def split(df, target):
    X = df[feature_cols].values
    y = df[target].values
    return X, y


II. Feature/Label split

In [5]:
X_tr_c, y_tr_c = split(train_df, "CLASS")
X_val_c, y_val_c = split(val_df,   "CLASS")
X_test_c, y_test_c = split(test_df, "CLASS")

X_tr_n, y_tr_n = split(train_df, "NSP")
X_val_n, y_val_n = split(val_df,   "NSP")
X_test_n, y_test_n = split(test_df, "NSP")


III. Pipeline setting

3.1 XgBoost Pipeline

In [29]:
pipe_xgb_class = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('xgb', XGBClassifier(
        random_state = 42,
        objective='multi:softprob', 
        eval_metric='mlogloss',      
        num_class=len(np.unique(y_tr_c))
    ))
])

In [30]:
pipe_xgb_nsp = ImbPipeline([
    ('smote', SMOTE(random_state=42)),
    ('xgb', XGBClassifier(
        random_state=42,
        objective='multi:softprob',  
        eval_metric='mlogloss',     
        num_class=len(np.unique(y_tr_n))  
    ))
])

3.2 Super parameters

In [32]:
param_dist_xgb = {
    'xgb__n_estimators':     randint(100, 300),
    'xgb__max_depth':        randint(3, 30),
    'xgb__learning_rate':    uniform(0.01, 0.3),
    'xgb__subsample':        uniform(0.6, 0.4),
    'xgb__colsample_bytree': uniform(0.6, 0.4),
    'xgb__gamma':            uniform(0, 5),
    'xgb__reg_alpha':        uniform(0, 1),
    'xgb__reg_lambda':       uniform(0, 1),
    'xgb__min_child_weight': randint(1, 10)
}


3.3 Cross-Validation

In [33]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


IV. Model train and test

RandomizedSearchCV for target "CLASS"

In [34]:
rnd_c_xgb = RandomizedSearchCV(
    estimator= pipe_xgb_class,
    param_distributions= param_dist_xgb,
    n_iter=50,
    cv=cv,
    scoring='accuracy',
    verbose=2,
    random_state=42
)


4.1 Target "CLASS"

Train with target "CLASS"

In [35]:
X_train_c = np.vstack([X_tr_c, X_val_c])
y_train_c = np.concatenate([y_tr_c, y_val_c])
rnd_c_xgb.fit(X_train_c, y_train_c)

print(">>> Best params (CLASS):", rnd_c_xgb.best_params_)
print(">>> Best CV acc  (CLASS):", rnd_c_xgb.best_score_)



Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END xgb__colsample_bytree=0.749816047538945, xgb__gamma=4.75357153204958, xgb__learning_rate=0.22959818254342154, xgb__max_depth=23, xgb__min_child_weight=7, xgb__n_estimators=221, xgb__reg_alpha=0.15599452033620265, xgb__reg_lambda=0.05808361216819946, xgb__subsample=0.9464704583099741; total time=   0.5s
[CV] END xgb__colsample_bytree=0.749816047538945, xgb__gamma=4.75357153204958, xgb__learning_rate=0.22959818254342154, xgb__max_depth=23, xgb__min_child_weight=7, xgb__n_estimators=221, xgb__reg_alpha=0.15599452033620265, xgb__reg_lambda=0.05808361216819946, xgb__subsample=0.9464704583099741; total time=   0.5s
[CV] END xgb__colsample_bytree=0.749816047538945, xgb__gamma=4.75357153204958, xgb__learning_rate=0.22959818254342154, xgb__max_depth=23, xgb__min_child_weight=7, xgb__n_estimators=221, xgb__reg_alpha=0.15599452033620265, xgb__reg_lambda=0.05808361216819946, xgb__subsample=0.9464704583099741; total time=   0.5s

Test

In [36]:
best_c_xgb  = rnd_c_xgb.best_estimator_
y_pred_c    = best_c_xgb.predict(X_test_c)

In [37]:
print("\n=== TEST (CLASS) ===")
print("Accuracy:", accuracy_score(y_test_c, y_pred_c))
print(classification_report(y_test_c, y_pred_c))
print(confusion_matrix(y_test_c, y_pred_c))



=== TEST (CLASS) ===
Accuracy: 0.9389671361502347
              precision    recall  f1-score   support

           0       0.91      0.88      0.90        34
           1       0.96      0.93      0.95        56
           2       0.83      0.83      0.83         6
           3       1.00      1.00      1.00         5
           4       0.67      0.75      0.71         8
           5       0.94      1.00      0.97        32
           6       0.97      1.00      0.98        28
           7       1.00      1.00      1.00         9
           8       1.00      1.00      1.00         9
           9       0.96      0.92      0.94        26

    accuracy                           0.94       213
   macro avg       0.92      0.93      0.93       213
weighted avg       0.94      0.94      0.94       213

[[30  1  1  0  2  0  0  0  0  0]
 [ 0 52  0  0  1  2  1  0  0  0]
 [ 1  0  5  0  0  0  0  0  0  0]
 [ 0  0  0  5  0  0  0  0  0  0]
 [ 0  1  0  0  6  0  0  0  0  1]
 [ 0  0  0  0  0 32  0  0

4.2 Target "NSP"

In [38]:
rnd_n_xgb = RandomizedSearchCV(
    estimator=pipe_xgb_nsp, 
    param_distributions=param_dist_xgb,
    n_iter=50,
    cv=cv,
    scoring='accuracy',
    verbose=2,
    random_state=42
)

Train

In [39]:
X_train_n = np.vstack([X_tr_n, X_val_n])
y_train_n = np.concatenate([y_tr_n, y_val_n])
rnd_n_xgb.fit(X_train_n, y_train_n)

print(">>> Best params (NSP):", rnd_n_xgb.best_params_)
print(">>> Best CV acc  (NSP):", rnd_n_xgb.best_score_)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END xgb__colsample_bytree=0.749816047538945, xgb__gamma=4.75357153204958, xgb__learning_rate=0.22959818254342154, xgb__max_depth=23, xgb__min_child_weight=7, xgb__n_estimators=221, xgb__reg_alpha=0.15599452033620265, xgb__reg_lambda=0.05808361216819946, xgb__subsample=0.9464704583099741; total time=   0.1s
[CV] END xgb__colsample_bytree=0.749816047538945, xgb__gamma=4.75357153204958, xgb__learning_rate=0.22959818254342154, xgb__max_depth=23, xgb__min_child_weight=7, xgb__n_estimators=221, xgb__reg_alpha=0.15599452033620265, xgb__reg_lambda=0.05808361216819946, xgb__subsample=0.9464704583099741; total time=   0.1s
[CV] END xgb__colsample_bytree=0.749816047538945, xgb__gamma=4.75357153204958, xgb__learning_rate=0.22959818254342154, xgb__max_depth=23, xgb__min_child_weight=7, xgb__n_estimators=221, xgb__reg_alpha=0.15599452033620265, xgb__reg_lambda=0.05808361216819946, xgb__subsample=0.9464704583099741; total time=   0.1s

Test

In [40]:
best_n_xgb = rnd_n_xgb.best_estimator_
y_pred_n = best_n_xgb.predict(X_test_n)

In [41]:
print("\n=== TEST (NSP) ===")
print("Accuracy:", accuracy_score(y_test_n, y_pred_n))
print(classification_report(y_test_n, y_pred_n))
print(confusion_matrix(y_test_n, y_pred_n))



=== TEST (NSP) ===
Accuracy: 0.9530516431924883
              precision    recall  f1-score   support

           0       0.97      0.96      0.97       159
           1       0.84      0.89      0.86        36
           2       1.00      1.00      1.00        18

    accuracy                           0.95       213
   macro avg       0.94      0.95      0.94       213
weighted avg       0.95      0.95      0.95       213

[[153   6   0]
 [  4  32   0]
 [  0   0  18]]
