In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline 

from scipy.stats import randint, uniform
from xgboost import XGBClassifier

I. Data path


In [3]:
DATA_DIR = "../data/cleaned_data"
train_df = pd.read_csv(os.path.join(DATA_DIR, "train.csv"))
test_df  = pd.read_csv(os.path.join(DATA_DIR, "test.csv"))

print("Train:", train_df.shape, "Test:", test_df.shape)

Train: (1700, 23) Test: (426, 23)


In [4]:
feature_cols = [c for c in train_df.columns if c not in ("CLASS", "NSP")]


In [5]:
def split(df, target):
    X = df[feature_cols].values
    y = df[target].values
    return X, y

II. Feature/Label split

In [6]:
X_train_c, y_train_c = split(train_df, "CLASS")
X_test_c,  y_test_c  = split(test_df,  "CLASS")

X_train_n, y_train_n = split(train_df, "NSP")
X_test_n,  y_test_n  = split(test_df,  "NSP")



III. Pipeline setting

3.1 XgBoost Pipeline

In [7]:
model_xgb_c = XGBClassifier(
    random_state=42,
    objective='multi:softprob',
    eval_metric='mlogloss',
    num_class=len(np.unique(y_train_c))
)

In [8]:
model_xgb_n = XGBClassifier(
    random_state=42,
    objective='multi:softprob',
    eval_metric='mlogloss',
    num_class=len(np.unique(y_train_n))
)

3.2 Super parameters

In [9]:
param_dist_xgb = {
    'n_estimators':     randint(100, 300),
    'max_depth':        randint(3, 30),
    'learning_rate':    uniform(0.01, 0.3),
    'subsample':        uniform(0.6, 0.4),
    'colsample_bytree': uniform(0.6, 0.4),
    'gamma':            uniform(0, 5),
    'reg_alpha':        uniform(0, 1),
    'reg_lambda':       uniform(0, 1),
    'min_child_weight': randint(1, 10)
}



3.3 Cross-Validation

In [10]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


IV. Train & Evaluate for CLASS


RandomizedSearchCV for target "CLASS"

In [12]:
rnd_c_xgb = RandomizedSearchCV(
    estimator=model_xgb_c,
    param_distributions=param_dist_xgb,
    n_iter=50,
    cv=cv,
    scoring='accuracy',
    verbose=2,
    random_state=42
)


In [13]:
rnd_c_xgb.fit(X_train_c, y_train_c)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END colsample_bytree=0.749816047538945, gamma=4.75357153204958, learning_rate=0.22959818254342154, max_depth=23, min_child_weight=7, n_estimators=221, reg_alpha=0.15599452033620265, reg_lambda=0.05808361216819946, subsample=0.9464704583099741; total time=   0.2s
[CV] END colsample_bytree=0.749816047538945, gamma=4.75357153204958, learning_rate=0.22959818254342154, max_depth=23, min_child_weight=7, n_estimators=221, reg_alpha=0.15599452033620265, reg_lambda=0.05808361216819946, subsample=0.9464704583099741; total time=   0.2s
[CV] END colsample_bytree=0.749816047538945, gamma=4.75357153204958, learning_rate=0.22959818254342154, max_depth=23, min_child_weight=7, n_estimators=221, reg_alpha=0.15599452033620265, reg_lambda=0.05808361216819946, subsample=0.9464704583099741; total time=   0.2s
[CV] END colsample_bytree=0.749816047538945, gamma=4.75357153204958, learning_rate=0.22959818254342154, max_depth=23, min_child_weight

In [14]:
print(">>> Best params (CLASS):", rnd_c_xgb.best_params_)
print(">>> Best CV acc  (CLASS):", rnd_c_xgb.best_score_)

>>> Best params (CLASS): {'colsample_bytree': np.float64(0.7364265404201034), 'gamma': np.float64(0.5673676062029454), 'learning_rate': np.float64(0.28740808548356883), 'max_depth': 12, 'min_child_weight': 3, 'n_estimators': 106, 'reg_alpha': np.float64(0.659984046034179), 'reg_lambda': np.float64(0.8172222002012158), 'subsample': np.float64(0.822080324639785)}
>>> Best CV acc  (CLASS): 0.8947058823529412


In [15]:
y_pred_c = rnd_c_xgb.predict(X_test_c)
print("\n=== TEST (CLASS) ===")
print("Accuracy:", accuracy_score(y_test_c, y_pred_c))
print(classification_report(y_test_c, y_pred_c))
print(confusion_matrix(y_test_c, y_pred_c))


=== TEST (CLASS) ===
Accuracy: 0.9154929577464789
              precision    recall  f1-score   support

           0       0.88      0.90      0.89        73
           1       0.92      0.95      0.94       114
           2       0.79      0.85      0.81        13
           3       1.00      0.74      0.85        19
           4       1.00      0.53      0.69        17
           5       0.90      1.00      0.95        63
           6       0.94      0.94      0.94        54
           7       0.88      0.94      0.91        16
           8       0.92      1.00      0.96        12
           9       0.95      0.91      0.93        45

    accuracy                           0.92       426
   macro avg       0.92      0.88      0.89       426
weighted avg       0.92      0.92      0.91       426

[[ 66   1   3   0   0   2   1   0   0   0]
 [  2 108   0   0   0   3   1   0   0   0]
 [  1   1  11   0   0   0   0   0   0   0]
 [  0   4   0  14   0   1   0   0   0   0]
 [  3   3   0   0 

V. Train & Evaluate for NSP

In [16]:
rnd_n_xgb = RandomizedSearchCV(
    estimator=model_xgb_n,
    param_distributions=param_dist_xgb,
    n_iter=50,
    cv=cv,
    scoring='accuracy',
    verbose=2,
    random_state=42
)


In [17]:
rnd_n_xgb.fit(X_train_n, y_train_n)


Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END colsample_bytree=0.749816047538945, gamma=4.75357153204958, learning_rate=0.22959818254342154, max_depth=23, min_child_weight=7, n_estimators=221, reg_alpha=0.15599452033620265, reg_lambda=0.05808361216819946, subsample=0.9464704583099741; total time=   0.0s
[CV] END colsample_bytree=0.749816047538945, gamma=4.75357153204958, learning_rate=0.22959818254342154, max_depth=23, min_child_weight=7, n_estimators=221, reg_alpha=0.15599452033620265, reg_lambda=0.05808361216819946, subsample=0.9464704583099741; total time=   0.0s
[CV] END colsample_bytree=0.749816047538945, gamma=4.75357153204958, learning_rate=0.22959818254342154, max_depth=23, min_child_weight=7, n_estimators=221, reg_alpha=0.15599452033620265, reg_lambda=0.05808361216819946, subsample=0.9464704583099741; total time=   0.0s
[CV] END colsample_bytree=0.749816047538945, gamma=4.75357153204958, learning_rate=0.22959818254342154, max_depth=23, min_child_weight

In [18]:
print(">>> Best params (NSP):", rnd_n_xgb.best_params_)
print(">>> Best CV acc  (NSP):", rnd_n_xgb.best_score_)

>>> Best params (NSP): {'colsample_bytree': np.float64(0.6777095814048169), 'gamma': np.float64(0.3547045849996383), 'learning_rate': np.float64(0.1290351481641665), 'max_depth': 16, 'min_child_weight': 3, 'n_estimators': 242, 'reg_alpha': np.float64(0.375582952639944), 'reg_lambda': np.float64(0.093981939840869), 'subsample': np.float64(0.8313120563984696)}
>>> Best CV acc  (NSP): 0.9552941176470588


In [19]:
y_pred_n = rnd_n_xgb.predict(X_test_n)
print("\n=== TEST (NSP) ===")
print("Accuracy:", accuracy_score(y_test_n, y_pred_n))
print(classification_report(y_test_n, y_pred_n))
print(confusion_matrix(y_test_n, y_pred_n))


=== TEST (NSP) ===
Accuracy: 0.9530516431924883
              precision    recall  f1-score   support

           0       0.97      0.98      0.97       333
           1       0.88      0.81      0.85        64
           2       0.94      1.00      0.97        29

    accuracy                           0.95       426
   macro avg       0.93      0.93      0.93       426
weighted avg       0.95      0.95      0.95       426

[[325   7   1]
 [ 11  52   1]
 [  0   0  29]]
