In [5]:
import xgboost as xgb
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, average_precision_score, classification_report, confusion_matrix
from data_processor import load_features_and_split3, class_weight_hint  # your helper

print("xgboost version:", xgb.__version__)

# 1) Data
X_train, X_val, X_test, y_train, y_val, y_test = load_features_and_split3(
    "orbit_results.csv", test_size=0.20, val_size=0.10, random_state=42, stratify=True
)
spw = class_weight_hint(y_train)

# 2) DMatrices (pass feature names for nicer importances)
feat_names = list(X_train.columns)
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=feat_names)
dval   = xgb.DMatrix(X_val,   label=y_val,   feature_names=feat_names)
dtest  = xgb.DMatrix(X_test,  label=y_test,  feature_names=feat_names)

# 3) Params (mirror your sklearn config)
params = {
    "objective": "binary:logistic",
    "eval_metric": "auc",
    "eta": 0.03,                # learning_rate
    "max_depth": 4,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "lambda": 1.0,
    "scale_pos_weight": spw,    # helps with imbalance
    "verbosity": 0
}

# 4) Train with early stopping on the validation set
num_boost_round = 2000
watchlist = [(dval, "val")]
booster = xgb.train(
    params,
    dtrain,
    num_boost_round=num_boost_round,
    evals=watchlist,
    early_stopping_rounds=100,   # patience
    verbose_eval=False
)

# 5) Predict on test using the best iteration (handles old/new APIs)
try:
    proba = booster.predict(dtest, iteration_range=(0, booster.best_iteration + 1))
except Exception:
    proba = booster.predict(dtest, ntree_limit=getattr(booster, "best_ntree_limit", booster.best_iteration + 1))
pred = (proba >= 0.5).astype(int)

# 6) Metrics
print("Test ROC AUC:", roc_auc_score(y_test, proba))
print("Test PR  AUC:", average_precision_score(y_test, proba))
print("\nConfusion matrix:\n", confusion_matrix(y_test, pred))
print("\nReport:\n", classification_report(y_test, pred, digits=3))

# 7) Quick feature importances (gain)
gain = booster.get_score(importance_type="gain")
imp = pd.Series(gain, index=[k for k in gain]).sort_values(ascending=False)
print("\nTop importances (gain):")
print(imp.head(15))


xgboost version: 2.1.4
Test ROC AUC: 0.896780303030303
Test PR  AUC: 0.6497495112411358

Confusion matrix:
 [[164  12]
 [ 10  14]]

Report:
               precision    recall  f1-score   support

           0      0.943     0.932     0.937       176
           1      0.538     0.583     0.560        24

    accuracy                          0.890       200
   macro avg      0.740     0.758     0.749       200
weighted avg      0.894     0.890     0.892       200


Top importances (gain):
vt        15.552489
Lz        14.603193
dtheta    10.220143
vx         9.250250
vy         8.492142
y          8.483853
x          8.258370
vr         6.540431
dtype: float64
