In [1]:
import pandas as pd
import xgboost as xgb
import numpy as np

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../data/mu-frac-iron-shower-signals.csv")
x_columns = df.columns[2:-2]
y_columns = df.columns[-2:]

In [3]:
MUONLESS_UPPER_BOUND = 0.075
MUON_ONLY_LOWER_BOUND = 1 - MUONLESS_UPPER_BOUND

In [4]:
X = df[x_columns].to_numpy()
mu_ratio = df[y_columns].to_numpy()

mu_ratio = np.mean(mu_ratio, axis=1)

y = np.ones_like(mu_ratio)
y[mu_ratio < MUONLESS_UPPER_BOUND] = 0  # class: muonless
y[mu_ratio > MUON_ONLY_LOWER_BOUND] = 2  # class: muon-only

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [25]:
model = xgb.XGBClassifier(
    n_estimators=64,
    max_depth=8,
    learning_rate=0.1,
    min_child_weight=2.0,
    tree_method="auto",
    objective="multi:softmax",
    early_stopping_rounds=15,
)

In [26]:
model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=True)

[0]	validation_0-mlogloss:1.01421
[1]	validation_0-mlogloss:0.94352
[2]	validation_0-mlogloss:0.88367
[3]	validation_0-mlogloss:0.83246
[4]	validation_0-mlogloss:0.78842
[5]	validation_0-mlogloss:0.75014
[6]	validation_0-mlogloss:0.71681
[7]	validation_0-mlogloss:0.68763
[8]	validation_0-mlogloss:0.66199
[9]	validation_0-mlogloss:0.63946
[10]	validation_0-mlogloss:0.61960
[11]	validation_0-mlogloss:0.60212
[12]	validation_0-mlogloss:0.58658
[13]	validation_0-mlogloss:0.57279
[14]	validation_0-mlogloss:0.56056
[15]	validation_0-mlogloss:0.54969
[16]	validation_0-mlogloss:0.53999
[17]	validation_0-mlogloss:0.53140
[18]	validation_0-mlogloss:0.52372
[19]	validation_0-mlogloss:0.51682
[20]	validation_0-mlogloss:0.51066
[21]	validation_0-mlogloss:0.50510
[22]	validation_0-mlogloss:0.50012
[23]	validation_0-mlogloss:0.49572
[24]	validation_0-mlogloss:0.49172
[25]	validation_0-mlogloss:0.48815
[26]	validation_0-mlogloss:0.48485
[27]	validation_0-mlogloss:0.48189
[28]	validation_0-mlogloss:0.4

XGBClassifier(base_score=0.5, booster='gbtree', callbacks=None,
              colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
              early_stopping_rounds=15, enable_categorical=False,
              eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
              importance_type=None, interaction_constraints='',
              learning_rate=0.1, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=8, max_leaves=0, min_child_weight=2.0,
              missing=nan, monotone_constraints='()', n_estimators=64, n_jobs=0,
              num_parallel_tree=1, objective='multi:softmax', predictor='auto',
              random_state=0, reg_alpha=0, ...)

In [27]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix

y_pred = np.round(model.predict(X_test))
print(f'precision = {precision_score(y_test, y_pred, average="weighted")}')
print(f'recall = {recall_score(y_test, y_pred, average="weighted")}')

print(f'\nconfustion matrix=\n{confusion_matrix(y_test, y_pred)}')

precision = 0.7991234721337611
recall = 0.8131712362879436

confustion matrix=
[[ 6994  6916  2004]
 [ 1592 51254   636]
 [ 1575  1907  5429]]


In [28]:
model.save_model("../trained_models/xgboost-classifier-multi.model")