In [1]:
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../mu-frac-iron-shower-signals.csv")
x_columns = df.columns[2:-2]
y_columns = df.columns[-2:]

In [9]:
PREDICT_TOP = True
EPS = 0.075

In [10]:
X = df[x_columns].to_numpy()
mu_ratio = df[y_columns].to_numpy()

mu_ratio = mu_ratio[:, 0 if PREDICT_TOP else 1]

y = np.ones_like(mu_ratio)
y[mu_ratio < EPS] = 0  # class: muonless
y[mu_ratio > 1 - EPS] = 0  # class: muon-only

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [11]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [12]:
param = {
    'max_depth': 8,
    'learning_rate': 0.1,
    'min_child_weight': 2.0,
    # 'objective': 'multi:softmax',
    # 'num_class': 3,
    'objective': 'binary:logistic',
}

epochs = 50

In [13]:
model = xgb.train(param, dtrain, num_boost_round=epochs, evals=[(dtest, 'test')], verbose_eval=5, early_stopping_rounds=5)

[0]	test-logloss:0.64019
[5]	test-logloss:0.48359
[10]	test-logloss:0.41302
[15]	test-logloss:0.37859
[20]	test-logloss:0.36121
[25]	test-logloss:0.35210
[30]	test-logloss:0.34731
[35]	test-logloss:0.34484
[40]	test-logloss:0.34349
[45]	test-logloss:0.34274
[49]	test-logloss:0.34226


In [14]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix

y_pred = np.round(model.predict(dtest))
print(f'precision = {precision_score(y_test, y_pred, average="binary")}')
print(f'recall = {recall_score(y_test, y_pred, average="binary")}')

print(f'\nconfustion matrix=\n{confusion_matrix(y_test, y_pred)}')

precision = 0.8511575261305555
recall = 0.9295771927452533

confustion matrix=
[[18118  8416]
 [ 3646 48127]]


In [15]:
model.save_model("../trained_models/xgboost-classifier.model")