In [1]:
import pandas as pd
import xgboost as xgb
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../data/mu-frac-iron-shower-signals.csv")
x_columns = df.columns[2:-2]
y_columns = df.columns[-2:]

In [31]:
MUONLESS_UPEER_BOUND = 0.000001
MUON_ONLY_LOWER_BOUND = 0.9

In [32]:
X = df[x_columns].to_numpy()
mu_ratio = df[y_columns].to_numpy()

mu_ratio = np.mean(mu_ratio, axis=1)

y = np.zeros_like(mu_ratio)
y[mu_ratio < MUONLESS_UPEER_BOUND] = 1  # class: muonless
# y[mu_ratio > MUON_ONLY_LOWER_BOUND] = 1  # class: muon-only

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)


In [33]:
total_ones = np.sum(y)
total = y.size

print(f"special signals: {total_ones}")
print(f"regular signals: {total - total_ones}")

special signals: 47194.0
regular signals: 344340.0


In [34]:
SPECIAL_WEIGHT = 2

dtrain = xgb.DMatrix(X_train, label=y_train, weight=1 + (SPECIAL_WEIGHT - 1) * y_train)
dtest = xgb.DMatrix(X_test, label=y_test, weight=1 + (SPECIAL_WEIGHT - 1) * y_test)

In [35]:
param = {
    'max_depth': 8,
    'learning_rate': 0.1,
    'min_child_weight': 2.0,
    # 'objective': 'multi:softmax',
    # 'num_class': 3,
    'objective': 'binary:logistic',
}

epochs = 50

In [36]:
model = xgb.train(param, dtrain, num_boost_round=epochs, evals=[(dtest, 'test')], verbose_eval=5, early_stopping_rounds=5)

[0]	test-logloss:0.62866
[5]	test-logloss:0.43595
[10]	test-logloss:0.34542
[15]	test-logloss:0.29812
[20]	test-logloss:0.27179
[25]	test-logloss:0.25673
[30]	test-logloss:0.24805
[35]	test-logloss:0.24297
[40]	test-logloss:0.23983
[45]	test-logloss:0.23782
[49]	test-logloss:0.23682


In [37]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix

y_pred = np.round(model.predict(dtest))
print(f'precision = {precision_score(y_test, y_pred, average="binary")}')
print(f'recall = {recall_score(y_test, y_pred, average="binary")}')

print(f'\nconfustion matrix=\n{confusion_matrix(y_test, y_pred)}')

precision = 0.6125675993416412
recall = 0.7329441552961036

confustion matrix=
[[96652  6591]
 [ 3797 10421]]


In [38]:
model.save_model("../trained_models/xgboost-classifier.model")