In [1]:
import numpy as np
from src.Preprocessing import create_labels_single_column
from src.Particle import ParticleType
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import recall_score, precision_score, confusion_matrix, precision_recall_curve, roc_curve, auc
from sklearn.model_selection import cross_val_predict

In [2]:
# loading the data
efps_gluon = np.load("../../../Data/g_jets_efp_d5.npy")
efps_quark = np.load("../../../Data/q_jets_efp_d5.npy")

print(f"Number of Gluon Jets: {efps_gluon.shape[0]}")
print(f"Number of LightQuark Jets: {efps_quark.shape[0]}")

Number of Gluon Jets: 177252
Number of LightQuark Jets: 170679


In [3]:
# joinning the data 
# half of the data represents Top jets, the other half represents non-top quark jets
X = np.vstack((efps_quark, efps_gluon))
y = create_labels_single_column(jet_inputs={ParticleType.Gluon: (efps_quark.shape[0], X.shape[0] - 1), ParticleType.LightQuark: (0, efps_quark.shape[0] - 1)})
# shuffling the data
X, y = shuffle(X, y, random_state=42)

In [4]:
# Dividing the data into trainning, validation, and test
# diving the set into trainning, validation, and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

print(f"Size of the Training set: {X_train.shape[0]}")
print(f"Size of the Validation set: {X_val.shape[0]}")
print(f"Size of the Test set: {X_test.shape[0]}")

Size of the Training set: 251379
Size of the Validation set: 44362
Size of the Test set: 52190


In [ ]:
dt_model = DecisionTreeClassifier(criterion='entropy', max_depth=20, min_samples_leaf=100, random_state=0)
adaboost_dt = AdaBoostClassifier(estimator=dt_model, n_estimators=50, random_state=0)
adaboost_dt.fit(X_train, y_train)

In [ ]:
# Tranning set
y_train_predict = dt_model.predict(X_train)

print("Trainnig set:")
print(f"Recall for Quark tagging: {recall_score(y_train, y_train_predict):.2f}")
print(f"Precision for Quark tagging: {precision_score(y_train, y_train_predict):.2f}")
print("Confusion Matrix")
print(confusion_matrix(y_train, y_train_predict, labels=[0, 1]))

In [ ]:
y_val_predict = dt_model.predict(X_val)

print("Validation set:")
print(f"Recall for Quark tagging: {recall_score(y_val, y_val_predict):.2f}")
print(f"Precision for Quark tagging: {precision_score(y_val, y_val_predict):.2f}")
print("Confusion Matrix")
print(confusion_matrix(y_val, y_val_predict, labels=[0, 1]))