In [1]:
import numpy as np
from src.Preprocessing import create_labels_single_column
from src.Particle import ParticleType
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import recall_score, precision_score, confusion_matrix, precision_recall_curve, roc_curve, auc
from sklearn.model_selection import cross_val_predict

In [2]:
# loading the data
efps_gluon = np.load("../../Data/g_jets_efp_d5.npy")
efps_quark = np.load("../../Data/q_jets_efp_d5.npy")

print(f"Number of Gluon Jets: {efps_gluon.shape[0]}")
print(f"Number of LightQuark Jets: {efps_quark.shape[0]}")

Number of Gluon Jets: 177252
Number of LightQuark Jets: 170679


In [3]:
# joinning the data 
# half of the data represents Top jets, the other half represents non-top quark jets
X = np.vstack((efps_quark, efps_gluon))
y = create_labels_single_column(jet_inputs={ParticleType.Gluon: (efps_quark.shape[0], X.shape[0] - 1), ParticleType.LightQuark: (0, efps_quark.shape[0] - 1)})
# shuffling the data
X, y = shuffle(X, y, random_state=42)

In [4]:
# Let us use the StandardScaler to leave each feature with the same order of magniture
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [5]:
# Dividing the data into trainning, validation, and test
# diving the set into trainning, validation, and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

print(f"Size of the Training set: {X_train.shape[0]}")
print(f"Size of the Validation set: {X_val.shape[0]}")
print(f"Size of the Test set: {X_test.shape[0]}")

Size of the Training set: 251379
Size of the Validation set: 44362
Size of the Test set: 52190


In [17]:
# defining the model
linear_svc = LinearSVC(penalty='l2', C=0.0004)
linear_svc.fit(X_train, y_train)

coefs = ', '.join([f'{val:.2f}' for val in linear_svc.coef_[0]])
print(f"Coefficients:", coefs)
print(f"Intercept {linear_svc.intercept_[0]:.3f}")



Coefficients: -1.30, 0.73, 0.09, -0.10, 0.11, 0.36, 0.06, -0.15, -0.10, 0.03, 0.03, -0.20, -0.12, -0.11, 0.04, 0.09, -0.13, 0.03, 0.01, 0.05, -0.04, -0.18, 0.01, -0.09, 0.06, -0.01, -0.02, 0.22, -0.06, 0.02, -0.06, -0.03, 0.03, -0.10, 0.03, -0.08, -0.03, -0.08, 0.08, -0.02, 0.01, 0.01, -0.01, 0.10, -0.01, -0.04, 0.06, 0.07, -0.05, 0.05, 0.00, -0.00, 0.00, 0.35, 0.17, -0.09, 0.01, 0.01, 0.02, 0.14, 0.01, -0.09, 0.01, 0.02, 0.01, 0.08, -0.02, -0.02, 0.05, 0.19, -0.09, -0.02, -0.01, -0.02, -0.06, 0.03, -0.00, 0.03, -0.03, 0.03, 0.02, 0.01, 0.02, 0.02, 0.02, -0.02, -0.08, 0.00, -0.01, 0.00, -0.00, 0.01, 0.00, 0.01, -0.05, 0.01, -0.03, -0.01, -0.03, -0.04, -0.08
Intercept -0.019


In [18]:
# Tranning set
y_train_predict = linear_svc.predict(X_train)

print("Trainnig set:")
print(f"Recall for Quark tagging: {recall_score(y_train, y_train_predict):.2f}")
print(f"Precision for Quark tagging: {precision_score(y_train, y_train_predict):.2f}")
print("Confusion Matrix")
print(confusion_matrix(y_train, y_train_predict, labels=[0, 1]))

Trainnig set:
Recall for Quark tagging: 0.67
Precision for Quark tagging: 0.64
Confusion Matrix
[[82711 45323]
 [41156 82189]]


In [19]:
y_val_predict = linear_svc.predict(X_val)

print("Validation set:")
print(f"Recall for Top tagging: {recall_score(y_val, y_val_predict):.2f}")
print(f"Precision for Top tagging: {precision_score(y_val, y_val_predict):.2f}")
print("Confusion Matrix")
print(confusion_matrix(y_val, y_val_predict, labels=[0, 1]))

Validation set:
Recall for Top tagging: 0.67
Precision for Top tagging: 0.65
Confusion Matrix
[[14575  8011]
 [ 7215 14561]]
