In [10]:
import pandas as pd
from src.Particle import ParticleType
from src.Preprocessing import PreprocessingEFPs, create_labels_single_column
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, recall_score, precision_score, precision_recall_curve, roc_curve
from sklearn.model_selection import cross_val_predict
import matplotlib.pyplot as plt

#### Loading the data

In [11]:
data_gluon = pd.read_csv('../../Data/g_jets.csv', header=None, sep=' ')
data_quark = pd.read_csv('../../Data/q_jets.csv', header=None, sep=' ')
data_top = pd.read_csv('../../Data/t_jets.csv', header=None, sep=' ')

#### Preprocessing the data 

In [3]:
# joinning data frames
all_jets = pd.concat([data_top, data_quark, data_gluon], axis=0)
all_jets.reset_index(drop=True, inplace=True)
# defining the dictionary with the order of jets in the full data frame
jets_order = {
    ParticleType.Top: (0, len(data_top) - 1), 
    ParticleType.LightQuark: (len(data_top), len(data_top) + len(data_quark) - 1),
    ParticleType.Gluon: (len(data_top) + len(data_quark), len(data_top) + len(data_quark) + len(data_gluon) - 1),
}

In [12]:
# defining the degree of the polynomions
efp_processing = PreprocessingEFPs(5, create_labels_single_column, ('p==', 1))
# processing the data by constructing the polynomials
# this takes a while to run
X = efp_processing.transform(X=data_top.to_numpy()[:100], y={"a": (0, 99)})
y = efp_processing.jet_labels

Originally Available EFPs:
  Prime: 23691
  Composite: 21540
  Total:  45231
Current Stored EFPs:
  Prime: 54
  Composite: 0
  Total:  54


In [None]:
# diving the set into trainning, validation, and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=42)

print(f"Size of the training set: {X_train.shape[0]}")
print(f"Size of the validation set: {X_val.shape[0]}")
print(f"Size of the test set: {X_test.shape[0]}")

#### Running the ML model

In [None]:
# creating the LogisticRegression (SoftMax) model with the Lasso penalty
logist_reg = LogisticRegression(penalty='l1', C=0.1, solver='saga')
logist_reg.fit(X_train, y_train)

In [None]:
logist_reg.coef_

In [None]:
logist_reg.intercept_

In [None]:
# predicting the data in the training set
y_train_predict = logist_reg.predict(X_train)

In [None]:
# setting up the confusion matrix
confusion_matrix(y_train, y_train_predict)

In [None]:
# prediction in the validation set
y_val_predict = logist_reg.predict(X_val)
confusion_matrix(y_val, y_val_predict)

In [None]:
# precision and recall for Top tagging in the trainning set
y_train_pred_top = y_train_predict == 0
y_train_top = y_train == 0

print("Trainnig set:")
print(f"Recall for Top tagging: {recall_score(y_train_top, y_train_pred_top):.2f}")
print(f"Precision for Top tagging: {precision_score(y_train_top, y_train_pred_top):.2f}")

# precision and recall for Top tagging in the validation set
y_val_pred_top = y_val_predict == 0
y_val_top = y_val == 0

print("Validation set:")
print(f"Recall for Top tagging: {recall_score(y_val_top, y_val_pred_top):.2f}")
print(f"Precision for Top tagging: {precision_score(y_val_top, y_val_pred_top):.2f}")

In [None]:
# evaluating the probabilities of each sample in the trainning data using 
# K-fold cross validation - it returns the probability of the sample in each class
y_probabilities = cross_val_predict(logist_reg, X_train, y_train, cv=4, method='predict_proba')

In [None]:
y_top_score = y_probabilities[:, 0]
precision, recall, thresholds = precision_recall_curve(y_train == 0, y_top_score)
plt.plot(thresholds, precision[:-1], label='Precision', color='green')
plt.plot(thresholds, recall[:-1], label='Recall', color='red')
plt.legend(loc='best')
plt.plot()

In [None]:
# roc curve for top tagging 
fpr, tpr, thresholds = roc_curve(y_train == 0, y_probabilities[:, 0])
plt.plot(fpr, tpr)
plt.xlabel('FPR')
plt.ylabel('TPR (Recall)')
plt.show()