In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import seaborn as sns

#We use Support Vector classifier as a classifier
from sklearn.svm import SVC
#https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html
from sklearn.metrics import recall_score
#https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html
from sklearn.metrics import precision_score

In [2]:
path = '/home/malpizar/Documents/MultipleScattering/'
file = 'RecoilsData_MasterCuts.csv'

df = pd.read_csv(path + file)

#drop the variable used to make the flags for multiple recoils
df = df.drop(['ar40recoils', 'subeventN', 'neckVetoN', ], axis=1) 

#drop rows with the value -99999
df = df.drop(index=df[df.eq(-99999).any(axis=1)].index)

df.head()
#print(df.shape)

Unnamed: 0,qPE,fprompt,eventTime,numEarlyPulses,fmaxpe,chargetopring,chargesecondring,chargebottomring,chargesecondbottomring,chargethirdbottomring,...,rprompt60Bayes,mblikelihoodX,mblikelihoodY,mblikelihoodZ,mblikelihoodR,timefit2X,timefit2Y,timefit2Z,deltat,multiplerecoils
0,124.141,0.718412,2482.56,0,0.03759,0.489214,0.0,3.75269,1.7731,2.0141,...,0.705331,-503.805,-130.72,-161.944,545.099,-657.14,-91.7037,-163.845,1217130000.0,0
1,70.9832,0.588649,2430.83,1,0.03866,1.08177,0.797658,0.0,1.50138,1.02298,...,0.336552,550.146,141.997,-136.025,584.232,756.071,-12.6274,-220.268,177796000.0,1
2,102.759,0.593867,2434.92,0,0.043435,1.44395,0.0,4.45313,0.0,0.0,...,0.424662,-248.059,-570.13,-37.1682,622.866,-186.042,-728.982,-298.363,3108520000.0,1
3,91.1607,0.647908,2469.96,1,0.076224,1.00222,0.378861,0.785043,3.02976,1.11181,...,0.715032,-33.0709,557.905,-243.44,609.602,-27.5744,684.606,-236.102,460340000.0,0
4,153.948,0.524905,2384.0,0,0.048278,3.42501,1.21717,3.88327,5.95274,0.0,...,0.21852,189.714,116.252,67.7249,232.578,-87.7712,287.97,62.1698,1325540000.0,1


In [3]:
#dependent variable that will be predicted
var = 'multiplerecoils'

X = df.drop([var], axis=1)
#X = pd.get_dummies(X) #this is necessary because otherwise the SVC can not convert string to float

# Select the dependent variable
y = df[var] 

# print(X.head())
print(X.shape)
# print(y.head())
print(y.shape)

(11, 21)
(11,)


In [4]:
# Splitting the dataset to Train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [None]:
#training the classifier using X_Train and y_train 
clf = SVC(kernel = 'linear').fit(X_train,y_train)

In [None]:
#Testing the model using X_test and storing the output in y_pred
y_pred = clf.predict(X_test)

In [None]:
#calculate recall, precision and F1 score

rec = recall_score(y_test, y_pred, average='weighted')
pres = precision_score(y_test, y_pred, average='weighted')

F1_score = 2/((1/rec) + (1/pres))

print("The recall is: ", rec)
print("The precision is: ", pres)
print("The F1-score is: ", F1_score)

In [None]:
# Creating  a confusion matrix,which compares the y_test and y_pred
cm = confusion_matrix(y_test, y_pred)

# Define the class labels
class_labels = df[var].unique()
print(class_labels.dtype)

# Plot the confusion matrix
plt.figure(figsize=(5, 4))
sns.heatmap(cm, annot=True, cmap='Greens')

# Add a legend
plt.xticks(ticks=np.arange(len(class_labels)) + 0.5, labels=class_labels)
plt.yticks(ticks=np.arange(len(class_labels)) + 0.5, labels=class_labels)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')

# Display the plot
plt.show()