In [95]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle as pkl
import seaborn as sns

from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_curve, auc 
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import tensorflow as tf
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

import random

import warnings
warnings.filterwarnings('ignore')

In [96]:
data = pd.read_csv('real_data.csv').drop(columns='Unnamed: 0')
data

Unnamed: 0,mjjs,dyjjs,pt_asyms,lead_etas,sub_etas,lead_ungrs,sub_ungrs,lead_ntrk,sub_ntrk
0,1130.073853,0.493658,113.070160,1.011049,1.521789,29,26,-35.905251,-44.798538
1,2449.469238,1.885328,45.750488,-0.200387,1.692164,45,41,-28.666266,-42.613891
2,1430.661377,1.006497,176.988586,0.312018,1.330938,33,49,-34.928000,-25.952763
3,1950.014404,1.921846,10.478271,-1.455257,0.545058,36,38,-35.587213,-42.304083
4,1432.149048,0.957917,80.403076,0.475771,-0.528239,37,43,-30.941180,-31.971276
...,...,...,...,...,...,...,...,...,...
1999995,1266.208740,0.307657,106.955811,-0.175908,-0.515509,30,65,-36.364752,-7.783872
1999996,2796.294678,1.515204,304.963562,-0.877777,0.652355,62,41,-12.558663,-44.059415
1999997,1558.986206,0.614022,449.194580,0.225624,0.864394,77,19,7.995984,-57.479116
1999998,1763.242310,0.211903,68.324036,-0.222037,-0.009930,21,61,-49.476015,-17.625761


In [125]:
data_features = data.iloc[:, :-2]

In [126]:
df = pd.read_csv('df_bkg_red.csv').drop(columns = 'Unnamed: 0')
models = ['A', 'B', 'C', 'D']
for i in models:
    df = pd.concat([df, pd.read_csv('df_model%s_red.csv'%(i)).drop(columns = 'Unnamed: 0')], ignore_index=True)

df_bin = df.copy()
df_bin.labels = df.labels.replace([2, 3, 4], 1)
labels_bin = df_bin.labels
features = df.iloc[:, :-2]
labels = df.labels
x_train, x_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)
_, _, y_train_bin, y_test_bin = train_test_split(features, labels_bin, test_size=0.2, random_state=42)
x_test_control = x_test[(x_test.lead_ntrk < 0)]
x_test_control = x_test_control[(x_test_control.sub_ntrk < 0)]
x_train_control = x_train[(x_train.lead_ntrk < 0)]
x_train_control = x_train_control[(x_train_control.sub_ntrk < 0)]

y_test_control = y_test[x_test_control.index]
y_test_control_bin = y_test_bin[x_test_control.index]
y_train_control = y_train[x_train_control.index]
y_train_control_bin = y_train_bin[x_train_control.index]

In [127]:
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train.iloc[:, :7])
x_test = scaler.fit_transform(x_test.iloc[:, :7])

x_train_control = scaler.fit_transform(x_train_control.iloc[:, :7])
x_test_control = scaler.fit_transform(x_test_control.iloc[:, :7])

data_scaled = scaler.fit_transform(data.iloc[:, :7])
data_cont_scaled = scaler.fit_transform(data_features.iloc[:, :7])

In [12]:
bdt = pkl.load(open('bdt_optimized_red.pkl', 'rb'))
bdt_bin = pkl.load(open('bdt_bin_optimized_red.pkl', 'rb'))

In [128]:
tree = DecisionTreeClassifier(criterion='entropy',
                              max_depth=15,
                            min_samples_leaf=0.36931642349065275,
                           min_samples_split=0.7479086208293507)
bdt = AdaBoostClassifier(tree, 
                         learning_rate=0.9912684934237729, 
                         n_estimators=80)
bdt_bin = AdaBoostClassifier(tree, 
                         learning_rate=0.67, 
                         n_estimators=80)

In [129]:
bdt_bin.fit(x_train, y_train_bin)

In [130]:
def numbers():
    sig_mc = 0
    bkg_mc = 0
    for i in mc_preds:
        if i==1:
            sig_mc+=1
        else:
            bkg_mc+=1
        
    sig_data = 0
    bkg_data = 0
    for i in data_preds_red:
        if i==1:
            sig_data+=1
        else:
            bkg_data+=1
    
    #cols = ['Class', 'Percentage of MC Events Found', 'Percentage of Data Events Found']
    classes = ['Signal', 'Background']
    mc_pc = [round(sig_mc/len(mc_preds)*100, 2), round(bkg_mc/len(mc_preds)*100, 2)]
    data_pc = [round(sig_data/len(data_preds_red)*100, 2), round(bkg_data/len(data_preds_red)*100, 2)]
    tmp = {'Class':classes, 'Percentage of MC Events Found':mc_pc, 'Percentage of Data Events Found':data_pc}
    return pd.DataFrame.from_dict(tmp)

In [131]:
mc_preds = bdt_bin.predict(x_test)

In [132]:
data_preds = bdt_bin.predict(data_scaled)

In [133]:
data_preds_red = random.sample(list(data_preds), k=len(mc_preds))

In [134]:
df = numbers()
df

Unnamed: 0,Class,Percentage of MC Events Found,Percentage of Data Events Found
0,Signal,55.55,98.34
1,Background,44.45,1.66


In [135]:
nbc = pkl.load(open('gaussian_bayes_red.pkl', 'rb'))
nbc_bin = pkl.load(open('gaussian_bayes_bin_red.pkl', 'rb'))

In [136]:
nbc_bin.fit(x_train, y_train_bin)

In [137]:
mc_preds = nbc_bin.predict(x_test)

In [138]:
data_preds = nbc_bin.predict(data_scaled)

In [139]:
data_preds_red = random.sample(list(data_preds), k=len(mc_preds))

In [140]:
df = numbers()
df

Unnamed: 0,Class,Percentage of MC Events Found,Percentage of Data Events Found
0,Signal,50.34,98.47
1,Background,49.66,1.53


In [141]:
def build_clf():
    ann = Sequential()
    ann.add(Dense(units = 7, activation = 'relu'))
    ann.add(Dense(units = 28,#best_params['neurons'], 
                  activation = 'relu'))
    ann.add(Dense(units = 1, activation = 'sigmoid'))
    ann.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=0.01), 
                loss = 'binary_crossentropy', metrics = ['accuracy'])
    return ann

In [142]:
ann_bin = KerasClassifier(build_fn=build_clf)

In [143]:
ann_bin.fit(x_train, y_train_bin, 
        batch_size = 64, #best_params['batch_size'], 
        epochs = 5, 
        validation_split = 0.33)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2478d94fd60>

In [144]:
mc_preds = ann_bin.predict(x_test)



In [145]:
data_preds = ann_bin.predict(data_scaled)



In [146]:
data_preds_red = random.sample(list(data_preds), k=len(mc_preds))

In [147]:
df = numbers()
df

Unnamed: 0,Class,Percentage of MC Events Found,Percentage of Data Events Found
0,Signal,52.37,99.16
1,Background,47.63,0.84
