In [2]:
#importing libraries 
import numpy as np 
import matplotlib.pyplot as plt 
import pandas as pd

#neural network libraries
from tensorflow import keras 
from keras import Sequential


#xgb boost library 
import xgboost as xgb

#evaluation code from evaluation.py
import evaluation



In [None]:
#loading datasets and functions to call datasets
#directory used is PS C:\Users\madhav\OneDrive\Desktop\comp phy\Dataset>
"Function to read test data"
data_test = pd.read_csv("test.csv.zip")

"Function to read train csv data"
data_train = pd.read_csv("training.csv.zip")

"Function to open training data"
data_training = pd.read_csv('training.csv.zip')

"Function to read check agreement csv file"
data_check = pd.read_csv('check_agreement.csv.zip')

"function to read correlation csv file"
data_correlation = pd.read_csv('check_correlation.csv.zip')




In [11]:

#filtered variables (Secondary training dataset)
variables=['id','min_ANNmuon', 'production', 'mass', 'signal', 'SPDhits','dira','DOCAone',	'FlightDistance','FlightDistanceError','p1_p','p2_p','p0_eta','p2_eta']
data_variables = data_train.drop(variables, axis = 1)
#variable to train
train_labels = data_train['signal']

In [None]:
#Histogram plots for background and signal events from training dataset
signal = data_training.loc[data_training['signal'] == 1]
background = data_training.loc[data_training['signal'] == 0]


for column in signal.columns:
    plt.hist(signal[column],  label=('signal event '+ column))
    plt.hist(background[column], label=('background event '+ column))
    plt.legend(loc="best")
    plt.show()

In [None]:
#FInal Keras Neural Network
model = Sequential()
#layers
model.add(keras.layers.Dense(10,  activation='swish'))
model.add(keras.layers.Dense(1, activation='sigmoid'))
#optimizer
adam=keras.optimizers.Adam(learning_rate=0.005)

model.compile(loss='binary_crossentropy', optimizer= adam)
model.fit(data_variables, train_labels, epochs=20, shuffle=True)


In [21]:
#Filtering variables from check_correlation.csv
variables_check = data_check.drop(['signal', 'id', 'weight', 'SPDhits','dira','DOCAone',	'FlightDistance','FlightDistanceError','p1_p','p2_p','p0_eta','p2_eta',
], axis = 1)
check_lables = data_check['signal']
check_weights = data_check['weight']

#Filtering variables from check_correlation.csv
variables_correlation = data_correlation.drop(['id', 'mass', 'SPDhits','dira','DOCAone',	'FlightDistance','FlightDistanceError','p1_p','p2_p','p0_eta','p2_eta'
], axis = 1)



In [None]:
#Calculating performance scores

agreement_prob = model.predict(variables_check)
correlation_prob = model.predict(variables_correlation)
correlation_prob = correlation_prob.reshape(-1,)


#KS test for Keras neural network
ks_keras = evaluation.compute_ks(
agreement_prob[data_check['signal'].values == 0],
agreement_prob[data_check['signal'].values == 1],
data_check[data_check['signal'] == 0]['weight'].values,
data_check[data_check['signal'] == 1]['weight'].values)
print ('KS metric', ks_keras, ks_keras < 0.09)

#CvM test for Keras nueral network
cvm_keras = evaluation.compute_cvm(correlation_prob, data_correlation['mass'])
print ('CvM metric', cvm_keras, cvm_keras < 0.002)

In [None]:
#Final XGBoost classifer
trees = 100
parameters = {"objective": "binary:logistic",
          "eta": 0.7,
          "max_depth": 12,
          "min_child_weight": 4,
          "silent":2,
          "seed": 3}
model_xgb = xgb.train(parameters, xgb.DMatrix(data_variables, train_labels), trees)

In [None]:
#Calculating performance scores for XGBoost classifier
xgb_agreement_pred = model_xgb.predict(xgb.DMatrix(variables_check))
xgb_correlation_pred = model_xgb.predict(xgb.DMatrix(variables_correlation))

#KS test for XGBoost model
ks_xgboost = evaluation.compute_ks(
xgb_agreement_pred[data_check['signal'].values == 0],
xgb_agreement_pred[data_check['signal'].values == 1],
data_check[data_check['signal'] == 0]['weight'].values,
data_check[data_check['signal'] == 1]['weight'].values)
print ('KS Test Score =', ks_xgboost)

#CvM test for XGBoost model
cvm_xgboost = evaluation.compute_cvm(xgb_correlation_pred, data_correlation['mass'])
print ('CvM Test Score = ', cvm_xgboost)

In [None]:
#displaying CvM and KS metric in tables
from tabulate import tabulate

print(tabulate({'Model Type': ['Keras', 'XGboost'], 'KS Test': [ks_keras,ks_xgboost], 'CvM Metric': [cvm_keras, cvm_xgboost]}, headers="keys", tablefmt='fancy_grid'))

In [None]:
#Comparing model performance 

import sklearn.metrics as metrics

#keras evaluation metrics

probs_k = model.predict(variables_check)
#calculating false positive and true positive rates for Keras network
fpr_k, tpr_k, threshold = metrics.roc_curve(data_check['signal'], probs_k)
#auc value for Keras network
roc_auc_k = metrics.auc(fpr_k, tpr_k)

#XGB evaluation metrics

pred_XGB = model_xgb.predict(xgb.DMatrix(variables_check))
#calculating false positive and true positive rates for XGBoost
fpr_xgb, tpr_xgb, threshold = metrics.roc_curve(check_lables, pred_XGB)
#auc value for XGBoost
roc_auc_x = metrics.auc(fpr_xgb, tpr_xgb)

#plotting ROC curves
import matplotlib.pyplot as plt
plt.title('ROC Model Evaluation')
plt.plot(fpr_k, tpr_k, color='orange', label = 'Keras Model. AUC = %0.03f' % roc_auc_k)
plt.plot(fpr_xgb, tpr_xgb,color='blue',label = 'XGBoost. AUC=  %0.03f ' % roc_auc_x)
plt.plot(np.linspace(0,1),np.linspace(0,1),'k--')

plt.legend(loc = 'best')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.show()
 