# References

Drawing from my lecture

https://colab.research.google.com/drive/12LHs9cL8-gXKr_ypaNfnwaO9bLF3I9tQ?usp=sharing

In [None]:
%load_ext autoreload
%autoreload 2

import uproot
import awkward as ak

import matplotlib.pylab as plt
import numpy as np

import time

from hist import Hist

import babar_analysis_tools as bat

from analysis_variables import *

import myPIDselector

import pandas as pd
import seaborn as sns

In [None]:
start = time.time()

# At Siena
#topdir = '/mnt/qnap/babar_data/bnv_plambda'

##josie laptop 
#topdir = "/Users/josieswann/Desktop/important documents"

# On Bellis' laptop
#topdir = '/home/bellis/babar_data/bnv_plambda/'

# At Bellis' home
topdir = '/home/bellis/babar_data/bnv_plambda'

# On Bellis' laptop
#topdir = './'

filename = f'{topdir}/Background_and_signal_SP_modes_Only_Run_1.parquet'
#filename = f'{topdir}/Background_and_signal_SP_modes_All_runs.parquet'

data = ak.from_parquet(filename)

print(f"Took {time.time() - start} s")
IS_MC=True


#'''
# Collision data
#filename = f'{topdir}/Background_SP_modes_Only_Run_1.parquet'
filename = f'{topdir}/Data_Only_Run_1_BLINDED.parquet'
#filename = f'{topdir}/Data_All_runs_BLINDED.parquet'
#filename = f'{topdir}/Data_All_runs_BLINDED.parquet'

start = time.time()

data_collision = ak.from_parquet(filename)
#data_collision = ak.from_parquet(filename)

print(f"Took {time.time() - start} s")

print(type(data_collision))
#'''


In [None]:
### information about cross section --> what we'll use to calculate scaling values for histograms 

dataset_information = pd.read_csv("dataset_statistics.csv")
cs_data= pd.read_csv("SP_cross_sections_and_labels.csv")

no_notes= cs_data.drop(["Uncertainty","Note: cross sections found at https://babar-wiki.heprc.uvic.ca/bbr_wiki/index.php/Physics/Cross_sections,_luminosities,_and_other_vital_stats"], axis= 1)
no_notes

In [None]:
sp = data['spmode']

np.unique(sp.to_list())

In [None]:
region_definitions

In [None]:
bat.fill_new_entry_with_tag_side_B(data)
data['BtagSideMes']



In [None]:
# Make our histograms
all_hists = bat.create_empty_histograms(hist_defs)

bkg_spmodes = ['998', '1005', '1235', '1237', '3981']
sig_spmodes = ['-999']

spmodes = bkg_spmodes + sig_spmodes

weights = {}
for sp in spmodes:
    weights[sp] = bat.scaling_value(int(sp), dataset_information=dataset_information, cs_data=cs_data, plot=False, verbose=False)
    #weights[sp] = 1

### bat.scaling_value is in Babar_analysis_tools.py 

# Scale the signal higher
weights['-999'] = 1000
weights['0'] = 1

print(weights)
print()
print(spmodes)

In [None]:
# Need to get the original duplicates mask for any other cuts we might generate outside the function
dcuts = bat.get_final_masks(data, region_definitions=region_definitions)

print([dcuts.keys()])
print()

for key in dcuts.keys():
    print(f'{key:3d} {dcuts[key]["name"]}')



In [None]:
################################################################################
# Make the masks
mask_event = dcuts[3]['event']
#mask_event = dcuts[4]['event']
#mask_event = dcuts[1]['event']
#mask_event = dcuts[-1]['event']
#mask_event = dcuts[2]['event'] & dcuts[3]['event'] & dcuts[4]['event']

#tag = "FINAL_CUTS"
tag = "EARLY_CUT"

mask = mask_event
################################################################################
subset = ['spmode', 'BpostFitMes', 'BpostFitDeltaE', 'Lambda0_unc_Mass', \
          'BtagSideMes', 'BSphr', 'BThrust', 'BCosThetaS', \
          'R2', 'R2All', \
          'thrustMag', 'thrustMagAll', 'thrustCosTh', 'thrustCosThAll', 'sphericityAll', \
          'BCosSphr', 'BCosThetaT', 'BCosThrust', 'BLegendreP2', 'BR2ROE', 'BSphrROE', \
          'BThrustROE']

ak_array_type = type(data['spmode'])

df_dict = {}
for var in subset:
    x = data[mask][var]

    # If this is nested, then flatten it
    if type(x[0]) == ak_array_type:
        x = ak.flatten(data[mask][var])
        
    df_dict[var] = x
# Make the plot
df_out = pd.DataFrame.from_dict(df_dict)

# Write it
outfilename = "output_variables_{tag}.parquet"
df_out.to_parquet(outfilename)

df = df_out

df_out

In [None]:
df.groupby('spmode').count()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
filter = df['spmode'] == '-999'

g = sns.PairGrid(df[filter].sample(500), vars=['BpostFitMes', 'BpostFitDeltaE'], hue='spmode')
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)

In [None]:
columns = df.columns

print(columns)

In [None]:
filter = df['spmode'] != '-999'

#g = sns.PairGrid(df[filter].sample(500), vars=['BpostFitMes', 'BpostFitDeltaE'], hue='spmode')
g = sns.PairGrid(df[filter].sample(50), vars=columns[1:6], hue='spmode')

g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
g.add_legend()

# Neural net

In [None]:
# Importing the necessary sklearn libraries
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score 

from sklearn.metrics import roc_curve, auc


In [None]:
columns = df.columns

print(columns)

feature_names = columns[1:]
print(feature_names)

In [None]:
df.groupby('spmode').count()['R2']

In [None]:
# Extract features and target variables
#filter = (df['spmode'] == '-999') | (df['spmode'] == '998')  
#df_ML = df[filter].dropna().sample(10000)

filter_sig = df['spmode'] == '-999'
filter_bkg = df['spmode'] == '998'

df_sig = df[filter_sig].dropna().sample(2000)
df_bkg = df[filter_bkg].dropna().sample(2000)

df_ML = pd.concat([df_sig, df_bkg])


all_vars = ['spmode', 'BpostFitMes', 'BpostFitDeltaE', 'Lambda0_unc_Mass',
       'BtagSideMes', 'BSphr', 'BThrust', 'BCosThetaS', 'R2', 'R2All',
       'thrustMag', 'thrustMagAll', 'thrustCosTh', 'thrustCosThAll',
       'sphericityAll', 'BCosSphr', 'BCosThetaT', 'BCosThrust', 'BLegendreP2',
       'BR2ROE', 'BSphrROE', 'BThrustROE']

#vars_to_drop = ["spmode", 'BpostFitMes', 'BpostFitDeltaE', 'Lambda0_unc_Mass']
vars_to_drop = ["spmode", 'BpostFitMes', 'BpostFitDeltaE', 'Lambda0_unc_Mass','BtagSideMes', 'BSphr', 'BThrust', 'BCosThetaS', 'R2', 'R2All',
       'thrustMag', 'thrustMagAll', 'thrustCosTh', 'thrustCosThAll',
       'sphericityAll', 'BCosSphr', 'BCosThetaT', 'BCosThrust' ]


x = df_ML.drop(columns=vars_to_drop)
y = df_ML["spmode"]


# Save the feature name and target variables
feature_names = x.columns
labels = y.unique()

print("We will train using the following features")
print(feature_names)
print()

print("Our labels (Outcome) are")
print(labels)
print()

print("The dataset (x) is the numbers, without the column names")
print(x)
print()

print("The variable y holds the 'truth' information about each sample")
print(y)
print()


In [None]:
# Splitting the data into test and train

# Split
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.40, random_state=4)




In [None]:
X_train

In [None]:
y_train

In [None]:
print(len(y_train[y_train=='-999']))
print(len(y_train[y_train=='998']))

In [None]:
# Making the Neural Network Classifier
model = MLPClassifier()

# Training the model on the training data and labels
model.fit(X_train, y_train)

In [None]:
# Testing the model i.e. predicting the labels of the test data.
y_pred = model.predict(X_test)

# Evaluating the results of the model
accuracy = accuracy_score(y_test,y_pred)*100
confusion_mat = confusion_matrix(y_test,y_pred)

In [None]:
# Printing the Results
print("Accuracy for Neural Network is:",accuracy)
print("Confusion Matrix")
print(confusion_mat)


In [None]:
# Turn this into a dataframe
matrix_df = pd.DataFrame(confusion_mat)

# Plot the result
fig, ax = plt.subplots(figsize=(10,7))

sns.set(font_scale=1.3)

sns.heatmap(matrix_df, annot=True, fmt="g", ax=ax, cmap="magma")

#labels = df['target_names'].tolist()
#labels = ['998', '-999'] # NEED TO FIX THIS SO IT IS NOT HARDCODED

# Formatting details here
# Set axis titles
ax.set_title('Confusion Matrix - MLP')
ax.set_xlabel("Predicted label", fontsize =15)
ax.set_xticklabels(labels)
ax.set_ylabel("True Label", fontsize=15)
ax.set_yticklabels(labels, rotation = 0)
plt.show()

In [None]:

# Get the predictions for the training and testing samples

decisions = []
for X, y in ((X_train, y_train), (X_test, y_test)):

  # Use the outcome to select the truth information (>0.5 or <0.5)
  d1 = model.predict_proba(X[y == '998'])[:, 1]
  d2 = model.predict_proba(X[y == '-999'])[:, 1]
  decisions += [d1, d2]

# Use this for the histogram ranges
low = min(np.min(d) for d in decisions)
high = max(np.max(d) for d in decisions)
low_high = (low, high)

# Make a plot of the training sample predictions
bins = 50
plt.figure(figsize=(12, 6))
plt.hist(decisions[0],
          color='r', alpha=0.5, range=low_high, bins=bins,
          histtype='stepfilled', density=True,
          label='Pos for diabetes (train)')
plt.hist(decisions[1],
          color='b', alpha=0.5, range=low_high, bins=bins,
          histtype='stepfilled', density=True,
          label='Neg for diabetes (train)')


# Make a plot with error bars for the testing samples
hist, bins = np.histogram(decisions[2],density=True,
                          bins=bins, range=low_high)
scale = len(decisions[2]) / sum(hist)
err = np.sqrt(hist * scale) / scale

width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2

plt.errorbar(center, hist, yerr=err, fmt='o', c='r', label='Pos for diabetes (test)')

hist, bins = np.histogram(decisions[3],density=True,
                          bins=bins, range=low_high)
scale = len(decisions[2]) / sum(hist)
err = np.sqrt(hist * scale) / scale

plt.errorbar(center, hist, yerr=err, fmt='o', c='b', label='Neg for diabetes (test)')

plt.xlabel("Classifer output")
plt.ylabel("Arbitrary units")
plt.legend(loc='best')

In [None]:
#decisions
#y_test

print(y_test)

sig_bkg = np.ones_like(y_test, dtype=int)
sig_bkg[y_test=='-999'] = 0

print(sig_bkg)

In [None]:
decisions = model.predict_proba(X_test)[:, 1]

# Compute ROC curve and area under the curve
fpr, tpr, thresholds = roc_curve(sig_bkg, decisions)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.2f)' % (roc_auc))

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.grid()
plt.show()


In [None]:
y_test.unique()

In [None]:
df_plot = X_test.copy()
df_plot['spmode'] = y_test.values

df_plot

print(len(X_test), len(y_test))
print(len(df_plot))

In [None]:
fig, axes = plt.subplots(nrows = 5, ncols = 4)    # axes is 2d array (3x3)
axes = axes.flatten()         # Convert axes to 1d array of length 9
fig.set_size_inches(15, 15)

for ax, col in zip(axes, df_plot.columns):
  sns.histplot(df_plot, x=col, ax = ax, hue='spmode', stat='density', common_norm=False)
  ax.set_title(col)

plt.tight_layout()