In [None]:
%load_ext autoreload
%autoreload 2

import uproot
import awkward as ak

import matplotlib.pylab as plt
import numpy as np

import time

from hist import Hist

import babar_analysis_tools as bat

from analysis_variables import *

import myPIDselector

import pandas as pd
import seaborn as sns

import ROOT
import pdf_definitions as pdfs

This one is tricky, because there are two options.

First, we need to define the signal window, something like 5.27 < m_es < 5.285 GeV (you can fine tune).

Then you have two options:

1) Define the "bkg" region as 5.27 < m_ES < 5.285 GeV in the BKG MC. Then the relative size is the MC/data lumi ratio (i.e. the weight you need to multiply to rescale the MC to the data). There are 1 or 3 events in that window (this is difficult to say from the plots) and one signal event. I need to know the MC/data lumi ratio to do the calculation.

2) Define the bkg region as 5.20 < m_ES <5.27 GeV in the DATA. Then the relative window size is 0.215 (=0.015/0.07). There are two events in the bkg window. In that case, TRolke gives
TRolke tr
tr.SetCL(0.9);
tr.SetPoissonBkgGaussEff(1,2,0.35,0.215,0.02)  /* N_sig, N_bkg, efficiency, relative window size, sigma_efficiency (my guess for illustration)
tr.GetUpperLimit() = 1.8 event

If the background is well described by the MC, then I would go with option 1 since m_ES has a different shape in the bkg and sig regions (it dips in the signal region). If the MC is less than optimal, then switch to option 2. In any case, I would compare the two methods (but you won't be able to claim a blind analysis)

Hope this helps

# Read in the data

In [None]:
BNC_tag = ""
BNC_bool = False
#ntrain_tag = 'nsig_20000_nbkg_20000'

#BNC_tag = "_BNC"
#BNC_bool = True
#ntrain_tag = 'nsig_30000_nbkg_30000'
#ntrain_tag = 'nsig_40000_nbkg_40000_trial0'
#ntrain_tag = 'nsig_40000_nbkg_40000_trial1'
#ntrain_tag = 'features_2_nsig_40000_nbkg_40000_trial0'
ntrain_tag = 'features_4_nsig_30000_nbkg_30000_trial5'


# Read in the dfs
infilename_sp = f"DATAFRAME_SP_MODEL_MLPClassifier_CUTS_1_2_3_{ntrain_tag}{BNC_tag}.pkl"

infilename_col = f"DATAFRAME_COL_MODEL_MLPClassifier_CUTS_1_2_3_{ntrain_tag}{BNC_tag}.pkl"

df_sp = pd.read_parquet(infilename_sp)
df_col = pd.read_parquet(infilename_col)


########### FOM ############
df_fom = bat.punzi_fom_nn(df_sp, df_col, region_definitions=region_definitions, BNC=BNC_bool, sigma=4.0)

fom_max = df_fom['fom'].max()

print(fom_max)

filter = df_fom['fom'] == fom_max

df_fom[filter]
###########################
print()
max_proba_cut = df_fom[filter]['thresh'].values[0]
print(f'max_proba_cut: {max_proba_cut}')
sig_eff = df_fom[filter]['sig_pct'].values[0]
print(f'sig_eff: {sig_eff}')

#proba_cut = 0.82


In [None]:
# BNV
save_dir = './BNV_pLambda_plots/'
print(f'{max_proba_cut = }')

fig, axes = plt.subplots(3,1, sharex=True, figsize=(8,8))

labels = ['SP - bkg', 'SP - sig', 'Collision data']

for i in range(0,3):

    idx = None
    spmode = None
    df_tmp = None
    
    if i==0:

        # Use them all
        mask = (~df_sp['used_in_bkg_train'])# | (df_sp['used_in_bkg_train'])
        spmode = '998'
        df_tmp = df_sp[mask]

    elif i==1:

        mask = (~df_sp['used_in_sig_train'])
        spmode = '-999'
        df_tmp = df_sp[mask]
    
    elif i==2:
        spmode = '0'
        df_tmp = df_col
        mask = np.ones(len(df_tmp), dtype=bool)
    
    spmask = (df_tmp['spmode']==spmode)
    if i==0:# Background
        spmask = (df_tmp['spmode']!='-999')
    
    mask = mask &  (df_tmp['cut_-1']==True)
    if BNC_bool:
        print("Making BNC cuts")
        mask = mask & (df_tmp['cut_2']==True) & (df_tmp['cut_3']==True)  & (df_tmp['cut_4']==True)

    mask = mask & (df_tmp['proba'] > max_proba_cut)
    
    mask = mask & (df_tmp['BpostFitDeltaE']<0.05) & (df_tmp['BpostFitDeltaE']>-0.05)

    #var = 'proba'
    var = 'BpostFitMes'

    #plt.subplot(3,1,i+1)
    df_tmp[spmask & mask][var].hist(bins=50, range=(5.2,5.3), label=labels[i], ax=axes[i])#, range=(0,0.99))
    axes[i].legend()
axes[2].set_xlabel(r'$M_{ES}$ (GeV/c$^2$)', fontsize=18)

plt.tight_layout()

tag = "UPPER_LIMIT_CALCS"
plt.savefig(f'{save_dir}/mes_tight_de_probcut_{max_proba_cut:.2f}_{tag}{BNC_tag}.png')

In [None]:
mask = (df_sp['cut_2']==True) & (df_sp['cut_3']==True) & (df_sp['cut_4']==True)
mask = mask & (df_sp['spmode']=='998')
df_sp[mask]['used_in_bkg_train'].value_counts()

In [None]:
fig,axes = plt.subplots(1,3, figsize=(12,4))

# BNV
proba_cut = max_proba_cut
#proba_cut = 0.0

if BNC_bool:
    proba_cut = max_cut
    #proba_cut = 0.90

deloline, dehiline = -0.05, 0.05

#de_cut = 0.07
de_cut = 0.2

# SP bkg
mask = (df_sp['spmode'] != '-999')

# Not used in training
mask = mask & (~df_sp['used_in_bkg_train'])# | (df_sp['used_in_bkg_train'])


if BNC_bool:
    mask = mask &  (df_sp['cut_2']==True) & (df_sp['cut_3']==True)  & (df_sp['cut_4']==True)
else:
    mask = mask &  (df_sp['cut_-1']==True)

mask = mask & (df_sp['BpostFitMes']>5.20)# & (df_sp['BpostFitDeltaE']>-0.07)

mask = mask & (df_sp['BpostFitDeltaE']<de_cut) & (df_sp['BpostFitDeltaE']>-de_cut)

mask = mask & (df_sp['proba'] > proba_cut)

df_sp[mask & (df_sp['spmode']=='998')].plot.scatter(x='BpostFitMes', y='BpostFitDeltaE', ax=axes[0])#, label='SP-998')#, label='SP')
df_sp[mask & (df_sp['spmode']=='1005')].plot.scatter(x='BpostFitMes', y='BpostFitDeltaE', ax=axes[0], c='orange')#, label='SP-1005')#, label='SP')

axes[0].plot([5.2, 5.29], [deloline, deloline], 'r--', lw=3)
axes[0].plot([5.2, 5.29], [dehiline, dehiline], 'r--', lw=3)
#plt.legend()
axes[0].set_title(f'Bkg SP (NN > {proba_cut:.2f})')

# SP sig
mask = (df_sp['spmode'] == '-999')

if BNC_bool:
    mask = mask &  (df_sp['cut_2']==True) & (df_sp['cut_3']==True)  & (df_sp['cut_4']==True)
else:
    mask = mask &  (df_sp['cut_-1']==True)

mask = mask & (df_sp['BpostFitMes']>5.20)# & (df_sp['BpostFitDeltaE']>-0.07)

mask = mask & (df_sp['BpostFitDeltaE']<de_cut) & (df_sp['BpostFitDeltaE']>-de_cut)

mask = mask & (df_sp['proba'] > proba_cut)


df_sp[mask].plot.scatter(x='BpostFitMes', y='BpostFitDeltaE', ax=axes[1], s=0.1, alpha=0.1)#, label='SP')
axes[1].plot([5.2, 5.29], [deloline, deloline], 'r--', lw=3)
axes[1].plot([5.2, 5.29], [dehiline, dehiline], 'r--', lw=3)
axes[1].set_ylim(-0.2, 0.2)
#plt.legend()
axes[1].set_title(f'Sig SP (NN > {proba_cut:.2f})')


# Data
mask = (df_col['spmode'] == '0')

if BNC_bool:
    mask = mask &  (df_col['cut_2']==True) & (df_col['cut_3']==True)  & (df_col['cut_4']==True)
else:
    mask = mask &  (df_col['cut_-1']==True)


mask = mask & (df_col['BpostFitMes']>5.20)# & (df_sp['BpostFitDeltaE']>-0.07)

mask = mask & (df_col['BpostFitDeltaE']<de_cut) & (df_col['BpostFitDeltaE']>-de_cut)

mask = mask & (df_col['proba'] > proba_cut)


df_col[mask].plot.scatter(x='BpostFitMes', y='BpostFitDeltaE', ax=axes[2])#, label='Collision data')
axes[2].plot([5.2, 5.29], [deloline, deloline], 'r--', lw=3)
axes[2].plot([5.2, 5.29], [dehiline, dehiline], 'r--', lw=3)
axes[2].set_ylim(-0.2, 0.2)
#plt.legend()
axes[2].set_title(f'Collision data (NN > {proba_cut:.2f})')

plt.tight_layout()

plt.savefig(f'{save_dir}/sp_and_collision_de_vs_mes_probcut_{proba_cut:.2f}_{tag}{BNC_tag}.png')

mask_de = (df_col['BpostFitDeltaE']<0.05) & (df_col['BpostFitDeltaE']>-0.05)
df_col[mask & mask_de]['BpostFitMes'].values

In [None]:
dataset_information= pd.read_csv("dataset_statistics.csv")
cs_data= pd.read_csv("SP_cross_sections_and_labels.csv")

no_notes= cs_data.drop(["Uncertainty","Note: cross sections found at https://babar-wiki.heprc.uvic.ca/bbr_wiki/index.php/Physics/Cross_sections,_luminosities,_and_other_vital_stats"], axis= 1)
no_notes


In [None]:
cs_data

In [None]:
bkg_spmodes= ["998","1005","3981","1235","1237"]
sig_spmodes= ["-999"]

spmodes= bkg_spmodes#+sig_spmodes

weights= {}
for sp in spmodes: 
    weights[sp]= bat.scaling_value(int(sp),dataset_information=dataset_information, cs_data= cs_data, plot= False, verbose= False)

weights

# #1

1) Define the "bkg" region as 5.27 < m_ES < 5.285 GeV in the BKG MC. Then the relative size is the MC/data lumi ratio (i.e. the weight you need to multiply to rescale the MC to the data). There are 1 or 3 events in that window (this is difficult to say from the plots) and one signal event. I need to know the MC/data lumi ratio to do the calculation.

In [None]:
# SP bkg
mask = (df_sp['spmode'] != '-999')
# Not used in training
mask = mask & (~df_sp['used_in_bkg_train'])# | (df_sp['used_in_bkg_train'])
if BNC_bool:
    mask = mask &  (df_sp['cut_2']==True) & (df_sp['cut_3']==True)  & (df_sp['cut_4']==True)
else:
    mask = mask &  (df_sp['cut_-1']==True)
mask = mask & (df_sp['BpostFitMes']>5.27) & (df_sp['BpostFitMes']<=5.285)

mask = mask & (df_sp['BpostFitDeltaE']<0.05) & (df_sp['BpostFitDeltaE']>-0.05)

mask = mask & (df_sp['proba'] > max_proba_cut)

events = df_sp[mask]['BpostFitMes']
print(events)
print()

wt = weights['998']
print(f'{wt = }\n')


n_bkg_sp_org = len(events)
n_bkg_sp = wt*n_bkg_sp_org

print(f'{n_bkg_sp_org = }    {n_bkg_sp = }')

In [None]:
nobs = 2
n_bkg_sp = 1

f = ROOT.TFeldmanCousins()
ul = f.CalculateUpperLimit(nobs, n_bkg_sp)
ll = f.GetLowerLimit()

print(f'{ul = }   {ll = }')

# #2 `TRolke`

https://root.cern.ch/root/html522/tutorials/math/Rolke.C.html

https://arxiv.org/pdf/0907.3450

In [None]:
tr = ROOT.TRolke()


In [None]:
tr.SetCL(0.9);
tr.SetPoissonBkgGaussEff(1,2,0.35,0.215,0.02)  # N_sig, N_bkg, efficiency, relative window size, sigma_efficiency (my guess for illustration)
tr.GetUpperLimit()# = 1.8 event



In [None]:
tr.GetLowerLimit()

In [None]:
tr.GetCriticalNumber([1,2,3], -1)