In [None]:
%load_ext autoreload
%autoreload 2

import uproot
import awkward as ak
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import seaborn as sns

import time

from hist import Hist

import babar_analysis_tools as bat
from analysis_variables import *
import myPIDselector

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score 
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

import joblib

import math

# Some of the negative signs weren't being displayed on axes.
plt.rc('axes', unicode_minus=False)


In [None]:
BNC_tag = ""
BNC_bool = False
#ntrain_tag = 'nsig_20000_nbkg_20000'

#BNC_tag = "_BNC"
#BNC_bool = True

#ntrain_tag = 'nsig_30000_nbkg_30000'
#ntrain_tag = 'nsig_30000_nbkg_30000_trial6'
#ntrain_tag = 'nsig_40000_nbkg_40000_trial0'
#ntrain_tag = 'nsig_40000_nbkg_40000_trial1'
#ntrain_tag = 'features_2_nsig_40000_nbkg_40000_trial0'
#ntrain_tag = 'nsig_60000_nbkg_60000_trial4'
#ntrain_tag = 'features_3_nsig_60000_nbkg_60000_trial1'
ntrain_tag = 'features_4_nsig_30000_nbkg_30000_trial15'
#ntrain_tag = 'features_2_nsig_30000_nbkg_30000_trial1'

# Read in the dfs
infilename = f"DATAFRAME_SP_MODEL_MLPClassifier_CUTS_1_2_3_{ntrain_tag}{BNC_tag}.pkl"
df_sp = pd.read_parquet(infilename)

infilename = f"DATAFRAME_COL_MODEL_MLPClassifier_CUTS_1_2_3_{ntrain_tag}{BNC_tag}.pkl"
df_col = pd.read_parquet(infilename)



In [None]:
infilename = f'MODEL_MLPClassifier_CUTS_1_2_3_{ntrain_tag}_{BNC_tag}.pkl'
workspace = joblib.load(infilename)

In [None]:
workspace['model'].feature_names

In [None]:
#data.iloc[62]

In [None]:
#data.iloc[0:20].sample(frac=1, replace=True)

In [None]:
def generate_correlation_matrix_with_bootstrap_uncertainties(data, n_bootstraps=1000):
    # 2. Bootstrap Resampling and Correlation Calculation
    #n_bootstraps = 1000  # Number of bootstrap samples
    bootstrap_correlations = []

    # For testing to see the differences with lower statistics
    #data = data.iloc[0:50]
    
    for _ in range(n_bootstraps):
        # Resample with replacement
        resampled_data = data.sample(frac=1, replace=True)
        # Calculate correlation matrix for the resampled data
        arr_temp = resampled_data.corr().values
        bootstrap_correlations.append(arr_temp)
        #print('bootstrap_correlations')
        #print(arr_temp)
    #print("--------------")
    
    # Convert list of arrays to a 3D NumPy array
    bootstrap_correlations = np.array(bootstrap_correlations)
    
    # 3. Calculate Uncertainties (e.g., standard deviation)
    #mean_correlations = np.mean(bootstrap_correlations, axis=0)
    mean_correlations = data.corr()
    std_correlations = np.std(bootstrap_correlations, axis=0)

    #print('Mean and std correlations')
    #print(mean_correlations)
    #print(std_correlations)
    
    # 4. Display Uncertainties (e.g., as a formatted string in a DataFrame)
    correlation_matrix_with_uncertainties = pd.DataFrame(
        '', index=data.columns, columns=data.columns, dtype=object
    )
    
    for i in range(mean_correlations.shape[0]):
        for j in range(mean_correlations.shape[1]):
            corr_val = mean_correlations.iloc[i, j]
            std_val = std_correlations[i, j]
            #print(corr_val, std_val)
            correlation_matrix_with_uncertainties.iloc[i, j] = f"{corr_val:.2f} ± {std_val:.2f}"
    
    print("Correlation Matrix with Bootstrap Uncertainties:")
    print(correlation_matrix_with_uncertainties)

    return mean_correlations, correlation_matrix_with_uncertainties

In [None]:
save_dir = './BNV_pLambda_plots/'

def plot_training_variables(df, feature_names=None, tag='DEFAULT'):

    #########################################################################
    # Plot the variables for the different spmodes
    #########################################################################

    if '0' in list(df['spmode'].unique()):
        tag += "_plus_collision_data"
    print(f"{tag = }")

    if feature_names is None:
        feature_names = list(df.columns)
    
    print("Plotting the training variables...")
    nvars = len(feature_names)
    print(f"Plotting for {nvars} training variables")
    print(feature_names)
    print()
    print("Plotting for the following spmodes")
    print(df['spmode'].unique())

    nrows, ncols = 4, 3
    if nvars>10:
        nrows = 4
        ncols = math.ceil(nvars / nrows) 
    elif nvars==6:
        nrows = 2
        ncols = math.ceil(nvars / nrows) 
    elif nvars==4:
        nrows = 2
        ncols = math.ceil(nvars / nrows) 
    elif nvars==3:
        nrows = 1
        ncols = math.ceil(nvars / nrows) 
    elif nvars==5:
        nrows = 2
        ncols = math.ceil(nvars / nrows) 
    else:
        nrows = 3
        ncols = math.ceil(nvars / nrows) 
    
    print(f"nrows: {nrows}    ncols: {ncols}")
    
    fig, axes = plt.subplots(nrows = nrows, ncols = ncols)    # axes is 2d array (3x3)
    axes = axes.flatten()         # Convert axes to 1d array of length 9
    fig.set_size_inches(ncols*3, nrows*3)

    for ax, col in zip(axes, feature_names):
        print(f'Plotting {col}')
        if col=='BtagSideMes':
            sns.histplot(df, x=col, ax = ax, hue='spmode', stat='density', common_norm=False, binrange=(5.0, 5.3))
        else:
            sns.histplot(df, x=col, ax = ax, hue='spmode', stat='density', common_norm=False)
        plt.setp(ax.get_legend().get_texts(), fontsize='8') # for legend text
        plt.setp(ax.get_legend().get_title(), fontsize='8') # for legend title

    #ax.set_title(col)


    plt.tight_layout()
    plt.savefig(f'{save_dir}/training_variables_{tag}.png')

    #########################################################################
    # Plot the correlation matrices
    #########################################################################

    spmodes_in_file = df['spmode'].unique()
    # Drop the cuts columns
    cols = df.columns

    cols_temp = []
    for col in cols:
        #print(col)
        if col[0:3]!='cut' and col[0:4]!='used' and col[0:5]!='proba':
            cols_temp.append(col)
    cols_temp

    for spmode in spmodes_in_file:
        print(f"Making the correlation matrix for SP-{spmode}...")
        fig,ax = plt.subplots(figsize=(16,16))
        mask = df['spmode'] == spmode
        ###############################################
        # Drop SP mode only
        #sns.heatmap(df[mask][cols_temp].drop(columns=['spmode']).corr(), center=0, cmap='coolwarm', annot=True, fmt='.2f', annot_kws={"size": 8})
        ###############################################
        # Use everything that was used in training
        cols_to_use_for_correlation = ['BpostFitMes', 'BpostFitDeltaE'] + feature_names
        # Drop SP mode only

        # PLOT EVERYTHING ALL TOGETHER
        #cols_to_use_for_correlation = cols_temp 
        #if 'spmode' in cols_temp:
        #    cols_temp.remove('spmode')

        print("HERE IN THE LOOP")
        print(df[mask][cols_to_use_for_correlation].corr())
        print()
        print(df[mask][['R2All', 'BLegendreP2']].corr())
        print()
        print(df[mask]['R2All'].min(), df[mask]['BLegendreP2'].min())
        print(df[mask]['R2All'].max(), df[mask]['BLegendreP2'].max())

        
        text_size = 16
        ncols = len(cols_to_use_for_correlation) 
        if ncols>=15:
            text_size = 6
        elif ncols>=12 and ncols<15:
            text_size = 8
        elif ncols >= 9 and ncols<12:
            text_size = 12

        mean_correlations, correlation_matrix_with_uncertainties = \
                generate_correlation_matrix_with_bootstrap_uncertainties(df[mask][cols_to_use_for_correlation], n_bootstraps=1000)
        sns.heatmap(mean_correlations, annot=correlation_matrix_with_uncertainties, center=0, \
                    cmap='coolwarm', fmt="", vmin=-1, vmax=1, annot_kws={"size": text_size}) # Customize 'fmt' as needed
        
        # Just the normal correlations
        #sns.heatmap(df[mask][cols_to_use_for_correlation].corr(), center=0, cmap='coolwarm', annot=True, fmt='.2f', annot_kws={"size": 8})

        plt.title(f'Correlation matrix SP {spmode}')        
        plt.tight_layout()
        plt.savefig(f'{save_dir}/correlation_matrix_{spmode}_{tag}.png')


In [None]:
df_sp['spmode'].value_counts()


In [None]:
tag = f'{ntrain_tag}{BNC_tag}'

print(tag)


In [None]:
'''
mask = (df_sp['spmode']=='-999') | (df_sp['spmode']=='998')
sns.histplot(df_sp[mask], x='BtagSideMes', hue='spmode', stat='density', common_norm=False, binrange=(5.0, 5.3))
plt.setp(plt.gca().get_legend().get_texts(), fontsize='8') # for legend text
plt.setp(plt.gca().get_legend().get_title(), fontsize='8') # for legend title
''';

In [None]:
# Make a temporary dataframe with the cuts

# This worked before
'''
mask = (df_sp['cut_2']==True) & (df_sp['cut_3']==True)
df_temp = df_sp[mask]
sp_mask = (df_temp['spmode']=='-999') | (df_temp['spmode']=='998')
feature_names = list(workspace['model'].feature_names)
plot_training_variables(df_temp[sp_mask], feature_names = feature_names, tag=tag)
'''

# This works to produce a data comparison
#'''

print(len(df_sp), len(df_col))

df_stacked = pd.concat([df_sp, df_col], axis=0)
print(len(df_stacked))
mask = (df_stacked['cut_2']==True) & (df_stacked['cut_3']==True)
df_temp = df_stacked[mask]

sp_mask = (df_temp['spmode']=='-999') | (df_temp['spmode']=='998') | (df_temp['spmode']=='0')
feature_names = list(workspace['model'].feature_names)

print(len(df_temp))

#print(df_temp[sp_mask])

df_for_pts = df_temp[sp_mask].drop(['used_in_sig_train', 'used_in_bkg_train'], axis=1).dropna()

# Some weird events in the collision data with R2All is around -10000
mask_special = (df_for_pts['R2All']>-1)
df_for_pts_v2 = df_for_pts[mask_special]
    
print(df_for_pts_v2)
print('min: ', df_for_pts_v2['R2All'].min())

plot_training_variables(df_for_pts_v2, feature_names = feature_names, tag=tag)
#'''



In [None]:
x = df_for_pts['BCosThetaT']
plt.hist(x)

In [None]:
#sns.histplot(df_for_pts.dropna(), x='R2All',  hue='spmode', stat='density', common_norm=False)

#sns.histplot(df_for_pts.dropna(), x='R2All', stat='density')

plt.figure()

varx = 'R2All'
vary = 'BLegendreP2'

for idx,spmode in enumerate(['0', '998']):
    filter = df_for_pts['spmode']==spmode
    
    x = df_for_pts[filter][varx]
    y = df_for_pts[filter][vary]
    plt.subplot(1,2,1+idx)
    plt.plot(x,y,'.', markersize=1, alpha=0.1)

    print(df_for_pts[filter][[varx, vary]].corr())

#plt.hist(x,bins=100, range=(-11000,0));



In [None]:
df_sp[['used_in_bkg_train', 'used_in_bkg_train']]

In [None]:
model = workspace['model']
x_train = workspace['x_train']
y_train = workspace['y_train']
x_test = workspace['x_test']
y_test = workspace['y_test']

model.predict_proba(x_train[y_train=='-999'])

In [None]:
# 
#def model_training_quality(model, x_train, y_train, x_test, y_test):
def model_training_quality(workspace, tag='DEFAULT'):
    model = workspace['model']
    x_train = workspace['x_train']
    y_train = workspace['y_train']
    x_test = workspace['x_test']
    y_test = workspace['y_test']
    
    #model
    ###################################################################
    # Get the predictions for the training and testing samples
    ###################################################################
    decisions = []
    for X, y in ((x_train, y_train), (x_test, y_test)):

      # Use the outcome to select the truth information (>0.5 or <0.5)
      d1 = model.predict_proba(X[y == '998'])[:, 0]
      d2 = model.predict_proba(X[y == '-999'])[:, 0]
      decisions += [d1, d2]
    
    # Use this for the histogram ranges
    low = min(np.min(d) for d in decisions)
    high = max(np.max(d) for d in decisions)
    low_high = (low, high)
    
    
    #print(decisions)
    ###################################################################
    # Make a plot of the training sample predictions
    ###################################################################

    bins = 50
    plt.figure(figsize=(12, 6))
    plt.hist(decisions[0],
              color='r', alpha=0.5, range=low_high, bins=bins,
              histtype='stepfilled', density=True,
              label='Bkg (train)')
    plt.hist(decisions[1],
              color='b', alpha=0.5, range=low_high, bins=bins,
              histtype='stepfilled', density=True,
              label='Sig (train)')
    
    
    # Make a plot with error bars for the testing samples
    hists, bins = np.histogram(decisions[2],density=True,
                              bins=bins, range=low_high)
    scale = len(decisions[2]) / sum(hists)
    err = np.sqrt(hists * scale) / scale
    
    width = (bins[1] - bins[0])
    center = (bins[:-1] + bins[1:]) / 2
    
    plt.errorbar(center, hists, yerr=err, fmt='o', c='r', label='Bkg (test)')
    
    hists, bins = np.histogram(decisions[3],density=True,
                              bins=bins, range=low_high)
    scale = len(decisions[2]) / sum(hists)
    err = np.sqrt(hists * scale) / scale
    
    plt.errorbar(center, hists, yerr=err, fmt='o', c='b', label='Sig (test)')
    
    plt.xlabel("Classifer output")
    plt.ylabel("Arbitrary units")
    plt.legend(loc='best')
    plt.savefig(f'{save_dir}/classifier_output_{tag}.png')

    ################################################################################
    # Confusion matrix
    # Testing the model i.e. predicting the labels of the test data.
    y_pred = model.predict(x_test)
    
    # Evaluating the results of the model
    accuracy = accuracy_score(y_test,y_pred)*100 ### returns the fraction of correctly classified samples 
    confusion_mat = confusion_matrix(y_test,y_pred)

    print("Accuracy for Neural Network is:",accuracy)
    print("Confusion Matrix")
    print(confusion_mat)
    
    tot_correct= confusion_mat[1][1] + confusion_mat[0][0]
    tot_wrong= confusion_mat[1][0] + confusion_mat[0][1]
    
    #print(tot_correct/(tot_correct+tot_wrong))
    
    ## The accuracy score is the total number classified correctly over the total number of classifications 


    # Turn this into a dataframe
    matrix_df = pd.DataFrame(confusion_mat)
    
    # Plot the result
    fig, ax = plt.subplots(figsize=(10,7))
    
    sns.set(font_scale=1.3)
    
    sns.heatmap(matrix_df, annot=True, fmt="g", ax=ax, cmap="magma")
    
    #labels = df['target_names'].tolist()
    labels = ['998', '-999'] # NEED TO FIX THIS SO IT IS NOT HARDCODED
    
    # Formatting details here
    # Set axis titles
    ax.set_title('Confusion Matrix - MLP')
    ax.set_xlabel("Predicted label", fontsize =15)
    ax.set_xticklabels(labels)
    ax.set_ylabel("True Label", fontsize=15)
    ax.set_yticklabels(labels, rotation = 0)
    #plt.show()
    plt.savefig(f'{save_dir}/confusion_matrix_{tag}.png')

    # ROC

    decisions = model.predict_proba(x_test)[:, 1]

    ###################################################################
    # Compute ROC curve and area under the curve
    ###################################################################

    sig_bkg = np.ones_like(y_test, dtype=int)
    sig_bkg[y_test=='-999'] = 0

    fpr, tpr, thresholds = roc_curve(sig_bkg, decisions)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8,6))
    plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.2f)' % (roc_auc))
    
    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.grid()
    #plt.show()
    plt.savefig(f'{save_dir}/roc_curve_{tag}.png')



In [None]:
model_training_quality(workspace, tag=tag)

In [None]:
#def punzi_fom_nn(model_aft_train, sp_data, threshold, sp_998_df, sp_999_df, sig_disc= 4, scaling= 0.3):
def punzi_fom_nn(df_sp, df_col, sig_sp_mode='-999', region_definitions = None, sigma = 4.0, BNC=False):

    # Collision data
    mask = (df_col['cut_-1'] == True) 
    if BNC is True:
        mask = (df_col['cut_2'] == True) 
        mask = mask & (df_col['cut_3'] == True) 
        mask = mask & (df_col['cut_4'] == True) 
        
    
    df_col_tmp = df_col[mask]

    # SP
    mask = (df_sp['cut_-1'] == True) 
    if BNC is True:
        mask = (df_sp['cut_2'] == True) 
        mask = mask & (df_sp['cut_3'] == True) 
        mask = mask & (df_sp['cut_4'] == True) 

    mask = mask & (df_sp['spmode'] == sig_sp_mode)
    mask = mask & (df_sp['used_in_sig_train'] == False)
    df_sp_tmp = df_sp[mask]

    meslo = region_definitions['signal MES'][0]
    meshi = region_definitions['signal MES'][1]
    
    delo = region_definitions['signal DeltaE'][0]
    dehi = region_definitions['signal DeltaE'][1]

    messidelo = region_definitions['sideband MES'][0]
    messidehi = region_definitions['sideband MES'][1]
    
    desidelo1 = region_definitions['sideband 1 DeltaE'][0]
    desidehi1 = region_definitions['sideband 1 DeltaE'][1]
    
    desidelo2 = region_definitions['sideband 2 DeltaE'][0]
    desidehi2 = region_definitions['sideband 2 DeltaE'][1]

    # Print statements
    print(f'{meslo = }        {meshi = }')
    print(f'{messidelo = }    {messidehi = }')
    print(f'{delo = }         {dehi = }')
    print(f'{desidelo1 = }     {desidehi1 = }')
    print(f'{desidelo2 = }     {desidehi2 = }')

    
    fom_dict = {}
    fom_dict['thresh'] = []
    fom_dict['nbkg_sb1'] = []
    fom_dict['nbkg_sb2'] = []
    fom_dict['nbkg'] = []
    fom_dict['nsig'] = []

    # Collision data
    mes_col = df_col_tmp['BpostFitMes']
    de_col = df_col_tmp['BpostFitDeltaE']

    mask1_col = (mes_col>messidelo) & (mes_col<messidehi) & (de_col>desidelo1) & (de_col<desidehi1)    
    mask2_col = (mes_col>messidelo) & (mes_col<messidehi) & (de_col>desidelo2) & (de_col<desidehi2)

    # SP
    mes_sp = df_sp_tmp['BpostFitMes']
    de_sp = df_sp_tmp['BpostFitDeltaE']

    mask_sp = (mes_sp>meslo) & (mes_sp<meshi) & (de_sp>delo) & (de_sp<dehi) 

    for thresh in np.arange(0,1,0.01):
        
        # Collision data
        mask_thresh_col = df_col_tmp['proba'] > thresh

        nsb1 = len(df_col_tmp[mask1_col & mask_thresh_col])        
        nsb2 = len(df_col_tmp[mask2_col & mask_thresh_col])
    
        # Collision data
        mask_thresh_sp = df_sp_tmp['proba'] > thresh

        nsig = len(df_sp_tmp[mask_sp & mask_thresh_sp])        
    
        #print(nsb1, nsb2, nsig)
        
        fom_dict['thresh'].append(thresh)
        fom_dict['nbkg_sb1'].append(nsb1)
        fom_dict['nbkg_sb2'].append(nsb2)
        #fom_dict['nbkg'].append((nsb1 + nsb2)/2)
        fom_dict['nbkg'].append(nsb1 + nsb2)

        fom_dict['nsig'].append(nsig)

    df_fom = pd.DataFrame.from_dict(fom_dict)
    df_fom['sig_pct'] = df_fom['nsig'] / df_fom['nsig'].iloc[0]

    # Number of signal estimation
    N_S0 = 20
    df_fom['N_S'] = N_S0*df_fom['sig_pct']

    #sigma = 4.0
    
    df_fom['fom'] = df_fom['sig_pct'] / (np.sqrt(df_fom['nbkg']) + sigma/2.0)
    df_fom['fom_std'] = df_fom['N_S'] / np.sqrt(df_fom['N_S'] + df_fom['nbkg'])

    return df_fom

In [None]:
df_fom = punzi_fom_nn(df_sp, df_col, region_definitions=region_definitions, BNC=BNC_bool, sigma=4.0)


In [None]:
fig, axes = plt.subplots(1,2,figsize=(12,6), sharex=True)

df_fom.plot(x='thresh', y='fom', ax=axes[0])
df_fom.plot(x='thresh', y='fom_std', ax=axes[1])
axes[0].set_ylabel("FOM")
#plt.xlabel("threshold")


###################################################

fig, axes = plt.subplots(3,1,figsize=(9,12), sharex=True)

df_fom.plot(x='thresh', y='fom', ax=axes[0])
axes[0].set_ylabel("FOM")
#plt.xlabel("threshold")


df_fom.plot(x='thresh',y='sig_pct', ax=axes[1])
axes[1].set_ylabel("$\%$ signal retained")
#plt.xlabel("threshold")


df_fom.plot(x='thresh',y='nbkg', ax=axes[2])
axes[2].set_ylabel("# bkg events surviving")
axes[2].set_xlabel("NN value", fontsize=18)

plt.tight_layout()
plt.savefig(f'{save_dir}/FOM_calculation_{tag}.png')

In [None]:
fom_max = df_fom['fom'].max()

print(fom_max)

filter = df_fom['fom'] == fom_max

df_fom[filter]

In [None]:
max_cut = df_fom[filter]['thresh'].values[0]
print(f'max_cut: {max_cut}')

In [None]:
df_fom.sort_values(by='fom')[-20:]

In [None]:
df_sp.columns

In [None]:
#df_sp['BCosThetaS']
#col = 'BCosThetaT'
col = 'BCosThetaS'

mask = (df_sp['spmode']=='998') | (df_sp['spmode']=='-999')
sns.histplot(df_sp[mask], x=col, hue='spmode', stat='density', common_norm=False)


In [None]:
# BNV
proba_cut = max_cut
#proba_cut = 0.00

if BNC_bool:
    proba_cut = max_cut
    #proba_cut = 0.88

print(f'{proba_cut = }')

fig, axes = plt.subplots(3,1, sharex=True, figsize=(8,8))

labels = ['SP - bkg', 'SP - sig', 'Collision data']

for i in range(0,3):

    idx = None
    spmode = None
    df_tmp = None
    
    if i==0:
        #idx = workspace['idx_bkg_not_train']
        #spmode = '998'
        #df_tmp = df_sp.loc[idx]

        # Use them all
        mask = (~df_sp['used_in_bkg_train']) | (df_sp['used_in_bkg_train'])
        spmode = '998'
        df_tmp = df_sp[mask]

    elif i==1:
        #idx = workspace['idx_sig_not_train']
        #spmode = '-999'
        #df_tmp = df_sp.loc[idx]

        mask = (~df_sp['used_in_sig_train'])
        spmode = '-999'
        df_tmp = df_sp[mask]
    
    elif i==2:
        spmode = '0'
        df_tmp = df_col
    
    spmask = (df_tmp['spmode']==spmode)
    if i==0:# Background
        spmask = (df_tmp['spmode']!='-999')
    
    mask =   (df_tmp['cut_-1']==True)
    if BNC_bool:
        print("Making BNC cuts")
        mask = (df_tmp['cut_2']==True) & (df_tmp['cut_3']==True)  & (df_tmp['cut_4']==True)

    mask = mask & (df_tmp['proba'] > proba_cut)
    #mask = mask & ((df_tmp['BCosThetaT']<-0.8) | (df_tmp['BCosThetaT']>0.999))
    #mask = mask & ((df_tmp['BCosThetaS']<-0.9) | (df_tmp['BCosThetaS']>0.999))
    
    if BNC_bool:
        mask = mask & (df_tmp['BpostFitDeltaE']<0.05) & (df_tmp['BpostFitDeltaE']>-0.05)
    else:
        mask = mask & (df_tmp['BpostFitDeltaE']<0.05) & (df_tmp['BpostFitDeltaE']>-0.05)

    #var = 'proba'
    var = 'BpostFitMes'

    #plt.subplot(3,1,i+1)
    df_tmp[spmask & mask][var].hist(bins=50, range=(5.2,5.3), label=labels[i], ax=axes[i])#, range=(0,0.99))
    axes[i].legend()
axes[2].set_xlabel(r'$M_{ES}$ (GeV/c$^2$)', fontsize=18)

plt.tight_layout()

plt.savefig(f'{save_dir}/mes_tight_de_probcut_{proba_cut:.2f}_{tag}{BNC_tag}.png')

In [None]:
df_sp[['BpostFitMes', 'proba']].corr()

In [None]:
mask = (~df_sp['used_in_sig_train'])
df_tmp = df_sp[mask]

spmask = (df_tmp['spmode']=='-999')
mask =   (df_tmp['cut_-1']==True)
#mask = (df_tmp['cut_2']==True) & (df_tmp['cut_3']==True)  & (df_tmp['cut_4']==True)

mask = mask & (df_tmp['proba'] > proba_cut)
#mask = mask & ((df_tmp['BCosThetaT']<-0.8) | (df_tmp['BCosThetaT']>0.999))
#mask = mask & ((df_tmp['BCosThetaS']<-0.9) | (df_tmp['BCosThetaS']>0.999))

mask = mask & (df_tmp['BpostFitDeltaE']<0.05) & (df_tmp['BpostFitDeltaE']>-0.05)

df_tmp[mask & spmask].hist('BpostFitMes', bins=100, range=(5.2, 5.3))

m1 = df_tmp[mask & spmask]['BpostFitMes'] > 5.27
m2 = df_tmp[mask & spmask]['BpostFitMes'] <= 5.27
m3 = (df_tmp[mask & spmask]['BpostFitMes'] > 5.27) & (df_tmp[mask & spmask]['BpostFitMes'] < 5.285)
m4 = (df_tmp[mask & spmask]['BpostFitMes'] >= 5.285)

print(f'{len(m1[m1])}  {len(m2[m2])}    {len(m3[m3])}   {len(m4[m4])}')



In [None]:
mask = (df_sp['spmode'] == '998')
mask = mask &  (df_sp['cut_2']==True) & (df_sp['cut_3']==True)  & (df_sp['cut_4']==True)
mask = mask & (df_sp['BpostFitMes']>5.20)# & (df_sp['BpostFitDeltaE']>-0.07)

mask = mask & (df_sp['BpostFitDeltaE']<0.07) & (df_sp['BpostFitDeltaE']>-0.07)

mask = mask & (df_sp['proba'] > 0.5)


df_sp[mask].plot.scatter(x='BpostFitMes', y='proba')

In [None]:


fig,axes = plt.subplots(1,3, figsize=(12,4))

# BNV
proba_cut = max_cut
#proba_cut = 0.0

if BNC_bool:
    proba_cut = max_cut
    #proba_cut = 0.90

deloline, dehiline = -0.05, 0.05

#de_cut = 0.07
de_cut = 0.2

# SP bkg
mask = (df_sp['spmode'] != '-999')

if BNC_bool:
    mask = mask &  (df_sp['cut_2']==True) & (df_sp['cut_3']==True)  & (df_sp['cut_4']==True)
else:
    mask = mask &  (df_sp['cut_-1']==True)

mask = mask & (df_sp['BpostFitMes']>5.20)# & (df_sp['BpostFitDeltaE']>-0.07)

mask = mask & (df_sp['BpostFitDeltaE']<de_cut) & (df_sp['BpostFitDeltaE']>-de_cut)

mask = mask & (df_sp['proba'] > proba_cut)

df_sp[mask & (df_sp['spmode']=='998')].plot.scatter(x='BpostFitMes', y='BpostFitDeltaE', ax=axes[0])#, label='SP-998')#, label='SP')
df_sp[mask & (df_sp['spmode']=='1005')].plot.scatter(x='BpostFitMes', y='BpostFitDeltaE', ax=axes[0], c='orange')#, label='SP-1005')#, label='SP')

axes[0].plot([5.2, 5.29], [deloline, deloline], 'r--', lw=3)
axes[0].plot([5.2, 5.29], [dehiline, dehiline], 'r--', lw=3)
#plt.legend()
axes[0].set_title(f'Bkg SP (NN > {proba_cut:.2f})')

# SP sig
mask = (df_sp['spmode'] == '-999')

if BNC_bool:
    mask = mask &  (df_sp['cut_2']==True) & (df_sp['cut_3']==True)  & (df_sp['cut_4']==True)
else:
    mask = mask &  (df_sp['cut_-1']==True)

mask = mask & (df_sp['BpostFitMes']>5.20)# & (df_sp['BpostFitDeltaE']>-0.07)

mask = mask & (df_sp['BpostFitDeltaE']<de_cut) & (df_sp['BpostFitDeltaE']>-de_cut)

mask = mask & (df_sp['proba'] > proba_cut)


df_sp[mask].plot.scatter(x='BpostFitMes', y='BpostFitDeltaE', ax=axes[1], s=0.1, alpha=0.1)#, label='SP')
axes[1].plot([5.2, 5.29], [deloline, deloline], 'r--', lw=3)
axes[1].plot([5.2, 5.29], [dehiline, dehiline], 'r--', lw=3)
axes[1].set_ylim(-0.2, 0.2)
#plt.legend()
axes[1].set_title(f'Sig SP (NN > {proba_cut:.2f})')


# Data
mask = (df_col['spmode'] == '0')

if BNC_bool:
    mask = mask &  (df_col['cut_2']==True) & (df_col['cut_3']==True)  & (df_col['cut_4']==True)
else:
    mask = mask &  (df_col['cut_-1']==True)


mask = mask & (df_col['BpostFitMes']>5.20)# & (df_sp['BpostFitDeltaE']>-0.07)

mask = mask & (df_col['BpostFitDeltaE']<de_cut) & (df_col['BpostFitDeltaE']>-de_cut)

mask = mask & (df_col['proba'] > proba_cut)


df_col[mask].plot.scatter(x='BpostFitMes', y='BpostFitDeltaE', ax=axes[2])#, label='Collision data')
axes[2].plot([5.2, 5.29], [deloline, deloline], 'r--', lw=3)
axes[2].plot([5.2, 5.29], [dehiline, dehiline], 'r--', lw=3)
axes[2].set_ylim(-0.2, 0.2)
#plt.legend()
axes[2].set_title(f'Collision data (NN > {proba_cut:.2f})')

plt.tight_layout()

plt.savefig(f'{save_dir}/sp_and_collision_de_vs_mes_probcut_{proba_cut:.2f}_{tag}{BNC_tag}.png')

mask_de = (df_col['BpostFitDeltaE']<0.05) & (df_col['BpostFitDeltaE']>-0.05)
df_col[mask & mask_de]['BpostFitMes'].values

In [None]:
# SP
mask = (df_sp['spmode'] != '-999')

# BNC
#mask = mask &  (df_sp['cut_2']==True) & (df_sp['cut_3']==True)  & (df_sp['cut_4']==True)

# BNV
mask = mask &  (df_sp['cut_-1']==True)

#mask = mask & (df_sp['BpostFitMes']>5.20)# & (df_sp['BpostFitDeltaE']>-0.07)

# Region 1
mask = mask & (df_sp['BpostFitMes']>5.20) & (df_sp['BpostFitDeltaE']>-0.05) & (df_sp['BpostFitDeltaE']<0.05)

# Region 2
#mask = mask & (df_sp['BpostFitMes']>5.27) & (df_sp['BpostFitDeltaE']>-0.20) & (df_sp['BpostFitDeltaE']<0.05)

# Region 3
#mask = mask & (df_sp['BpostFitMes']>5.27) & (df_sp['BpostFitDeltaE']>-0.05) & (df_sp['BpostFitDeltaE']<0.20)

#mask = mask & (df_sp['BpostFitDeltaE']<de_cut) & (df_sp['BpostFitDeltaE']>-de_cut)
#mask = mask & (df_sp['BpostFitDeltaE']<de_cut) & (df_sp['BpostFitDeltaE']>-0.05)
#mask = mask & (df_sp['BpostFitDeltaE']<0.05) & (df_sp['BpostFitDeltaE']>-de_cut)

mask = mask & (df_sp['proba'] > 0.7)

#df_sp[mask][['BpostFitMes', 'proba']].corr()
print(df_sp[mask][['BpostFitMes', 'proba']].corr())
print()
print(df_sp[mask][['BpostFitDeltaE', 'proba']].corr())
print()
plt.figure(figsize=(12,5))
plt.subplot(1,3,1)
sns.scatterplot(df_sp[mask], x='BpostFitMes', y='BpostFitDeltaE')
plt.ylim(-0.2, 0.2)
plt.xlim(5.2, 5.3)

plt.subplot(1,3,2)
sns.scatterplot(df_sp[mask], x='proba', y='BpostFitDeltaE')

plt.subplot(1,3,3)

#plt.figure()
sns.scatterplot(df_sp[mask], x='proba', y='BpostFitMes')
plt.tight_layout()


# Correlations and uncertainties

In [None]:
from scipy.stats import bootstrap

In [None]:
#x,y =  df_sp[mask]['BpostFitMes'].values, df_sp[mask]['proba'].values
x,y =  df_sp[mask]['BpostFitDeltaE'].values, df_sp[mask]['proba'].values

In [None]:
x

In [None]:
def correlation_statistic(x, y):
       return np.corrcoef(x, y)[0][1]

In [None]:
correlation_statistic(x,y)

In [None]:
# Perform bootstrap resampling with `scipy.stats.bootstrap`
bootstrap_result = bootstrap(
   (x, y),
   correlation_statistic,
   paired=True,
   random_state=1,
   n_resamples=1000, # Adjust as needed
   confidence_level=0.95 # Adjust as needed
)


In [None]:
confidence_interval = bootstrap_result.confidence_interval
print(f"Bootstrap confidence interval: {confidence_interval}")

fig, ax = plt.subplots()
ax.hist(bootstrap_result.bootstrap_distribution, bins=50)
ax.set_title('Bootstrap Distribution')
ax.set_xlabel('statistic value')
ax.set_ylabel('frequency')

In [None]:
#bootstrap_result.bootstrap_distribution[1]

# Plotting many

In [None]:
def summarize_trainings(features = 5, nsig = 60000, nbkg = 60000, trials = [1], BNC_tag="", BNC_bool=False, make_features_plots=False):

    for trial in trials:
    
        ntrain_tag = f'features_{features}_nsig_{nsig}_nbkg_{nbkg}_trial{trial}'
        tag = f'{ntrain_tag}{BNC_tag}'
        
        # Read in the dfs
        infilename = f"DATAFRAME_SP_MODEL_MLPClassifier_CUTS_1_2_3_{ntrain_tag}{BNC_tag}.pkl"
        print(f"Reading in {infilename}")
        df_sp = pd.read_parquet(infilename)
        
        infilename = f"DATAFRAME_COL_MODEL_MLPClassifier_CUTS_1_2_3_{ntrain_tag}{BNC_tag}.pkl"
        print(f"Reading in {infilename}")
        df_col = pd.read_parquet(infilename)

        infilename = f'MODEL_MLPClassifier_CUTS_1_2_3_{ntrain_tag}_{BNC_tag}.pkl'
        print(f"Reading in {infilename}")
        workspace = joblib.load(infilename)
    
        # FOM
        df_fom = punzi_fom_nn(df_sp, df_col, region_definitions=region_definitions, BNC=BNC_bool, sigma=4.0)
    
        fom_max = df_fom['fom'].max()
    
        #print(fom_max)
        
        filter = df_fom['fom'] == fom_max
        
        df_fom[filter]
        
        max_cut = df_fom[filter]['thresh'].values[0]
        print(f'max_cut: {max_cut}')
    
        # Plot the variables
        # Make a temporary dataframe with the cuts
        if make_features_plots:
            mask = (df_sp['cut_2']==True) & (df_sp['cut_3']==True)
            df_temp = df_sp[mask]
            sp_mask = (df_temp['spmode']=='-999') | (df_temp['spmode']=='998')
            feature_names = list(workspace['model'].feature_names)
            plot_training_variables(df_temp[sp_mask], feature_names = feature_names, tag=tag)
        

        #####################################################
        # Quality
        #####################################################
        model_training_quality(workspace, tag=f'{ntrain_tag}{BNC_tag}')

        #####################################################
        # FOM stuff
        #####################################################
        fig, axes = plt.subplots(3,1,figsize=(9,12), sharex=True)
        
        df_fom.plot(x='thresh', y='fom', ax=axes[0])
        axes[0].set_ylabel("FOM")
        #plt.xlabel("threshold")
        
        
        df_fom.plot(x='thresh',y='sig_pct', ax=axes[1])
        axes[1].set_ylabel("$\%$ signal retained")
        #plt.xlabel("threshold")
        
        
        df_fom.plot(x='thresh',y='nbkg', ax=axes[2])
        axes[2].set_ylabel("# bkg events surviving")
        axes[2].set_xlabel("NN value", fontsize=18)
        
        plt.tight_layout()
        plt.savefig(f'{save_dir}/FOM_calculation_{tag}.png')
    
        #####################################################
        # 2D plots
        #####################################################
    
        fig,axes = plt.subplots(1,3, figsize=(12,4))
        
        # BNV
        proba_cut = max_cut
        #proba_cut = 0.0
        
        if BNC_bool:
            proba_cut = max_cut
            #proba_cut = 0.90
        
        deloline, dehiline = -0.05, 0.05
        
        #de_cut = 0.07
        de_cut = 0.2
        
        # SP bkg
        mask = (df_sp['spmode'] != '-999')
        
        if BNC_bool:
            mask = mask &  (df_sp['cut_2']==True) & (df_sp['cut_3']==True)  & (df_sp['cut_4']==True)
        else:
            mask = mask &  (df_sp['cut_-1']==True)
        
        mask = mask & (df_sp['BpostFitMes']>5.20)# & (df_sp['BpostFitDeltaE']>-0.07)
        
        mask = mask & (df_sp['BpostFitDeltaE']<de_cut) & (df_sp['BpostFitDeltaE']>-de_cut)
        
        mask = mask & (df_sp['proba'] > proba_cut)
        
        df_sp[mask & (df_sp['spmode']=='998')].plot.scatter(x='BpostFitMes', y='BpostFitDeltaE', ax=axes[0])#, label='SP-998')#, label='SP')
        df_sp[mask & (df_sp['spmode']=='1005')].plot.scatter(x='BpostFitMes', y='BpostFitDeltaE', ax=axes[0], c='orange')#, label='SP-1005')#, label='SP')
        
        axes[0].plot([5.2, 5.29], [deloline, deloline], 'r--', lw=3)
        axes[0].plot([5.2, 5.29], [dehiline, dehiline], 'r--', lw=3)
        #plt.legend()
        axes[0].set_title(f'Bkg SP (NN > {proba_cut:.2f})')
        
        # SP sig
        mask = (df_sp['spmode'] == '-999')
        
        if BNC_bool:
            mask = mask &  (df_sp['cut_2']==True) & (df_sp['cut_3']==True)  & (df_sp['cut_4']==True)
        else:
            mask = mask &  (df_sp['cut_-1']==True)
        
        mask = mask & (df_sp['BpostFitMes']>5.20)# & (df_sp['BpostFitDeltaE']>-0.07)
        
        mask = mask & (df_sp['BpostFitDeltaE']<de_cut) & (df_sp['BpostFitDeltaE']>-de_cut)
        
        mask = mask & (df_sp['proba'] > proba_cut)
        
        
        df_sp[mask].plot.scatter(x='BpostFitMes', y='BpostFitDeltaE', ax=axes[1], s=0.1, alpha=0.1)#, label='SP')
        axes[1].plot([5.2, 5.29], [deloline, deloline], 'r--', lw=3)
        axes[1].plot([5.2, 5.29], [dehiline, dehiline], 'r--', lw=3)
        axes[1].set_ylim(-0.2, 0.2)
        #plt.legend()
        axes[1].set_title(f'Sig SP (NN > {proba_cut:.2f})')
        
        
        # Data
        mask = (df_col['spmode'] == '0')
        
        if BNC_bool:
            mask = mask &  (df_col['cut_2']==True) & (df_col['cut_3']==True)  & (df_col['cut_4']==True)
        else:
            mask = mask &  (df_col['cut_-1']==True)
        
        
        mask = mask & (df_col['BpostFitMes']>5.20)# & (df_sp['BpostFitDeltaE']>-0.07)
        
        mask = mask & (df_col['BpostFitDeltaE']<de_cut) & (df_col['BpostFitDeltaE']>-de_cut)
        
        mask = mask & (df_col['proba'] > proba_cut)
        
        
        df_col[mask].plot.scatter(x='BpostFitMes', y='BpostFitDeltaE', ax=axes[2])#, label='Collision data')
        axes[2].plot([5.2, 5.29], [deloline, deloline], 'r--', lw=3)
        axes[2].plot([5.2, 5.29], [dehiline, dehiline], 'r--', lw=3)
        axes[2].set_ylim(-0.2, 0.2)
        #plt.legend()
        axes[2].set_title(f'Collision data (NN > {proba_cut:.2f})')
        
        plt.tight_layout()
        
        plt.savefig(f'{save_dir}/sp_and_collision_de_vs_mes_probcut_{proba_cut:.2f}_{tag}{BNC_tag}.png')
    
    
        ##########################################
        # 1D cuts
        ##########################################
        
        # BNV
        proba_cut = max_cut
        #proba_cut = 0.00
        
        if BNC_bool:
            proba_cut = max_cut
            #proba_cut = 0.88
        
        print(f'{proba_cut = }')
        
        fig, axes = plt.subplots(3,1, sharex=True, figsize=(8,8))
        
        labels = ['SP - bkg', 'SP - sig', 'Collision data']
        
        for i in range(0,3):
        
            idx = None
            spmode = None
            df_tmp = None
            
            if i==0:
                #idx = workspace['idx_bkg_not_train']
                #spmode = '998'
                #df_tmp = df_sp.loc[idx]
        
                # Use them all
                mask = (~df_sp['used_in_bkg_train']) | (df_sp['used_in_bkg_train'])
                spmode = '998'
                df_tmp = df_sp[mask]
        
            elif i==1:
                #idx = workspace['idx_sig_not_train']
                #spmode = '-999'
                #df_tmp = df_sp.loc[idx]
        
                mask = (~df_sp['used_in_sig_train'])
                spmode = '-999'
                df_tmp = df_sp[mask]
            
            elif i==2:
                spmode = '0'
                df_tmp = df_col
            
            spmask = (df_tmp['spmode']==spmode)
            if i==0:# Background
                spmask = (df_tmp['spmode']!='-999')
            
            mask =   (df_tmp['cut_-1']==True)
            if BNC_bool:
                print("Making BNC cuts")
                mask = (df_tmp['cut_2']==True) & (df_tmp['cut_3']==True)  & (df_tmp['cut_4']==True)
        
            mask = mask & (df_tmp['proba'] > proba_cut)
            #mask = mask & ((df_tmp['BCosThetaT']<-0.8) | (df_tmp['BCosThetaT']>0.999))
            #mask = mask & ((df_tmp['BCosThetaS']<-0.9) | (df_tmp['BCosThetaS']>0.999))
            
            if BNC_bool:
                mask = mask & (df_tmp['BpostFitDeltaE']<0.05) & (df_tmp['BpostFitDeltaE']>-0.05)
            else:
                mask = mask & (df_tmp['BpostFitDeltaE']<0.05) & (df_tmp['BpostFitDeltaE']>-0.05)
        
            #var = 'proba'
            var = 'BpostFitMes'
        
            #plt.subplot(3,1,i+1)
            df_tmp[spmask & mask][var].hist(bins=50, range=(5.2,5.3), label=labels[i], ax=axes[i])#, range=(0,0.99))
            axes[i].legend()
        
        axes[2].set_xlabel(r'$M_{ES}$ (GeV/c$^2$)', fontsize=18)
        
        plt.tight_layout()
        
        plt.savefig(f'{save_dir}/mes_tight_de_probcut_{proba_cut:.2f}_{tag}{BNC_tag}.png')

In [None]:
#np.arange(1,21,1,dtype=int)

In [None]:
BNC_tag = ""
BNC_bool = False

#BNC_tag = "_BNC"
#BNC_bool = True

#for idx_feature in [2, 4, 5]:
#for idx_feature in [2, 4]:
#for idx_feature in [2, 4, 5, 6, 7]:
#for idx_feature in [2]:
#for idx_feature in [6,7]:
for idx_feature in [2, 4, 5, 6, 7, 8]:
#for idx_feature in [8]:
    
    #summarize_trainings(features = idx_feature, nsig = 30000, nbkg = 30000, trials = np.arange(1,21,1,dtype=int), \
    #                    BNC_tag=BNC_tag, BNC_bool=BNC_bool)
    summarize_trainings(features = idx_feature, nsig = 30000, nbkg = 30000, trials = np.arange(1,2,1,dtype=int), \
                        BNC_tag=BNC_tag, BNC_bool=BNC_bool, make_features_plots=True)


In [None]:
df_sp


# Boostrap uncertainties on the correlation matrix

In [None]:
data = pd.DataFrame(np.random.rand(100, 4), columns=['A', 'B', 'C', 'D'])

#np.corrcoef(data.T)
data.corr()

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Create a sample DataFrame
data = pd.DataFrame(np.random.rand(100, 4), columns=['A', 'B', 'C', 'D'])

# 2. Bootstrap Resampling and Correlation Calculation
n_bootstraps = 1000  # Number of bootstrap samples
bootstrap_correlations = []

for _ in range(n_bootstraps):
    # Resample with replacement
    resampled_data = data.sample(frac=1, replace=True)
    # Calculate correlation matrix for the resampled data
    bootstrap_correlations.append(resampled_data.corr().values)

# Convert list of arrays to a 3D NumPy array
bootstrap_correlations = np.array(bootstrap_correlations)

# 3. Calculate Uncertainties (e.g., standard deviation)
#mean_correlations = np.mean(bootstrap_correlations, axis=0)
#mean_correlations = np.corrcoef(data.T)
mean_correlations = data.corr()
print(mean_correlations)

std_correlations = np.std(bootstrap_correlations, axis=0)

# 4. Display Uncertainties (e.g., as a formatted string in a DataFrame)
correlation_matrix_with_uncertainties = pd.DataFrame(
    '', index=data.columns, columns=data.columns, dtype=object
)

for i in range(mean_correlations.shape[0]):
    for j in range(mean_correlations.shape[1]):
        corr_val = mean_correlations.iloc[i, j]
        std_val = std_correlations[i, j]
        correlation_matrix_with_uncertainties.iloc[i, j] = f"{corr_val:.2f} ± {std_val:.2f}"

print("Correlation Matrix with Bootstrap Uncertainties:")
print(correlation_matrix_with_uncertainties)

print(type(mean_correlations), type(correlation_matrix_with_uncertainties))

plt.figure(figsize=(8, 6))
#sns.heatmap(mean_correlations, annot=correlation_matrix_with_uncertainties, fmt="", cmap='coolwarm', vmin=-1, vmax=1) # Customize 'fmt' as needed
sns.heatmap(mean_correlations, fmt="", cmap='coolwarm', vmin=-1, vmax=1) # Customize 'fmt' as needed

plt.title('Correlation Matrix with Bootstrap Confidence Intervals')
plt.show()


In [None]:
correlation_matrix_with_uncertainties.iloc[0][0]