In [None]:
import uproot
import awkward as ak
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import seaborn as sns

import time

from hist import Hist

import babar_analysis_tools as bat
from analysis_variables import *
import myPIDselector

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score 
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

In [None]:
start= time.time()

## My laptop
topdir= "/Users/josieswann/BaBar_analyses/BNV_pLambda/"

## Bellis computer
#topdir= "/home/bellis/babar_data/bnv_plambda"
#topdir= "/home/bellis/babar_data/bnv_plambda_bnc"


filename= f"{topdir}/Background_and_signal_SP_modes_Only_Run_1.parquet"
#filename= f"{topdir}/Background_and_signal_SP_modes_All_runs.parquet" ## this won't run on mine 
#filename= f"{topdir}/Background_and_signal_SP_modes_BNC_Only_Run_1.parquet"
#filename= f"{topdir}/Background_and_signal_SP_modes_BNC_All_runs.parquet"

data= ak.from_parquet(filename)

print(f"Took {time.time()-start} seconds")

IS_MC= True

#Collision data 

#filename = f'{topdir}/Background_SP_modes_Only_Run_1.parquet'
filename = f'{topdir}/Data_Only_Run_1_BLINDED.parquet'
#filename = f'{topdir}/Data_All_runs_BLINDED.parquet'
#filename = f'{topdir}/Data_All_runs_BLINDED.parquet'
#filename = f'{topdir}/Data_BNC_Only_Run_1.parquet'
#filename = f'{topdir}/Data_BNC_All_runs.parquet'

start= time.time()
data_collision= ak.from_parquet(filename)

print(f"took {time.time()-start} seconds")

print(type(data_collision))

In [None]:
dataset_information= pd.read_csv("dataset_statistics.csv")
cs_data= pd.read_csv("SP_cross_sections_and_labels.csv")

no_notes= cs_data.drop(["Uncertainty","Note: cross sections found at https://babar-wiki.heprc.uvic.ca/bbr_wiki/index.php/Physics/Cross_sections,_luminosities,_and_other_vital_stats"], axis= 1)
no_notes

sp= data["spmode"]

splist= np.unique(sp.to_list())
splist

dcuts= bat.get_final_masks(data, region_definitions= region_definitions)

print([dcuts.keys()])
print()

for key in dcuts.keys():
    print(f'{key:3d} {dcuts[key]["name"]}')

dcuts[3]

bat.fill_new_entry_with_tag_side_B(data)
data["BtagSideMes"]
bat.fill_new_entry_with_tag_side_B(data_collision)
data_collision["BtagSideMes"]

all_hists= bat.create_empty_histograms(hist_defs)

bkg_spmodes= ["998","1005","3981","1235","1237"]
sig_spmodes= ["-999"]

spmodes= bkg_spmodes+sig_spmodes

weights= {}
for sp in spmodes: 
    weights[sp]= bat.scaling_value(int(sp),dataset_information=dataset_information, cs_data= cs_data, plot= False, verbose= False)
    #weights[sp]=1

weights["-999"]= 1000 #scales signal higher 
weights["0"]= 1 #idk what this is for;;; ASK

print(weights)

In [None]:
#mask_event= dcuts[1]["event"]
#mask_event= dcuts[2]["event"]
mask_event= dcuts[3]["event"]
#mask_event= dcuts[4]["event"] ## individual cuts
#mask_event= dcuts[-1]["event"] ## all cuts

#mask_event= dcuts[1]["event"] & dcuts[2]["event"] & dcuts[3]["event"] & dcuts[4]["event"] ## combo of cuts
mask_event= dcuts[1]["event"] & dcuts[2]["event"] & dcuts[3]["event"] ## combo of cuts

### ASK WHAT THESE MEAN
#tag= "EARLY_CUT"
tag= "FINAL_CUTS"

mask= mask_event

In [None]:
subset = ['spmode', 'BpostFitMes', 'BpostFitDeltaE', 'Lambda0_unc_Mass', \
          'BtagSideMes', 'BSphr', 'BThrust', 'BCosThetaS', \
          'R2', 'R2All', \
          'thrustMag', 'thrustMagAll', 'thrustCosTh', 'thrustCosThAll', 'sphericityAll', \
          'BCosSphr', 'BCosThetaT', 'BCosThrust', 'BLegendreP2', 'BR2ROE', 'BSphrROE', \
          'BThrustROE']

ak_array_type= type(data["spmode"])

df_dict={}
for var in subset: 
    x= data[mask][var] ##in each event, cut on the above cuts and pull out the info from each of the variables listed above
    if type(x[0]) == ak_array_type:
        x= ak.flatten(data[mask][var])
    df_dict[var] = x

df_out= pd.DataFrame.from_dict(df_dict)

outfilename= f"output_variables_{tag}.parquet"
df_out.to_parquet(outfilename)

df= df_out

df_out


In [None]:
filter= df["spmode"]== "-999"

g= sns.PairGrid(df[filter].sample(500), vars= ["BpostFitMes","BpostFitDeltaE"], hue= "spmode")
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)

In [None]:
filter = df['spmode'] != '-999'
columns= df.columns

#g = sns.PairGrid(df[filter].sample(500), vars=['BpostFitMes', 'BpostFitDeltaE'], hue='spmode')
g = sns.PairGrid(df[filter].sample(50), vars=columns[1:6], hue='spmode')

g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
g.add_legend()

In [None]:
feature_names= columns[1:] ##exclude spmode
print(feature_names)

In [None]:
def model_maker(data, test_size_pct= 0.4, activator= "relu", solve_model= "adam", All_runs= False): 
    bat.fill_new_entry_with_tag_side_B(data)
    dcuts= bat.get_final_masks(data, region_definitions= region_definitions)

    bkg_spmodes= ["998","1005","3981","1235","1237"]
    sig_spmodes= ["-999"]
    spmodes= bkg_spmodes+sig_spmodes
    
    weights= {}
    for sp in spmodes: 
        weights[sp]= bat.scaling_value(int(sp),dataset_information=dataset_information, cs_data= cs_data, plot= False, verbose= False)
        #weights[sp]=1
    
    weights["-999"]= 1000  
    weights["0"]= 1 

    mask_event= dcuts[1]["event"] & dcuts[2]["event"] & dcuts[3]["event"] 
    tag= "FINAL_CUTS"
    mask= mask_event
    
    subset = ['spmode', 'BpostFitMes', 'BpostFitDeltaE', 'Lambda0_unc_Mass', \
          'BtagSideMes', 'BSphr', 'BThrust', 'BCosThetaS', \
          'R2', 'R2All', \
          'thrustMag', 'thrustMagAll', 'thrustCosTh', 'thrustCosThAll', 'sphericityAll', \
          'BCosSphr', 'BCosThetaT', 'BCosThrust', 'BLegendreP2', 'BR2ROE', 'BSphrROE', \
          'BThrustROE']
        
    df_dict={}
    for var in subset: 
        x= data[mask][var] ##in each event, cut on the above cuts and pull out the info from each of the variables listed above
        if type(x[0]) == ak_array_type:
            x= ak.flatten(data[mask][var])
        df_dict[var] = x
    
    df_out= pd.DataFrame.from_dict(df_dict)
    outfilename= f"output_variables_{tag}.parquet"
    df_out.to_parquet(outfilename)
    df= df_out
    
    columns= df.columns

    feature_names= columns[1:] ##exclude spmode
    #print(feature_names)
    filter_sig= df["spmode"]== "-999"
    filter_bkg= df["spmode"]== "998"
    
    if All_runs== False: 
        df_sig= df[filter_sig].dropna().sample(1000)
        df_bkg= df[filter_bkg].dropna().sample(1000)
    
    if All_runs==True:
        df_sig= df[filter_sig].dropna().sample(20000)
        df_bkg= df[filter_bkg].dropna().sample(20000)
    
    df_ML= pd.concat([df_sig,df_bkg])
    
    x= df_ML.drop(columns= ["spmode","BpostFitMes","BpostFitDeltaE","Lambda0_unc_Mass","BtagSideMes"])
    y=df_ML["spmode"]

    feature_names= x.columns ##disc vars
    labels= y.unique() ##diff sp modes
    
    print("Training features:")
    print(feature_names)
    print()
    
    print("Labels (Outcome):")
    print(labels)
    print()
    
    print("The dataset (x) is the numbers without column names---")
    print("The variable y is truth info about the data (signal or bkg)")

    x_train, x_test, y_train, y_test= train_test_split(x,y, test_size= test_size_pct, random_state= 4)
    scaler= StandardScaler()
    x_train= scaler.fit_transform(x_train)
    x_test= scaler.transform(x_test)
    model = MLPClassifier(max_iter= 300, random_state= 3, activation= activator, solver= solve_model ) #n_iter_no_change= 15)

    model.fit(x_train, y_train)

    decisions = model.predict_proba(x_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(sig_bkg, decisions)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8,6))
    plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.2f)' % (roc_auc))
    
    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()

    return model 

    

In [None]:
""""
def ML_FOM_Calculator(model, MC_data, coll_data, threshold):
    bkg_samp= 1000
    sig_samp= 1000
    #### FOR SP 
    for i in threshold:
        output_df= see_stuff(sig_samp= sig_samp,bkg_samp= bkg_samp, thresh= i, verbose= False, df=MC_data, df_col=coll_data)
        
    #### FOR COLLISION
    y_proba_col_sig = model.predict_proba()
    
    sp998= sp_data["spmode"]== "998"
    N_bkg = len(sp_998_df[sp998]) ## total number of background events (sp 998) 
    signal_before= len(sp_999_df)
    signal_after= len(sp_999_df)
    efficiency = signal_after/signal_before ## the accuracy of the model after training with the SP 
    fom = efficiency(threshold)/(np.sqrt(N_bkg(threshold)+sig_disc/2))
    return fom
"""

### Not done yet 