In [None]:
import uproot
import awkward as ak
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import seaborn as sns

import time

from hist import Hist

import babar_analysis_tools as bat
from analysis_variables import *
import myPIDselector

from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score 
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

import joblib

In [None]:
start= time.time()

## My laptop
#topdir= "/Users/josieswann/BaBar_analyses/BNV_pLambda/"

## Bellis computer
topdir= "/home/bellis/babar_data/bnv_plambda"
#topdir= "/home/bellis/babar_data/bnv_plambda_bnc"


filename= f"{topdir}/Background_and_signal_SP_modes_Only_Run_1.parquet"
#filename= f"{topdir}/Background_and_signal_SP_modes_All_runs.parquet" ## this won't run on mine 
#filename= f"{topdir}/Background_and_signal_SP_modes_BNC_Only_Run_1.parquet"
#filename= f"{topdir}/Background_and_signal_SP_modes_BNC_All_runs.parquet"

data= ak.from_parquet(filename)

print(f"Took {time.time()-start} seconds")

IS_MC= True

#Collision data 

#filename = f'{topdir}/Background_SP_modes_Only_Run_1.parquet'
filename = f'{topdir}/Data_Only_Run_1_BLINDED.parquet'
#filename = f'{topdir}/Data_All_runs_BLINDED.parquet'
#filename = f'{topdir}/Data_All_runs_BLINDED.parquet'
#filename = f'{topdir}/Data_BNC_Only_Run_1.parquet'
#filename = f'{topdir}/Data_BNC_All_runs.parquet'

start= time.time()
data_collision= ak.from_parquet(filename)

print(f"took {time.time()-start} seconds")

print(type(data_collision))

In [None]:
dataset_information= pd.read_csv("dataset_statistics.csv")
cs_data= pd.read_csv("SP_cross_sections_and_labels.csv")

no_notes= cs_data.drop(["Uncertainty","Note: cross sections found at https://babar-wiki.heprc.uvic.ca/bbr_wiki/index.php/Physics/Cross_sections,_luminosities,_and_other_vital_stats"], axis= 1)
no_notes

sp= data["spmode"]

splist= np.unique(sp.to_list())
splist

dcuts= bat.get_final_masks(data, region_definitions= region_definitions)

print([dcuts.keys()])
print()

for key in dcuts.keys():
    print(f'{key:3d} {dcuts[key]["name"]}')

dcuts[3]

bat.fill_new_entry_with_tag_side_B(data)
data["BtagSideMes"]
bat.fill_new_entry_with_tag_side_B(data_collision)
data_collision["BtagSideMes"]

all_hists= bat.create_empty_histograms(hist_defs)

bkg_spmodes= ["998","1005","3981","1235","1237"]
sig_spmodes= ["-999"]

spmodes= bkg_spmodes+sig_spmodes

weights= {}
for sp in spmodes: 
    weights[sp]= bat.scaling_value(int(sp),dataset_information=dataset_information, cs_data= cs_data, plot= False, verbose= False)
    #weights[sp]=1

weights["-999"]= 1000 #scales signal higher 
weights["0"]= 1 #idk what this is for;;; ASK

print(weights)

In [None]:
#mask_event= dcuts[1]["event"]
#mask_event= dcuts[2]["event"]
mask_event= dcuts[3]["event"]
#mask_event= dcuts[4]["event"] ## individual cuts
#mask_event= dcuts[-1]["event"] ## all cuts

#mask_event= dcuts[1]["event"] & dcuts[2]["event"] & dcuts[3]["event"] & dcuts[4]["event"] ## combo of cuts
mask_event= dcuts[1]["event"] & dcuts[2]["event"] & dcuts[3]["event"] ## combo of cuts

### ASK WHAT THESE MEAN
#tag= "EARLY_CUT"
tag= "FINAL_CUTS"

mask= mask_event

In [None]:
subset = ['spmode', 'BpostFitMes', 'BpostFitDeltaE', 'Lambda0_unc_Mass', \
          'BtagSideMes', 'BSphr', 'BThrust', 'BCosThetaS', \
          'R2', 'R2All', \
          'thrustMag', 'thrustMagAll', 'thrustCosTh', 'thrustCosThAll', 'sphericityAll', \
          'BCosSphr', 'BCosThetaT', 'BCosThrust', 'BLegendreP2', 'BR2ROE', 'BSphrROE', \
          'BThrustROE']

ak_array_type= type(data["spmode"])

df_dict={}
for var in subset: 
    x= data[mask][var] ##in each event, cut on the above cuts and pull out the info from each of the variables listed above
    if type(x[0]) == ak_array_type:
        x= ak.flatten(data[mask][var])
    df_dict[var] = x

df_out= pd.DataFrame.from_dict(df_dict)

outfilename= f"output_variables_{tag}.parquet"
df_out.to_parquet(outfilename)

df= df_out

df_out


In [None]:
filter= df["spmode"]== "-999"

g= sns.PairGrid(df[filter].sample(500), vars= ["BpostFitMes","BpostFitDeltaE"], hue= "spmode")
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)

In [None]:
filter = df['spmode'] != '-999'
columns= df.columns

#g = sns.PairGrid(df[filter].sample(500), vars=['BpostFitMes', 'BpostFitDeltaE'], hue='spmode')
g = sns.PairGrid(df[filter].sample(50), vars=columns[1:6], hue='spmode')

g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
g.add_legend()

In [None]:
feature_names= columns[1:] ##exclude spmode
print(feature_names)

In [None]:
dcuts= bat.get_final_masks(data, region_definitions= region_definitions)

for key,val in dcuts.items():
    print(key, val['name'])

In [None]:
cut1 = dcuts[1]['event']
print(len(cut1))

len(dcuts[3]['event'][cut1])

In [None]:
subset = ['spmode', 'BpostFitMes', 'BpostFitDeltaE', 'Lambda0_unc_Mass', \
      'BtagSideMes', 'BSphr', 'BThrust', 'BCosThetaS', \
      'R2', 'R2All', \
      'thrustMag', 'thrustMagAll', 'thrustCosTh', 'thrustCosThAll', 'sphericityAll', \
      'BCosSphr', 'BCosThetaT', 'BCosThrust', 'BLegendreP2', 'BR2ROE', 'BSphrROE', \
      'BThrustROE']

dcuts= bat.get_final_masks(data, region_definitions= region_definitions)

mask_event= dcuts[1]["event"]# & dcuts[2]["event"] & dcuts[3]["event"] 
mask= mask_event

df_sp = bat.dump_awkward_to_dataframe(data[mask], fields_to_dump=subset)

# Put the cuts into the dataframe 
cut1 = dcuts[1]['event'] # This is the main cut that gets rid of duplicates
cuts_to_add = [2, 3, 4, 6, -1]
for cut in cuts_to_add:
    bools = dcuts[cut]['event']
    colname = f'cut_{cut}'
    print(colname, len(bools[cut1]), bools[cut1])

    df_sp[colname] = bools[cut1]

###################################
# Collision
dcuts_col= bat.get_final_masks(data_collision, region_definitions= region_definitions)

mask_event= dcuts_col[1]["event"]# & dcuts[2]["event"] & dcuts[3]["event"] 
mask= mask_event

df_col = bat.dump_awkward_to_dataframe(data_collision[mask], fields_to_dump=subset)

# Put the cuts into the dataframe 
cut1 = dcuts_col[1]['event'] # This is the main cut that gets rid of duplicates
cuts_to_add = [2, 3, 4, 6, -1]
for cut in cuts_to_add:
    bools = dcuts_col[cut]['event']
    colname = f'cut_{cut}'
    print(colname, len(bools[cut1]), bools[cut1])

    df_col[colname] = bools[cut1]


df_sp

In [None]:
df_col

In [None]:
mask = df_sp['cut_-1'] == True
df_sp[mask].hist('BpostFitMes', bins=100)

In [None]:
def model_maker(df, sig_spmode="-999", bkg_spmode= "998", n_sig_bkg=[1000,1000], \
                columns_to_drop=["spmode","BpostFitMes","BpostFitDeltaE","Lambda0_unc_Mass"], \
                test_size_pct= 0.4, activator= "relu", solve_model= "adam", model_filename=None): 
            
    columns= df.columns

    print("columns...")
    print(columns)
    print()

    feature_names= columns[1:] ##exclude spmode
    #print(feature_names)
    filter_sig= df["spmode"]==sig_spmode
    filter_bkg= df["spmode"]==bkg_spmode
    
    df_sig= df[filter_sig].dropna().sample(n_sig_bkg[0])
    df_bkg= df[filter_bkg].dropna().sample(n_sig_bkg[1])

    print(len(df_sig), len(df_bkg))
    
    df_ML= pd.concat([df_sig,df_bkg])
    
    x=df_ML.drop(columns= columns_to_drop)
    y=df_ML["spmode"]

    print("y")
    print(y)
    print()
    
    #print("Hi there")

    feature_names= x.columns ##disc vars
    labels= y.unique() ##diff sp modes
    
    print("Training features:")
    print(feature_names)
    print()
    
    print("Labels (Outcome):")
    print(labels)
    print()
    
    print("The dataset (x) is the numbers without column names---")
    print("The variable y is truth info about the data (signal or bkg)")

    x_train, x_test, y_train, y_test= train_test_split(x,y, test_size= test_size_pct, random_state= 4)
    scaler= StandardScaler()

    # Look in "tips for practical use"
    # https://scikit-learn.org/stable/modules/neural_networks_supervised.html
    x_train= scaler.fit_transform(x_train)
    x_test= scaler.transform(x_test)
    
    model = MLPClassifier(max_iter= 1000, random_state= 3, activation= activator, solver= solve_model ) #n_iter_no_change= 15)

    model.fit(x_train, y_train)

    model.feature_names = feature_names
    
    workspace = {}
    workspace['model'] = model
    workspace['x_train'] = x_train
    workspace['y_train'] = y_train
    workspace['x_test'] = x_test
    workspace['y_test'] = y_test

    if model_filename is not None:
        joblib.dump(workspace, model_filename)

    return workspace

    

In [None]:
def plot_training_variables(df):

    #########################################################################
    # Plot the variables for the different spmodes
    #########################################################################

    print("Plotting the training variables...")
    nvars = len(df.columns)

    nrows = 5
    ncols = int(nvars / nrows) + 1
    
    fig, axes = plt.subplots(nrows = nrows, ncols = ncols)    # axes is 2d array (3x3)
    axes = axes.flatten()         # Convert axes to 1d array of length 9
    fig.set_size_inches(ncols*3, nrows*3)

    for ax, col in zip(axes, df.columns):
      sns.histplot(df, x=col, ax = ax, hue='spmode', stat='density', common_norm=False)
      ax.set_title(col)

    plt.tight_layout()

    #########################################################################
    # Plot the correlation matrices
    #########################################################################

    spmodes_in_file = df['spmode'].unique()
    # Drop the cuts columns
    cols = df_temp.columns

    cols_temp = []
    for col in cols:
        #print(col)
        if col[0:3]!='cut':
            cols_temp.append(col)
    cols_temp

    for spmode in spmodes_in_file:
        print(f"Making the correlation matrix for SP-{spmode}...")
        fig,ax = plt.subplots(figsize=(16,16))
        mask = df_temp['spmode'] == spmode
        
        sns.heatmap(df_temp[mask][cols_temp].drop(columns=['spmode']).corr(), center=0, cmap='coolwarm', annot=True, fmt='.2f', annot_kws={"size": 8})
        plt.title(f'Correlation matrix SP {spmode}')        

In [None]:
'''
cols = df_temp.columns

cols = cols.to_list()

cols_temp = []
for col in cols:
    #print(col)
    if col[0:3]!='cut':
        cols_temp.append(col)

cols_temp
'''

In [None]:
#df_sp['spmode'].unique()

In [None]:
#df_sp.columns
df_sp['spmode'].value_counts()

In [None]:
# Make a temporary dataframe with the cuts

mask = (df_sp['cut_2']==True) & (df_sp['cut_3']==True)

df_temp = df_sp[mask]

In [None]:
sp_mask = (df_temp['spmode']=='-999') | (df_temp['spmode']=='998')

plot_training_variables(df_temp[sp_mask])

In [None]:
#fig,ax = plt.subplots(figsize=(16,16))
#mask = df_temp['spmode'] == '998'
##corr = df_temp[mask].drop(columns=['spmode']).corr()
##corr.style.background_gradient(cmap='coolwarm', axis=None).format(precision=2)

#sns.heatmap(df_temp[mask].drop(columns=['spmode']).corr(), center=0, cmap='coolwarm', annot=True, fmt='.2f', annot_kws={"size": 8})

In [None]:
#mask = df_sp['spmode'] == '-999'
#corr = df_sp[mask].drop(columns=['spmode']).corr()
#corr.style.background_gradient(cmap='coolwarm', axis=None).format(precision=2)

In [None]:
columns_to_drop = ['cut_2', 'cut_3', 'cut_4', 'cut_6', 'cut_-1', \
                   "spmode","BpostFitMes","BpostFitDeltaE","Lambda0_unc_Mass", \
                  'BSphr', 'BThrust']
#["spmode","BpostFitMes","BpostFitDeltaE","Lambda0_unc_Mass","BtagSideMes"]

model_filename = "TEST_MODEL_SAVE.pkl"
workspace = model_maker(df_temp, columns_to_drop=columns_to_drop, \
                                                      n_sig_bkg=[2000, 2000], model_filename=model_filename)#, 

model = workspace['model']
x_train = workspace['x_train']
y_train = workspace['y_train']
x_test = workspace['x_test']
y_test = workspace['y_test']

model

In [None]:
#model

In [None]:
workspace = joblib.load('TEST_MODEL_SAVE.pkl')
#workspace

model = workspace['model']

model

In [None]:
workspace

### Not done yet 

In [None]:
# 
#def model_training_quality(model, x_train, y_train, x_test, y_test):
def model_training_quality(workspace):
    model = workspace['model']
    x_train = workspace['x_train']
    y_train = workspace['y_train']
    x_test = workspace['x_test']
    y_test = workspace['y_test']
    
    #model
    ###################################################################
    # Get the predictions for the training and testing samples
    ###################################################################
    decisions = []
    for X, y in ((x_train, y_train), (x_test, y_test)):

      # Use the outcome to select the truth information (>0.5 or <0.5)
      d1 = model.predict_proba(X[y == '998'])[:, 1]
      d2 = model.predict_proba(X[y == '-999'])[:, 1]
      decisions += [d1, d2]
    
    # Use this for the histogram ranges
    low = min(np.min(d) for d in decisions)
    high = max(np.max(d) for d in decisions)
    low_high = (low, high)
    
    
    #print(decisions)
    ###################################################################
    # Make a plot of the training sample predictions
    ###################################################################

    bins = 50
    plt.figure(figsize=(12, 6))
    plt.hist(decisions[0],
              color='r', alpha=0.5, range=low_high, bins=bins,
              histtype='stepfilled', density=True,
              label='Bkg (train)')
    plt.hist(decisions[1],
              color='b', alpha=0.5, range=low_high, bins=bins,
              histtype='stepfilled', density=True,
              label='Sig (train)')
    
    
    # Make a plot with error bars for the testing samples
    hists, bins = np.histogram(decisions[2],density=True,
                              bins=bins, range=low_high)
    scale = len(decisions[2]) / sum(hists)
    err = np.sqrt(hists * scale) / scale
    
    width = (bins[1] - bins[0])
    center = (bins[:-1] + bins[1:]) / 2
    
    plt.errorbar(center, hists, yerr=err, fmt='o', c='r', label='Bkg (test)')
    
    hists, bins = np.histogram(decisions[3],density=True,
                              bins=bins, range=low_high)
    scale = len(decisions[2]) / sum(hists)
    err = np.sqrt(hists * scale) / scale
    
    plt.errorbar(center, hists, yerr=err, fmt='o', c='b', label='Sig (test)')
    
    plt.xlabel("Classifer output")
    plt.ylabel("Arbitrary units")
    plt.legend(loc='best')

    # Confusion matrix
    # Testing the model i.e. predicting the labels of the test data.
    y_pred = model.predict(x_test)
    
    # Evaluating the results of the model
    accuracy = accuracy_score(y_test,y_pred)*100 ### returns the fraction of correctly classified samples 
    confusion_mat = confusion_matrix(y_test,y_pred)

    print("Accuracy for Neural Network is:",accuracy)
    print("Confusion Matrix")
    print(confusion_mat)
    
    tot_correct= confusion_mat[1][1] + confusion_mat[0][0]
    tot_wrong= confusion_mat[1][0] + confusion_mat[0][1]
    
    #print(tot_correct/(tot_correct+tot_wrong))
    
    ## The accuracy score is the total number classified correctly over the total number of classifications 


    # Turn this into a dataframe
    matrix_df = pd.DataFrame(confusion_mat)
    
    # Plot the result
    fig, ax = plt.subplots(figsize=(10,7))
    
    sns.set(font_scale=1.3)
    
    sns.heatmap(matrix_df, annot=True, fmt="g", ax=ax, cmap="magma")
    
    #labels = df['target_names'].tolist()
    labels = ['998', '-999'] # NEED TO FIX THIS SO IT IS NOT HARDCODED
    
    # Formatting details here
    # Set axis titles
    ax.set_title('Confusion Matrix - MLP')
    ax.set_xlabel("Predicted label", fontsize =15)
    ax.set_xticklabels(labels)
    ax.set_ylabel("True Label", fontsize=15)
    ax.set_yticklabels(labels, rotation = 0)
    #plt.show()

    # ROC

    decisions = model.predict_proba(x_test)[:, 1]

    ###################################################################
    # Compute ROC curve and area under the curve
    ###################################################################

    sig_bkg = np.ones_like(y_test, dtype=int)
    sig_bkg[y_test=='-999'] = 0

    fpr, tpr, thresholds = roc_curve(sig_bkg, decisions)
    roc_auc = auc(fpr, tpr)
    
    plt.figure(figsize=(8,6))
    plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.2f)' % (roc_auc))
    
    plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')
    plt.legend(loc="lower right")
    plt.grid()
    plt.show()


In [None]:
workspace = joblib.load('TEST_MODEL_SAVE.pkl')

model_training_quality(workspace)

In [None]:
#""""
def ML_FOM_Calculator(workspace, df_col):

    model = workspace['model']
    x_train = workspace['x_train']
    y_train = workspace['y_train']
    x_test = workspace['x_test']
    y_test = workspace['y_test']

    # Get the training vars
    training_vars = model.feature_names
    df_col_tmp = df_col[training_vars]

    print(training_vars)

    #### FOR SP 
    #for i in threshold:
    #    output_df= see_stuff(sig_samp= sig_samp,bkg_samp= bkg_samp, thresh= i, verbose= False, df=MC_data, df_col=coll_data)

    # 3. Scale the test data using the same scaler
    scaler = StandardScaler()
    x_dummy = scaler.fit_transform(x_train)

    x_test = scaler.transform(x_test)

    x_sp_proba = model.predict_proba(x_test)


    x_col_proba = model.predict_proba(df_col_tmp)
    
    #### FOR COLLISION
    #y_proba_col_sig = model.predict_proba()
    
    #sp998= sp_data["spmode"]== "998"
    #N_bkg = len(sp_998_df[sp998]) ## total number of background events (sp 998) 
    #signal_before= len(sp_999_df)
    #signal_after= len(sp_999_df)
    #efficiency = signal_after/signal_before ## the accuracy of the model after training with the SP 
    #fom = efficiency(threshold)/(np.sqrt(N_bkg(threshold)+sig_disc/2))
    #return fom
    return 0
#"""

In [None]:
df_col.dropna(inplace=True)
ML_FOM_Calculator(workspace, df_col)

In [None]:
model = workspace['model']

x_train = workspace['x_train']
y_train = workspace['y_train']
y_test = workspace['y_test']

#x_train
#y_train
y_test

In [None]:
model.feature_names = ['hi']


In [None]:
model.feature_names