## Bellis's Lecture

https://colab.research.google.com/drive/12LHs9cL8-gXKr_ypaNfnwaO9bLF3I9tQ?usp=sharing

In [None]:
%load_ext autoreload
%autoreload 2


import uproot
import awkward as ak
import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import seaborn as sns

import time

from hist import Hist

import babar_analysis_tools as bat
from analysis_variables import *
import myPIDselector

In [None]:
start= time.time()

## My laptop
topdir= "/Users/josieswann/BaBar_analyses/BNV_pLambda/"

## Bellis computer
#topdir= "/home/bellis/babar_data/bnv_plambda"


filename= f"{topdir}/Background_and_signal_SP_modes_Only_Run_1.parquet"
#filename= f"{topdir}/Background_and_signal_SP_modes_All_runs.parquet" ## this won't run on mine 

data= ak.from_parquet(filename)

print(f"Took {time.time()-start} seconds")

IS_MC= True

#Collision data 

#filename = f'{topdir}/Background_SP_modes_Only_Run_1.parquet'
filename = f'{topdir}/Data_Only_Run_1_BLINDED.parquet'
#filename = f'{topdir}/Data_All_runs_BLINDED.parquet'
#filename = f'{topdir}/Data_All_runs_BLINDED.parquet'

start= time.time()
data_collision= ak.from_parquet(filename)

print(f"took {time.time()-start} seconds")

print(type(data_collision))


## Cross section info - scaling values 

In [None]:
dataset_information= pd.read_csv("dataset_statistics.csv")
cs_data= pd.read_csv("SP_cross_sections_and_labels.csv")

no_notes= cs_data.drop(["Uncertainty","Note: cross sections found at https://babar-wiki.heprc.uvic.ca/bbr_wiki/index.php/Physics/Cross_sections,_luminosities,_and_other_vital_stats"], axis= 1)
no_notes

## SP info and Region Defs

In [None]:
sp= data["spmode"]

splist= np.unique(sp.to_list())
splist

In [None]:
region_definitions

## Tag Side B (ask about this)

In [None]:
bat.fill_new_entry_with_tag_side_B(data)
data["BtagSideMes"]

In [None]:
all_hists= bat.create_empty_histograms(hist_defs)

bkg_spmodes= ["998","1005","3981","1235","1237"]
sig_spmodes= ["-999"]

spmodes= bkg_spmodes+sig_spmodes

weights= {}
for sp in spmodes: 
    weights[sp]= bat.scaling_value(int(sp),dataset_information=dataset_information, cs_data= cs_data, plot= False, verbose= False)
    #weights[sp]=1

weights["-999"]= 1000 #scales signal higher 
weights["0"]= 1 #idk what this is for;;; ASK

print(weights)

## Making the masks 

In [None]:
dcuts= bat.get_final_masks(data, region_definitions= region_definitions)

print([dcuts.keys()])
print()

for key in dcuts.keys():
    print(f'{key:3d} {dcuts[key]["name"]}')

dcuts[3]

In [None]:
#mask_event= dcuts[1]["event"]
#mask_event= dcuts[2]["event"]
mask_event= dcuts[3]["event"]
#mask_event= dcuts[4]["event"] ## individual cuts
#mask_event= dcuts[-1]["event"] ## all cuts

#mask_event= dcuts[2]["event"] & dcuts[3]["event"] & dcuts[4]["event"] ## combo of cuts

### ASK WHAT THESE MEAN
tag= "EARLY_CUT"
#tag= "FINAL_CUTS"

mask= mask_event

In [None]:
subset = ['spmode', 'BpostFitMes', 'BpostFitDeltaE', 'Lambda0_unc_Mass', \
          'BtagSideMes', 'BSphr', 'BThrust', 'BCosThetaS', \
          'R2', 'R2All', \
          'thrustMag', 'thrustMagAll', 'thrustCosTh', 'thrustCosThAll', 'sphericityAll', \
          'BCosSphr', 'BCosThetaT', 'BCosThrust', 'BLegendreP2', 'BR2ROE', 'BSphrROE', \
          'BThrustROE']

ak_array_type= type(data["spmode"])

df_dict={}
for var in subset: 
    x= data[mask][var] ##in each event, cut on the above cuts and pull out the info from each of the variables listed above
    if type(x[0]) == ak_array_type:
        x= ak.flatten(data[mask][var])
    df_dict[var] = x

df_out= pd.DataFrame.from_dict(df_dict)

outfilename= f"output_variables_{tag}.parquet"
df_out.to_parquet(outfilename)

df= df_out

df_out



In [None]:
df.groupby("spmode").count()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
filter= df["spmode"]== "-999"

g= sns.PairGrid(df[filter].sample(500), vars= ["BpostFitMes","BpostFitDeltaE"], hue= "spmode")
g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)

In [None]:
columns= df.columns
columns

In [None]:
filter = df['spmode'] != '-999'

#g = sns.PairGrid(df[filter].sample(500), vars=['BpostFitMes', 'BpostFitDeltaE'], hue='spmode')
g = sns.PairGrid(df[filter].sample(50), vars=columns[1:6], hue='spmode')

g.map_diag(sns.histplot)
g.map_offdiag(sns.scatterplot)
g.add_legend()

## Neural Net

In [None]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix, accuracy_score 
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import StandardScaler


In [None]:
feature_names= columns[1:] ##exclude spmode
print(feature_names)

In [None]:
df.groupby("spmode").count()["R2"] ## R2 doesn't matter we just want to see how many of each sp mode are there

In [None]:
filter_sig= df["spmode"]== "-999"
filter_bkg= df["spmode"]== "998"

df_sig= df[filter_sig].dropna().sample(6000)
df_bkg= df[filter_bkg].dropna().sample(6000)


df_ML= pd.concat([df_sig,df_bkg])

x= df_ML.drop(columns= ["spmode","BpostFitMes","BpostFitDeltaE","Lambda0_unc_Mass","BtagSideMes"])

y=df_ML["spmode"]

In [None]:
feature_names= x.columns ##disc vars
labels= y.unique() ##diff sp modes

print("Training features:")
print(feature_names)
print()

print("Labels (Outcome):")
print(labels)
print()

print("The dataset (x) is the numbers without column names---")
print("The variable y is truth info about the data (signal or bkg)")

In [None]:
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size= 0.4, random_state= 4)


In [None]:
'''
scaler= StandardScaler()
x_train= scaler.fit_transform(x_train)
x_test= scaler.transform(x_test)
'''

In [None]:
y_train

In [None]:
print(len(y_train[y_train=='-999']))
print(len(y_train[y_train=='998']))

## This should be about half and half since we used the same amount of data for each case (sig and bkg)

In [None]:
# Making the Neural Network Classifier
model = MLPClassifier(max_iter= 300, random_state= 3, activation= "relu", solver= "adam") #n_iter_no_change= 15)

# Training the model on the training data and labels
model.fit(x_train, y_train)

In [None]:
# Testing the model i.e. predicting the labels of the test data.
y_pred = model.predict(x_test)

# Evaluating the results of the model
accuracy = accuracy_score(y_test,y_pred)*100 ### returns the fraction of correctly classified samples 
confusion_mat = confusion_matrix(y_test,y_pred)

In [None]:
print("Accuracy for Neural Network is:",accuracy)
print("Confusion Matrix")
print(confusion_mat)

tot_correct= confusion_mat[1][1] + confusion_mat[0][0]
tot_wrong= confusion_mat[1][0] + confusion_mat[0][1]

#print(tot_correct/(tot_correct+tot_wrong))

## The accuracy score is the total number classified correctly over the total number of classifications 


In [None]:
# Turn this into a dataframe
matrix_df = pd.DataFrame(confusion_mat)

# Plot the result
fig, ax = plt.subplots(figsize=(10,7))

sns.set(font_scale=1.3)

sns.heatmap(matrix_df, annot=True, fmt="g", ax=ax, cmap="magma")

#labels = df['target_names'].tolist()
#labels = ['998', '-999'] # NEED TO FIX THIS SO IT IS NOT HARDCODED

# Formatting details here
# Set axis titles
ax.set_title('Confusion Matrix - MLP')
ax.set_xlabel("Predicted label", fontsize =15)
ax.set_xticklabels(labels)
ax.set_ylabel("True Label", fontsize=15)
ax.set_yticklabels(labels, rotation = 0)
plt.show()

In [None]:

# Get the predictions for the training and testing samples

decisions = []
for X, y in ((x_train, y_train), (x_test, y_test)):

  # Use the outcome to select the truth information (>0.5 or <0.5)
  d1 = model.predict_proba(X[y == '998'])[:, 1]
  d2 = model.predict_proba(X[y == '-999'])[:, 1]
  decisions += [d1, d2]

# Use this for the histogram ranges
low = min(np.min(d) for d in decisions)
high = max(np.max(d) for d in decisions)
low_high = (low, high)


print(decisions)
# Make a plot of the training sample predictions
bins = 50
plt.figure(figsize=(12, 6))
plt.hist(decisions[0],
          color='r', alpha=0.5, range=low_high, bins=bins,
          histtype='stepfilled', density=True,
          label='Bkg (train)')
plt.hist(decisions[1],
          color='b', alpha=0.5, range=low_high, bins=bins,
          histtype='stepfilled', density=True,
          label='Sig (train)')


# Make a plot with error bars for the testing samples
hists, bins = np.histogram(decisions[2],density=True,
                          bins=bins, range=low_high)
scale = len(decisions[2]) / sum(hists)
err = np.sqrt(hists * scale) / scale

width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2

plt.errorbar(center, hists, yerr=err, fmt='o', c='r', label='Bkg (test)')

hists, bins = np.histogram(decisions[3],density=True,
                          bins=bins, range=low_high)
scale = len(decisions[2]) / sum(hists)
err = np.sqrt(hists * scale) / scale

plt.errorbar(center, hists, yerr=err, fmt='o', c='b', label='Sig (test)')

plt.xlabel("Classifer output")
plt.ylabel("Arbitrary units")
plt.legend(loc='best')

In [None]:
#decisions
#y_test

print(y_test)

sig_bkg = np.ones_like(y_test, dtype=int)
sig_bkg[y_test=='-999'] = 0

print(sig_bkg)

In [None]:
decisions = model.predict_proba(x_test)[:, 1]

# Compute ROC curve and area under the curve
fpr, tpr, thresholds = roc_curve(sig_bkg, decisions)
roc_auc = auc(fpr, tpr)

plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, lw=1, label='ROC (area = %0.2f)' % (roc_auc))

plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck')
plt.xlim([-0.05, 1.05])
plt.ylim([-0.05, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.grid()
plt.show()


In [None]:
y_test.unique()

In [None]:
df_plot = x_test.copy()
print(y_test.values)
df_plot['spmode'] = y_test.values

df_plot

print(len(x_test), len(y_test))
print(len(df_plot))

In [None]:
fig, axes = plt.subplots(nrows = 5, ncols = 4)    # axes is 2d array (3x3)
axes = axes.flatten()         # Convert axes to 1d array of length 9
fig.set_size_inches(15, 15)

for ax, col in zip(axes, df_plot.columns):
  sns.histplot(df_plot, x=col, ax = ax, hue='spmode', stat='density', common_norm=False)
  ax.set_title(col)

plt.tight_layout()

# Bellis suggestions for next step

In [None]:
mask_event= dcuts[-1]["event"] ## all cuts

tag= "FINAL_CUTS"

mask= mask_event

subset = ['spmode', 'BpostFitMes', 'BpostFitDeltaE', 'Lambda0_unc_Mass', \
          'BtagSideMes', 'BSphr', 'BThrust', 'BCosThetaS', \
          'R2', 'R2All', \
          'thrustMag', 'thrustMagAll', 'thrustCosTh', 'thrustCosThAll', 'sphericityAll', \
          'BCosSphr', 'BCosThetaT', 'BCosThrust', 'BLegendreP2', 'BR2ROE', 'BSphrROE', \
          'BThrustROE']

ak_array_type= type(data["spmode"])

df_dict={}
for var in subset: 
    x= data[mask][var] ##in each event, cut on the above cuts and pull out the info from each of the variables listed above
    if type(x[0]) == ak_array_type:
        x= ak.flatten(data[mask][var])
    df_dict[var] = x

df_final= pd.DataFrame.from_dict(df_dict)

outfilename= f"output_variables_{tag}.parquet"
df_final.to_parquet(outfilename)

df_final



In [None]:
df_final.groupby("spmode").count()["R2"] ## R2 doesn't matter we just want to see how many of each sp mode are there

In [None]:
mask_final_sp = df_final['spmode']=='998'
x_final_bkg= df_final[mask_final_sp].drop(columns= ["spmode","BpostFitMes","BpostFitDeltaE","Lambda0_unc_Mass","BtagSideMes"]).dropna()

mask_final_sp = df_final['spmode']=='-999'
x_final_sig= df_final[mask_final_sp].drop(columns= ["spmode","BpostFitMes","BpostFitDeltaE","Lambda0_unc_Mass","BtagSideMes"]).dropna()


proba_final_bkg = model.predict_proba(x_final_bkg)
proba_final_sig = model.predict_proba(x_final_sig)


proba_final_bkg

In [None]:
plt.hist(proba_final_bkg[:,0],bins=20, range=(0,1), alpha=0.5, label='bkg', density=True)
plt.hist(proba_final_sig[:,0],bins=20, range=(0,1), alpha=0.5, label='sig', density=True)


#plt.hist(proba_final[:,1],bins=10, range=(0,1), alpha=0.5)

plt.legend();

## Underfitting

In [None]:
filter_sig= df["spmode"]== "-999"
filter_bkg= df["spmode"]== "998"

df_sig= df[filter_sig].dropna().sample(3000)
df_bkg= df[filter_bkg].dropna().sample(3000)


df_ML= pd.concat([df_sig,df_bkg])

x= df_ML.drop(columns= ["spmode","BpostFitMes","BpostFitDeltaE","Lambda0_unc_Mass",
                       ])

y=df_ML["spmode"]

feature_names= x.columns ##disc vars
labels= y.unique() ##diff sp modes

print("Training features:")
print(feature_names)
print()

print("Labels (Outcome):")
print(labels)
print()

print("The dataset (x) is the numbers without column names---")
print("The variable y is truth info about the data (signal or bkg)")


In [None]:
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size= 0.06, random_state= 4)


In [None]:
# Making the Neural Network Classifier
model = MLPClassifier(max_iter= 50, random_state= 3, activation= "identity", solver= "adam", hidden_layer_sizes=2) #n_iter_no_change= 15)

# Training the model on the training data and labels
model.fit(x_train, y_train)

In [None]:
# Testing the model i.e. predicting the labels of the test data.
y_pred = model.predict(x_test)

# Evaluating the results of the model
accuracy = accuracy_score(y_test,y_pred)*100 ### returns the fraction of correctly classified samples 
confusion_mat = confusion_matrix(y_test,y_pred)

# Collision data stuff

In [None]:
print(data_collision)

In [None]:
# This seems to not work
#BPFM= data_collision["BpostFitMes"]
#BPFDE= data_collision["BpostFitDeltaE"]

#############################################################
# Bellis edits
#############################################################
# Can we read in the Monte Carlo data?
BPFM= data["BpostFitMes"]
BPFDE= data["BpostFitDeltaE"]

# Can we plot the Monte Carlo data if we flatten it??
BPFM_sp= ak.flatten(data["BpostFitMes"])
BPFDE_sp= ak.flatten(data["BpostFitDeltaE"])

# Can we plot the collision data if we flatten it??
BPFM_coll= ak.flatten(data_collision["BpostFitMes"])
BPFDE_coll= ak.flatten(data_collision["BpostFitDeltaE"])
#############################################################

print(type(BPFM_coll))

plt.figure(figsize= (16,8))

plt.subplot(1,2,1)
plt.title("B post fit MES")
plt.hist(BPFM_coll, bins= 100, range= (3.5,5.5));
plt.xlabel("Mass [GeV/c^2]")

plt.subplot(1,2,2)
plt.title("B post fit Delta E")
plt.hist(BPFDE_coll, bins= 100, range=(-1,1));
plt.xlabel("E [GeV]")

#print(type(BPFM))

#plt.scatter(BPFM_coll, BPFDE_coll)

print(BPFM_coll)

In [None]:
type(BPFM)
%reload_ext autoreload

In [None]:
#bat.plot_mes_vs_DeltaE(BPFM_coll, BPFDE_coll)


import hist as hist


In [None]:
from hist import Hist

In [None]:
plt.figure(figsize=(8, 8))

h= Hist(
    hist.axis.Regular(400,3,7,name= "BPFM", label= "mass [GeV/c^2]", flow= True),
    hist.axis.Regular(350,-.75,1,name= "BPFMDE", label= "energy [GeV]", flow= True),
)

# normal fill
h.fill(BPFM_coll, BPFDE_coll)

h.plot2d_full(
    main_cmap="coolwarm",
    top_ls="--",
    top_color="orange",
    top_lw=2,
    side_ls=":",
    side_lw=2,
    side_color="steelblue",
)

plt.xlim(5.1,5.3)
plt.ylim(-.5,.5)
plt.show()

In [None]:
dcuts= bat.get_final_masks(data_collision, region_definitions= region_definitions)

print([dcuts.keys()])
print()

for key in dcuts.keys():
    print(f'{key:3d} {dcuts[key]["name"]}')

dcuts[3]

In [None]:
subset = ['spmode', 'BpostFitMes', 'BpostFitDeltaE', 'Lambda0_unc_Mass', \
          'BSphr', 'BThrust', 'BCosThetaS', \
          'R2', 'R2All',\
          'thrustMag', 'thrustMagAll', 'thrustCosTh', 'thrustCosThAll', 'sphericityAll', \
          'BCosSphr', 'BCosThetaT', 'BCosThrust', 'BLegendreP2', 'BR2ROE', 'BSphrROE', \
          'BThrustROE']

ak_array_type= type(data_collision["spmode"])

mask_event= dcuts[-1]["event"] ## all cuts
mask= mask_event

df_dict={}
for var in subset: 
    x= data_collision[mask][var] ##in each event, cut on the above cuts and pull out the info from each of the variables listed above
    if type(x[0]) == ak_array_type:
        x= ak.flatten(data_collision[mask][var])
    df_dict[var] = x

df_out= pd.DataFrame.from_dict(df_dict)

outfilename= f"output_variables_{tag}.parquet"
df_out.to_parquet(outfilename)

df= df_out

df_out



In [None]:

import hist as hist
from hist import Hist

plt.figure(figsize=(8, 8))

h= Hist(
    hist.axis.Regular(400,3,7,name= "BPFM", label= "mass [GeV/c^2]", flow= True),
    hist.axis.Regular(350,-.75,1,name= "BPFMDE", label= "energy [GeV]", flow= True),
)

# normal fill
h.fill(df_out["BpostFitMes"], df_out["BpostFitDeltaE"])

h.plot2d_full(
    main_cmap="coolwarm",
    top_ls="--",
    top_color="orange",
    top_lw=2,
    side_ls=":",
    side_lw=2,
    side_color="steelblue",
)

plt.xlim(5.1,5.3)
plt.ylim(-.5,.5)
plt.show()


In [None]:
df_test= df_out.drop(columns= ["spmode","BpostFitMes","BpostFitDeltaE","Lambda0_unc_Mass"])

In [None]:
y_pred = model.predict(df_test)  #### Change x_test to the correct data from df_out
y_test = 
# Evaluating the results of the model
accuracy = accuracy_score(y_test,y_pred)*100 ### returns the fraction of correctly classified samples 
confusion_mat = confusion_matrix(y_test,y_pred)

In [None]:
print("Accuracy for Neural Network is:",accuracy)
print("Confusion Matrix")
print(confusion_mat)

tot_correct= confusion_mat[1][1] + confusion_mat[0][0]
tot_wrong= confusion_mat[1][0] + confusion_mat[0][1]

#print(tot_correct/(tot_correct+tot_wrong))

## The accuracy score is the total number classified correctly over the total number of classifications 


In [None]:
# Turn this into a dataframe
matrix_df = pd.DataFrame(confusion_mat)

# Plot the result
fig, ax = plt.subplots(figsize=(10,7))

sns.set(font_scale=1.3)

sns.heatmap(matrix_df, annot=True, fmt="g", ax=ax, cmap="magma")

#labels = df['target_names'].tolist()
#labels = ['998', '-999'] # NEED TO FIX THIS SO IT IS NOT HARDCODED

# Formatting details here
# Set axis titles
ax.set_title('Confusion Matrix - MLP')
ax.set_xlabel("Predicted label", fontsize =15)
ax.set_xticklabels(labels)
ax.set_ylabel("True Label", fontsize=15)
ax.set_yticklabels(labels, rotation = 0)
plt.show()

In [None]:

# Get the predictions for the training and testing samples

decisions = []
for X, y in ((x_train, y_train), (x_test, y_test)):

  # Use the outcome to select the truth information (>0.5 or <0.5)
  d1 = model.predict_proba(X[y == '998'])[:, 1]
  d2 = model.predict_proba(X[y == '-999'])[:, 1]
  decisions += [d1, d2]

# Use this for the histogram ranges
low = min(np.min(d) for d in decisions)
high = max(np.max(d) for d in decisions)
low_high = (low, high)


print(decisions)
# Make a plot of the training sample predictions
bins = 50
plt.figure(figsize=(12, 6))
plt.hist(decisions[0],
          color='r', alpha=0.5, range=low_high, bins=bins,
          histtype='stepfilled', density=True,
          label='Bkg (train)')
plt.hist(decisions[1],
          color='b', alpha=0.5, range=low_high, bins=bins,
          histtype='stepfilled', density=True,
          label='Sig (train)')


# Make a plot with error bars for the testing samples
hists, bins = np.histogram(decisions[2],density=True,
                          bins=bins, range=low_high)
scale = len(decisions[2]) / sum(hists)
err = np.sqrt(hists * scale) / scale

width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2

plt.errorbar(center, hists, yerr=err, fmt='o', c='r', label='Bkg (test)')

hists, bins = np.histogram(decisions[3],density=True,
                          bins=bins, range=low_high)
scale = len(decisions[2]) / sum(hists)
err = np.sqrt(hists * scale) / scale

plt.errorbar(center, hists, yerr=err, fmt='o', c='b', label='Sig (test)')

plt.xlabel("Classifer output")
plt.ylabel("Arbitrary units")
plt.legend(loc='best')

# TRYING TO BIAS/OVERFIT/UNDERFIT

In [None]:
print(f"Accuracy for Neural Network is: {accuracy:.2f}")
print("Confusion Matrix")
print(confusion_mat)

tot_correct= confusion_mat[1][1] + confusion_mat[0][0]
tot_wrong= confusion_mat[1][0] + confusion_mat[0][1]

#print(tot_correct/(tot_correct+tot_wrong))

## The accuracy score is the total number classified correctly over the total number of classifications 


In [None]:
# Turn this into a dataframe
matrix_df = pd.DataFrame(confusion_mat)

# Plot the result
fig, ax = plt.subplots(figsize=(10,7))

sns.set(font_scale=1.3)

sns.heatmap(matrix_df, annot=True, fmt="g", ax=ax, cmap="magma")

#labels = df['target_names'].tolist()
#labels = ['998', '-999'] # NEED TO FIX THIS SO IT IS NOT HARDCODED

# Formatting details here
# Set axis titles
ax.set_title('Confusion Matrix - MLP')
ax.set_xlabel("Predicted label", fontsize =15)
ax.set_xticklabels(labels)
ax.set_ylabel("True Label", fontsize=15)
ax.set_yticklabels(labels, rotation = 0)
plt.show()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn import svm

In [None]:
clf= svm.SVC(kernel="linear", C=1, random_state=2).fit(x_train,y_train)

In [None]:
print(f"CV score: {clf.score(x_test,y_test):.2f}")

In [None]:

# Get the predictions for the training and testing samples

decisions = []
for X, y in ((x_train, y_train), (x_test, y_test)):

  # Use the outcome to select the truth information (>0.5 or <0.5)
  d1 = model.predict_proba(X[y == '998'])[:, 1]
  d2 = model.predict_proba(X[y == '-999'])[:, 1]
  decisions += [d1, d2]

# Use this for the histogram ranges
low = min(np.min(d) for d in decisions)
high = max(np.max(d) for d in decisions)
low_high = (low, high)

# Make a plot of the training sample predictions
bins = 50
plt.figure(figsize=(12, 6))
plt.hist(decisions[0],
          color='r', alpha=0.5, range=low_high, bins=bins,
          histtype='stepfilled', density=True,
          label='Bkg (train)')
plt.hist(decisions[1],
          color='b', alpha=0.5, range=low_high, bins=bins,
          histtype='stepfilled', density=True,
          label='Sig (train)')


# Make a plot with error bars for the testing samples
hist, bins = np.histogram(decisions[2],density=True,
                          bins=bins, range=low_high)
scale = len(decisions[2]) / sum(hist)
err = np.sqrt(hist * scale) / scale

width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2

plt.errorbar(center, hist, yerr=err, fmt='o', c='r', label='Bkg (test)')

hist, bins = np.histogram(decisions[3],density=True,
                          bins=bins, range=low_high)
scale = len(decisions[2]) / sum(hist)
err = np.sqrt(hist * scale) / scale

plt.errorbar(center, hist, yerr=err, fmt='o', c='b', label='Sig (test)')

plt.xlabel("Classifer output")
plt.ylabel("Arbitrary units")
plt.legend(loc='best')

## BIAS

In [None]:
filter_sig= df["spmode"]== "-999"
filter_bkg= df["spmode"]== "998"

df_sig= df[filter_sig].dropna().sample(3000)
df_bkg= df[filter_bkg].dropna().sample(3000)


df_ML= pd.concat([df_sig,df_bkg])

x= df_ML.drop(columns= ["spmode","BpostFitMes","BpostFitDeltaE","Lambda0_unc_Mass",\
                       'thrustMag', 'thrustMagAll', 'thrustCosTh', 'thrustCosThAll',\
                        'sphericityAll', 'BCosSphr', 'BCosThetaT', 'BCosThrust', \
                        'BLegendreP2','BR2ROE', 'BSphrROE', 'BThrustROE',"R2",\
                        "BtagSideMes", "BSphr"
                       ])

y=df_ML["spmode"]

feature_names= x.columns ##disc vars
labels= y.unique() ##diff sp modes

print("Training features:")
print(feature_names)
print()

print("Labels (Outcome):")
print(labels)
print()

print("The dataset (x) is the numbers without column names---")
print("The variable y is truth info about the data (signal or bkg)")


In [None]:
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size= 0.06, random_state= 4)


In [None]:
# Making the Neural Network Classifier
model = MLPClassifier(max_iter= 50, random_state= 3, activation= "identity", solver= "adam", hidden_layer_sizes=3) #n_iter_no_change= 15)

# Training the model on the training data and labels
model.fit(x_train, y_train)

In [None]:
# Testing the model i.e. predicting the labels of the test data.
y_pred = model.predict(x_test)

# Evaluating the results of the model
accuracy = accuracy_score(y_test,y_pred)*100 ### returns the fraction of correctly classified samples 
confusion_mat = confusion_matrix(y_test,y_pred)

In [None]:
print(f"Accuracy for Neural Network is: {accuracy:.2f}")
print("Confusion Matrix")
print(confusion_mat)

tot_correct= confusion_mat[1][1] + confusion_mat[0][0]
tot_wrong= confusion_mat[1][0] + confusion_mat[0][1]

#print(tot_correct/(tot_correct+tot_wrong))

## The accuracy score is the total number classified correctly over the total number of classifications 


In [None]:
# Turn this into a dataframe
matrix_df = pd.DataFrame(confusion_mat)

# Plot the result
fig, ax = plt.subplots(figsize=(10,7))

sns.set(font_scale=1.3)

sns.heatmap(matrix_df, annot=True, fmt="g", ax=ax, cmap="magma")

#labels = df['target_names'].tolist()
#labels = ['998', '-999'] # NEED TO FIX THIS SO IT IS NOT HARDCODED

# Formatting details here
# Set axis titles
ax.set_title('Confusion Matrix - MLP')
ax.set_xlabel("Predicted label", fontsize =15)
ax.set_xticklabels(labels)
ax.set_ylabel("True Label", fontsize=15)
ax.set_yticklabels(labels, rotation = 0)
plt.show()

In [None]:
clf= svm.SVC(kernel="linear", C=1, random_state=2).fit(x_train,y_train)

In [None]:
print(f"CV score: {clf.score(x_test,y_test):.2f}")

In [None]:

# Get the predictions for the training and testing samples

decisions = []
for X, y in ((x_train, y_train), (x_test, y_test)):

  # Use the outcome to select the truth information (>0.5 or <0.5)
  d1 = model.predict_proba(X[y == '998'])[:, 1]
  d2 = model.predict_proba(X[y == '-999'])[:, 1]
  decisions += [d1, d2]

# Use this for the histogram ranges
low = min(np.min(d) for d in decisions)
high = max(np.max(d) for d in decisions)
low_high = (low, high)

# Make a plot of the training sample predictions
bins = 50
plt.figure(figsize=(12, 6))
plt.hist(decisions[0],
          color='r', alpha=0.5, range=low_high, bins=bins,
          histtype='stepfilled', density=True,
          label='Bkg (train)')
plt.hist(decisions[1],
          color='b', alpha=0.5, range=low_high, bins=bins,
          histtype='stepfilled', density=True,
          label='Sig (train)')


# Make a plot with error bars for the testing samples
hist, bins = np.histogram(decisions[2],density=True,
                          bins=bins, range=low_high)
scale = len(decisions[2]) / sum(hist)
err = np.sqrt(hist * scale) / scale

width = (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2

plt.errorbar(center, hist, yerr=err, fmt='o', c='r', label='Bkg (test)')

hist, bins = np.histogram(decisions[3],density=True,
                          bins=bins, range=low_high)
scale = len(decisions[2]) / sum(hist)
err = np.sqrt(hist * scale) / scale

plt.errorbar(center, hist, yerr=err, fmt='o', c='b', label='Sig (test)')

plt.xlabel("Classifer output")
plt.ylabel("Arbitrary units")
plt.legend(loc='best')