# Analyzing Training, Validation, and Testing Sets
Plotting some distributions of the test set.

In [3]:
import sys
import os
import h5py
from collections import Counter
from progressbar import *
import re
import numpy as np
from scipy import signal
import matplotlib
#from watchmal.testing.repeating_classifier_training_utils import *
from functools import reduce

# Add the path to the parent directory to augment search for module
par_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))

if par_dir not in sys.path:
    sys.path.append(par_dir)

sys.path.append("../..")
sys.path.append("..")

from testing.repeating_classifier_training import *

%load_ext autoreload
%matplotlib inline
%autoreload 2

from IPython.display import HTML

In [4]:
# HTML('''<script>
# code_show=true; 
# function code_toggle() {
#  if (code_show){
#  $('div.input').hide();
#  } else {
#  $('div.input').show();
#  }
#  code_show = !code_show
# } 
# $( document ).ready(code_toggle);
# </script>
# The raw code for this IPython notebook is by default hidden for easier reading.
# To toggle on/off the raw code, click <a href="javascript:code_toggle()">here</a>.''')

In [5]:
original_data_path = "/data/WatChMaL/data/IWCD_mPMT_Short_e-mu-gamma_E0to1000MeV_digihits.h5"
f = h5py.File(original_data_path, "r")

labels = np.array(f['labels'])

## Load dataset

In [6]:
# Import test events from h5 file
"""
# No filtered indices I don't think
filtered_index_file = "/fast_scratch/WatChMaL/data/IWCD_fulltank_300_pe_idxs.npz"
filtered_indices = np.load(filtered_index_file, allow_pickle=True)
test_filtered_indices = filtered_indices['test_idxs']

"""

original_data_path = "/data/WatChMaL/data/IWCD_mPMT_Short_e-mu-gamma_E0to1000MeV_digihits.h5"
f = h5py.File(original_data_path, "r")

print(f.keys())

hdf5_hit_pmt = f["hit_pmt"]
hdf5_hit_time = f["hit_time"]
hdf5_hit_charge = f["hit_charge"]


hit_pmt = np.memmap(original_data_path, mode="r", shape=hdf5_hit_pmt.shape,
                                    offset=hdf5_hit_pmt.id.get_offset(), dtype=hdf5_hit_pmt.dtype)

hit_time = np.memmap(original_data_path, mode="r", shape=hdf5_hit_time.shape,
                                    offset=hdf5_hit_time.id.get_offset(), dtype=hdf5_hit_time.dtype)

hit_charge = np.memmap(original_data_path, mode="r", shape=hdf5_hit_charge.shape,
                                    offset=hdf5_hit_charge.id.get_offset(), dtype=hdf5_hit_charge.dtype)

angles     = np.array(f['angles'])
energies   = np.array(f['energies'])
positions  = np.array(f['positions'])
labels     = np.array(f['labels'])
root_files = np.array(f['root_files'])

#original_radius = [np.sqrt(original_positions[i,0,0]**2 + original_positions[i,0,2]**2) for i in range(original_positions.shape[0])]


<KeysViewHDF5 ['angles', 'energies', 'event_hits_index', 'event_ids', 'hit_charge', 'hit_pmt', 'hit_time', 'labels', 'positions', 'root_files', 'veto', 'veto2']>


In [9]:
# Set up indices
indices = np.array(range(len(labels)))
# Set up root file set
root_file_set = list(set(root_files))

In [10]:
# Set up dict of file indices
file_dict = {}
for file in root_file_set:
    file_dict[file] = []
print("Dict set")

for idx, root_file in enumerate(root_files):
    file_dict[root_file].append(idx)
print("Done")

Dict set
Done


In [11]:
# Get files associated with each particle type
e_indices     = indices[np.where(labels == 0)]
e_root_file_set = list(set(root_files[e_indices]))

mu_indices    = indices[np.where(labels == 1)]
mu_root_file_set = list(set(root_files[mu_indices]))

gamma_indices = indices[np.where(labels == 2)]
gamma_root_file_set = list(set(root_files[gamma_indices]))

In [35]:
# Define indices retrieval function
def get_indices_for_files(file_names):
    all_indices = []
    for file_name in file_names:
        all_indices.extend(file_dict[file_name])
    return np.array(all_indices)
        

In [37]:
mu_test_files, mu_val_files, mu_train_files = mu_root_file_set[0:400], mu_root_file_set[400:500], mu_root_file_set[500:]

mu_test_set, mu_val_set, mu_train_set = get_indices_for_files(mu_test_files), get_indices_for_files(mu_val_files), get_indices_for_files(mu_train_files)

print(mu_test_set)

[ 165418  165419  165420 ... 5794864 5794865 5794866]


In [40]:
gamma_test_files, gamma_val_files, gamma_train_files = gamma_root_file_set[0:400], gamma_root_file_set[400:500], gamma_root_file_set[500:]

gamma_test_set, gamma_val_set, gamma_train_set = get_indices_for_files(gamma_test_files), get_indices_for_files(gamma_val_files), get_indices_for_files(gamma_train_files)

print(gamma_test_set)

[19118259 19118260 19118261 ... 19721096 19721097 19721098]


In [39]:
e_test_files, e_val_files, e_train_files = e_root_file_set[0:400], e_root_file_set[400:500], e_root_file_set[500:]

e_test_set, e_val_set, e_train_set = get_indices_for_files(e_test_files), get_indices_for_files(e_val_files), get_indices_for_files(e_train_files)

print(e_test_set)

[15599514 15599515 15599516 ... 13539179 13539180 13539181]


In [47]:
# Verify that indices match
all_e_indices = np.concatenate((e_test_set, e_val_set, e_train_set))
print(set(labels[all_e_indices]))

all_gamma_indices = np.concatenate((gamma_test_set, gamma_val_set, gamma_train_set))
print(set(labels[all_gamma_indices]))

all_mu_indices = np.concatenate((mu_test_set, mu_val_set, mu_train_set))
print(set(labels[all_mu_indices]))

{0}
{2}
{1}
20683927


In [48]:
# Verify that all events are uniquely accounted for
all_collected_indices = np.concatenate((e_test_set, e_val_set, e_train_set, gamma_test_set, gamma_val_set, gamma_train_set, mu_test_set, mu_val_set, mu_train_set))

print(len(labels))
print(len(all_collected_indices))
print(len(set(all_collected_indices)))

20683927
20683927
20683927


In [49]:
train_idxs = np.concatenate((e_train_set, mu_train_set, gamma_train_set))
val_idxs   = np.concatenate((e_val_set, mu_val_set, gamma_val_set))
test_idxs  = np.concatenate((e_test_set, mu_val_set, gamma_train_set))

In [None]:
np.savez('representative_indices.npz', train_idxs=train_idxs, val_idxs=val_idxs, test_idxs=test_idxs)

In [None]:
#np.savez('equal_indices.npz', train_idxs=train_idxs, val_idxs=val_idxs, test_idxs=test_idxs)