## NB to create the DECO dataset (from the deconvolution segmentation net output)

In [1]:
import glob
import sys
import pandas as pd
import tables as tb
import numpy as np
from invisible_cities.io.dst_io import df_writer

In [2]:
import networkx as nx
from scipy.spatial import KDTree
from sklearn.decomposition import PCA

In [3]:
def create_graph(df, max_distance, coords):
    '''
    Creates a graph using a KDTree for efficient distance computations.
    '''
    nodes = [tuple(x) for x in df[coords].to_numpy()]

    graph = nx.Graph()
    graph.add_nodes_from(nodes)
    
    # Use KDTree for fast neighbor search
    kdtree = KDTree(nodes)
    pairs = kdtree.query_pairs(r=max_distance + sys.float_info.epsilon) # add epsilon to include the max_distance value itself
    
    # Add edges for nodes within max_distance
    for i, j in pairs:
        graph.add_edge(nodes[i], nodes[j])
    return graph

In [4]:
def get_track_extent(voxels):
    # Apply PCA to find principal axis and projections
    pca = PCA(n_components=1)
    proj = pca.fit_transform(voxels)
    return proj.max() - proj.min()

In [5]:
def optimal_threshold_all_components(event, lowth, highth, precision = 6, min_vox_track = 5, extent_tol = 0.8):
    '''
    min_vox_track is the min number of voxels that a group of voxels has to have in an event to be taken into account
    with this, we make sure all the small "important" glumps get better results, and the spurious (which are typically 1 voxel) get deleted   
    '''

    # Step 1: Get the initial connected components (as lists of node identifiers)
    initial_G = create_graph(event, np.sqrt(3), ['xbin', 'ybin', 'zbin'])
    initial_components = [group for group in list(nx.connected_components(initial_G)) if len(group) > min_vox_track]  # List of sets, where tracks must have a minimum of voxels to be taken into account as an "important" track

    final_graph = None

    while highth - lowth > 10**(-precision):
        mid = (highth + lowth) / 2
        ev = event[event.class_1 > mid]

        if len(ev) <= 1:
            highth = mid
            continue

        graph = create_graph(ev, np.sqrt(3), ['xbin', 'ybin', 'zbin'])

        all_connected = True

        for comp in initial_components:
            # this makes sure one by one that all of them are "connected"
            subgraph = graph.subgraph(comp)
            if len(subgraph) == 0:
                all_connected = False
                break

            if not nx.is_connected(subgraph):
                all_connected = False
                break
            
        
        init_extent = get_track_extent(list(initial_G))
        deco_extent = get_track_extent(list(graph))
        extent_ok = (deco_extent / init_extent) >= extent_tol

        if all_connected & extent_ok:
            lowth = mid
            final_graph = graph  # save last valid graph
        else:
            highth = mid

    return round(lowth, precision), final_graph, round(deco_extent / init_extent, precision)

In [6]:
prec = 6
min_vox_track = 10
extent_tol = 0.85

min_thr = 0
max_thr = 1

In [7]:
filetype = 'train'
original_file = '/mnt/lustre/scratch/nlsas/home/usc/ie/mpm/NEXT100/data/HE_calib/4bar/trains/soph_deco/dataset_4bar_PORT_1a_label_scn_{}.h5'.format(filetype)

path = '/mnt/lustre/scratch/nlsas/home/usc/ie/mpm/NEXT100/data/HE_calib/4bar/trains/soph_deco/train_B/'
pred_file = path + 'pred_file_{}*.h5'.format(filetype)

out_file = path + 'dataset_4bar_PORT_1a_label_scn_DECO_{}.h5'.format(filetype)

files = sorted(glob.glob(pred_file), key = lambda x: int(x.split('_')[-1].split('.')[0]))

In [22]:
files[7:]

['/mnt/lustre/scratch/nlsas/home/usc/ie/mpm/NEXT100/data/HE_calib/4bar/trains/soph_deco/train_B/pred_file_train_7.h5']

In [23]:
evinf_df = pd.DataFrame([])
events_info = pd.read_hdf(original_file, 'DATASET/EventsInfo')
bins_info = pd.read_hdf(original_file, 'DATASET/BinsInfo')

for f in files[7:]:
    pred = pd.read_hdf(f, 'DATASET/VoxelsPred')
    for dat_id, event in pred.groupby('dataset_id'):
        if dat_id % 100 == 0:
            print(dat_id)
        thr, _, red = optimal_threshold_all_components(event, min_thr, max_thr, precision = prec, min_vox_track = min_vox_track, extent_tol=extent_tol)
        evinf_df = evinf_df.append(pd.DataFrame({'dataset_id':[dat_id], 'threshold': [thr], 'reduction':[red]}))
        thr_event = event[event.class_1 > thr]
        thr_event = thr_event.assign(energy=lambda df: df['energy'] / df['energy'].sum())
        thr_event['binclass'] = events_info[events_info.dataset_id == dat_id].binclass.values[0]
        thr_event = thr_event.rename(columns={'label':'decolabel'})
        thr_event = thr_event[['dataset_id', 'binclass', 'xbin', 'ybin', 'zbin', 'energy', 'decolabel', 'class_0', 'class_1']]
        with tb.open_file(out_file, 'a') as h5out:
            df_writer(h5out, thr_event, 'DATASET', 'Voxels', columns_to_index = ['dataset_id'])
        
#mergear con events info
assert len(events_info) == len(evinf_df)
events_info = events_info.merge(evinf_df, on = 'dataset_id')
#guardar events info con esta nueva info y bins info tal cual
with tb.open_file(out_file, 'a') as h5out:
    df_writer(h5out, bins_info, 'DATASET', 'BinsInfo')
    df_writer(h5out, events_info, 'DATASET', 'EventsInfo', columns_to_index=['dataset_id'], str_col_length=128)

105000
105100
105200
105300
105400
105500
105600
105700
105800
105900
106000
106100
106200
106300
106400
106500
106600
106700
106800
106900
107000
107100
107200
107300
107400
107500
107600
107700
107800
107900
108000
108100
108200
108300
108400
108500
108600
108700
108800
108900
109000
109100
109200
109300
109400
109500
109600
109700
109800
109900
110000
110100
110200
110300
110400
110500
110600
110700
110800
110900
111000
111100
111200
111300
111400
111500
111600
111700
111800
111900
112000
112100
112200
112300
112400
112500
112600
112700
112800
112900
113000
113100
113200
113300
113400
113500
113600
113700
113800
113900
114000
114100
114200
114300
114400
114500
114600
114700
114800
114900
115000
115100
115200
115300
115400
115500
115600
115700
115800
115900
116000
116100
116200
116300
116400
116500
116600
116700
116800
116900
117000
117100
117200
117300
117400
117500
117600
117700
117800
117900
118000
118100
118200
118300
118400
118500
118600
118700
118800
118900
119000
119100
119200

AssertionError: 

In [19]:
evinf_df

Unnamed: 0,dataset_id,threshold,reduction
0,0,0.444375,0.888990
0,1,0.121932,1.040006
0,2,0.376039,0.890630
0,3,0.331419,0.820157
0,4,0.404675,0.165961
...,...,...,...
0,104995,0.177602,0.864382
0,104996,0.493998,0.941009
0,104997,0.331453,0.966743
0,104998,0.196425,0.872359
