## Preparing the CNN input

In [70]:
import uproot
import awkward as ak
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import numpy as np
import os

* Dataframe definition:

In [71]:
class ND280_dataset:
    def __init__(self, path):
        #tree_hattracks = uproot.open(path)['hatdigits']
        tree_hattracks = uproot.open(path)['hatdigits']
        
        df = tree_hattracks.arrays()
        tree = ak.to_dataframe(df)
        keys = tree.index.get_level_values('entry').unique()
        
        self.tree = tree
        self.keys = keys
        
    def __getitem__(self, i):
        key = self.keys[i]
        return self.tree.loc[key]
    
    def __len__(self):
        return len(self.keys)

**Step 1**  
Define path_1 and path_2 dataframes and assign 'label' column with the information of the type of particles involved (0 for protons and 1 for muons).

- path_1 (protons):

In [72]:
path_1="/data/neutrinos/common/casado/T2K/HAT-Reco/treemaker_one_proton.root"

In [73]:
ds = ND280_dataset(path_1)

In [74]:
df = pd.DataFrame(ds.tree)

In [75]:
df_unique_entries = df.groupby('entry').first()

In [76]:
df.groupby('entry').first()

Unnamed: 0_level_0,event,hat,plate,fem,fec,asic,channel,time,nsamples,adc,row,col,y,z,qmax,tmax,fwhm
entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,0,0,0,3,0,1,3,360,26,269,16,8,971.594971,-1419.595947,293,370,5
1,0,0,0,3,0,1,4,293,49,258,16,7,971.594971,-1430.875977,813,311,14
2,0,0,0,3,0,1,5,294,52,242,16,6,971.594971,-1442.156006,530,311,17
3,0,0,0,3,0,1,12,304,34,247,17,8,981.784973,-1419.595947,295,322,12
4,0,0,0,3,0,1,13,294,48,252,17,7,981.784973,-1430.875977,448,312,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1985017,9999,1,1,0,0,4,63,333,55,237,29,21,-427.934998,-2535.956055,314,352,23
1985018,9999,1,1,0,0,4,70,363,95,268,28,18,-438.125000,-2569.795898,299,385,53
1985019,9999,1,1,0,0,4,71,330,67,246,28,19,-438.125000,-2558.515869,717,352,22
1985020,9999,1,1,0,0,4,72,329,55,252,28,20,-438.125000,-2547.236084,902,347,15


In [77]:
print(len(df.groupby('event').first())-1)

9935


In [78]:
result_array = []
len_array = []
event_array = []

for event_value in range(9999):
    df_sel = df_unique_entries[df_unique_entries['event'] == event_value]
    
    data_list = []  
    
    for fem_value in range(8):
        df_sel_fem = df_sel[df_sel['fem'] == fem_value]
        num_inputs = len(df_sel_fem)

        if num_inputs != 0:
            mask_qmax = df_sel_fem['qmax'] != 0
            mask_tmax = df_sel_fem['tmax'] != 0

            qmax_values = df_sel_fem['qmax'][mask_qmax]
            tmax_values = df_sel_fem['tmax'][mask_tmax]
            
            # Apply row and col transformations based on fem_value
            row_offset = 0
            col_offset = 0
            if fem_value == 0:
                row_offset += 32
                col_offset += 0
            elif fem_value == 1:
                row_offset += 32
                col_offset += 36
            elif fem_value == 2:
                row_offset += 32
                col_offset += 72
            elif fem_value == 3:
                row_offset += 32
                col_offset += 108
            elif fem_value == 5:
                col_offset += 36
            elif fem_value == 6:
                col_offset += 72
            elif fem_value == 7:
                col_offset += 108

            row_values = df_sel_fem['row'][mask_qmax] + row_offset
            col_values = df_sel_fem['col'][mask_qmax] + col_offset
            
            for qmax, tmax, row, col in zip(qmax_values, tmax_values, row_values, col_values):
                data_list.append([row, col, qmax, tmax])
            
    result_array.append(data_list)
    len_array.append(len(data_list))
    event_array.append(event_value)

df_result_1 = pd.DataFrame({
    'event': event_array,
    'nhits': len_array,
    '[row, col, qmax, tmax]': result_array})

df_result_1
df_result_1 = df_result_1[df_result_1['nhits'] != 0]

In [79]:
df_result_1['label'] = 0

In [80]:
df_result_1

Unnamed: 0,event,nhits,"[row, col, qmax, tmax]",label
0,0,200,"[[56, 60, 291, 343], [56, 59, 631, 302], [56, ...",0
1,1,342,"[[59, 54, 4095, 279], [59, 55, 3812, 279], [59...",0
2,2,215,"[[56, 22, 295, 322], [56, 21, 444, 322], [56, ...",0
3,3,198,"[[56, 35, 414, 328], [57, 35, 479, 328], [58, ...",0
4,4,27,"[[59, 38, 384, 351], [59, 39, 1041, 348], [59,...",0
...,...,...,...,...
9993,9993,70,"[[56, 62, 1018, 339], [56, 61, 291, 360], [57,...",0
9994,9994,326,"[[56, 66, 389, 340], [56, 66, 1036, 453], [56,...",0
9995,9995,187,"[[56, 27, 823, 284], [56, 28, 531, 288], [57, ...",0
9996,9996,206,"[[58, 47, 369, 304], [58, 48, 286, 297], [59, ...",0


- path_2 (muons):

In [81]:
path_2= "/data/neutrinos/common/casado/T2K/HAT-Reco/treemaker_one_muon.root"

In [82]:
ds = ND280_dataset(path_2)

In [83]:
df = pd.DataFrame(ds.tree)

In [84]:
df_unique_entries = df.groupby('entry').first()

In [85]:
df.groupby('entry').first()

Unnamed: 0_level_0,event,hat,plate,fem,fec,asic,channel,time,nsamples,adc,row,col,y,z,qmax,tmax,fwhm
entry,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,0,0,1,2,0,3,41,284,47,247,0,35,808.554993,-1536.036011,476,301,14
1,0,0,1,2,0,3,42,284,46,257,0,34,808.554993,-1547.315918,378,303,13
2,0,0,1,2,0,3,50,283,48,250,1,35,818.744995,-1536.036011,621,299,14
3,0,0,1,2,0,3,51,301,26,280,1,34,818.744995,-1547.315918,294,311,14
4,0,0,1,2,0,3,51,331,26,247,1,34,818.744995,-1547.315918,286,341,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1165465,5623,1,1,1,1,0,64,298,60,251,30,13,-417.744995,-2205.196045,354,313,22
1165466,5623,1,1,1,1,0,65,296,49,254,30,14,-417.744995,-2193.916016,657,315,14
1165467,5623,1,1,1,1,0,67,303,49,254,30,15,-417.744995,-2182.635986,298,313,21
1165468,5623,1,1,1,1,0,75,296,48,247,31,14,-407.554993,-2193.916016,675,314,15


In [86]:
print(len(df.groupby('event').first())-1)

5613


In [87]:
result_array = []
len_array = []
event_array = []

for event_value in range(9999):
    df_sel = df_unique_entries[df_unique_entries['event'] == event_value]
    
    data_list = []  
    
    for fem_value in range(8):
        df_sel_fem = df_sel[df_sel['fem'] == fem_value]
        num_inputs = len(df_sel_fem)

        if num_inputs != 0:
            mask_qmax = df_sel_fem['qmax'] != 0
            mask_tmax = df_sel_fem['tmax'] != 0

            qmax_values = df_sel_fem['qmax'][mask_qmax]
            tmax_values = df_sel_fem['tmax'][mask_tmax]
            
            # Apply row and col transformations based on fem_value
            row_offset = 0
            col_offset = 0
            if fem_value == 0:
                row_offset += 32
                col_offset += 0
            elif fem_value == 1:
                row_offset += 32
                col_offset += 36
            elif fem_value == 2:
                row_offset += 32
                col_offset += 72
            elif fem_value == 3:
                row_offset += 32
                col_offset += 108
            elif fem_value == 5:
                col_offset += 36
            elif fem_value == 6:
                col_offset += 72
            elif fem_value == 7:
                col_offset += 108

            row_values = df_sel_fem['row'][mask_qmax] + row_offset
            col_values = df_sel_fem['col'][mask_qmax] + col_offset
            
            for qmax, tmax, row, col in zip(qmax_values, tmax_values, row_values, col_values):
                data_list.append([row, col, qmax, tmax])
            
    result_array.append(data_list)
    len_array.append(len(data_list))
    event_array.append(event_value)

df_result_2 = pd.DataFrame({
    'event': event_array,
    'nhits': len_array,
    '[row, col, qmax, tmax]': result_array})

df_result_2
df_result_2 = df_result_2[df_result_2['nhits'] != 0]

In [88]:
df_result_2['label'] = 1

In [89]:
df_result_2

Unnamed: 0,event,nhits,"[row, col, qmax, tmax]",label
0,0,209,"[[56, 45, 292, 317], [56, 46, 1102, 301], [56,...",1
1,1,213,"[[59, 54, 449, 354], [58, 54, 350, 357], [63, ...",1
2,2,179,"[[56, 36, 315, 350], [57, 37, 288, 359], [57, ...",1
3,3,126,"[[55, 53, 289, 365], [54, 53, 335, 340], [56, ...",1
4,4,210,"[[56, 55, 589, 341], [56, 54, 337, 344], [57, ...",1
...,...,...,...,...
5619,5619,144,"[[59, 54, 588, 388], [58, 54, 423, 388], [57, ...",1
5620,5620,216,"[[59, 31, 310, 355], [59, 30, 352, 353], [58, ...",1
5621,5621,193,"[[56, 64, 290, 318], [57, 64, 452, 317], [57, ...",1
5622,5622,210,"[[59, 51, 436, 318], [59, 50, 699, 316], [59, ...",1


**Step 2**  
Merge both dataframes into one

In [90]:
df_both_paths=pd.concat([df_result_1, df_result_2])

In [91]:
df_both_paths.reset_index(drop=True, inplace=True)

In [92]:
df_both_paths

Unnamed: 0,event,nhits,"[row, col, qmax, tmax]",label
0,0,200,"[[56, 60, 291, 343], [56, 59, 631, 302], [56, ...",0
1,1,342,"[[59, 54, 4095, 279], [59, 55, 3812, 279], [59...",0
2,2,215,"[[56, 22, 295, 322], [56, 21, 444, 322], [56, ...",0
3,3,198,"[[56, 35, 414, 328], [57, 35, 479, 328], [58, ...",0
4,4,27,"[[59, 38, 384, 351], [59, 39, 1041, 348], [59,...",0
...,...,...,...,...
15544,5619,144,"[[59, 54, 588, 388], [58, 54, 423, 388], [57, ...",1
15545,5620,216,"[[59, 31, 310, 355], [59, 30, 352, 353], [58, ...",1
15546,5621,193,"[[56, 64, 290, 318], [57, 64, 452, 317], [57, ...",1
15547,5622,210,"[[59, 51, 436, 318], [59, 50, 699, 316], [59, ...",1


**Step 3**   
Randomize the inputs entries

In [93]:
input_paths=df_both_paths.sample(frac=1)

In [94]:
input_paths.reset_index(drop=True, inplace=True)

In [95]:
input_paths

Unnamed: 0,event,nhits,"[row, col, qmax, tmax]",label
0,6121,403,"[[59, 35, 865, 358], [59, 34, 307, 374], [58, ...",0
1,1105,204,"[[59, 55, 462, 347], [59, 56, 1262, 346], [59,...",1
2,9545,195,"[[56, 63, 599, 335], [57, 63, 710, 335], [57, ...",0
3,4968,220,"[[56, 48, 313, 346], [56, 49, 403, 347], [56, ...",1
4,6548,21,"[[57, 59, 346, 350], [58, 60, 345, 348], [58, ...",0
...,...,...,...,...
15544,4161,357,"[[32, 41, 356, 83], [32, 42, 1891, 83], [32, 4...",0
15545,6566,245,"[[56, 47, 575, 284], [56, 48, 298, 298], [57, ...",0
15546,851,31,"[[53, 62, 305, 282], [54, 62, 516, 281], [55, ...",0
15547,1389,207,"[[51, 53, 446, 277], [51, 52, 297, 280], [54, ...",1


Save Dataframe:

In [96]:
path_pwd='/data/neutrinos/common/mrodrigu/CNN_discriminate_1-2_particles/input.csv'

In [97]:
input_paths.to_csv(path_pwd, index=False)