## NB of the functions (construction and test) to transform the MC labelled events into graphs

In [1]:
import os
import copy
import random
import itertools

import numpy  as np
import pandas as pd
import tables as tb
import networkx as nx
import os.path as osp

from glob import glob

import invisible_cities.io.dst_io as dio

import torch
from torch_geometric.data import Data, Dataset
from torch_geometric.loader import DataLoader
from torch_geometric.data.makedirs import makedirs
from torch_geometric.nn import GCNConv
from torch.nn import BatchNorm1d, CrossEntropyLoss
import torch.nn.functional as F

In [28]:
i = 1
file_nexus = '/mnt/lustre/scratch/nlsas/home/usc/ie/mpm/NEXT100/data/pressure_topology/13bar/0nubb/label/prod/nexus_label_{n}_0nubb.h5'
file_graph = '/mnt/lustre/scratch/nlsas/home/usc/ie/mpm/NEXT100/data/pressure_topology/13bar/0nubb/graph_nn/prod/nexus_graph_nn_{n}_0nubb.pt'

df = dio.load_dst(file_nexus.format(n=i), 'DATASET', 'MCVoxels')

Function to create graphs with Networkx

In [29]:
def create_graph(df, max_distance, coords):
    '''
    Takes a dataframe and creates a graph with the coordinates as nodes, which are connected by edges 
    if they are separated less than certain distance.
    
    Args:
        df: DATAFRAME
    Contains spatial information (at least).
        
        max_distance: FLOAT
    Indicates the maximum distance between nodes to be connected.
        
        coords: LIST OF STR
    Indicates the names of the df columns that have the coordinates info.
    
    RETURNS: 
        graph: NETWORKX GRAPH
    Graph with the nodes and their connections.
    '''
    
    nodes = [tuple(x) for x in df[coords].to_numpy()]
    
    graph = nx.Graph()
    graph.add_nodes_from(nodes)
    
    #Ahora hacemos los edges para contar las componentes conexas
    for va, vb in itertools.combinations(graph.nodes(), 2):
        va_arr, vb_arr = np.array(va), np.array(vb)
        dis = np.linalg.norm(va_arr-vb_arr)
        if dis <= max_distance:
            graph.add_edge(va, vb, distance = dis)
    return graph

Functions to transform our data into data for a graph neural network

In [30]:
def edge_index(event, 
               max_distance = np.sqrt(3), 
               norm_features = True,
               ener_name = 'ener', 
               coord_names = ['xbin', 'ybin', 'zbin'], 
               directed = False, 
               fully_connected = False):
    ''' 
    Creates the edge index tensor, with shape [2, E] where E is the number of edges.
    It contains the index of the nodes that are connected by an edge. 
    Also creates the edge features tensor, with shape [E, D] being D the number of features. In this case we add the distance, and a sort of gradient.
    Also creates the edge weights tensor, with shape E: one weight assigned to each edge. In this case we use the inverse of the distance. 
    '''
    def grad(ener, dis, i, j): return abs(ener[i] - ener[j]) / dis
    def inve(dis): return 1 / dis

    coord = event[coord_names].T
    ener  = event[ener_name]
    ener = ener / sum(ener) if norm_features else ener
    edges, edge_features, edge_weights = [], [], []
    node_comb = itertools.combinations if directed else itertools.permutations
    for i, j in node_comb(coord, 2):
        dis = np.linalg.norm(coord[i].values - coord[j].values)
        #append info for all edges if fully_connected, or if not, only the edges for the closest nodes
        if fully_connected or dis <= max_distance:
            edges.append([i, j])
            edge_features.append([dis, grad(ener, dis, i, j)])
            edge_weights.append(inve(dis))
    edges, edge_features, edge_weights = torch.tensor(edges, dtype = torch.long).T, torch.tensor(edge_features), torch.tensor(edge_weights)
    return edges, edge_features, edge_weights


In [31]:
edge_index(df[df.dataset_id == 10], coord_names = ['x', 'y', 'z'], directed = False, fully_connected=True)[0].shape

torch.Size([2, 12210])

In [32]:
def graphData(event, 
              data_id, 
              features = ['energy'], 
              label_n = ['segclass'], 
              norm_features = True, 
              max_distance = np.sqrt(3), 
              ener_name = 'energy', 
              coord_names = ['xbin', 'ybin', 'zbin'], 
              directed = False, 
              fully_connected = False, 
              simplify_segclass = False):
    event.reset_index(drop = True, inplace = True)
    edges, edge_features, edge_weights = edge_index(event, 
                                                    max_distance=max_distance, 
                                                    norm_features = norm_features,
                                                    ener_name=ener_name, 
                                                    coord_names=coord_names, 
                                                    directed=directed, 
                                                    fully_connected=fully_connected)
    #nodes features, for now just the energy; the node itself is defined by its position
    features = event[features]
    features = features / features.sum() if norm_features else features
    nodes = torch.tensor(features.values)
    #nodes segmentation label
    seg = event[label_n].values
    if simplify_segclass:
        label_map = {1:1, 2:2, 3:3, 4:1, 5:2, 6:3, 7:4}
        seg = np.array([label_map[i] for i in seg])
    #we can try to add also the transformation just to have track + blob (+ ghost)
    #shifting already the label below!!
    label = torch.tensor(seg - 1)
    coords = torch.tensor(event[coord_names].values)
    bincl = event.binclass.unique()[0]
    graph_data = Data(x = nodes, edge_index = edges, edge_attr = edge_features, edge_weight = edge_weights, y = label, num_nodes = len(nodes), coords = coords, dataset_id = data_id, binclass = bincl)
    return graph_data

In [33]:
data_obj = graphData(df[df.dataset_id == 10], 10, features = ['ener', 'nhits'], norm_features = True, ener_name = 'ener', coord_names = ['x', 'y', 'z'], directed = False, fully_connected=True)

data_obj

Data(x=[111, 2], edge_index=[2, 12210], edge_attr=[12210, 2], y=[111, 1], edge_weight=[12210], num_nodes=111, coords=[111, 3], dataset_id=10, binclass=1)

In [34]:
def graphDataset(file, 
                 group = 'DATASET', 
                 table = 'BeershebaVoxels', 
                 id = 'dataset_id', 
                 features = ['energy'], 
                 label_n = ['segclass'], 
                 norm_features = True, 
                 ener_name = 'energy', 
                 max_distance = np.sqrt(3), 
                 coord_names = ['xbin', 'ybin', 'zbin'], 
                 directed = False, 
                 fully_connected = False, 
                 simplify_segclass = False):
    df = dio.load_dst(file, group, table)
    dataset = []
    for dat_id, event in df.groupby(id):
        #event = event.reset_index(drop = True) #esto lo hace ahora dentro de graphData
        graph_data = graphData(event, 
                               dat_id, 
                               features=features, 
                               label_n=label_n, 
                               norm_features = norm_features, 
                               max_distance=max_distance, 
                               ener_name = ener_name, 
                               fully_connected = fully_connected, 
                               coord_names=coord_names, 
                               directed = directed, 
                               simplify_segclass = simplify_segclass)
        #to avoid events with no graph connections, they can be discarted
        if graph_data.edge_index.numel() == 0:
            continue
        dataset.append(graph_data)
    return dataset

In [35]:
file = '/mnt/lustre/scratch/nlsas/home/usc/ie/mpm/NEXT100/data/pressure_topology/13bar/0nubb/label/prod/nexus_label_{}_0nubb.h5'
#dataset_1 = graphDataset(file.format('1'), table = 'MCVoxels', features = ['ener', 'nhits'], ener_name = 'ener', coord_names = ['x', 'y', 'z'], directed = False, fully_connected = True)
#dataset_2 = graphDataset(file.format('2'))
#dataset_3 = graphDataset(file.format('3'))

We can read the stored tensors and then group them into batches. The batch joins all the nodes, edge_index, label, etc... and will have a batch list of the length of the nodes, with corresponding indexes to recognize the different graphs of course. In the case of the edges, they will continue to have two lists of positions for the nodes that have an edge.

In [10]:
# class MyDataset(Dataset):
#     def __init__(self, root, tag = '0nubb', transform=None, pre_transform=None, pre_filter=None, directed = False, simplify_segclass = False):
#         self.sort = lambda x: int(x.split('_')[-2])
#         self.tag = tag
#         self.directed = directed
#         self.simplify_segclass = simplify_segclass
#         super().__init__(root, transform, pre_transform, pre_filter)
        
#     @property
#     def raw_file_names(self):
#         ''' 
#         Returns a list of the raw files in order (supossing they are beersheba labelled files that have the structure beersheba_label_N_tag.h5)
#         '''
#         rfiles = [i.split('/')[-1] for i in glob(self.raw_dir + '/*_{}.h5'.format(self.tag))]
#         return sorted(rfiles, key = self.sort)

#     @property
#     def processed_file_names(self):
#         '''
#         Returns a list of the processed files in order (supossing they are stored tensors with the structure data_N.pt)
#         '''
#         pfiles = [i.split('/')[-1] for i in glob(self.processed_dir + '/data_*_{}.pt'.format(self.tag))]
#         return sorted(pfiles, key = self.sort)
    
#     def process(self):
#         makedirs(self.processed_dir)
#         already_processed = [self.sort(i) for i in self.processed_file_names]
#         for raw_path in self.raw_paths:
#             idx = self.sort(raw_path)
#             if np.isin(idx, already_processed):
#                 #to avoid processing already processed files
#                 continue
#             data = graphDataset(raw_path, directed=self.directed, simplify_segclass=self.simplify_segclass)

#             torch.save(data, osp.join(self.processed_dir, f'data_{idx}_{self.tag}.pt'))
        

#     def len(self):
#         return len(self.processed_file_names)

#     def get(self, idx):
#         data = torch.load(osp.join(self.processed_dir, f'data_{idx}_{self.tag}.pt'))
#         return data

#     def join(self):
#         #print('Joining ', self.processed_file_names)
#         dataset = []
#         for processed_path in self.processed_paths:
#             dataset += torch.load(processed_path)
#         return dataset
    

Here we create a dataset file(s)

In [11]:
# file_path = '/mnt/lustre/scratch/nlsas/home/usc/ie/mpm/NEXT100/labelled_data/0nubb/554mm_voxels/'
# tag = file_path.split('/')[-3]
# Dataset(file_path, '0nubb', directed = False).process() #this action creates the processed files

## Test creating and saving files with graphData 

We want to detect where the float64 change is done, instead of maintaining the tensors always as float32

In [2]:
import sys
import gzip
import torch
import numpy as np
import itertools
from   torch_geometric.data import Data
import invisible_cities.io.dst_io as dio
sys.path.append("/home/usc/ie/mpm/NEXT_graphs/") # go to parent dir
from NEXT_graphNN.utils.data_loader import graphDataset, graphData, edge_index

In [3]:
dt = '1eroi'
file_in = '/home/usc/ie/mpm/prueba/nexus_label_1_{}.h5'.format(dt)
fileout = '/home/usc/ie/mpm/prueba/nexus_gnn_1_{}_float32.pt'.format(dt)

group = 'DATASET'
table = 'MCVoxels'

id_name     = 'dataset_id'
ener_name   = 'ener'
features    = ['ener', 'nhits']
label_n     = ['segclass']
coord_names = ['x', 'y', 'z']

norm_features = True
max_distance  = np.sqrt(3)

directed          = False
fully_connected   = True
simplify_segclass = False

get_file_number = lambda filename: int(filename.split("/")[-1].split("_")[-2])

dat_id = 0


In [4]:
# dataset = graphDataset(file_in, 
#                            group             = group, 
#                            table             = table,
#                            id_name           = id_name, 
#                            feature_n         = features, 
#                            label_n           = label_n, 
#                            norm_features     = norm_features,
#                            max_distance      = max_distance, 
#                            ener_name         = ener_name,
#                            coord_names       = coord_names, 
#                            directed          = directed, 
#                            fully_connected   = fully_connected, 
#                            simplify_segclass = simplify_segclass,
#                            get_fnum_function = get_file_number, 
#                            torch_dtype       = torch.float)


# with gzip.open(fileout + '.gz', 'wb') as fout:
#     torch.save(dataset, fout)
# torch.save(dataset, fileout)

## Create dataset from several processed files

With the graphDataset function I've processed each individual file. Now the goal is to read all the files I want to create a complete dataset, shuffle them and divide between test, train and validation batches, and saving them again.

In [87]:
import gzip

def load_graph_data(fname):
    if isinstance(fname, list):
        dataset = [graph for path in fname for graph in torch.load(path)]
    if isinstance(fname, str):
        dataset = torch.load(fname)
    return dataset

def load_graph_data_compressed(fname):
    dataset = []
    if isinstance(fname, list):
        for path in fname:
            print(path)
            with gzip.open(path, 'rb') as f:
                dataset.extend(torch.load(f))
    if isinstance(fname, str):
        with gzip.open(fname, 'rb') as f:
            dataset.extend(torch.load(f))
    return dataset

In [4]:
basedir = '/mnt/lustre/scratch/nlsas/home/usc/ie/mpm/NEXT100/data/pressure_topology/{p}/{dt}/graph_nn/prod/{f}'

In [5]:
p = '13bar'
#dt = '*'
f = '*.pt.gz'
#numero aprox de eventos por fichero
nevents_per_file_0nubb = 400
nevents_per_file_1eroi = 50

#numero de eventos que queremos por data type
wanted_nevents_per_dt = 2000
#numero de ficheros necesarios para cada dt para tener el numero de eventos anterior
nfiles_0nubb = int(wanted_nevents_per_dt / nevents_per_file_0nubb)
nfiles_1eroi = int(wanted_nevents_per_dt / nevents_per_file_1eroi)

#ficheros seleccionados por cada dt
files_0nubb = sorted(glob(basedir.format(p = p, dt = '0nubb', f = f)), key = lambda x: (x.split('/')[-4], int(x.split('_')[-2])))[:nfiles_0nubb]
files_1eroi = sorted(glob(basedir.format(p = p, dt = '1eroi', f = f)), key = lambda x: (x.split('/')[-4], int(x.split('_')[-2])))[:nfiles_1eroi]

#ficheros totales, con 2 veces el numero de eventos aprox por dt
files = files_0nubb + files_1eroi

In [8]:
files

['/mnt/lustre/scratch/nlsas/home/usc/ie/mpm/NEXT100/data/pressure_topology/13bar/0nubb/graph_nn/prod/nexus_graph_nn_1_0nubb.pt.gz',
 '/mnt/lustre/scratch/nlsas/home/usc/ie/mpm/NEXT100/data/pressure_topology/13bar/0nubb/graph_nn/prod/nexus_graph_nn_2_0nubb.pt.gz',
 '/mnt/lustre/scratch/nlsas/home/usc/ie/mpm/NEXT100/data/pressure_topology/13bar/0nubb/graph_nn/prod/nexus_graph_nn_3_0nubb.pt.gz',
 '/mnt/lustre/scratch/nlsas/home/usc/ie/mpm/NEXT100/data/pressure_topology/13bar/0nubb/graph_nn/prod/nexus_graph_nn_4_0nubb.pt.gz',
 '/mnt/lustre/scratch/nlsas/home/usc/ie/mpm/NEXT100/data/pressure_topology/13bar/0nubb/graph_nn/prod/nexus_graph_nn_5_0nubb.pt.gz']

In [4]:
import torch
from torch.utils.data import random_split

fileout = '/mnt/lustre/scratch/nlsas/home/usc/ie/mpm/NEXT100/data/pressure_topology/{p}/dataset_{p}_graph_nn_4kevents.pt.gz'
# dataset = load_graph_data_compressed(files)
# dataset_len = len(dataset)

# # Define the sizes for training, validation, and test sets
# train_size = int(0.8 * dataset_len)
# val_size = int(0.1 * dataset_len)
# test_size = dataset_len - train_size - val_size

# assert train_size + val_size + test_size == dataset_len
# # Split the dataset
# split_dataset = random_split(dataset, [train_size, val_size, test_size])


# # with gzip.open(fileout.format(p = p), 'wb') as fout:
# #     torch.save(split_dataset, fout)

# torch.save(split_dataset, fileout.format(p = p))

In [5]:
dataset = torch.load(fileout.format(p = p))

In [33]:
from torch_geometric.loader import DataLoader

train_dataset, val_dataset, test_dataset = dataset
batch_size = 10
# Shuffle and create DataLoader for each set
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [34]:
for batch in train_loader:
    print(batch)
    break

DataBatch(x=[1526, 2], edge_index=[2, 233728], edge_attr=[233728, 2], y=[1526, 1], edge_weight=[233728], num_nodes=1526, coords=[1526, 3], dataset_id=[10], binclass=[10], fnum=[10], batch=[1526], ptr=[11])


In [35]:
sum(batch.batch == 0)

tensor(163)

In [36]:
batch.binclass

[1, 0, 0, 0, 1, 0, 0, 0, 1, 1]

## Creating dataset from mixed hdf file

Need this bc we now need to do the loop and divide per event (as I have a big file) instead of a loop on lots of small files...

In [11]:
file_voxel_mix = '/mnt/lustre/scratch/nlsas/home/usc/ie/mpm/NEXT100/labelled_data/train_data_files/mixer_voxels_fid_norm.h5'
process_file_voxel_mix = '/mnt/lustre/scratch/nlsas/home/usc/ie/mpm/NEXT100/labelled_data/train_data_files/process_mixer_voxels_fid_norm/'

In [12]:
def graphProcess(file, 
                 outfile,
                 nevents_per_file = 200,
                 table = 'MCVoxels', 
                 id = 'dataset_id', 
                 features = ['ener'], 
                 label_n = ['segclass'], 
                 max_distance = np.sqrt(3), 
                 coord_names = ['x', 'y', 'z'], 
                 directed = False, 
                 simplify_segclass = False):
    df = pd.read_hdf(file, table)
    dataset = []
    for dat_id, event in df.groupby(id):        
        # event = event.reset_index(drop = True)
        graph_data = graphData(event, dat_id, features=features, label_n=label_n, max_distance=max_distance, coord_names=coord_names, directed = directed, simplify_segclass = simplify_segclass)
        dataset.append(graph_data)
        
        if (dat_id + 1) % nevents_per_file == 0:
            start_id = (dat_id + 1) - nevents_per_file
            final_id = dat_id
            torch.save(dataset, osp.join(outfile, f'data_{start_id}_{final_id}.pt'))
            dataset = []

    #to save the last file!!
    start_id = (dat_id + 1) - nevents_per_file
    final_id = dat_id
    torch.save(dataset, osp.join(outfile, f'data_{start_id}_{final_id}.pt'))
    
    #return dataset

In [13]:
#graphProcess(file_voxel_mix, process_file_voxel_mix, nevents_per_file=1000)

In [15]:
files = sorted(glob(process_file_voxel_mix + 'data_*'), key = lambda x: int(x.split('_')[-2]))

In [16]:
all_graphs = load_graph_data(files)

In [19]:
#torch.save(all_graphs, osp.join(process_file_voxel_mix, f'all_proc_mix_vox_fid_norm.pt'))

In [43]:
from invisible_cities.types.ic_types import AutoNameEnumBase
from enum import auto

In [40]:
class LabelType(AutoNameEnumBase):
    Classification = auto()
    Segmentation   = auto()

In [114]:
def weights_loss(fname, label_type, nclass = 3, nevents = None):
    dataset = load_graph_data(fname)[:nevents]
    if label_type==LabelType.Segmentation:
        inv_freq = 1 / sum([np.bincount(graph.y.numpy().flatten(), minlength=nclass) for graph in dataset])
    elif label_type == LabelType.Classification:
        inv_freq = 1 / np.bincount([graph.binclass for graph in dataset])
    return inv_freq / sum(inv_freq)

In [115]:
weights_loss(files, LabelType.Segmentation)

array([0.36001924, 0.07829171, 0.56168905])

In [116]:
weights_loss(files, LabelType.Classification)

array([0.26362139, 0.73637861])

PARA MI DATASET ACTUAL LOS LOSSWEIGHT DE SEGMENTACION SON array([0.36001924, 0.07829171, 0.56168905])

Y LOS DE CLASIFICACION SON array([0.26362139, 0.73637861])

In [None]:
from torch_geometric.loader import DataLoader


In [117]:
def create_idx_split(dataset, train_perc):
    '''
    Divides the whole dataset into train, validation and test data. Picks a certain percentage (the majority) for the 
    train batch, and the remaining is divided equally for validation and test.
    '''
    indices = np.arange(len(dataset))
    valid_perc = (1 - train_perc) / 2
    random.shuffle(indices)
    train_data = torch.tensor(np.sort(indices[:int((len(indices)+1)*train_perc)])) 
    valid_data = torch.tensor(np.sort(indices[int((len(indices)+1)*train_perc):int((len(indices)+1)*(train_perc + valid_perc))]))
    test_data = torch.tensor(np.sort(indices[int((len(indices)+1)*(train_perc + valid_perc)):]))
    idx_split = {'train':train_data, 'valid':valid_data, 'test':test_data}
    return idx_split

In [26]:
def split_dataset(dataset, train_perc):
    '''
    Divides the whole dataset into train, validation and test data. Picks a certain percentage (the majority) for the 
    train batch, and the remaining is divided equally for validation and test.
    '''

    valid_perc = (1 - train_perc) / 2
    nevents = len(dataset)
    train_data = dataset[:int(nevents * train_perc)]
    valid_data = dataset[int(nevents * train_perc):int(nevents * (train_perc + valid_perc))]
    test_data  = dataset[int(nevents * (train_perc + valid_perc)):]
    return train_data, valid_data, test_data

In [27]:
tr, va, te = split_dataset(all_graphs, 0.9)

In [48]:
# torch.save(tr, osp.join(process_file_voxel_mix, f'train_graphs.pt'))
# torch.save(va, osp.join(process_file_voxel_mix, f'valid_graphs.pt'))
# torch.save(te, osp.join(process_file_voxel_mix, f'test_graphs.pt'))

In [37]:
# mcvox = pd.read_hdf(file_voxel_mix, 'MCVoxels')
# evinf = pd.read_hdf(file_voxel_mix, 'EventsInfo')

In [58]:
# datapath = '/mnt/lustre/scratch/nlsas/home/usc/ie/mpm/NEXT100/labelled_data/train_data_files/'

# ids = [g.dataset_id for g in te]
# selected_mcvox = mcvox[np.isin(mcvox.dataset_id, ids)]
# selected_evinf = evinf[np.isin(evinf.dataset_id, ids)]

# selected_mcvox.to_hdf(datapath + 'test_data.h5', 'MCVoxels')
# selected_evinf.to_hdf(datapath + 'test_data.h5', 'EventsInfo')

In [203]:
from torch.utils.tensorboard import SummaryWriter

In [None]:
def save_checkpoint(state, filename='checkpoint.pth.tar'):
    torch.save(state, filename)

In [None]:
def train_net(*,
              nepoch,
              train_data,
              valid_data,
              train_batch_size,
              valid_batch_size,
              net,
              device,
              optimizer,
              criterion,
              label_type,
              nclass,
              model_uses_batch,
              checkpoint_dir,
              tensorboard_dir,
              num_workers,
              use_cuda = True):
    """
        Trains the net nepoch times and saves the model anytime the validation loss decreases
    """

    loader_train = DataLoader(train_data,
                            batch_size = train_batch_size,
                            shuffle = True,
                            num_workers = num_workers,
                            drop_last = True,
                            pin_memory = False)
    loader_valid = DataLoader(valid_data,
                            batch_size = valid_batch_size,
                            shuffle = True,
                            num_workers = 1,
                            drop_last = True,
                            pin_memory = False)

    start_loss = np.inf
    writer = SummaryWriter(tensorboard_dir)
    for i in range(nepoch):
        train_loss, train_met = train_one_epoch(i, net, loader_train, device, optimizer, criterion, label_type, nclass = nclass, model_uses_batch = model_uses_batch)
        valid_loss, valid_met = valid_one_epoch(net, loader_valid, device, criterion, label_type, nclass = nclass, model_uses_batch = model_uses_batch)

        if valid_loss < start_loss:
            save_checkpoint({'state_dict': net.state_dict(),
                             'optimizer': optimizer.state_dict()}, f'{checkpoint_dir}/net_checkpoint_{i}.pth.tar')
            start_loss = valid_loss

        writer.add_scalar('loss/train', train_loss, i)
        writer.add_scalar('loss/valid', valid_loss, i)
        if label_type == LabelType.Segmentation:
            for k, iou in enumerate(train_met):
                writer.add_scalar(f'iou/train_{k}class', iou, i)
            for k, iou in enumerate(valid_met):
                writer.add_scalar(f'iou/valid_{k}class', iou, i)
        elif label_type == LabelType.Classification:
            writer.add_scalar('acc/train', train_met, i)
            writer.add_scalar('acc/valid', valid_met, i)
        writer.flush()
    writer.close()

We check for one file that the elaboration of graphs with Networkx and with the NN notation agree

In [9]:

# Small loop to check that my graph creator (the old one) and the new graph representation are the same (more or less)
dataset_ = Dataset(file_path, '0nubb').get(1)
for i in df.dataset_id.unique():
    event = df[df.dataset_id == i]
    event = event.reset_index(drop = True)
    #need to reset index as the edge index representation works with index from 0 to N (number of nodes in the graph)
    G = create_graph(event, np.sqrt(3), ['xbin', 'ybin', 'zbin'])
    data = dataset_[i]
    n_nodes = data.x.shape[0]
    n_edges = data.edge_index.shape[1]
    assert n_nodes == len(G.nodes)
    assert n_edges == 2 * len(G.edges) 
    #this factor 2 is due to the fact that the networkx graphs know if a graph is directed or not, but with the tensor representation we have to include both directions
    #for each edge to indicate they are not directed

Small function to visualize graph with label colours

In [13]:
def graph_draw(event, color_dct = {1:'b', 2:'y', 3:'r', 4:'b', 5:'y', 6:'r', 7:'g'}):
    G = create_graph(event, np.sqrt(3), ['xbin', 'ybin', 'zbin'])
    node_colors = event.segclass.apply(lambda x: color_dct[x]).values
    nx.draw(G, node_color = node_colors, node_shape = '.')
#graph_draw(event)

We also create a function for idx split for a certain dataset (group of processed files)

In [12]:
def create_idx_split(dataset, train_perc):
    indices = np.arange(len(dataset))
    valid_perc = (1 - train_perc) / 2
    random.shuffle(indices)
    train_data = torch.tensor(np.sort(indices[:int((len(indices)+1)*train_perc)])) #Remaining 80% to training set
    valid_data = torch.tensor(np.sort(indices[int((len(indices)+1)*train_perc):int((len(indices)+1)*(train_perc + valid_perc))]))
    test_data = torch.tensor(np.sort(indices[int((len(indices)+1)*(train_perc + valid_perc)):]))
    idx_split = {'train':train_data, 'valid':valid_data, 'test':test_data}
    return idx_split


Example of how the idx split would be used

In [18]:
# Creation of the dataset, index split and data loaders for each case
file_path = '/mnt/lustre/scratch/nlsas/home/usc/ie/mpm/NEXT100/labelled_data/0nubb/554mm_voxels/'

dataset = Dataset(file_path, '0nubb').join()

idx_split = create_idx_split(dataset, 0.8)
#idx_split
train_loader = DataLoader([dataset[i] for i in idx_split['train']], batch_size=50, shuffle=True, num_workers=0)
#valid_loader = DataLoader([dataset[i] for i in idx_split['valid']], batch_size=50, shuffle=False, num_workers=0)
#test_loader = DataLoader([dataset[i] for i in idx_split['test']], batch_size=50, shuffle=False, num_workers=0)

Implement a function to calculate + correct the weights for each class

In [19]:
def weight_loss(file_names, correct = False):
    #correct assigns to the ghost class the desired inverse freq and redistributes the rest
    seg = pd.Series(dtype='int')
    for f in file_names:
        seg = seg.append(dio.load_dst(f, 'DATASET', 'BeershebaVoxels').segclass)
    freq = np.bincount(seg - 1, minlength=max(seg))
    inv_freq = 1. / freq
    inv_freq = inv_freq / sum(inv_freq)
    if correct:
        redistr = inv_freq[:-1] * (1 - correct) / sum(inv_freq[:-1])
        inv_freq = np.append(redistr, correct)
    return inv_freq
    

In [20]:
files_for_weights = glob(file_path + 'raw/*.h5')
inv_freq = weight_loss(files_for_weights, correct = 0.1)
inv_freq

array([0.37765031, 0.02843595, 0.20542302, 0.23333288, 0.01188655,
       0.04327129, 0.1       ])