In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from torch import nn, optim
import time
from torch.utils.data import Dataset, DataLoader
import torch

# Generate files with inputs

This notebook used to generate graphs and store them as *.npz files

This notebook is working from the docker

In [2]:
from trackml.dataset import load_dataset
PATH_TO_DATA = "/home/data/train_sample_full"
#PATH_TO_DATA = "/home/data/train_sample_single"

## input selection:

Generate inputs from the file using preprocess code:

In [3]:
from collections import namedtuple
Graph = namedtuple('Graph', ['X', 'Is', 'y'])
from preprocess import preprocess
def graph_to_sparse(graph):
    return dict(X=graph.X, y=graph.y, Is=graph.Is)
def get_size(graph):
    size = 0
    for fld in graph._fields:
        size += getattr(graph, fld).nbytes
    return size

Select input features, and $\eta$ range:

In [4]:
feature_names = ['x', 'y', 'z', 'phi', 'eta', 'r']
#eta_range = np.array([-1.5, -0.75, -0.5,-0.25, 0.0, 0.25, 0.5, 0.75, 1.5 ])
eta_range = np.array([-1.5, 1.5 ])

Set the folder with the output data for training:

In [5]:
PATH='/home/data/graph_full_6var_250MeV_150mmz0_1eta/'
!rm -f /home/data/graph_full_6var_250MeV_150mmz0_1eta/*npz
!mkdir /home/data/graph_full_6var_250MeV_150mmz0_1eta

mkdir: cannot create directory ‘/home/data/graph_full_6var_250MeV_150mmz0_1eta’: File exists


Since using only the barrel, set `getLayer` function that for each hit will set it layer number (total 10 layers)

In [6]:
def getLayer(volume_id, layer_id):
    if volume_id==8:
        return layer_id//2
    elif volume_id==13:
        return layer_id//2 + 4
    elif volume_id==17:
        return layer_id//2 + 8
    else:
        return -1   

Produce the inputs:

In [7]:
data = load_dataset(PATH_TO_DATA, parts=['hits', 'cells', 'truth', 'particles'])
keys = ['hit_id','x','y','z','particle_id','volume_id','layer_id']
#loop over all files in the folder
for event_id, hits, cells, truth, particles in data:
    
    print('evaluate and store event',event_id,' with',hits.shape[0],'hits')
    
    r = np.sqrt(hits['x']**2 + hits['y']**2)
    theta = np.arctan2(r,hits['z'])
    hits['eta'] = -np.log(np.tan(0.5*theta))
    hits['layer'] = hits.apply(lambda x: getLayer(x['volume_id'],x['layer_id']), axis=1)

    #filter event - remove noise, use only barrel, keep hits associated to tracks with more than 10 hits
    new_hits = hits.merge(truth[['hit_id','particle_id']], on='hit_id').copy()
    
    group_hits = new_hits.groupby(by=['particle_id'])
    new_hits = group_hits.filter(lambda x: x['layer'].min() > 0)
    new_hits = new_hits.loc[new_hits['particle_id']>0]
    print('remove noise and tracks outside the barrel: ',new_hits.shape[0])
    
    #filter hits to be within the eta region:
    eta_cut = (-1.5,1.5);
    new_hits = (new_hits.loc[(new_hits['eta']>eta_cut[0]) & (new_hits['eta']<=eta_cut[1])])
    print('eta cut ' ,new_hits.shape[0])
    
    group_hits = new_hits.groupby(by=['particle_id'])
    track_idx = group_hits.indices
    new_hits = pd.concat([group_hits.get_group(pid).assign(nhits=len(idx)) for pid, idx in track_idx.items()])
    new_hits = new_hits.loc[(new_hits['nhits']>9)]
    print('n_hits cut' ,new_hits.shape[0])

    #reprocess the event using the compiled "preprocess" function, compute full graph for entire event (don't split)
    print('call reprocess')
    list_y, list_X, list_Is, list_hits_id, list_labels = preprocess(new_hits.copy(), eta_range, feature_names)
    print('done!')
    i = 0
    for y, X, Is, hits_id, labels in zip(list_y, list_X, list_Is, list_hits_id, list_labels):
        #store the inputs
        print(X.shape)
        y = y.astype(np.float32)
        Is = Is.values
        graph = Graph(X,Is,y)
        filename = PATH+'/myGraph_event_%d_eta%d.npz'%(event_id,i); i = i+1
        np.savez(filename, **graph_to_sparse(graph))
        print('graph of size',get_size(graph)/(1024*1024),'MB with ',X.shape[0],'nodes and ',Is.shape[0],'edges \nsaved in location',filename)


evaluate and store event 21000  with 125576 hits
remove noise and tracks outside the barrel:  31347
eta cut  28554
n_hits cut 19058
call reprocess
done!
(19058, 6)
graph of size 11.242362976074219 MB with  19058 nodes and  566554 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21000_eta0.npz
evaluate and store event 21001  with 120844 hits
remove noise and tracks outside the barrel:  30091
eta cut  27300
n_hits cut 18408
call reprocess
done!
(18408, 6)
graph of size 10.566875457763672 MB with  18408 nodes and  531919 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21001_eta0.npz
evaluate and store event 21002  with 116176 hits
remove noise and tracks outside the barrel:  28613
eta cut  26222
n_hits cut 17561
call reprocess
done!
(17561, 6)
graph of size 9.474090576171875 MB with  17561 nodes and  475642 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21002_eta0.npz
evalu

evaluate and store event 21025  with 105855 hits
remove noise and tracks outside the barrel:  27081
eta cut  24797
n_hits cut 16695
call reprocess
done!
(16695, 6)
graph of size 8.436641693115234 MB with  16695 nodes and  422289 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21025_eta0.npz
evaluate and store event 21026  with 102441 hits
remove noise and tracks outside the barrel:  24176
eta cut  22006
n_hits cut 14737
call reprocess
done!
(14737, 6)
graph of size 6.909492492675781 MB with  14737 nodes and  344572 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21026_eta0.npz
evaluate and store event 21027  with 107595 hits
remove noise and tracks outside the barrel:  25962
eta cut  23678
n_hits cut 15574
call reprocess
done!
(15574, 6)
graph of size 7.542930603027344 MB with  15574 nodes and  376778 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21027_eta0.npz
evaluat

evaluate and store event 21050  with 122573 hits
remove noise and tracks outside the barrel:  30571
eta cut  27837
n_hits cut 18405
call reprocess
done!
(18405, 6)
graph of size 10.519847869873047 MB with  18405 nodes and  529457 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21050_eta0.npz
evaluate and store event 21051  with 122509 hits
remove noise and tracks outside the barrel:  30896
eta cut  28096
n_hits cut 18612
call reprocess
done!
(18612, 6)
graph of size 10.702159881591797 MB with  18612 nodes and  538767 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21051_eta0.npz
evaluate and store event 21052  with 111546 hits
remove noise and tracks outside the barrel:  27380
eta cut  24932
n_hits cut 16132
call reprocess
done!
(16132, 6)
graph of size 8.119285583496094 MB with  16132 nodes and  406326 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21052_eta0.npz
evalu

evaluate and store event 21075  with 103548 hits
remove noise and tracks outside the barrel:  25738
eta cut  23437
n_hits cut 15503
call reprocess
done!
(15503, 6)
graph of size 7.359039306640625 MB with  15503 nodes and  367222 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21075_eta0.npz
evaluate and store event 21076  with 108410 hits
remove noise and tracks outside the barrel:  26547
eta cut  24274
n_hits cut 16096
call reprocess
done!
(16096, 6)
graph of size 8.079341888427734 MB with  16096 nodes and  404275 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21076_eta0.npz
evaluate and store event 21077  with 94151 hits
remove noise and tracks outside the barrel:  21621
eta cut  19464
n_hits cut 12705
call reprocess
done!
(12705, 6)
graph of size 5.1599884033203125 MB with  12705 nodes and  255286 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21077_eta0.npz
evaluat

evaluate and store event 21100  with 104867 hits
remove noise and tracks outside the barrel:  25310
eta cut  23004
n_hits cut 14856
call reprocess
done!
(14856, 6)
graph of size 6.944812774658203 MB with  14856 nodes and  346281 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21100_eta0.npz
evaluate and store event 21101  with 84162 hits
remove noise and tracks outside the barrel:  20218
eta cut  18098
n_hits cut 11976
call reprocess
done!
(11976, 6)
graph of size 4.570240020751953 MB with  11976 nodes and  225241 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21101_eta0.npz
evaluate and store event 21102  with 109857 hits
remove noise and tracks outside the barrel:  26552
eta cut  24132
n_hits cut 15928
call reprocess
done!
(15928, 6)
graph of size 7.975914001464844 MB with  15928 nodes and  399054 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21102_eta0.npz
evaluate

evaluate and store event 21125  with 116164 hits
remove noise and tracks outside the barrel:  28479
eta cut  25816
n_hits cut 17063
call reprocess
done!
(17063, 6)
graph of size 8.906604766845703 MB with  17063 nodes and  446487 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21125_eta0.npz
evaluate and store event 21126  with 109042 hits
remove noise and tracks outside the barrel:  27151
eta cut  24756
n_hits cut 16405
call reprocess
done!
(16405, 6)
graph of size 8.516883850097656 MB with  16405 nodes and  426844 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21126_eta0.npz
evaluate and store event 21127  with 120242 hits
remove noise and tracks outside the barrel:  28690
eta cut  26071
n_hits cut 17256
call reprocess
done!
(17256, 6)
graph of size 9.170440673828125 MB with  17256 nodes and  460088 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21127_eta0.npz
evaluat

evaluate and store event 21150  with 115871 hits
remove noise and tracks outside the barrel:  28421
eta cut  25642
n_hits cut 16574
call reprocess
done!
(16574, 6)
graph of size 8.476119995117188 MB with  16574 nodes and  424504 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21150_eta0.npz
evaluate and store event 21151  with 102762 hits
remove noise and tracks outside the barrel:  25148
eta cut  22710
n_hits cut 14584
call reprocess
done!
(14584, 6)
graph of size 6.699710845947266 MB with  14584 nodes and  333757 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21151_eta0.npz
evaluate and store event 21152  with 115920 hits
remove noise and tracks outside the barrel:  29051
eta cut  26272
n_hits cut 17290
call reprocess
done!
(17290, 6)
graph of size 9.296283721923828 MB with  17290 nodes and  466645 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21152_eta0.npz
evaluat

evaluate and store event 21175  with 110987 hits
remove noise and tracks outside the barrel:  28547
eta cut  26089
n_hits cut 17762
call reprocess
done!
(17762, 6)
graph of size 9.764030456542969 MB with  17762 nodes and  490602 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21175_eta0.npz
evaluate and store event 21176  with 139267 hits
remove noise and tracks outside the barrel:  33958
eta cut  30806
n_hits cut 20636
call reprocess
done!
(20636, 6)
graph of size 13.043846130371094 MB with  20636 nodes and  659110 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21176_eta0.npz
evaluate and store event 21177  with 89927 hits
remove noise and tracks outside the barrel:  21183
eta cut  19293
n_hits cut 12784
call reprocess
done!
(12784, 6)
graph of size 5.256858825683594 MB with  12784 nodes and  260270 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21177_eta0.npz
evaluat

evaluate and store event 21200  with 123004 hits
remove noise and tracks outside the barrel:  30589
eta cut  27906
n_hits cut 18246
call reprocess
done!
(18246, 6)
graph of size 10.426544189453125 MB with  18246 nodes and  524756 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21200_eta0.npz
evaluate and store event 21201  with 126839 hits
remove noise and tracks outside the barrel:  32491
eta cut  29708
n_hits cut 20081
call reprocess
done!
(20081, 6)
graph of size 12.302898406982422 MB with  20081 nodes and  620929 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21201_eta0.npz
evaluate and store event 21202  with 101283 hits
remove noise and tracks outside the barrel:  25072
eta cut  22487
n_hits cut 14294
call reprocess
done!
(14294, 6)
graph of size 6.364971160888672 MB with  14294 nodes and  316555 edges 
saved in location /home/data/graph_full_6var_250MeV_150mmz0_1eta//myGraph_event_21202_eta0.npz
evalu

KeyboardInterrupt: 

# Create dummy data for validations:

In [None]:
import pandas as pd
import numpy as np
edges = pd.DataFrame(data=np.column_stack(([1,1,2,3,4,4,5,5], [4,5,5,5,6,7,7,8],[1,0,1,1,1,0,0,1])),
              columns=["index_1", "index_2", "weight"])
edges.index_1 = edges.index_1 - 1
edges.index_2 = edges.index_2 - 1
X = np.random.rand(8,5)

In [None]:
graph = Graph(X,edges[['index_1','index_2']],edges.weight.values)
filename = '/home/data/train_graphs/myGraph_event_%d.npz'%999999
np.savez(filename, **graph_to_sparse(graph))    

Load the data and validate the code you are using:

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from torch import nn, optim
import time
from torch.utils.data import Dataset, DataLoader
import torch

In [None]:
from collections import namedtuple
Graph = namedtuple('Graph', ['X', 'Is', 'y'])

def load_graph(filename):
    #print('loading',filename)
    with np.load(filename) as f:
        return sparse_to_graph(**dict(f.items()))
def sparse_to_graph(X, Is, y):
    return Graph(X, Is, y)

class trackDataLoader(Dataset):
    def __init__(self, filenames, n_samples=None):
        self.filenames = (
            filenames[:n_samples] if n_samples is not None else filenames)

    def __getitem__(self, index):
        return load_graph(self.filenames[index])

    def __len__(self):
        return len(self.filenames)
    
def collate_fn(graph):
        
        g = graph[0]
        
        # for CPU
        batch_target = torch.FloatTensor(g.y)
        
        n_hits = g.X.shape[0]
        n_edges = g.Is.shape[0]
        print('n_hits',n_hits)
        print('n_edges',n_edges)
        Ri = torch.sparse.FloatTensor(torch.LongTensor([g.Is[:,1],np.arange(n_edges)]),
                                      torch.ones(n_edges),
                                      torch.Size([n_hits,n_edges]))
        Ro = torch.sparse.FloatTensor(torch.LongTensor([g.Is[:,0],np.arange(n_edges)]),
                                      torch.ones(n_edges),
                                      torch.Size([n_hits,n_edges]))
        batch_inputs = [torch.FloatTensor(g.X),
                        torch.LongTensor(g.Is),
                        Ro, Ri]
        
        # for GPU
        #batch_target = torch.cuda.FloatTensor(g.y)
        #batch_inputs = [torch.cuda.FloatTensor(g.X),
        #                torch.cuda.LongTensor(g.Is),
        #                Ro.to(device), Ri.to(device)]
    
        
        return batch_inputs, batch_target

In [None]:
test_dataset = trackDataLoader([filename]) #use 1 file
print('lloaded - ',filename)
test_loader = DataLoader(test_dataset, collate_fn=collate_fn, num_workers = 0)

In [None]:
test_pred, test_target = next(iter(test_loader))
X, Is, Ro, Ri = test_pred
e = np.random.rand(8)

In [None]:
(Ri.mm(E))

In [None]:
Ri.transpose(0,1).to_dense()

In [None]:
        eXo = e[:,None]*X[Is[:,1]]
        eXi = e[:,None]*X[Is[:,0]]
        xo = Ro.mm(eXo)
        xi = Ri.mm(eXi)
        dxo = (xo - X)[Is[:,1]]
        dxi = (X - xi)[Is[:,0]]
        XEo = torch.cat([dxo,torch.sqrt(dxo[:,0]**2+dxo[:,1]**2)[:,None]], dim=1)
        XEi = torch.cat([dxi,torch.sqrt(dxi[:,0]**2+dxi[:,1]**2)[:,None]], dim=1)
        
        return self.network(torch.cat([XEo, XEi], dim=-1)).squeeze(-1)        


In [None]:
    y, X, Is, hits_id, labels = list_y[0].astype(np.float32), list_X[0], list_Is[0].values, list_hits_id[0], list_labels[0]
    graph = Graph(X,Is,y)
    filename = '/home/data/train_graphs/myGraph_event_%d.npz'%event_id

In [None]:
torch.Size([11,15])

In [None]:
weights = np.array([0.5,0.8,0.6,0.4,0.2,0.3])
weights

In [None]:
weights[weights>0.5]=1
weights[weights<=0.5]=0

In [None]:
weights
