### Loading docker image

if running on a public cluster, do the following: Download the data to `INPUTFOLDER/train_sample_full`

* start docker
```bash
INPUT_DATA=INPUTFOLDER
docker run -it -p 8888:8888 --rm -v $(pwd):/home/code -v $INPUT_DATA:/home/data estradevictorantoine/trackml:1.0
jupyter notebook --ip 0.0.0.0 --no-browser --allow-root
```
* open a tunnel
in a new shell open a tunnel
```bash
ssh -N -f -L localhost:7008:localhost:8888 adress.of.cluster
```

* in chrome open the notebook:
in chrome: http://localhost:7008/, the notebook will be in `code` folder


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from torch import nn, optim
import time
from torch.utils.data import Dataset, DataLoader
import torch

# Generate files with inputs

This notebook used to generate graphs and store them as *.npz files

This notebook is working from the docker

In [None]:
from trackml.dataset import load_dataset
PATH_TO_DATA = "/home/data/train_sample_full"
#PATH_TO_DATA = "/home/data/train_sample_single"

## input selection:

Generate inputs from the file using preprocess code:

In [None]:
from collections import namedtuple
Graph = namedtuple('Graph', ['X', 'Is', 'y'])
from preprocess import preprocess
def graph_to_sparse(graph):
    return dict(X=graph.X, y=graph.y, Is=graph.Is)
def get_size(graph):
    size = 0
    for fld in graph._fields:
        size += getattr(graph, fld).nbytes
    return size

Select input features, and $\eta$ range:

In [None]:
feature_names = ['x', 'y', 'z', 'phi', 'eta', 'r']
#eta_range = np.array([-1.5, -0.75, -0.5,-0.25, 0.0, 0.25, 0.5, 0.75, 1.5 ])
eta_range = np.array([-1.5, 1.5 ])

Set the folder with the output data for training:

In [None]:
PATH='/home/data/graph_full_6var_250MeV_150mmz0_1eta/'
!rm -f /home/data/graph_full_6var_250MeV_150mmz0_1eta/*npz
!mkdir /home/data/graph_full_6var_250MeV_150mmz0_1eta

Since using only the barrel, set `getLayer` function that for each hit will set it layer number (total 10 layers)

In [None]:
def getLayer(volume_id, layer_id):
    if volume_id==8:
        return layer_id//2
    elif volume_id==13:
        return layer_id//2 + 4
    elif volume_id==17:
        return layer_id//2 + 8
    else:
        return -1   

Produce the inputs:

In [None]:
data = load_dataset(PATH_TO_DATA, parts=['hits', 'cells', 'truth', 'particles'])
keys = ['hit_id','x','y','z','particle_id','volume_id','layer_id']
#loop over all files in the folder
for event_id, hits, cells, truth, particles in data:
    
    print('evaluate and store event',event_id,' with',hits.shape[0],'hits')
    
    r = np.sqrt(hits['x']**2 + hits['y']**2)
    theta = np.arctan2(r,hits['z'])
    hits['eta'] = -np.log(np.tan(0.5*theta))
    hits['layer'] = hits.apply(lambda x: getLayer(x['volume_id'],x['layer_id']), axis=1)

    #filter event - remove noise, use only barrel, keep hits associated to tracks with more than 10 hits
    new_hits = hits.merge(truth[['hit_id','particle_id']], on='hit_id').copy()
    
    group_hits = new_hits.groupby(by=['particle_id'])
    new_hits = group_hits.filter(lambda x: x['layer'].min() > 0)
    new_hits = new_hits.loc[new_hits['particle_id']>0]
    print('remove noise and tracks outside the barrel: ',new_hits.shape[0])
    
    #filter hits to be within the eta region:
    eta_cut = (-1.5,1.5);
    new_hits = (new_hits.loc[(new_hits['eta']>eta_cut[0]) & (new_hits['eta']<=eta_cut[1])])
    print('eta cut ' ,new_hits.shape[0])
    
    group_hits = new_hits.groupby(by=['particle_id'])
    track_idx = group_hits.indices
    new_hits = pd.concat([group_hits.get_group(pid).assign(nhits=len(idx)) for pid, idx in track_idx.items()])
    new_hits = new_hits.loc[(new_hits['nhits']>9)]
    print('n_hits cut' ,new_hits.shape[0])

    #reprocess the event using the compiled "preprocess" function, compute full graph for entire event (don't split)
    print('call reprocess')
    list_y, list_X, list_Is, list_hits_id, list_labels = preprocess(new_hits.copy(), eta_range, feature_names)
    print('done!')
    i = 0
    for y, X, Is, hits_id, labels in zip(list_y, list_X, list_Is, list_hits_id, list_labels):
        #store the inputs
        print(X.shape)
        y = y.astype(np.float32)
        Is = Is.values
        graph = Graph(X,Is,y)
        filename = PATH+'/myGraph_event_%d_eta%d.npz'%(event_id,i); i = i+1
        np.savez(filename, **graph_to_sparse(graph))
        print('graph of size',get_size(graph)/(1024*1024),'MB with ',X.shape[0],'nodes and ',Is.shape[0],'edges \nsaved in location',filename)
