### Reconstruct all events using four GPUs and the CRS2 spherical optimizer running in batch mode

In [1]:
import pickle
import math
import time
import pkg_resources

from multiprocessing import Process, Pool
from itertools import repeat

import numpy as np
import pandas as pd

from freedom.llh_service.llh_service import LLHService

import sys
sys.path.append('../../examples/reco/')
import reco

In [2]:
with open('/home/atfienberg/freedomDataCopy/public_for_aaron/test_events.pkl', 'rb') as f:
    events = pickle.load(f)

In [3]:
allowed_DOMs = np.load(pkg_resources.resource_filename('freedom', 'resources/allowed_DOMs.npy'))
ndoms = len(allowed_DOMs)

### Specify n live points

In [4]:
n_live_points = 97

### Define allowed search ranges and initial box limits

In [5]:
init_pos_range = 50

time_range = (-1000, 0)

log_energy_range = [0, 1.7]

init_range = np.array( [[-init_pos_range, init_pos_range],
                        [-init_pos_range, init_pos_range],
                        [-2*init_pos_range, 2*init_pos_range],
                        [time_range[0], time_range[1]],
                        [0, 2*math.pi],
                        [0, math.pi],
                        [log_energy_range[0], log_energy_range[1]],
                        [log_energy_range[0], log_energy_range[1]]])

# define limits of search range 
param_search_limits = np.array([
    [-200, 200],
    [-250, 250],
    [-800, -200],
    [8000, 11000],
    [0, 2*math.pi],
    [0, math.pi],
    [5, 400],
    [0.1, 400]
]).T

### llh service configuration:

In [6]:
service_conf = {
        "poll_timeout": 1,
        "flush_period": 1,
        "n_hypo_params": 8,
        "n_hit_features": 9,
        "n_evt_features": 2,
        "batch_size" : {
          "n_hypos": 200,
          "n_observations": 6000, 
        },
        "send_hwm": 10000,
        "recv_hwm": 10000,
#         "hitnet_file": "/home/atfienberg/freedomDataCopy/public_for_aaron/HitNet_ranger_30_Jul_2020-15h49/epoch_32_model.hdf5",
#         "domnet_file": "/home/atfienberg/freedomDataCopy/public_for_aaron/DOMNet_reduced_22_Jul_2020-15h18/epoch_30_model.hdf5",
        "hitnet_file": "/home/atfienberg/freedomDataCopy/public_for_aaron/HitNet_ranger_14_Jul_2020-08h03/epoch_32_model.hdf5",
        "chargenet_file": "ChargeNet_nChannels_22_May_2020-11h05/epoch_400_model.hdf5",
        "ndoms": ndoms
}

In [7]:
# add hit_data, evt_data keys based on the networks being used
# for now, support domnet and chargenet
for event in events:
    if service_conf['n_hit_features'] == 8:
        event['hit_data'] = event['hits'][:, :8]
    else:
        event['hit_data'] = event['hits']
        
    if 'domnet_file' in service_conf:
        event['evt_data'] = event['doms'][allowed_DOMs]
    else:
        event['evt_data'] = event['total_charge']

### Build four services, one per GPU

In [8]:
n_gpus = 4

In [9]:
base_req = "ipc:///tmp/atfrecotestreq"
base_ctrl = "ipc:///tmp/atfrecotestctrl"

req_addrs = []
ctrl_addrs = []
for i in range(n_gpus):
    req_addrs.append(f'{base_req}{i}')
    ctrl_addrs.append(f'{base_ctrl}{i}')

In [10]:
print(req_addrs)
print(ctrl_addrs)

['ipc:///tmp/atfrecotestreq0', 'ipc:///tmp/atfrecotestreq1', 'ipc:///tmp/atfrecotestreq2', 'ipc:///tmp/atfrecotestreq3']
['ipc:///tmp/atfrecotestctrl0', 'ipc:///tmp/atfrecotestctrl1', 'ipc:///tmp/atfrecotestctrl2', 'ipc:///tmp/atfrecotestctrl3']


In [11]:
procs = []
for i in range(n_gpus):
    proc = Process(target=reco.start_service, args=(service_conf, ctrl_addrs[i], req_addrs[i] ,i))
    proc.start()
    procs.append(proc)

starting service work loop for gpu 3...
starting service work loop for gpu 2...
starting service work loop for gpu 0...
starting service work loop for gpu 1...
Received die command... flushing and exiting
cleaning up
Received die command... flushing and exiting
cleaning up
Received die command... flushing and exiting
cleaning up
Received die command... flushing and exiting
cleaning up


### Fit a single event

In [12]:
%%time
test_out = reco.fit_events(events[:1], 0, ctrl_addrs, init_range, param_search_limits, n_live_points)

CPU times: user 404 ms, sys: 44 ms, total: 448 ms
Wall time: 8.02 s


In [13]:
print(test_out[0][0]['fun'])
print(test_out[0][1])
print(test_out[0][2])
print('---')
print(test_out[0][0]['n_calls'])
print(test_out[0][0]['nit'])

-26.782982
-20.648993
-19.15583
---
3239
165


In [14]:
# events_to_process = len(events)
events_to_process = 400
pool_size = 200
evts_per_proc = int(math.ceil(events_to_process/pool_size))
evt_splits = [events[i*evts_per_proc:(i+1)*evts_per_proc] for i in range(pool_size)]
print(sum(len(l) for l in evt_splits))

400


In [15]:
gpu_inds = np.arange(pool_size) % n_gpus

In [16]:
%%time
start = time.time()
# reconstruct with a worker pool; one LLH client per worker
with Pool(pool_size) as p:
    outs = p.starmap(reco.fit_events, zip(evt_splits, gpu_inds, 
                                          repeat(ctrl_addrs), repeat(init_range), 
                                          repeat(param_search_limits), repeat(n_live_points)))
delta = time.time() - start

CPU times: user 540 ms, sys: 3.02 s, total: 3.56 s
Wall time: 47.2 s


In [17]:
print(f'measured time: {delta/60:.1f} minutes')

measured time: 0.8 minutes


In [18]:
print(sum(len(out) for out in outs))

400


In [19]:
n_params = len(events[0]['params'])

In [20]:
all_outs = sum((out for out in outs), [])

Note: the following timing is from running on four Titan X GPUs in parallel

In [21]:
total_calls = sum(out[0]['n_calls'] for out in all_outs)
total_iters = sum(out[0]['nit'] for out in all_outs)
print(f'{total_calls} total calls')
time_per_call = delta/total_calls
print(f'{total_iters} total iters')
time_per_iter = delta/total_iters
print(f'{total_calls/len(all_outs):.1f} calls per event')
print(f'{time_per_call*1e6:.2f} us per call')

print(f'{total_iters/len(all_outs):.1f} iters per event')
print(f'{time_per_iter*1e6:.2f} us per iter')

1995865 total calls
99162 total iters
4989.7 calls per event
23.66 us per call
247.9 iters per event
476.16 us per iter


In [22]:
par_names = ['x', 'y', 'z', 'time',
             'azimuth', 'zenith', 
             'cascade energy', 'track energy']
df = reco.build_summary_df(all_outs, par_names)

In [23]:
free_f_better = df[df.free_fit_llh < df.retro_p_llh + 10]
frac = len(free_f_better)/len(df)
print(f'free fit better frac: {frac:.2f}')

free fit better frac: 0.99


In [24]:
bad = df[df.free_fit_llh >= df.retro_p_llh + 5]

In [25]:
bad.head()

Unnamed: 0,evt_idx,free_fit_llh,true_p_llh,retro_p_llh,n_calls,n_iters,x,y,z,time,azimuth,zenith,cascade energy,track energy
59,59,-21.73916,-32.207302,-33.14716,4268,208,182.947175,-248.250557,-692.4535,9475.536676,5.517159,2.35487,5.076019,0.415987
228,228,-31.02685,-30.454,-36.645184,5536,265,194.586054,55.769015,-460.530232,9468.533814,6.178632,1.535882,5.02372,23.896223
296,296,10000000000.0,-37.760033,-41.822254,97,0,132.899733,28.010456,-221.850347,12109.470241,5.194571,2.622121,11.121643,3.063659
305,305,-260.6486,-269.864471,-288.172119,5768,286,19.979787,-52.767462,-384.441221,9785.034272,2.32915,2.33322,5.07283,12.536853
322,322,-35.87344,-37.282921,-44.327633,3996,196,-149.103619,-68.301022,-429.982624,9312.129662,3.159588,1.286287,6.333071,30.480493


In [26]:
df.to_pickle('./test_out.pkl')

In [27]:
# kill all the services
import zmq
for proc, ctrl_addr in zip(procs, ctrl_addrs): 
    with zmq.Context.instance().socket(zmq.REQ) as ctrl_sock:
        ctrl_sock.connect(ctrl_addr)
        ctrl_sock.send_string("die")
        proc.join()