## Notebook for loading events from two lists of root files with NanoEventsFactory

Sorts events array from one list and searches events using array from second list

In [1]:
import awkward as ak
import numpy as np
import time
from coffea.nanoevents import NanoEventsFactory, NanoAODSchema
import sys


fname = "step1_NANO_NoClusterThreshold_ButOnlyGenMatchBelow5GeV_54.root"
fname = 'root://cmsxrootd.fnal.gov//store/mc/RunIISummer19UL17NanoAOD/QCD_Pt-15to7000_TuneCP5_Flat2018_13TeV_pythia8/NANOAODSIM/JMECustomTuples_106X_mc2017_realistic_v6-v1/280000/0CEC4EFC-9CBD-B64C-8721-29D0CBB1F0AA.root'
ext = 'root://cmsxrootd.fnal.gov//'


In [2]:
toc_notebook = time.time()
delta = -1

In [3]:
def printTime(delta):
    
    print('Time: {0:0.0f} m {1:0.2f} s'.format(delta // 60, delta % 60))
    
def progressBar(width, count):
    toolbar_width = width

    # setup toolbar
    if(count==1): 
        sys.stdout.write("[{}]".format(" " * toolbar_width))
        sys.stdout.flush()
        
        
    else :    
        for i in range(width+1):
            sys.stdout.flush()
            sys.stdout.write('\b')
            

        sys.stdout.write('='*count + ' '*(width-count) + ']')
        sys.stdout.flush()


        if (count==width): sys.stdout.write("]\n")

### Get lists of files

In [4]:
# file = open('filenames.txt').readlines()
file_flatPU = open('filenames_flatPU.txt').readlines()
file_epsilonPU = open('filenames_epsilonPU.txt').readlines()

# filenames = ['root://cmsxrootd.fnal.gov/' + f.split()[0] for f in file]
filenames_flatPU = ['root://cmsxrootd.fnal.gov/' + f.split()[0] for f in file_flatPU]
filenames_epsilonPU = ['root://cmsxrootd.fnal.gov/' + f.split()[0] for f in file_epsilonPU]

### Shorten filenames list to only 5 files for testing

skipping 4th file in filenames_epsilonPU because xrootd was not loading it


In [5]:
filenames1 = filenames_flatPU[0:5]

filenames2 = filenames_epsilonPU[0:4]
filenames2.append(filenames_epsilonPU[5])

In [8]:
events_flatPU = []
events_epsilonPU = []

toc = time.time()
print('N files: {0:.0f}'.format(len(filenames1)))
count = 0
for file in filenames1:
    
    events_flatPU.append( np.array(NanoEventsFactory.from_root(file, schemaclass=NanoAODSchema).events().event,
                                   dtype='<U64' ).flatten())
    
    count+=1
    progressBar(len(filenames1), count )

tic = time.time()
printTime(tic-toc)

N files: 5
[=====]]
Time: 0 m 25.29 s


### Convert list of event arrays to awkward array

In [9]:
toc = time.time()

ak_arr1 = ak.Array(events_flatPU)
del events_flatPU

tic = time.time()
printTime(tic-toc)

Time: 0 m 6.59 s


### Flatten awkward array and convert to numpy array

In [10]:


toc = time.time()

ak_flat1 = ak.flatten(ak_arr1)
del ak_arr1

np_arr1 = np.array(ak_flat1)
del ak_flat1

tic = time.time()

printTime(tic-toc)



Time: 1 m 4.21 s


In [12]:
events_epsilonPU = []
toc = time.time()
print('N files: {0:.0f}'.format(len(filenames2)))
count = 0
for file in filenames2:
    
    events_epsilonPU.append( np.array(NanoEventsFactory.from_root(file, schemaclass=NanoAODSchema).events().event,
                                   dtype='<U64' ).flatten())
    
    count+=1
    progressBar(len(filenames2), count )

tic = time.time()
printTime(tic-toc)

N files: 5
[=====]]
Time: 0 m 19.37 s


In [13]:
toc = time.time()

ak_arr2 = ak.Array(events_epsilonPU)
del events_epsilonPU

tic = time.time()
printTime(tic-toc)

Time: 0 m 3.51 s


In [14]:
toc = time.time()

ak_flat2 = ak.flatten(ak_arr2)
del ak_arr2


np_arr2 = np.array(ak_flat2)
del ak_flat2

tic = time.time()

printTime(tic-toc)

Time: 0 m 34.86 s


### Pad smaller events array

In [15]:
if(np_arr1.shape[0] > np_arr2.shape[0]):
    diff = np_arr1.shape[0] - np_arr2.shape[0]
    np_arr_pad = np.pad(np_arr2, (diff, 0),  constant_values=(-1))
    np_arr = np_arr1
#     del np_arr1
#     del np_arr2

    print('flatPU')
    print(np_arr.shape[0])
    print('epsilonPU')
    print(np_arr_pad.shape[0])
    
else:
    diff = np_arr2.shape[0] - np_arr1.shape[0]
    np_arr_pad = np.pad(np_arr1, (diff, 0),  constant_values=(-1))
    np_arr = np_arr2
#     del np_arr1
#     del np_arr2

    print('flatPU')
    print(np_arr_pad.shape[0])
    print('epsilonPU')
    print(np_arr.shape[0])

flatPU
3792100
epsilonPU
3792100


In [16]:
print(np_arr_pad.shape[0])
print(np_arr.shape[0])

3792100
3792100


### Sort 1st array

In [17]:
toc = time.time()

ind1 = np.argsort(np_arr)

tic = time.time()
print('Time: {0:0.2f} s'.format(tic-toc))

Time: 0.48 s


### Search sorted array with 2nd array

In [18]:
toc = time.time()

ind2 = np.searchsorted(np_arr[ind1], np_arr_pad)

tic = time.time()
print('Time: {0:0.2f} s'.format(tic-toc))

Time: 0.36 s


In [19]:
sorted_ind = ind1[ind2]

In [20]:
del ind1
del ind2

In [21]:
print('sorted array 1')
print(np_arr[sorted_ind])
print('matched array to array 1')
print(np_arr_pad)

sorted array 1
['1' '1' '1' ... '195820' '195820' '195820']
matched array to array 1
['-1' '-1' '-1' ... '19581997' '19581993' '19581931']


In [22]:
matchedEvents = (np_arr[sorted_ind] == np_arr_pad)

print('matched events')
print(np_arr[matchedEvents])
print()
print('shape')
print(np_arr[matchedEvents].shape[0])
print()
print('percentage matched events')
print('{0:0.2f}%'.format((np_arr[matchedEvents].shape[0] / np_arr.shape[0])*100))


matched events
['7741911' '7741903' '7741901' ... '15654186' '15654195' '15654198']

shape
106500

percentage matched events
2.81%


In [23]:
if(delta < 0):
    tic_notebook = time.time()
delta = tic_notebook-toc_notebook
print('Time: {0:0.2f} m {1:0.2f} s'.format(delta // 60, delta % 60))

Time: 2.00 m 35.06 s


Notebook runs in ~3 mins for 5 files