In [None]:
import uproot
import awkward as ak

import numpy as np

import json

import time

# Download the files

## lumi file

https://opendata.cern.ch/record/14220

https://opendata.cern.ch/record/14220/files/Cert_271036-284044_13TeV_Legacy2016_Collisions16_JSON.txt

## Data file

https://opendata.cern.ch/record/30522

I grabbed this one because it's smaller and easier to prototype with

4E3A26DE-E53B-A844-8048-36376617AE8D.root

Grabbed this one for testing with larger files

3B20EB8F-4FD1-D041-9513-1A82351756E1.root



In [None]:
#!wget https://opendata.cern.ch/record/14220/files/Cert_271036-284044_13TeV_Legacy2016_Collisions16_JSON.txt

In [None]:
# Uncomment to download NanoAOD files

# Small file (~60 MB)
#!wget --no-check-certificate https://eospublic.cern.ch//eos/opendata/cms/Run2016G/DoubleMuon/NANOAOD/UL2016_MiniAODv2_NanoAODv9-v2/2430000/4E3A26DE-E53B-A844-8048-36376617AE8D.root

# Big file (~1 GB)
#!wget --no-check-certificate https://eospublic.cern.ch//eos/opendata/cms/Run2016G/DoubleMuon/NANOAOD/UL2016_MiniAODv2_NanoAODv9-v2/2430000/3B20EB8F-4FD1-D041-9513-1A82351756E1.root


In [None]:
# Read in the JSON file with the lumi sections
# I use awkward for this. 

good_luminosity_sections = ak.from_json(open('Cert_271036-284044_13TeV_Legacy2016_Collisions16_JSON.txt', 'rb'))

good_luminosity_sections['273158']

In [None]:
# Small file for testing
f = uproot.open('4E3A26DE-E53B-A844-8048-36376617AE8D.root')

# Larger file
#f = uproot.open('3B20EB8F-4FD1-D041-9513-1A82351756E1.root')

t = f['Events']

# Slow test

Here is a way to build a mask using a very explicit, but slow approach with standard python loops. 

In [None]:
# Pull out the number of entries
nevents = t.num_entries
print(f"nevents: {nevents}\n")

# Get the run and luminosity block info from the TTree
run = t['run'].array()
lumiBlock = t['luminosityBlock'].array()

print(run[0:10])
print(lumiBlock[0:10])
print()
print(len(run), len(lumiBlock))

In [None]:
# Loop over the entries in the TTree and see if they are 
# in the luminosity file

# Time it
start = time.time()

# This is our mask we will create
mask_slow_test = np.zeros(nevents, dtype=int)

for idx,(r,lb) in enumerate(zip(run,lumiBlock)):

    if idx%10000==0:
        print(f"{idx} out of {nevents}")

    #print(r,lb,idx)
    
    # r is an integer but appears as a string in the Awkward array 
    # of good luminosity sections
    good_blocks = good_luminosity_sections[str(r)]
    #print(good_blocks)

    # For debugging
    if idx<0:
        print(r,lb,idx,good_blocks)

    for good_block in good_blocks:
        if lb>=good_block[0] and lb<=good_block[1]:
            mask_slow_test[idx] = 1

print(f"Time to run: {time.time() - start:.4f} seconds")

mask_slow_test = mask_slow_test.astype(bool)
mask_slow_test

In [None]:
# Here's how I would use the mask on an array
t['Muon_charge'].array()[mask_slow_test]

# Faster

Here we've built a function to handle the masking using cool tricks in `awkward`.

In [None]:
def build_lumi_mask(lumifile, tree, verbose=False):
    # lumifile should be the name/path of the file
    good_luminosity_sections = ak.from_json(open(lumifile, 'rb'))

    # Pull out the good runs as integers
    good_runs = np.array(good_luminosity_sections.fields).astype(int)
    #good_runs

    # Get the good blocks as an awkward array
    # First loop over to get them as a list
    all_good_blocks = []
    for field in good_luminosity_sections.fields:
        all_good_blocks.append(good_luminosity_sections[field])

    # Turn the list into an awkward array
    all_good_blocks = ak.Array(all_good_blocks)
    all_good_blocks[11]

    # Assume that tree is a NanoAOD Events tree
    nevents = tree.num_entries
    if verbose:
        print(f"nevents: {nevents}")
        print()
        print("All good runs")
        print(good_runs)
        print()
        print("All good blocks")
        print(all_good_blocks)
        print()

    # Get the runs and luminosity blocks from the tree
    run = t['run'].array()
    lumiBlock = t['luminosityBlock'].array()

    if verbose:
        print("Runs from the tree")
        print(run)
        print()
        print("Luminosity blocks from the tree")
        print(lumiBlock)
        print()

    # ChatGPT helped me with this part!
    # Find index of values in arr2 if those values appear in arr1

    def find_indices(arr1, arr2):
        index_map = {value: index for index, value in enumerate(arr1)}
        return [index_map.get(value, -1) for value in arr2]

    # Get the indices that say where the good runs are in the lumi file
    # for the runs that appear in the tree
    good_runs_indices = find_indices(good_runs, run)

    # For each event, calculate the difference between the luminosity block for that event
    # and the good luminosity blocks for that run for that event
    diff = lumiBlock - all_good_blocks[good_runs_indices]

    if verbose:
        print("difference between event lumi blocks and the good lumi blocks")
        print(diff)
        print()

    # If the lumi block appears between any of those good block numbers, 
    # then one difference will be positive and the other will be negative
    # 
    # If it it outside of the range, both differences will be positive or 
    # both negative.
    #
    # The product will be negagive if the lumi block is in the range
    # and positive if it is not in the range
    prod_diff = ak.prod(diff, axis=2)

    if verbose:
        print("product of the differences")
        print(prod_diff)
        print()

    mask = ak.any(prod_diff<=0, axis=1)

    return mask

In [None]:
# Use the function to build the mask

start = time.time()

mask_fast_test = build_lumi_mask('Cert_271036-284044_13TeV_Legacy2016_Collisions16_JSON.txt', t)#, verbose=True)

print(f"Time to run: {time.time() - start:.4f} seconds")

In [None]:
# Check the fast mask against the slower, naive way of doing things

num = ak.num(t['Electron_eta'].array(), axis=0)
print(f"no mask:   {num}")

num = ak.num(t['Electron_eta'].array()[mask_slow_test], axis=0)
print(f"slow mask: {num}")

num = ak.num(t['Electron_eta'].array()[mask_fast_test], axis=0)
print(f"fast mask: {num}")

In [None]:
# That mask can be used on individual awkward arrays

# AFAIK, you can't just apply the mask to the TTree object, but
# you an load the TTree into memory as an awkward array and then 
# mask the entire thing. 

# It can take a while just to load it into memory! 
# Maybe 2 min for a 1 million events

# Load the TTree into memory as an awkward array

start = time.time()

tree_array = t.arrays()
tree_array

print(f"Time to run: {time.time() - start:.4f} seconds")

In [None]:
# Then you can just mask the tree
tree_array[mask_fast_test]['Jet_area']

In [None]:
masked_tree_array = tree_array[mask_fast_test]

masked_tree_array

# Explanation of the fast code

Here's how the masking function works. There might be better ways to do this, but this is how I approached it. 

In [None]:
# Here's what we read in from the json file of the good runs and lumi blocks.
good_luminosity_sections

In [None]:
# Get the good runs from the lumi file

good_runs = np.array(good_luminosity_sections.fields).astype(int)
good_runs

In [None]:
# We'll pull out all the good lumi blocks and put them
# into an awkward array. 
# This will be easier to work with and doesn't take much time. 

all_good_blocks = []
for field in good_luminosity_sections.fields:
    all_good_blocks.append(good_luminosity_sections[field])

all_good_blocks = ak.Array(all_good_blocks)

# For example
all_good_blocks[11]

In [None]:
# A helper function!

# ChatGPT helped me with this!

# Find index of values in arr2 if those values appear in arr1

def find_indices(arr1, arr2):
    index_map = {value: index for index, value in enumerate(arr1)}
    return [index_map.get(value, -1) for value in arr2]



In [None]:
# Pass in the good runs from the json file and the 
# runs from the TTree
result = find_indices(good_runs, run)

# Result is the indices of where to find the runs in the list of good lumi runs/blocks. 
# We can pass in all the indices to get the good blocks. 

print(result)
print()
print(len(run))
print(len(result))
print()

print(result[10])
print(run[10])

print(good_runs[272])
#lumi_runs

In [None]:
# Pass in all the indices to get the good info from the lumi block
# for each run in the TTree (NanoAOD file)
good_runs[result]

In [None]:
# We can pass in the result indices to get the list of good lumi blocks
# for each event in the TTree (NanoAOD file)
all_good_blocks[result]



In [None]:
# If I take the difference between each event's lumiBlock
# and the list of good blocks for that run, I will get a positive
# and negative number if the block is within the range. 

diff = lumiBlock - all_good_blocks[result]

diff[0:5]

In [None]:
# If I take the produce of those numbers I will get a negative
# number if they are different signs

prod_diff = ak.prod(diff, axis=2)

prod_diff[0:5]

In [None]:
prod_diff<0

In [None]:
# I can use ak.any to see which events have a product that is less than 0,
# which means it is a good run/lumiBlock!

mask2 = ak.any(prod_diff<=0, axis=1)

mask2

In [None]:
ak.num(t['Electron_eta'].array(), axis=0)

In [None]:
ak.num(t['Electron_eta'].array()[mask2], axis=0)

In [None]:
t['Electron_eta'].array()[mask2]

# Tests

This is where I prototyped stuff. 

In [None]:
mock_lumi_info = {}
mock_lumi_info[10] = [[1,100]]
mock_lumi_info[20] = [[2,50], [500,942]]
mock_lumi_info[30] = []
mock_lumi_info[40] = [[200,500], [520,594], [720,890]]

mock_lumi_info

In [None]:
run = 40
#lp = 550 # lumi period
lp = 800

blocks = mock_lumi_info[run]
blocks = np.array(blocks)

print(blocks)
print()

dl0 = lp - blocks.T[0]
dl1 = blocks.T[1] - lp

print(dl0)
print(dl1)
print()

prod = dl0*dl1
test = (dl1>0) & (dl0>0)

print(dl0>0)

print(prod)
print(test)

In [None]:
runs = np.array([10, 10, 10, 30, 40, 40])

print(runs)

In [None]:
#keys = np.array(mock_lumi_info.keys())
keys = mock_lumi_info.keys()

print(keys)
print(type(keys))


#blocks = keys.tolist().index(runs)
#print(blocks)

In [None]:
x = np.array(list(keys))

print(x)
print(type(x))

idx = np.where(x==40)

print(idx)

In [None]:
x = ak.Array(keys)

ak.where(x==30)

In [None]:
#ak.local_index(x, runs)

In [None]:
? list.index

In [None]:
? ak.index

In [None]:
akindex = ak.index

In [None]:
akindex.Index

In [None]:
# From ChatGPT

def find_indices(arr1, arr2):
    index_map = {value: index for index, value in enumerate(arr1)}
    return [index_map.get(value, -1) for value in arr2]



In [None]:
arr1 = np.array([10, 40, 50, 60])
arr2 = np.array([40, 40, 40, 60, 50])

result = find_indices(arr1, arr2)
print(result)