In [1]:
from ROOT import TFile
from root_numpy import root2array, root2rec, tree2array
import numpy as np
import pandas as pd

Welcome to JupyROOT 6.10/09


In [193]:
def get_size(filepath):
    print "skimming " + filepath
    f = TFile.Open(filepath)
    size = f.Get("ZZTree/candTree").GetEntries()
    f.Close()

    return size

In [207]:
# this simulates a single ROOT file that is actually distributed over multiple "physical" ROOT trees. From each file in the list,
# only the portion between start_fraction and end_fraction is considered
class FileCollection:
    def __init__(self, files, start_fraction, end_fraction):
        self.files = files
        self.start_fraction = start_fraction
        self.end_fraction = end_fraction

        # the number of entries in each file, and the local start- and endpositions
        self.lengths = [get_size(file_path) for file_path in files]
        self.minpos = [int(length * start_fraction) for length in self.lengths]
        self.maxpos = [int(length * end_fraction) for length in self.lengths]
        
        self.used_lengths = [end - beginning for (end, beginning) in zip(self.maxpos, self.minpos)]
                
        self.total_length = sum(self.lengths)
        self.used_length = sum(self.used_lengths)
        
        print "collection set up: " + str(len(files)) + " files, " + str(self.total_length) + " entries in total, " + str(self.used_length) + " of which will be used"
        
    def get_length(self):
        return self.used_length
    
    # returns some data from this file collection
    def get_data(self, branches, start_index, end_index):
        # now need to translate between a global index, and a filepath and its corresponding local index
        return 0
        
    def transform_index(self, global_index):
        if global_index >= self.get_length():
            raise IndexError("global index out of range")
        
        # first determine which file in the list is needed to read this index
        max_local_indices = np.array(self.used_lengths) - 1  # all works by 0-indexing
        cum_lengths = np.cumsum(self.used_lengths)
        
        needed_file = 0
        while global_index > cum_lengths[needed_file] - 1:
            needed_file += 1
            
        # then determine the corresponding local index within this file
        local_minpos = np.append(0, cum_lengths)
                
        local_index = global_index - local_minpos[needed_file]
        
        # up to now, all these indices are relative w.r.t. the used slice in each file. the beginning of these slices can be shifted w.r.t. the beginning of the file itself
        local_index += self.minpos[needed_file]
            
        return self.files[needed_file], local_index
    
    def transform_index_range(self, global_start_index, global_end_index):
        if global_start_index >= self.get_length() or global_end_index >= self.get_length():
            raise IndexError("global index out of range")
        if global_end_index < global_start_index:
            raise IndexError("end ought to come after beginning")
            
        local_coords = [self.transform_index(global_index) for global_index in range(global_start_index, global_end_index)]
        needed_files = set([local_coord[0] for local_coord in local_coords])
        
        retval = []
        # now look at each needed file in turn and determine the relevant index range in this local file
        for needed_file in needed_files:
            needed_local_indices = [local_coord[1] for local_coord in local_coords if local_coord[0] == needed_file ]
            needed_min_index = min(needed_local_indices)
            needed_max_index = max(needed_local_indices)
            
            retval += (needed_file, needed_min_index, needed_max_index)
        
        return retval

In [208]:
inpath = "/data_CMS/cms/wind/CJLST_NTuples/"
filename = "/ZZ4lAnalysis.root"
filelist = ["ggH125", "VBFH125", "WplusH125"]

In [209]:
pathlist = [inpath + cur_file + filename for cur_file in filelist]

In [210]:
coll = FileCollection(pathlist, 0.1, 0.2)

skimming /data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root
skimming /data_CMS/cms/wind/CJLST_NTuples/VBFH125/ZZ4lAnalysis.root
skimming /data_CMS/cms/wind/CJLST_NTuples/WplusH125/ZZ4lAnalysis.root
collection set up: 3 files, 205204 entries in total, 20520 of which will be used


In [216]:
coll.transform_index_range(1000, 20000)

['/data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root',
 12048,
 22095,
 '/data_CMS/cms/wind/CJLST_NTuples/VBFH125/ZZ4lAnalysis.root',
 6232,
 12463,
 '/data_CMS/cms/wind/CJLST_NTuples/WplusH125/ZZ4lAnalysis.root',
 3240,
 5959]