In [1]:
import json
import multiprocessing
import os
import time

import energyflow as ef
from energyflow.datasets import mod
import numpy as np

import utils

In [2]:
cname = 'CMS2011AJets'
collection = mod.COLLECTIONS[cname]

In [12]:
def hash_file(arg):
    filename, (cache_dir, subdir, algorithm) = arg
    filepath = ef.utils.data_utils._get_filepath(filename, None, cache_dir, 
                                                 cache_subdir=subdir, file_hash=None)
    return ef.utils.data_utils._hash_file(filepath, algorithm=algorithm)

def make_hash_dict(collection, cname, cache_dir='~/.energyflow', compressed=True, algorithm='md5'):
    global hashes, name
    hashes = {}
    for k,v in collection.items():
        for dset in v['subdatasets']:
            start = time.time()
            name, nfiles, record = dset
            subdir = os.path.join('datasets', cname, name)
            
            opts = [cache_dir, subdir, algorithm]
            comp_str = '_compressed' if compressed else ''
            filenames = ['{}_{}{}.h5'.format(name, i, comp_str) for i in range(nfiles)]
            with multiprocessing.Pool() as pool:
                for i,h in enumerate(pool.map(hash_file, zip(filenames, nfiles*[opts]))):
                    hashes[filenames[i]] = h
            
            print('Done with {} in {:.3f}s'.format(name, time.time() - start))

    return hashes

def get_total_weights(collection, cname, cache_dir='~/.energyflow', compressed=True):
    weights = {}
    for k,v in collection.items():
        for dset in v['subdatasets']:
            start = time.time()
            name, nfiles, record = dset
            subdir = os.path.join('datasets', cname, name)
            
            total_weight = 0.
            for i in range(nfiles):
                filename = '{}_{}{}.h5'.format(name, i, '_compressed' if compressed else '')
                filepath = ef.utils.data_utils._get_filepath(filename, None, cache_dir, 
                                                         cache_subdir=subdir, file_hash=None)
                
                dset = mod.MODDataset(filepath, store_pfcs=False, store_gens=False)
                total_weight += np.sum(dset.weights)
                
            weights[name] = total_weight
                
            print('Done with {} in {:.3f}s'.format(name, time.time() - start))

    return weights

In [14]:
print('Hashes')
hashes = make_hash_dict(collection, cname)

print()
print('Weights')
weights = get_total_weights(collection, cname)

Hashes
Done with CMS_Jet300_pT375-infGeV in 0.598s
Done with SIM170_Jet300_pT375-infGeV in 0.400s
Done with SIM300_Jet300_pT375-infGeV in 0.789s
Done with SIM470_Jet300_pT375-infGeV in 1.395s
Done with SIM600_Jet300_pT375-infGeV in 1.550s
Done with SIM800_Jet300_pT375-infGeV in 1.556s
Done with SIM1000_Jet300_pT375-infGeV in 0.965s
Done with SIM1400_Jet300_pT375-infGeV in 0.953s
Done with SIM1800_Jet300_pT375-infGeV in 0.851s
Done with GEN170_pT375-infGeV in 0.353s
Done with GEN300_pT375-infGeV in 0.551s
Done with GEN470_pT375-infGeV in 1.255s
Done with GEN600_pT375-infGeV in 1.364s
Done with GEN800_pT375-infGeV in 1.360s
Done with GEN1000_pT375-infGeV in 0.856s
Done with GEN1400_pT375-infGeV in 0.863s
Done with GEN1800_pT375-infGeV in 0.671s

Weights
Done with CMS_Jet300_pT375-infGeV in 1.800s
Done with SIM170_Jet300_pT375-infGeV in 0.021s
Done with SIM300_Jet300_pT375-infGeV in 3.760s
Done with SIM470_Jet300_pT375-infGeV in 11.672s
Done with SIM600_Jet300_pT375-infGeV in 12.623s
Done

In [42]:
#data = {'md5_hashes': hashes, 'total_weights': weights}
#f = np.load('../../../../../EnergyFlow/energyflow/data/ReweightingFactors.npz')
#data.update({k: v.tolist() for k,v in f.items()})
#with open('/home/pkomiske/Dropbox/Research/EnergyFlow/energyflow/data/{}.json'.format(cname), 'w') as f:
#    json.dump(data, f)

In [46]:
with open('/home/pkomiske/Dropbox/Research/EnergyFlow/energyflow/data/{}.json'.format(cname), 'r') as f:
    data = json.load(f)
data.keys()

dict_keys(['md5_hashes', 'total_weights', 'kfactor_x', 'kfactor_y', 'npv_hist_ratios', 'residual_factor'])