Notebook to collect the information of the generation

### Import

In [1]:
import sys, os, re, yaml, pickle
import commands
from glob import glob
sys.path.append('../lib')

import time, datetime

In [2]:
import signal

class TimeoutError(Exception):
    pass

class timeout:
    def __init__(self, seconds=1, error_message='Timeout'):
        self.seconds = seconds
        self.error_message = error_message
    def handle_timeout(self, signum, frame):
        raise TimeoutError(self.error_message)
    def __enter__(self):
        signal.signal(signal.SIGALRM, self.handle_timeout)
        signal.alarm(self.seconds)
    def __exit__(self, type, value, traceback):
        signal.alarm(0)

# with timeout(seconds=1):
#     try:
#         time.sleep(2)
#     except TimeoutError:
#         print 'Got it'

In [3]:
import numpy as np
from scipy.stats import mode
import matplotlib.pyplot as plt
from prettytable import PrettyTable
import humanfriendly
from progressBar import ProgressBar

In [4]:
import uproot as ur
import ROOT as rt
rt.gErrorIgnoreLevel = rt.kError
rt.RooMsgService.instance().setGlobalKillBelow(rt.RooFit.ERROR)
import root_numpy as rtnp

Welcome to JupyROOT 6.12/07


In [5]:
# load FWLite C++ libraries
rt.gSystem.Load("libFWCoreFWLite.so");
rt.gSystem.Load("libDataFormatsFWLite.so");
rt.FWLiteEnabler.enable()

# load FWlite python libraries
from DataFormats.FWLite import Lumis
from DataFormats.FWLite import Handle

# Inputs

In [6]:
site_loc_conf = '/mnt/hadoop/store/user/ocerri'

In [7]:
sampleFile = '/storage/user/ocerri/work/CMSSW_10_2_3/src/ntuplizer/BPH_RDntuplizer/production/samples.yml'
samples = yaml.load(open(sampleFile))['samples']

In [8]:
class Bauble(object):
    def __init__(self, in_sample, 
                 candLoc='/storage/user/ocerri/BPhysics/data/cmsMC_private/', 
                 candDir='ntuples_B2DstMu'):
        if not in_sample in samples.keys():
            raise
        self.sample = in_sample
        self.candLoc = candLoc
        self.candDir = candDir

In [9]:
setName = 'SignalRegion'
outDic = {}

outDic['mu'] = Bauble('B0_MuNuDmst_PU20')

outDic['tau'] = Bauble('B0_TauNuDmst_PU20')

outDic['Hc'] = Bauble('B0_DmstHc_PU20')

outDic['Dstst'] = Bauble('Bp_MuNuDstst_PU20')

In [10]:
# setName = 'SidePT'
# outDic = {}

# outDic['JPsiKst'] = Bauble('B0_JpsiKst_PU0')

Gather the files

In [11]:
for d in outDic.values():
    d.MINIAOD_dirs = []
    for part in samples[d.sample]['parts']:
        aux = glob(part)
        if len(aux) > 0:
            aux = os.path.dirname(part)
        else:
            aux = glob(site_loc_conf + part[:-38].replace('ocerri-','') + '/*/*')
        d.MINIAOD_dirs += aux
    
    d.full_name = samples[d.sample]['dataset']
    d.ntuples_dir = glob(os.path.join(d.candLoc, d.full_name, d.candDir))[0]

# Efficiency

In [12]:
def getEff(k,N):
    e = k/float(N)
    de = np.sqrt(e*(1-e)/N)
    return [e, de]

In [13]:
handle = {}
handle['genFilter'] = [Handle('GenFilterInfo'), ('genFilterEfficiencyProducer', '', 'SIM')]
handle['genProduct'] = [Handle('GenLumiInfoProduct'), ('generator', '', 'SIM')]

In [None]:
N_max = 5000

for name, d in outDic.iteritems():
    print '\n\n--> ' + name
    N_gen = 0
    N_cuts = 0
    xsec = []
    xsec_err = []
    
    fileList = []
    for directory in d.MINIAOD_dirs:
        fileList += glob(directory + '/out_MINIAODSIM_*.root')
    if N_max > 0 and N_max < len(fileList):
        fileList = np.random.choice(fileList, N_max)
    
    print 'Analizing', len(fileList), 'logs'
    pb = ProgressBar(maxEntry=len(fileList))
    skippedFiles = []
    for i_j, fileName in enumerate(fileList):
        pb.show(i_j)
        
        with timeout(seconds=3):
            try:
                for lumi in Lumis(fileName):
                    prods = {}
                    for k,v in handle.iteritems():
                        lumi.getByLabel(v[1], v[0])
                        prods[k] = v[0].product()
                    N_cuts += prods['genFilter'].numEventsPassed()
                    N_gen += prods['genFilter'].numEventsTotal()
                    xs = prods['genProduct'].getProcessInfos()[0].lheXSec()
                    xsec.append(xs.value())
                    xsec_err.append(xs.error())
            except TimeoutError:
                skippedFiles.append(fileName)
    print 'Skipped {} files for timeout'.format(len(skippedFiles))
    
    xsec = np.array(xsec)
    xsec_err = np.array(xsec_err)
    s2 = np.square(xsec_err)
    num = np.sum(xsec/s2)
    den = np.sum(1./s2)
    xsec = 1e3*num/den
    xsec_err = 1e3*np.sqrt(1/den)
    print 'Xsec: {:1.4e} +/- {:1.4e} fb ({:1.1e})'.format(xsec, xsec_err, xsec_err/xsec)
    d.xsec = [xsec, xsec_err]
    
    e, de = getEff(N_cuts, N_gen)
    print 'eff generator: {:1.3e} +/- {:1.3e} ({:1.1e})'.format(e,de, de/e)
    d.effGEN = [e, de]
    d.nTotGen = N_gen

    if not os.path.isdir(d.ntuples_dir):
        continue
    cand_out_list = glob(os.path.join(d.ntuples_dir,'out/job*.out'))
    N_analyzed = 0
    N_trg = 0
    N_cand = 0
    print 'Analyzing {} ntuplizer jobs'.format(len(cand_out_list))
    pb = ProgressBar(maxEntry=len(cand_out_list))
    for ic, cand_out in enumerate(cand_out_list):
        pb.show(ic)
        step5_log_lines = open(cand_out).readlines()
        eff_ln = []
        for line in reversed(step5_log_lines):
            if 'efficiency:' in line:
                eff_ln.append(line)

        aux = re.search('[0-9]+/[0-9]+', eff_ln[1]).group(0)
        aux = aux.split('/')
        N_analyzed += int(aux[1])
        N_trg += int(aux[0])
        
        aux = re.search(': [0-9]+/', eff_ln[0]).group(0)
        N_cand += int(aux[2:-1])
    
    e, de = getEff(N_trg, N_analyzed)
    d.effCAND_trg = e, de
    print 'eff candidates (trigger): {:1.3e} +/- {:1.3e} ({:1.1e})'.format(e,de, de/e)
    
    e, de = getEff(N_cand, N_trg)
    d.effCAND_cand = e, de
    print 'eff candidates (cand): {:1.3e} +/- {:1.3e} ({:1.1e})'.format(e,de, de/e)
    
    e, de = getEff(N_cand, N_analyzed)
    d.effCAND = e, de
    print 'eff candidates: {:1.3e} +/- {:1.3e} ({:1.1e})'.format(e,de, de/e)



--> mu
Analizing 5000 logs
[####################]  100% - Tot. time: 1735.1 s
Skipped 50 files for timeout
Xsec: 1.4799e+11 +/- 2.5726e+06 fb (1.7e-05)
eff generator: 7.005e-03 +/- 3.000e-06 (4.3e-04)
Analyzing 784 ntuplizer jobs
[####################]  100% - Tot. time: 12.3 s
eff candidates (trigger): 2.406e-01 +/- 6.819e-05 (2.8e-04)
eff candidates (cand): 9.106e-02 +/- 9.357e-05 (1.0e-03)
eff candidates: 2.191e-02 +/- 2.335e-05 (1.1e-03)


--> tau
Analizing 5000 logs
[##------------------]  13% - ETA: 14.6 min 

In [None]:
for n, d in outDic.iteritems():
    dump_dic = {'nTotGen': int(d.nTotGen)}
    for k in ['xsec', 'effGEN', 'effCAND', 'effCAND_trg', 'effCAND_cand']:
        aux = getattr(d, k)
        dump_dic[k] = [float(aux[0]), float(aux[1])]
    outdir = os.path.join(d.candLoc, d.full_name)
    if not os.path.isdir(outdir):
        os.makedirs(outdir)
    with open(os.path.join(outdir,'effMC.yaml'), 'w') as dumpF:
        f.write(yaml.dump(dump_dic, default_flow_style=False, default_style=''))