Utility notebook to skim candidates

# Import

In [1]:
import sys, os, pickle, time
from glob import glob
sys.path.append('../lib')
sys.path.append('../analysis')
from multiprocessing import Pool

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import ROOT as rt
rt.gErrorIgnoreLevel = rt.kError
rt.RooMsgService.instance().setGlobalKillBelow(rt.RooFit.ERROR)
import root_numpy as rtnp

from analysis_utilities import drawOnCMSCanvas, getEff, DSetLoader
from histo_utilities import create_TH1D, create_TH2D, std_color_list, SetMaxToMaxHist

from progressBar import ProgressBar
from B02DstMu_selection import candidate_selection, candidateSelection_stringList, candidateSelection_nameList

import CMS_lumi, tdrstyle
tdrstyle.setTDRStyle()
CMS_lumi.writeExtraText = 1


CMS_lumi.extraText = "     Simulation Internal"

donotdelete = []

Welcome to JupyROOT 6.12/07


# Create histograms file

In [3]:
file_loc = {}

In [4]:
MCdataSet = {
# 'mu_0': DSetLoader('B0_MuNuDmst_PU0', candDir='ntuples_probeB2DstMu'),
'p_mu_c0': DSetLoader('p_B0_MuNuDst_PUc0', candDir='ntuples_probeB2DstMu'),
'p_tau_c0': DSetLoader('p_B0_TauNuDst_PUc0', candDir='ntuples_probeB2DstMu')
}
for n, d in MCdataSet.iteritems():
    file_loc[n] = os.path.join(d.ntuples_dir, 'out_CAND_*.root')

In [5]:
RDloc = '../data/cmsRD/ParkingBPH*/'
file_loc['dataprobeB2DstMu'] = RDloc+'*2018*probeB2DstMu_200315_CAND.root'

In [6]:
def makeSelection(inputs):
    tag, filepath, leafs_names, idxInt, skipCut, serial = inputs
    N_accepted_cand = []
    N_accepted_tot = 0
    
    tree = rt.TChain('outA/Tevts')
    lastIdxDisc = -1
    for fn in glob(filepath):
        tree.Add(fn)
        if tree.GetEntries() + lastIdxDisc < idxInt[0]:
            lastIdxDisc += tree.GetEntries()
            tree = rt.TChain('outA/Tevts')
        elif tree.GetEntries() + lastIdxDisc > idxInt[1]:
            break
    
    nDiscEvts = lastIdxDisc + 1
            
        
    if serial:
        pb = ProgressBar(maxEntry=idxInt[1]+1)
    else:
        perc = int((idxInt[1]-idxInt[0])*0.3)
    
    output = np.zeros((idxInt[1]-idxInt[0]+1, len(leafs_names)))
    for i_ev, ev in enumerate(tree):
        i_ev += nDiscEvts
        if i_ev < idxInt[0]:
            continue
        if i_ev > idxInt[1]:
            break
            
        if serial:
            pb.show(i_ev-idxInt[0])
        elif (i_ev-idxInt[0]) % perc == 0:
            print tag, ': {:.0f}%'.format(100*(i_ev+1-idxInt[0])/(idxInt[1]-idxInt[0]))

        for j in range(ev.pval_piK.size()):
            ev_output = []
            if not skipCut == 'all':
                if not candidate_selection(j, ev, skipCut):
                    continue
            
            idx_st = 0
            for jjj in range(j):
                idx_st += int(ev.nTksAdd[jjj])

            N_lowMassAddTks = 0
            idx_stop = int(idx_st + ev.nTksAdd[j])
            for jj in range(idx_st, idx_stop):
                if ev.tksAdd_massVis[jj] < 5.28 and ev.tksAdd_cos_PV[jj]>0.95:
                    N_lowMassAddTks += 1
            

            aux = (ev.q2_D0pismu[j], ev.Est_mu_D0pismu[j], ev.M2_miss_D0pismu[j],
                   ev.mu_pt[j], ev.mu_eta[j], ev.mu_phi[j], 
                   ev.mu_sigdxy_PV[j], ev.mu_dca_vtxDst[j],
                   ev.B_D0pismu_pt[j], ev.B_D0pismu_eta[j], ev.B_D0pismu_phi[j],
                   ev.Dst_refitD0pismu_pt[j], ev.Dst_refitD0pismu_eta[j], ev.Dst_refitD0pismu_phi[j],
                   ev.D0_refitD0pismu_pt[j], ev.D0_refitD0pismu_eta[j], ev.D0_refitD0pismu_phi[j],
                   ev.pi_pt[j], ev.pi_eta[j], ev.pi_phi[j], ev.sigdxy_pi_PV[j],
                   ev.K_pt[j], ev.K_eta[j], ev.K_phi[j], ev.sigdxy_K_PV[j],
                   ev.pval_piK[j], ev.sigdxy_vtxD0_PV[j],
                   ev.pis_pt[j], ev.pis_eta[j], ev.pis_phi[j], ev.sigdxy_pis_PV[j],
                   ev.pval_D0pis[j],
                   ev.mass_piK[j], ev.mass_D0pis[j], ev.mass_D0pismu[j],
                   ev.pval_D0pismu[j], ev.cos_D0pismu_PV[j], ev.cosT_D0pismu_PV[j],
                   N_lowMassAddTks,
                   ev.N_vertexes
                  )
            if not 'data' in n:
                aux += (ev.MC_q2, ev.MC_Est_mu, ev.MC_M2_miss,
                        ev.MC_B_pt, ev.MC_B_eta, ev.MC_B_phi,
                        ev.MC_Dst_pt, ev.MC_Dst_eta, ev.MC_Dst_phi,
                        ev.MC_mu_pt, ev.MC_mu_eta, ev.MC_mu_phi, ev.MC_mu_IP,
                        ev.MC_idxCand == j
                       )
            if 'mu' in n or 'tau' in n:
                aux += (ev.wh_CLNCentral,
                        ev.wh_CLNR0Down, ev.wh_CLNR0Up,
                        ev.wh_CLNR1Down, ev.wh_CLNR1Up,
                        ev.wh_CLNR2Down, ev.wh_CLNR2Up,
                        ev.wh_CLNRhoSqDown, ev.wh_CLNRhoSqUp,
                       )
        
            ev_output.append(aux)
        
        N_acc = len(ev_output)
        idx = 0
        if N_acc > 1:
            if 'data' in n:
                idx = np.random.randint(len(ev_output))
            else:
                #Get matched can preferably
                varIdx = leafs_names.index('MC_idxMatch')
                goodIdx = np.nonzero([o[varIdx] for o in ev_output])[0]
                if goodIdx.shape[0] > 0:
                    auxIdx = np.random.randint(goodIdx.shape[0])
                    idx = goodIdx[auxIdx]
                else:
                    idx = np.random.randint(len(ev_output))

        if N_acc > 0:
            output[N_accepted_tot] = ev_output[idx]
            N_accepted_tot += 1
            N_accepted_cand.append(N_acc)

    output = output[:N_accepted_tot]
    if not serial:
        print tag, ': done'
    return [output, N_accepted_cand]

In [7]:
def create_dSet(n, filepath, skipCut=None, maxEntries=1e15):  
    print n
    if 'data' in n:
        fskimmed_name = '../data/cmsRD/skimmed/probeB2DstMu_' + filepath[-16:-10]
        if not skipCut is None:
            fskimmed_name += '_skip'+str(skipCut)
        fskimmed_name += '.root'
    else:
        d = os.path.dirname(filepath) + '/skimmed/'
        if not os.path.isdir(d):
            os.makedirs(d)
        fskimmed_name = d + 'selTree'
        if not skipCut is None:
            fskimmed_name += '_skip'+str(skipCut)
        fskimmed_name += '.root'
    logfile = fskimmed_name.replace('.root', '.log')
    if os.path.isfile(fskimmed_name) and not n in recreate:
        print 'Already present'
    else:
        tree = rt.TChain('outA/Tevts')
        for fn in glob(filepath):
            tree.Add(fn)
        N_cand_in = min(maxEntries, tree.GetEntries())
        print n, ': Total number of candidate events =', N_cand_in
            
        leafs_names = ['q2', 'Est_mu', 'M2_miss',
                       'mu_pt', 'mu_eta', 'mu_phi', 
                       'mu_sigdxy', 'mu_dca_vtxDst',
                       'B_pt', 'B_eta', 'B_phi',
                       'Dst_pt', 'Dst_eta', 'Dst_phi',
                       'D0_pt', 'D0_eta', 'D0_phi',
                       'pi_pt', 'pi_eta', 'pi_phi', 'pi_IP',
                       'K_pt', 'K_eta', 'K_phi', 'K_IP',
                       'pval_piK', 'sigdxy_vtxD0_PV',
                       'pis_pt', 'pis_eta', 'pis_phi', 'pis_IP',
                       'pval_D0pis',
                       'mass_piK', 'mass_D0pis', 'mass_D0pismu',
                       'pval_D0pismu', 'cos_D0pismu_PV', 'cosT_D0pismu_PV',
                       'N_lowMassAddTks',
                       'N_vtx'
                      ]
        if not 'data' in n:
            leafs_names += ['MC_q2', 'MC_Est_mu', 'MC_M2_miss',
                            'MC_B_pt', 'MC_B_eta', 'MC_B_phi',
                            'MC_Dst_pt', 'MC_Dst_eta', 'MC_Dst_phi',
                            'MC_mu_pt', 'MC_mu_eta', 'MC_mu_phi', 'MC_mu_IP',
                            'MC_idxMatch'
                           ]
        if 'mu' in n or 'tau' in n:
            leafs_names += ['wh_CLNCentral', 
                            'wh_CLNR0Down', 'wh_CLNR0Up', 
                            'wh_CLNR1Down', 'wh_CLNR1Up', 
                            'wh_CLNR2Down', 'wh_CLNR2Up', 
                            'wh_CLNRhoSqDown', 'wh_CLNRhoSqUp']
            
        if N_cand_in < 40000:
            output, N_accepted_cand = makeSelection(['', filepath, leafs_names, 
                                                     [0, N_cand_in-1], skipCut, True])
        else:
            pdiv = list(np.arange(0, N_cand_in, 20000))
            if not pdiv[-1] == N_cand_in: 
                pdiv.append(N_cand_in)
            print 'Will be divided into ' + str(len(pdiv)-1) + ' jobs'
            inputs = []
            for i in range(1, len(pdiv)):
                corr = 0
                if i == 1:
                    corr = -1
                inputs.append([str(i), filepath, leafs_names, [pdiv[i-1]+1+corr, pdiv[i]], skipCut, False])
            print ' '
            
            start = time.time()
            p = Pool(min(25,len(inputs)))
            outputs = p.map(makeSelection, inputs)
            output = np.concatenate(tuple([o[0] for o in outputs]))
            N_accepted_cand = []
            for o in outputs: N_accepted_cand += o[1]
            print 'Total time: {:.1f} min'.format((time.time()-start)/60.)
                
        
        dset = pd.DataFrame(output, columns=leafs_names)
        if not os.path.isdir(os.path.dirname(fskimmed_name)):
            os.makedirs(os.path.dirname(fskimmed_name))
        rtnp.array2root(dset.to_records(), fskimmed_name, treename='Tevts', mode='RECREATE')
        
        with open(logfile, 'w') as f:
            ln = 'Number of candidates per events\n{'
            ln += ', '.join(['{}:{}'.format(i, N_accepted_cand.count(i)) for i in range(1, np.max(N_accepted_cand)+1)])
            ln += '}\n'
            f.write(ln)
            f.write('N_analyzed: '+str(N_cand_in)+'\n')
            f.write('N_accepted: '+str(dset.shape[0])+'\n')
            e = getEff(dset.shape[0], N_cand_in)
            f.write('Eff: {:.3f} +/- {:.3f} %'.format(1e2*e[0], 1e2*e[1])+'\n')
         
    os.system('echo '+logfile+';cat '+logfile + ';echo ')

In [8]:
recreate = []#file_loc.keys()

# for n, fp in file_loc.iteritems():
#     create_dSet(n, fp, 'all')

for n, fp in file_loc.iteritems():
        create_dSet(n, fp)

dataprobeB2DstMu
dataprobeB2DstMu : Total number of candidate events = 60667
Will be divided into 4 jobs
 
1 : 0%
2 : 0%
3 : 0%
1 : 30%
4 : 0%
4 : 30%
4 : 59%
4 : 89%
4 : done
2 : 30%
3 : 30%
1 : 60%
2 : 59%
3 : 59%
1 : 90%
1 : done
2 : 89%
3 : 89%
2 : done
3 : done
Total time: 0.2 min
p_tau_c0
Already present
p_mu_c0
Already present
../data/cmsRD/skimmed/probeB2DstMu_200315.log
Number of candidates per events
{1:18161}
N_analyzed: 60667
N_accepted: 18161
Eff: 29.936 +/- 0.186 %

/storage/user/ocerri/BPhysics/data/cmsMC_private/BP_Probe_B0_TauNuDmst_Tag-B_MuNuDst_Hardbbbar_evtgen_ISGW2_PUc0_10-2-3/ntuples_probeB2DstMu/skimmed/selTree.log
Number of candidates per events
{1:15037}
N_analyzed: 25224
N_accepted: 15037
Eff: 59.614 +/- 0.309 %

/storage/user/ocerri/BPhysics/data/cmsMC_private/BP_Probe_B0_MuNuDmst_Tag-B_MuNuDst_Hardbbbar_evtgen_ISGW2_PUc0_10-2-3/ntuples_probeB2DstMu/skimmed/selTree.log
Number of candidates per events
{1:34488}
N_analyzed: 54235
N_accepted: 34488
Eff: 63.590 +

# Analyze selection efficiencies

In [9]:
def getTree(n, filepath, skipCut=None, maxEntries=1e15):  
    print n, skipCut
    if 'data' in n:
        pass
    else:
        d = os.path.dirname(filepath) + '/skimmed/'
        fskimmed_name = d + 'selTree'
        if not skipCut is None:
            fskimmed_name += '_skip'+str(skipCut)
        fskimmed_name += '.root'
    if os.path.isfile(fskimmed_name):
        t = rt.TChain('Tevts')
        t.Add(fskimmed_name)
        return t
    else: return None

## Get efficiency per cut

In [10]:
T = {}
for n, fp in file_loc.iteritems():
    aux = getTree(n, fp, 'all', maxEntries=1e9)
    if not aux is None:
        T[n] = aux
    else:
        raise

dataprobeB2DstMu all


UnboundLocalError: local variable 'fskimmed_name' referenced before assignment

In [None]:
allCuts = ' && '.join(candidateSelection_stringList)
eff = {}
for n, t in T.iteritems():
    print n
    eff[n] = np.zeros((len(candidateSelection_stringList)+1,2))
    Ntot = float(t.GetEntries())
    Nsel = t.GetEntries(allCuts)
    eff[n][0] = getEff(Nsel, Ntot)
    pb = ProgressBar(maxEntry=len(candidateSelection_stringList))
    for ic, c in enumerate(candidateSelection_stringList):
        pb.show(ic)
        eff[n][ic+1] = getEff(t.GetEntries(c), Ntot)

for n in T.keys():
    y = np.array(eff[n])
    x = np.arange(y.shape[0])
    plt.errorbar(x, y[:, 0], y[:,1], lw=0, elinewidth=5, label=n)

plt.rcParams.update({'font.size': 20})
plt.xlabel('Cut')
plt.ylabel('Efficiency')
plt.legend(loc='best', numpoints=1)
plt.xticks(range(len(candidateSelection_nameList)+1), ['all']+candidateSelection_nameList, rotation=80)
plt.ylim(0.6,1.05)
plt.xlim(-1, len(candidateSelection_nameList)+1)
plt.grid()
plt.gcf().set_size_inches(10, 6)