In [2]:
%load_ext autoreload
%autoreload 2

import os
import time
import glob
import re
import pandas as pd
from functools import reduce
from klepto.archives import dir_archive

import numpy as np
from tqdm.auto import tqdm
import coffea.processor as processor
from coffea.processor.accumulator import AccumulatorABC
from coffea.analysis_objects import JaggedCandidateArray
from coffea.btag_tools import BTagScaleFactor
from coffea import hist
import pandas as pd
import uproot_methods
import uproot
import awkward
import copy

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm

from Tools.config_helpers import *
from Tools.helpers import mergeArray, mt, get_scheduler_address

from Tools.objects import Collections
from Tools.cutflow import Cutflow

# This just tells matplotlib not to open any
# interactive windows.
matplotlib.use('Agg')

In [3]:
def pad_and_flatten(val): 
    try:
        return val.pad(1, clip=True).fillna(0.).flatten()#.reshape(-1, 1)
    except AttributeError:
        return val.flatten()

#os.environ['KERAS_BACKEND'] = 'theano'
#from keras.models import load_model

In [4]:
import sys
sys.setrecursionlimit(10000)
print(sys.getrecursionlimit())

10000


In [5]:
from Tools.WH_objects import *
from Tools.WH_scalefactors import LeptonSF
from Tools.WH_deepAK8 import getWTagSF

In [6]:
#global variables... to avoid making bugs!!!
processesList = ['DYJets', 'TTJets', 'ttW', 'ttZ', 'Data']
linesList= ['triggers', 'filters',  'dielectron = 1', 'opposite sign','70 < dielectron mass < 110',
           'lead electron pt > 25', 'sublead electron pt > 15',
            'MET>250', 'N_fatjet>1', 
           'min_dphiFatJetMet4', 'dphiDiFatJet', 'njet veto']
#            'htag > 0', 'minmth>200', '90 < h sd mass > 150',
plotDir = '/home/users/mbryson/public_html/dump/WH/WH_2017/4_1_21/'
year = 2017

In [7]:
class analysisProcessor(processor.ProcessorABC):
    """Processor used for running the analysis"""
    def __init__(self):
        
        ## load b-tag SFs
        #self.btag_sf = BTagScaleFactor(os.path.expandvars("$TWHOME/data/DeepCSV_102XSF_V1.btag.csv.gz", "reshape")

        ## load the NN
        #self.model = load_model('../ML/data/training.h5')
        #self.stds  = pd.read_json('../ML/data/stds.json').squeeze()
        #self.means = pd.read_json('../ML/data/means.json').squeeze()
        
        # we can use a large number of bins and rebin later
        dataset_axis        = hist.Cat("dataset",   "Primary dataset")
        pt_axis             = hist.Bin("pt",        r"$p_{T}$ (GeV)", 1000, 0, 1000)
        p_axis              = hist.Bin("p",         r"$p$ (GeV)", 1000, 0, 2500)
        ht_axis             = hist.Bin("ht",        r"$H_{T}$ (GeV)", 500, 0, 5000)
        mass_axis           = hist.Bin("mass",      r"Mass (GeV)", 1000, 0, 2000)
        eta_axis            = hist.Bin("eta",       r"$\eta$", 60, -5.5, 5.5)
        phi_axis            = hist.Bin("phi",       r"$\phi$", 40, -4, 4)
        delta_axis          = hist.Bin("delta",     r"$\delta$", 100,0,10 )
        multiplicity_axis   = hist.Bin("multiplicity",         r"N", 20, -0.5, 19.5)
        norm_axis           = hist.Bin("norm",         r"N", 25, 0, 1)

        self._accumulator = processor.dict_accumulator({
            "met":                     hist.Hist("Counts", dataset_axis, pt_axis),
            "met-electron":                hist.Hist("Counts", dataset_axis, pt_axis),
            "met_SR":                  hist.Hist("Counts", dataset_axis, pt_axis),
            "nW":                      hist.Hist("Counts", dataset_axis, multiplicity_axis),
            "nH":                      hist.Hist("Counts", dataset_axis, multiplicity_axis),
            "leade_pt":               hist.Hist("Counts", dataset_axis, pt_axis),
            "leade_eta":              hist.Hist("Counts", dataset_axis, eta_axis),
            "leade_phi":              hist.Hist("Counts", dataset_axis, phi_axis),
            "subleade_pt":            hist.Hist("Counts", dataset_axis, pt_axis),
            "subleade_eta":           hist.Hist("Counts", dataset_axis, eta_axis),
            "subleade_phi":           hist.Hist("Counts", dataset_axis, phi_axis),
            "ee_deltaPhi":             hist.Hist("Counts", dataset_axis, delta_axis),
            "ee_deltaR":               hist.Hist("Counts", dataset_axis, delta_axis),
            "ee_mass":                 hist.Hist("Counts", dataset_axis, mass_axis),
            "ee_pt":                   hist.Hist("Counts", dataset_axis, pt_axis),
            "ee_eta":                  hist.Hist("Counts", dataset_axis, eta_axis),
            "ee_phi":                  hist.Hist("Counts", dataset_axis, phi_axis),
            "N_AK4" :                  hist.Hist("Counts", dataset_axis, multiplicity_axis),
            "N_AK8" :                  hist.Hist("Counts", dataset_axis, multiplicity_axis),
            "leadAK4_pt" :             hist.Hist("Counts", dataset_axis, pt_axis),            
            "leadAK4_eta" :            hist.Hist("Counts", dataset_axis, eta_axis),
            "leadAK4_phi" :            hist.Hist("Counts", dataset_axis, phi_axis),
            "subleadAK4_pt" :          hist.Hist("Counts", dataset_axis, pt_axis),            
            "subleadAK4_eta" :         hist.Hist("Counts", dataset_axis, eta_axis),
            "subleadAK4_phi" :         hist.Hist("Counts", dataset_axis, phi_axis),
            "leadAK8_pt" :             hist.Hist("Counts", dataset_axis, pt_axis),            
            "leadAK8_eta" :            hist.Hist("Counts", dataset_axis, eta_axis),            
            "leadAK8_phi" :            hist.Hist("Counts", dataset_axis, phi_axis), 
            "subleadAK8_pt" :          hist.Hist("Counts", dataset_axis, pt_axis),            
            "subleadAK8_eta" :         hist.Hist("Counts", dataset_axis, eta_axis),            
            "subleadAK8_phi" :         hist.Hist("Counts", dataset_axis, phi_axis),
            "min_dphiFatJetMet4":      hist.Hist("Counts", dataset_axis, delta_axis),
            "dphiDiFatJet":            hist.Hist("Counts", dataset_axis, delta_axis),
            "min_drFatJetLeadElectron":    hist.Hist("Counts", dataset_axis, delta_axis),
            "min_drFatJetSubLeadElectron": hist.Hist("Counts", dataset_axis, delta_axis),
             
            'TTJets':           processor.defaultdict_accumulator(int),
            'DYJets':           processor.defaultdict_accumulator(int),
            'ttW':              processor.defaultdict_accumulator(int),
            'ttZ':              processor.defaultdict_accumulator(int),
            'Data':             processor.defaultdict_accumulator(int),
            'totalEvents':      processor.defaultdict_accumulator(int),
        })

    @property
    def accumulator(self):
        return self._accumulator

    def process(self, df):
        """
        Processing function. This is where the actual analysis happens.
        """
        output = self.accumulator.identity()
        dataset = df["dataset"]
        cfg = loadConfig()
        
        ############## MET ##############
        met = JaggedCandidateArray.candidatesfromcounts(
            (df['MET_pt']>=0)*1,
            pt = df["MET_pt"],
            eta = df["MET_phi"]*0,
            phi = df["MET_phi"],
            mass = df["MET_phi"]*0
        )
        
        
        ############## LOAD OBJECTS ############## 
        
        muon     = getMuons(df, WP='veto')
        electron = getElectrons(df, WP='veto')
        tau      = getTaus(df)
        isotrack = getIsoTracks(df)
        fatjet   = getFatJets(df)
        jet      = getJets(df)
        
        triggers = getDielectronTriggers(df, year=year, dataset=dataset)
        filters  = getFilters(df, year=year, dataset=dataset)
        
        sf = LeptonSF(year=year)
        leptonSF = sf.get(electron, muon)

        ############## LEPTONS ############## 
    
        electron = electron[electron.pt.argsort(ascending=False)]
        
        leade = electron[:,:1]
        subleade = electron[:,1:2]
        ee = electron.choose(2)
        newmet = ee.cross(met)
        
        ee_deltaPhi = np.arccos(np.cos(ee.i0.phi-ee.i1.phi))
        ee_deltaR = ee.i0.p4.delta_r(ee.i1.p4)
        
        muon = muon[muon.pt.argsort(ascending=False)]
                
        leadm = muon[:,:1]
        subleadm = muon[:,1:2]
        mm = muon.choose(2)
        #newmet = mm.cross(met)
        
        mm_deltaPhi = np.arccos(np.cos(mm.i0.phi-mm.i1.phi))
        mm_deltaR = mm.i0.p4.delta_r(mm.i1.p4)

        

        
        ############## FATJETS ##############
        
        fatjet       = fatjet[~fatjet.match(muon, deltaRCut=0.8)] # remove jets that overlap with muons
        fatjet       = fatjet[~fatjet.match(electron, deltaRCut=0.8)]
        
        high_pt_fatjet = fatjet[fatjet.pt.argsort(ascending=False)][:,:2]
        leadfatjet = high_pt_fatjet[high_pt_fatjet.pt.argmax()]
        subleadfatjet = high_pt_fatjet[high_pt_fatjet.pt.argmin()]
        
        leadingFatJets = fatjet[:,:2]
        difatjet = leadingFatJets.choose(2)
        dphiDiFatJet = np.arccos(np.cos(difatjet.i0.phi-difatjet.i1.phi))
        
        htag = fatjet[((fatjet.pt > 200) & (fatjet.deepTagMD_HbbvsQCD > 0.8365))]
        htag_hard = fatjet[((fatjet.pt > 300) & (fatjet.deepTagMD_HbbvsQCD > 0.8365))]
        
        lead_htag = htag[htag.pt.argmax()]
        
        wtag = fatjet[((fatjet.pt > 200) & (fatjet.deepTagMD_HbbvsQCD < 0.8365) & (fatjet.deepTag_WvsQCD > 0.918))]
        wtag_hard = fatjet[((fatjet.pt > 300) & (fatjet.deepTagMD_HbbvsQCD < 0.8365) & (fatjet.deepTag_WvsQCD > 0.918))]
        
        lead_wtag = wtag[wtag.pt.argmax()]
        
        wh = lead_htag.cross(lead_wtag)
        wh_deltaPhi = np.arccos(wh.i0.phi - wh.i1.phi)
        wh_deltaR = wh.i0.p4.delta_r(wh.i1.p4)
        
        ############## JETS ##############

        skimjet   = jet[(jet.pt>30) & (abs(jet.eta)<2.4)]
        jet       = jet[(jet.pt>30) & (jet.jetId>1) & (abs(jet.eta)<2.4)]
        jet       = jet[~jet.match(muon, deltaRCut=0.4)] # remove jets that overlap with muons
        jet       = jet[~jet.match(electron, deltaRCut=0.4)]
#        jet       = jet[~jet.match(fatjet, deltaRCut=1.2)]# remove jets that overlap with electrons
        jet       = jet[jet.pt.argsort(ascending=False)] # sort the jets
        btag      = jet[(jet.btagDeepB>0.4184)]
        light     = jet[(jet.btagDeepB<0.4184)]
        extrajet  = jet[~jet.match(fatjet, deltaRCut=0.8)] # remove AK4 jets that overlap with AK8 jets
        
        ## Get leading jets
        
        jet = jet[jet.pt.argsort(ascending=False)]
        leadjet = jet[:,:1]
        subleadjet = jet[:,1:2]
        
        ## Get the leading b-jets
        high_score_btag = jet[jet.btagDeepB.argsort(ascending=False)][:,:2]

        leading_b      = btag[btag.pt.argmax()]
        
        bb = high_score_btag.choose(2)
        bb_deltaPhi = np.arccos(np.cos(bb.i0.phi-bb.i1.phi))
        bb_deltaR = bb.i0.p4.delta_r(bb.i1.p4)

        ############## OTHER VARIABLES ##############
        
        ht = jet.pt.sum()
                
        leadingJets = jet[:,:2]
        dijet = leadingJets.choose(2)
        dphiDiJet = np.arccos(np.cos(dijet.i0.phi-dijet.i1.phi))

        min_dphiFatJetMet4 = np.arccos(np.cos(fatjet[:,:4].phi-newmet.phi.min())).min()


        min_drfatjetleadmuon = (np.sqrt((fatjet[:,:4].eta - leadm.eta.min())** 2 + np.arccos(np.cos(fatjet[:,:4].phi-leadm.phi.min()))** 2)).min()
        min_drfatjetsubleadmuon = (np.sqrt((fatjet[:,:4].eta - subleadm.eta.min())** 2 + np.arccos(np.cos(fatjet[:,:4].phi-subleadm.phi.min()))** 2)).min()

        min_drfatjetleadelectron = (np.sqrt((fatjet[:,:4].eta - leade.eta.min())** 2 + np.arccos(np.cos(fatjet[:,:4].phi-leade.phi.min()))** 2)).min()
        min_drfatjetsubleadelectron = (np.sqrt((fatjet[:,:4].eta - subleade.eta.min())** 2 + np.arccos(np.cos(fatjet[:,:4].phi-subleade.phi.min()))** 2)).min()

        
        mth = mt(htag.pt, htag.phi, newmet.pt.min(), newmet.phi.min())

        ############## SELECTIONS ##############
        
        dilep_sel = ((electron.counts+muon.counts)==2)
        dilep_sf_sel = dilep_sel & ((electron.counts ==2 )|(muon.counts == 2))
        dilep_of_sel = dilep_sel & ((electron.counts ==1 )|(muon.counts == 1))
        
        dimuon_sel = dilep_sel & (muon.counts == 2)
        dielectron_sel = dilep_sel & (electron.counts == 2)
        
        ak4_sel = (jet.counts > 1)
        ak8_sel = (fatjet.counts > 0)
        
        dimuonmass_sel = (abs(mm.mass-90) < 20).any()
        dimuonpt_sel = (mm.pt > 200).any()
        leadmuon_sel = (leadm.pt > 25).any()
        subleadmuon_sel = (subleadm.pt > 15).any()
        #os_sel = ((muon[muon.pdg < 0].counts == 1) & (muon[muon.pdg > 0].counts == 1))

        dielectronmass_sel = (abs(ee.mass-90) < 20).any()
        dielectronpt_sel = (ee.pt > 200).any()
        leadelectron_sel = (leade.pt > 25).any()
        subleadelectron_sel = (subleade.pt > 15).any()
        os_sel = ((electron[electron.pdg < 0].counts == 1) & (electron[electron.pdg > 0].counts == 1))

        
        SR_sel = triggers & filters & ((met.pt >250).any()) &(skimjet.counts >1) 
        SR_sel &= (electron.counts == 0) & (muon.counts == 0)  
        SR_sel &= (tau.counts == 0) & (isotrack.counts == 0) & (fatjet.counts > 1)  
        SR_sel &= (min_dphiFatJetMet4>0.5)& ((dphiDiFatJet<2.5).all())  
        SR_sel &= (htag.counts>0) & (wtag.counts>0) & (mth.min()>200)  
        SR_sel &= (abs(htag.msoftdrop-125<25).any()) & (abs(wtag.msoftdrop-80<30).any())  
        SR_sel &= (extrajet.counts<2)
        
        ############## CUTFLOW ##############
        
#         hlt_mu17 = df["HLT_Mu17_TrkIsoVVL_Mu8_TrkIsoVVL"]
#         hlt_mu17_dz = df["HLT_Mu17_TrkIsoVVL_Mu8_TrkIsoVVL_DZ"]
#         hlt_mu50 = df["HLT_Mu50"]
#         hlt_mu55 = df["HLT_Mu55"]
#         dimuon_tsel = ((hlt_mu17 == 1) | (hlt_mu17_dz == 1) | (hlt_mu50 == 1) | (hlt_mu55 ==1))
        
        output['totalEvents']['all'] += len(df['weight'])
        processes = processesList
        weight = np.ones(len(df['weight'])) if dataset=='Data' else df['weight']*leptonSF*df['puWeight']
        lumis       = {2016: 36., 2017: 41.5, 2018: 60.}
        cfg['lumi'] = 1 if dataset=='Data' else lumis[year]
        fullweight  = weight*cfg['lumi']

        cutflow = Cutflow(output, df, cfg, processes, weight=weight)
        
        cutflow.addRow( 'triggers',   (triggers) )         
        cutflow.addRow( 'filters',   (filters) ) 
        cutflow.addRow('dielectron = 1', (dielectron_sel))
        cutflow.addRow('opposite sign', (os_sel))        
        cutflow.addRow('70 < dielectron mass < 110', (dielectronmass_sel))
        cutflow.addRow('lead electron pt > 25', (leadelectron_sel))
        cutflow.addRow('sublead electron pt > 15', (subleadelectron_sel))
        cutflow.addRow( 'MET>250',     (newmet.pt>250).any() )
        cutflow.addRow( 'N_fatjet>1',      (fatjet.counts>1) )
        cutflow.addRow( 'min_dphiFatJetMet4', (min_dphiFatJetMet4>0.5))
        cutflow.addRow( 'dphiDiFatJet', (dphiDiFatJet<2.5).all() ) 
        cutflow.addRow( 'njet veto',     (extrajet.counts<2))

#        cutflow.addRow('dimuon pt > 200', (dimuonpt_sel))       
#         cutflow.addRow( 'stitch',   (stitchVar ==1) )
#         cutflow.addRow( 'skim',   ((met_pt>200) & (skimjet.counts>1)) )
#         cutflow.addRow( 'Exactly 1 e or mu',   ((electron.counts+muon.counts)==1) )
# 
        
        baseline = copy.deepcopy(cutflow.selection)

       ############## HISTOGRAMS ##############
        output['met'].fill(dataset=dataset,pt=met[baseline].pt.flatten(),weight=fullweight[baseline])
        output['met_SR'].fill(dataset=dataset,pt=met[SR_sel].pt.flatten(),weight=fullweight[SR_sel])
        output['met-electron'].fill(dataset=dataset,pt=newmet[baseline].pt.flatten(),weight=fullweight[baseline])
        output['nW'].fill(dataset=dataset, multiplicity=wtag[baseline].counts, weight=fullweight[baseline]) 
        output['nH'].fill(dataset=dataset, multiplicity=htag[baseline].counts, weight=fullweight[baseline])     
        output['ee_mass'].fill(dataset=dataset,mass=ee[baseline].mass.flatten(),weight=fullweight[baseline])
        output['ee_pt'].fill(dataset=dataset,pt=ee[baseline].pt.flatten(),weight=fullweight[baseline])
        output['ee_eta'].fill(dataset=dataset,eta=ee[baseline].eta.flatten(),weight=fullweight[baseline])
        output['ee_phi'].fill(dataset=dataset,phi=ee[baseline].phi.flatten(),weight=fullweight[baseline])
        output['leade_pt'].fill(dataset=dataset,pt=leade[baseline].pt.flatten(),weight=fullweight[baseline])
        output['leade_eta'].fill(dataset=dataset,eta=leade[baseline].eta.flatten(),weight=fullweight[baseline])
        output['leade_phi'].fill(dataset=dataset,phi=leade[baseline].phi.flatten(),weight=fullweight[baseline])
        output['subleade_pt'].fill(dataset=dataset,pt=subleade[baseline].pt.flatten(),weight=fullweight[baseline])
        output['subleade_eta'].fill(dataset=dataset,eta=subleade[baseline].eta.flatten(),weight=fullweight[baseline])
        output['subleade_phi'].fill(dataset=dataset,phi=subleade[baseline].phi.flatten(),weight=fullweight[baseline])
        output['ee_deltaPhi'].fill(dataset=dataset, delta=ee_deltaPhi[baseline].flatten(), weight=fullweight[baseline])
        output['ee_deltaR'].fill(dataset=dataset, delta=ee_deltaR[baseline].flatten(), weight=fullweight[baseline])
        output['N_AK4'].fill(dataset=dataset, multiplicity=jet[baseline].counts, weight=fullweight[baseline])
        output['N_AK8'].fill(dataset=dataset, multiplicity=fatjet[baseline].counts, weight=fullweight[baseline])       
        output['leadAK4_pt'].fill(dataset=dataset, pt=leadjet[baseline & ak4_sel].pt.flatten(), weight=fullweight[baseline & ak4_sel])
        output['leadAK4_eta'].fill(dataset=dataset, eta=leadjet[baseline & ak4_sel].eta.flatten(), weight=fullweight[baseline & ak4_sel])
        output['leadAK4_phi'].fill(dataset=dataset, phi=leadjet[baseline & ak4_sel].phi.flatten(), weight=fullweight[baseline & ak4_sel])
        output['subleadAK4_pt'].fill(dataset=dataset, pt=subleadjet[baseline & ak4_sel].pt.flatten(), weight=fullweight[baseline & ak4_sel])
        output['subleadAK4_eta'].fill(dataset=dataset, eta=subleadjet[baseline & ak4_sel].eta.flatten(), weight=fullweight[baseline & ak4_sel])
        output['subleadAK4_phi'].fill(dataset=dataset, phi=subleadjet[baseline & ak4_sel].phi.flatten(), weight=fullweight[baseline & ak4_sel])
        output['leadAK8_pt'].fill(dataset=dataset, pt=leadfatjet[baseline & ak8_sel].pt.flatten(), weight=fullweight[baseline & ak8_sel])       
        output['leadAK8_eta'].fill(dataset=dataset, eta=leadfatjet[baseline & ak8_sel].eta.flatten(), weight=fullweight[baseline & ak8_sel])       
        output['leadAK8_phi'].fill(dataset=dataset, phi=leadfatjet[baseline & ak8_sel].phi.flatten(), weight=fullweight[baseline & ak8_sel])       
        output['subleadAK8_pt'].fill(dataset=dataset, pt=subleadfatjet[baseline & ak8_sel].pt.flatten(), weight=fullweight[baseline & ak8_sel])       
        output['subleadAK8_eta'].fill(dataset=dataset, eta=subleadfatjet[baseline & ak8_sel].eta.flatten(), weight=fullweight[baseline & ak8_sel])       
        output['subleadAK8_phi'].fill(dataset=dataset, phi=subleadfatjet[baseline & ak8_sel].phi.flatten(), weight=fullweight[baseline & ak8_sel])       
        output['min_dphiFatJetMet4'].fill(dataset=dataset, delta=min_dphiFatJetMet4[baseline].flatten(), weight=fullweight[baseline])
        output['dphiDiFatJet'].fill(dataset=dataset, delta=dphiDiFatJet[(baseline & (fatjet.counts>1))].min().flatten(), weight=fullweight[baseline & (fatjet.counts>1)])
        output['min_drFatJetLeadElectron'].fill(dataset=dataset, delta=min_drfatjetleadelectron[baseline].flatten(), weight=fullweight[baseline])
        output['min_drFatJetSubLeadElectron'].fill(dataset=dataset, delta=min_drfatjetsubleadelectron[baseline].flatten(), weight=fullweight[baseline])

        return output

    def postprocess(self, accumulator):
        return accumulator

In [8]:
runLocal = True

if not runLocal:
    # Get the scheduler from the dask_cluster notebook
    from dask.distributed import Client, progress

    c = Client('tcp://169.228.130.5:27879')

    ## for dask
    exe_args = {
        'client': c,
        #'savemetrics': True,
    }
    exe = processor.dask_executor
    
else:
    ## for local
    exe_args = {
        'workers': 16,
        'function_args': {'flatten': False}
    }
    exe = processor.futures_executor

if not runLocal:
    print(c)

In [None]:
overwrite = True
small = False

tag = 'v0.2.4'

from Tools.dilep_samples import * #fileset_2016, fileset_2016_small

if year == 2016:
    fileset_year = fileset_dilep_2016
    fileset_year_small = fileset_dilep_2016_small
elif year == 2017:
    fileset_year = fileset_dilep_2017
    fileset_year_small = fileset_dilep_2017_small
elif year == 2018:
    fileset_year = fileset_dilep_2018
    fileset_year_small = fileset_dilep_2018_small


fileset_dilep = {
    'DYJets': fileset_year['DY_Tune'],
    'ZNuNu': fileset_year['ZNuNu'],
    'TTJets': fileset_year['TTJets'] ,
    'ttW': fileset_year['TTW'],
    'ttZ': fileset_year['TTZ'],
    'Data': fileset_year['DoubleEG'],
}


fileset_dilep_small = {
    'DYJets': fileset_dilep['DYJets'][:2],
    'ZNuNu': fileset_dilep['ZNuNu'][:2],
    'TTJets': fileset_dilep['TTJets'][:2],
    'ttW': fileset_dilep['ttW'][:2],
    'ttZ': fileset_dilep['ttZ'][:2],
    'Data': fileset_dilep['Data'][:2],
}

# load the config and the cache
cfg = loadConfig()

cacheName = 'Dilep_%s_small'%year if small else 'DiLep_%s'%year
print(cacheName)

# histograms
histograms = []

# initialize cache
cache = dir_archive(os.path.join(os.path.expandvars(cfg['caches']['base']), cacheName), serialized=True)
if not overwrite:
    cache.load()

if cfg == cache.get('cfg') and histograms == cache.get('histograms') and cache.get('simple_output'):
    output = cache.get('simple_output')

else:
    # Run the processor
    if small:
        fileset = fileset_dilep_small
        exe_args['workers'] = 4
    else:
        fileset = fileset_dilep
        exe_args['workers'] = 16
    
        
    output = processor.run_uproot_job(fileset,
                                      treename='Events',
                                      processor_instance=analysisProcessor(),
                                      executor=exe,
                                      executor_args=exe_args,
                                      #chunksize=250000,
                                      chunksize=100000,
                                     )
    cache['fileset']        = fileset
    cache['cfg']            = cfg
    cache['histograms']     = histograms
    cache['simple_output']  = output
    cache.dump()

DiLep_2017





  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **k

In [None]:
# Cutflow
from Tools.helpers import getCutFlowTable

datapt = 'Data'
puritypt = 'DYJets'


processes = processesList
lines     = ['entry']
lines    += linesList
df        = getCutFlowTable(output, processes=processes, lines=lines, significantFigures=4,
                           signal=datapt, purity =puritypt)
df

In [None]:
# Efficiencies
df = getCutFlowTable(output, processes=processes, lines=lines, significantFigures=3, absolute=False)
df

In [None]:
output['met'].sum('pt').values()

In [None]:
output['met_SR'].sum('pt').values()

In [None]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from matplotlib.colors import LogNorm

In [None]:
from plots.helpers import *
bins = {
    'met_axis':                  {'axis':'pt',   'overflow': 'over', 'bins': hist.Bin("pt",  r"$E_{T}^{miss}$ (GeV)", 20, 0, 500)},
    'met-electron_axis':             {'axis':'pt',   'overflow': 'over', 'bins': hist.Bin("pt",  r"$E_{T}^{miss}$ - dimuon $p_T$ (GeV)", 28, 0, 700)},
    'met_coarse':                {'axis': 'pt',   'overflow':'over',  'bins': hist.Bin('pt', r'$p_{T}^{miss}\ (GeV)$', np.array([250,400,600])) },
    'nW_axis':                   {'axis': 'multiplicity',  'overflow':'over',  'bins': hist.Bin('multiplicity', r'$nWtag', 4, -0.5, 3.5)},
    'nH_axis':                   {'axis': 'multiplicity',  'overflow':'over',  'bins': hist.Bin('multiplicity', r'$nHtag', 4, -0.5, 3.5)},
    'ee_mass_axis':              {'axis':'mass',   'overflow': 'over', 'bins': hist.Bin("mass",  r"Dielectron mass (GeV)", 25, 50, 150)},
    'ee_pt_axis':                {'axis':'pt',   'overflow': 'over', 'bins': hist.Bin("pt",  r"Dielectron $p_{T}$ (GeV)",40, 0, 1000)},
    'ee_eta_axis':               {'axis':'eta',   'overflow': 'over', 'bins': hist.Bin("eta",       r"Dielectron $\eta$", 30, -5.5, 5.5)},
    'ee_phi_axis':               {'axis':'phi',   'overflow': 'over', 'bins': hist.Bin("phi",       r"Dielectron $\phi$", 20, -4, 4)},
    'leade_pt_axis':            {'axis':'pt',   'overflow': 'over', 'bins': hist.Bin("pt",  r" Lead Electron $p_{T}$ (GeV)", 32, 0, 800)},
    'leade_eta_axis':           {'axis':'eta',   'overflow': 'over', 'bins': hist.Bin("eta",       r"Lead Electron $\eta$", 30, -5.5, 5.5)},
    'leade_phi_axis':           {'axis':'phi',   'overflow': 'over', 'bins': hist.Bin("phi",       r"Lead Electron $\phi$", 20, -4, 4)},
    'subleade_pt_axis':         {'axis':'pt',   'overflow': 'over', 'bins': hist.Bin("pt",  r" Sublead Electron $p_{T}$ (GeV)", 24, 0, 600)},
    'subleade_eta_axis':        {'axis':'eta',   'overflow': 'over', 'bins': hist.Bin("eta",       r"Sublead Electron $\eta$", 30, -5.5, 5.5)},
    'subleade_phi_axis':        {'axis':'phi',   'overflow': 'over', 'bins': hist.Bin("phi",       r"Sublead Electron $\phi$", 20, -4, 4)},
    'leadAK4_pt_axis':           {'axis':'pt',   'overflow': 'over', 'bins': hist.Bin("pt",  r" Lead AK4 $p_{T}$ (GeV)", 32, 0, 800)},
    'leadAK4_eta_axis':          {'axis':'eta',   'overflow': 'over', 'bins': hist.Bin("eta",       r"Lead AK4 $\eta$", 30, -5.5, 5.5)},
    'leadAK4_phi_axis':          {'axis':'phi',   'overflow': 'over', 'bins': hist.Bin("phi",       r"Lead AK4 $\phi$", 20, -4, 4)},
    'subleadAK4_pt_axis':        {'axis':'pt',   'overflow': 'over', 'bins': hist.Bin("pt",  r" Sublead AK4 $p_{T}$ (GeV)", 24, 0, 600)},
    'subleadAK4_eta_axis':       {'axis':'eta',   'overflow': 'over', 'bins': hist.Bin("eta",       r"Sublead AK4 $\eta$", 30, -5.5, 5.5)},
    'subleadAK4_phi_axis':       {'axis':'phi',   'overflow': 'over', 'bins': hist.Bin("phi",       r"Sublead AK4 $\phi$", 20, -4, 4)},
    'leadAK8_pt_axis':           {'axis':'pt',   'overflow': 'over', 'bins': hist.Bin("pt",  r" Lead AK8 $p_{T}$ (GeV)", 32, 0, 800)},
    'leadAK8_eta_axis':          {'axis':'eta',   'overflow': 'over', 'bins': hist.Bin("eta",       r"Lead AK8 $\eta$", 30, -5.5, 5.5)},
    'leadAK8_phi_axis':          {'axis':'phi',   'overflow': 'over', 'bins': hist.Bin("phi",       r"Lead AK8 $\phi$", 20, -4, 4)},
    'subleadAK8_pt_axis':        {'axis':'pt',   'overflow': 'over', 'bins': hist.Bin("pt",  r" Sublead AK8 $p_{T}$ (GeV)", 24, 0, 600)},
    'subleadAK8_eta_axis':       {'axis':'eta',   'overflow': 'over', 'bins': hist.Bin("eta",       r"Sublead AK8 $\eta$", 30, -5.5, 5.5)},
    'subleadAK8_phi_axis':       {'axis':'phi',   'overflow': 'over', 'bins': hist.Bin("phi",       r"Sublead AK8 $\phi$", 20, -4, 4)},
    'N_AK4_axis':                {'axis': 'multiplicity',  'overflow':'over',  'bins': hist.Bin('multiplicity', r'$N_{AK4 jet}$', 6, -0.5, 5.5)},
    'N_AK8_axis':                {'axis': 'multiplicity',  'overflow':'over',  'bins': hist.Bin('multiplicity', r'$N_{AK8 jet}$', 5, -0.5, 4.5)},
    'ee_deltaPhi_axis':          {'axis': 'delta', 'overflow':'over',  'bins': hist.Bin('delta', r'Dielectron $\Delta \phi $', 30, 0, 3)},
    'ee_deltaR_axis':            {'axis': 'delta', 'overflow':'over',  'bins': hist.Bin('delta', r'Dielectron $\Delta R $', 10, 0, 5)},
    'min_dphiFatJetMet4_axis':   {'axis': 'delta', 'overflow':'over',  'bins': hist.Bin('delta', r'min $\Delta \phi $ (fatjet(4),MET)', 25, 0, 5)},
    'dphiDiFatJet_axis':         {'axis': 'delta', 'overflow':'over',  'bins': hist.Bin('delta', r' $\Delta \phi $ difatjet', 25, 0, 5)},
    'min_drFatJetLeadElectron_axis':   {'axis': 'delta', 'overflow':'over',  'bins': hist.Bin('delta', r'min $\Delta R $ (fatjet(4), Lead Electron)', 25, 0, 5)},
    'min_drFatJetSubLeadElectron_axis':   {'axis': 'delta', 'overflow':'over',  'bins': hist.Bin('delta', r'min $\Delta R $ (fatjet(4), Sublead Electron)', 25, 0, 5)},
   }

In [None]:
#Plotting aesthetics

lineopts = {
    'color': 'r',
    'linewidth': '3'}

data_err_opts = {
    'linestyle': 'none',
    'marker': '_',
    'markersize': 10.,
    'color': 'r',
    'elinewidth': 1}

data_err_opts_rat = {
    'linestyle': 'none',
    'marker': '.',
    'markersize': 10.,
    'color': 'k',
    'elinewidth': 1}

fillopts2 = {
    'edgecolor': (0,0,0,0.3),
    'facecolor': [('#989C94'),('#6A0136'),('#FF5714'),('#FFCA3A')]  
}

In [None]:
from plots.helpers import *

def saveFig( fig, ax, rax, path, name, scale='linear', shape=False, y_max=-1 ):
    outdir = os.path.join(path,scale)
    finalizePlotDir(outdir)
    ax.set_yscale(scale)
    ax.set_ylabel('Events')

    if scale == 'linear':
        if y_max<0: #or True:
            pass
        else:
            ax.set_ylim(0, 1 if shape else 1.2*y_max)
    else:
        if y_max<0 and not shape:
            pass
        else:
            ax.set_ylim(0.000005 if shape else 0.05, 3 if shape else 300*y_max)

    handles, labels = ax.get_legend_handles_labels()
    new_labels = []
    for handle, label in zip(handles, labels):
        #print (handle, label)
        try:
            new_labels.append(my_labels[label])
            if not label=='pseudodata':
                handle.set_color(colors[label])
        except:
            pass

    if rax:
        plt.subplots_adjust(hspace=0)
        rax.set_ylabel('Obs./Pred.')
        rax.set_ylim(0,2)

    ax.legend(title='',ncol=2,handles=handles, labels=new_labels, frameon=False)

    fig.text(0., 0.995, '$\\bf{CMS}$', fontsize=20,  horizontalalignment='left', verticalalignment='bottom', transform=ax.transAxes )
    fig.text(0.15, 1., '$\\it{Preliminary}$', fontsize=14, horizontalalignment='left', verticalalignment='bottom', transform=ax.transAxes )
    fig.text(0.8, 1., '13 TeV', fontsize=14, horizontalalignment='left', verticalalignment='bottom', transform=ax.transAxes )

    fig.savefig(os.path.join(outdir, "{}.pdf".format(name)))
    fig.savefig(os.path.join(outdir, "{}.png".format(name)))
    #ax.clear()

In [None]:
#Histos I want to save
histos = [["ee_mass", "ee_mass_axis"],
        ["ee_pt", "ee_pt_axis"],
        ["ee_eta", "ee_eta_axis"],
        ["ee_phi", "ee_phi_axis"],
        ["leade_pt", "leade_pt_axis"],
        ["leade_eta", "leade_eta_axis"],
        ["leade_phi", "leade_phi_axis"],
        ["subleade_pt", "subleade_pt_axis"],
        ["subleade_eta", "subleade_eta_axis"],
        ["subleade_phi", "subleade_phi_axis"],
        ["leadAK4_pt", "leadAK4_pt_axis"],
        ["leadAK4_eta", "leadAK4_eta_axis"],
        ["leadAK4_phi", "leadAK4_phi_axis"],
        ["subleadAK4_pt", "subleadAK4_pt_axis"],
        ["subleadAK4_eta", "subleadAK4_eta_axis"],
        ["subleadAK4_phi", "subleadAK4_phi_axis"],
        ["leadAK8_pt", "leadAK8_pt_axis"],
        ["leadAK8_eta", "leadAK8_eta_axis"],
        ["leadAK8_phi", "leadAK8_phi_axis"],
        ["subleadAK8_pt", "subleadAK8_pt_axis"],
        ["subleadAK8_eta", "subleadAK8_eta_axis"],
        ["subleadAK8_phi", "subleadAK8_phi_axis"],
        ["N_AK4", "N_AK4_axis"],
        ["N_AK8", "N_AK8_axis"],
        ["ee_deltaPhi", "ee_deltaPhi_axis"],
        ["ee_deltaR", "ee_deltaR_axis"],
        ["met", "met_axis"],
        ["met-electron", "met_coarse"],
        ["nW", "nW_axis"],
        ["nH", "nH_axis"],
        ["min_dphiFatJetMet4", "min_dphiFatJetMet4_axis"],
        ["dphiDiFatJet", "dphiDiFatJet_axis"],
        ["min_drFatJetLeadElectron", "min_drFatJetLeadElectron_axis"],
        ["min_drFatJetSubLeadElectron", "min_drFatJetSubLeadElectron_axis"]]

In [None]:
# Make some of the plots


finalizePlotDir(plotDir)


for plot in histos:

    name = plot[0]
    binName = plot[1]
    histogram = output[name]

    axis = bins[binName]['axis']
    histogram = histogram.rebin(axis, bins[binName]['bins'])

    y_max = histogram.sum("dataset").values(overflow='all')[()].max()
    y_over = histogram.sum("dataset").values(overflow='all')[()][-1]

    import re

    bkg = re.compile('(?!Data)')
    
    background = histogram[bkg]
    data = histogram['Data']

    #fig, ax = plt.subplots(1,1,figsize=(7,7))
    fig, (ax, rax) = plt.subplots(nrows=2,ncols=1, figsize=(7,7),
        gridspec_kw={"height_ratios": (3, 1)}, sharex=True)
    
    # get axes
    hist.plot1d(background, overlay="dataset", ax=ax, stack=True, 
                overflow=bins[binName]['overflow'], clear=False, 
                fill_opts=fill_opts, error_opts=error_opts,
                order=['ttZ', 'ttW','TTJets', 'DYJets'])
#     hist.plot1d(background, overlay="dataset", ax=ax, stack=True, 
#                  overflow=bins[binName]['overflow'], clear=False, fill_opts=fillopts2) 
#                  #error_opts=error_opts)# order=['DYJets','TTJets', 'ttW', 'ttZ']) #error_opts??
    hist.plot1d(data, overlay="dataset", ax=ax, stack=False, 
                overflow=bins[binName]['overflow'], error_opts=data_err_opts_rat, 
                clear=False)
    hist.plotratio(num=data.sum('dataset'), denom=background.sum('dataset'), ax=rax,
                   error_opts = data_err_opts_rat, denom_fill_opts={}, guide_opts={}, 
                   unc='num', overflow = 'over')

    for l in ['log', 'linear']:
        saveFig(fig, ax, rax, plotDir, name, scale=l, shape=False, y_max=y_max)


# ESTIMATE

In [None]:
from uncertainties import ufloat
## This can be either validation region or signal region

# First, add the years together using the add function of the histograms
# Then, use uncertainties.ufloat to do proper error propagation

def get_np_ufloat(vals):
    #print (vals)
    return (np.array([ ufloat(vals[0][i], np.sqrt(vals[1][i])) for i in range(len(vals[0]))  ]))
#    return (ufloat(vals[0][i], np.sqrt(vals[1][i])) for i in range(len(vals[0])))

In [None]:
hist_SR = output['met_SR'].copy()
hist_CR = output['met'].copy()
axis = bins['met_coarse']['axis']
hist_SR = hist_SR.rebin(axis, bins['met_coarse']['bins'])
hist_CR = hist_CR.rebin(axis, bins['met_coarse']['bins'])


In [108]:
bkgonly = re.compile('(?!(Data|ZNuNu))')
CR_bkg  = get_np_ufloat( hist_CR[bkgonly].sum('dataset').values(overflow='over', sumw2=True)[()] )
CR_data = get_np_ufloat( hist_CR['Data'].sum('dataset').values(overflow='over', sumw2=True)[()] )
CR_ZNN   = get_np_ufloat( hist_CR['ZNuNu'].sum('dataset').values(overflow='over', sumw2=True)[()] )
SR_ZNN   = get_np_ufloat(hist_SR['ZNuNu'].sum('dataset').values(overflow='over', sumw2=True)[()] )

In [109]:
transfer_factor = SR_ZNN/CR_bkg
estimate = CR_data*transfer_factor

In [110]:
print(year)
print("***********************************")
print("Control Region")
print("SR MC ZNuNu Yield: ", SR_ZNN)
print("CR data Yield: ", CR_data)
print("CR MC ZNuNu Yield: ", CR_ZNN)
print("CR MC Bkgd Yield: ", CR_bkg)
print("CR TF: ", transfer_factor)
print("CR Estimate: ", estimate)

2016
***********************************
Control Region
SR MC ZNuNu Yield:  [2.140684384503402+/-0.26966407532591985
 1.5847062514803838+/-0.2575663014143749
 0.45555149993742816+/-0.15114613153151935]
CR data Yield:  [7.0+/-2.6457513110645907 5.0+/-2.23606797749979 1.0+/-1.0]
CR MC ZNuNu Yield:  [0.0+/-0 0.0+/-0 0.0+/-0]
CR MC Bkgd Yield:  [6.261681539595884+/-0.8764824723252315
 1.139717943238793+/-0.36203300329983146
 0.0026331101544201374+/-0.00263311015825438]
CR TF:  [0.3418705296599222+/-0.0643787188798924
 1.3904372225438915+/-0.4961333186501291
 173.00890324421295+/-182.28298116861288]
CR Estimate:  [2.3930937076194554+/-1.0105516147807023
 6.952186112719457+/-3.977472199280294
 173.00890324421295+/-251.31487386440602]
