# PocketCoffea: a configuration layer for CMS analysis with Coffea

In [1]:
import os
import pocket_coffea
from pocket_coffea.utils.dataset import build_datasets
from pocket_coffea.utils.run import IterativeRunner, FuturesRunner
from pocket_coffea.utils.configurator import Configurator
from pocket_coffea.workflows.tthbb_base_processor import ttHbbBaseProcessor
from pocket_coffea.lib.cut_functions import get_nObj_min, get_HLTsel
from pocket_coffea.parameters.cuts import passthrough
from pocket_coffea.parameters.histograms import muon_hists
print(f"# CPUs = {os.cpu_count()}")

# CPUs = 72


## Define Coffea processor

In [2]:
import awkward as ak
from coffea.analysis_tools import PackedSelection
from pocket_coffea.workflows.base import BaseProcessorABC
from pocket_coffea.lib.objects import (
    lepton_selection,
    get_dilepton,
)


class ZmumuBaseProcessor(BaseProcessorABC):
    def __init__(self, cfg: Configurator):
        super().__init__(cfg)

    # Since the CMS opendata NanoAOD file does not have the 'genWeight' branch, we set it to 1 for each event in this demo
    def load_metadata_extra(self):
        if self._isMC:
            self.events["genWeight"] = ak.ones_like(self.events.event, dtype=float)

    def skim_events(self):
        self._skim_masks = PackedSelection()

        for skim_func in self._skim:
            # Apply the skim function and add it to the mask
            mask = skim_func.get_mask(
                self.events,
                processor_params=self.params,
                year=self._year,
                sample=self._sample,
                isMC=self._isMC,
            )
            self._skim_masks.add(skim_func.id, mask)

        self.events = self.events[self._skim_masks.all(*self._skim_masks.names)]
        self.nEvents_after_skim = self.nevents
        self.output['cutflow']['skim'][self._dataset] = self.nEvents_after_skim
        self.has_events = self.nEvents_after_skim > 0

    def get_shape_variations(self):
        '''
        Dummy generator for shape variations.
        '''
        yield "nominal"
        return
    
    def apply_object_preselection(self, variation):
        '''
        The object preselection cleans the following collections:
          - Muons
        In addition, a dilepton object is built.
        '''
        # Build masks for selection of muons, electrons, jets, fatjets
        self.events["MuonGood"] = lepton_selection(
            self.events, "Muon", self.params
        )
        #leptons = ak.with_name(
        #    ak.concatenate((self.events.MuonGood, self.events.ElectronGood), axis=1),
        #    name='PtEtaPhiMCandidate',
        #)
        #self.events["LeptonGood"] = leptons[ak.argsort(leptons.pt, ascending=False)]

        #self.events["ll"] = get_dilepton(
        #    self.events.ElectronGood, self.events.MuonGood
        #)

    def count_objects(self, variation):
        self.events["nMuonGood"] = ak.num(self.events.MuonGood)

In [3]:
ZmumuBaseProcessor

__main__.ZmumuBaseProcessor

## Define parameters

In [4]:
from pocket_coffea.parameters import defaults

localdir = os.getcwd()
default_parameters = defaults.get_default_parameters()
parameters = defaults.merge_parameters_from_files(default_parameters,
                                                  f"{localdir}/params/event_flags.yaml",
                                                  f"{localdir}/params/lumi.yaml",
                                                  f"{localdir}/params/object_preselection_semileptonic.yaml",
                                                  f"{localdir}/params/triggers.yaml",
                                                  f"{localdir}/params/plotting_style.yaml",
                                                  update=True)

cfg = Configurator(
    parameters=parameters,
    datasets = {
        "jsons": [f"{localdir}/datasets/DYJetsToLL.json",
                  #f"{localdir}/datasets/DATA_SingleMuon.json",
                    ],
        "filter" : {
            "samples": ["DYJetsToLL",
                       ],
            "samples_exclude" : [],
            "year": ["2012"]
        },
    },
    workflow=ZmumuBaseProcessor,
    skim = [get_nObj_min(3, 15., "Jet"),
            get_HLTsel(primaryDatasets=["SingleMuon"])],
    preselections = [get_nObj_min(4, 30., "Jet")],
    categories = {
        "inclusive" : [passthrough],
    },
    weights = {
        "common": {
            "inclusive": ["genWeight","lumi","XS",
                          #"pileup",
                          #"sf_ele_reco", "sf_ele_id",
                          #"sf_mu_id","sf_mu_iso",
                          ],
            "bycategory" : {
            }
        },
        "bysample": {
        }
    },
    variations = {
        "weights": {
            "common": {
                "inclusive": [ ],#"pileup" ],
                "bycategory" : {
                }
            },
            "bysample": {
            }
        },
        "shape": {
            "common":{
                "inclusive": [ ]
            }
        }
    },
    variables = {
        **muon_hists(coll="MuonGood"),
        **muon_hists(coll="MuonGood", pos=0),
    }
)
cfg.filter_dataset(nfiles=1)
output_dir = "output"
#cfg.save_config(output_dir)

In [5]:
run_options = {
    "executor"       : "local",
    "env"            : "conda",
    "workers"        : 1,
    "scaleout"       : 32,
    "worker_image"   : "/cvmfs/unpacked.cern.ch/gitlab-registry.cern.ch/cms-analysis/general/pocketcoffea:lxplus-cc7-latest",
    "queue"          : "standard",
    "walltime"       : "02:00:00",
    "mem_per_worker" : "4GB", # GB
    "disk_per_worker" : "1GB", # GB
    "exclusive"      : False,
    "chunk"          : 200000,
    "retries"        : 50,
    "treereduction"  : 20,
    "adapt"          : False,
}
runner = IterativeRunner(architecture="local", run_options=run_options, output_dir=output_dir, loglevel="INFO")

cp: ‘/t3home/mmarcheg/.x509up_u718’ and ‘/t3home/mmarcheg/.x509up_u718’ are the same file


In [6]:
filesets = {
    "DYJetsToLL_2012": {
        "metadata": {
            "sample": "DYJetsToLL",
            "year": "2012",
            "isMC": "True",
            "xsec": 6077.22,
        },
    "files": [f"{localdir}/datasets/DYJetsToLL.root"]
    }
}

In [7]:
runner.run(
    filesets,
    cfg.processor_instance,
    full=True,
)

[0;37m[INFO    ] Working on samples: ['DYJetsToLL_2012'][0m


Output()

Output()

KeyboardInterrupt: 

In [None]:
os.listdir("output")

## Produce Data/MC plots

In [None]:
from coffea.util import load
from pocket_coffea.utils.plot_utils import PlotManager
o = load("output/output_all.coffea")
o.keys()

In [None]:
plotter = PlotManager(
    variables=o["variables"].keys(),
    hist_objs=o["variables"],
    datasets_metadata=o['datasets_metadata'],
    plot_dir="plots",
    style_cfg=parameters['plotting_style'],
    only_cat=None,
    workers=8,
    log=False,
    density=False,
    save=True
)
plotter.plot_datamc_all(syst=True, spliteras=False)

## Fetch the dataset from DAS and rucio

In [None]:
build_datasets(
    "datasets/datasets_definitions.json",
    overwrite=True,
    #local_prefix=self.local_prefix,
    #whitelist_sites=self.whitelist_sites,
    #blacklist_sites=self.blacklist_sites,
    #regex_sites=self.regex_sites,
    parallelize=4,
)

In [None]:
import 