# PocketCoffea: a configuration layer for CMS analysis with Coffea

In [1]:
import os
import pocket_coffea
from pocket_coffea.utils.dataset import build_datasets
from pocket_coffea.utils.run import FuturesRunner
from pocket_coffea.utils.configurator import Configurator
from pocket_coffea.workflows.tthbb_base_processor import ttHbbBaseProcessor
from pocket_coffea.lib.cut_functions import get_nObj_min, get_HLTsel
from pocket_coffea.parameters.cuts import passthrough
from pocket_coffea.parameters.histograms import muon_hists
print(f"PocketCoffea version = {pocket_coffea.__version__}")
print(f"# CPUs = {os.cpu_count()}")

PocketCoffea version = 0.1.0
# CPUs = 72


## Define parameters

In [9]:
from pocket_coffea.parameters import defaults

localdir = os.getcwd()
default_parameters = defaults.get_default_parameters()
parameters = defaults.merge_parameters_from_files(default_parameters,
                                                  f"{localdir}/params/object_preselection_semileptonic.yaml",
                                                  f"{localdir}/params/triggers.yaml",
                                                  #f"{localdir}/params/plotting_style.yaml",
                                                  update=True)

cfg = Configurator(
    parameters=parameters,
    datasets = {
        "jsons": [f"{localdir}/datasets/backgrounds_MC_ttbar.json",
                  f"{localdir}/datasets/DATA_SingleMuon.json",
                    ],
        "filter" : {
            "samples": ["TTToSemiLeptonic",
                        "TTTo2L2Nu",
                        "DATA_SingleMuon"],
            "samples_exclude" : [],
            "year": ["2018"]
        },
    },
    workflow=ttHbbBaseProcessor,
    skim = [get_nObj_min(3, 15., "Jet"),
            get_HLTsel(primaryDatasets=["SingleMuon"])],
    preselections = [get_nObj_min(4, 30., "Jet")],
    categories = {
        "inclusive" : [passthrough],
    },
    weights = {
        "common": {
            "inclusive": ["genWeight","lumi","XS",
                          "pileup",
                          "sf_ele_reco", "sf_ele_id",
                          "sf_mu_id","sf_mu_iso",
                          ],
            "bycategory" : {
            }
        },
        "bysample": {
        }
    },
    variations = {
        "weights": {
            "common": {
                "inclusive": [ "pileup" ],
                "bycategory" : {
                }
            },
            "bysample": {
            }
        },
        "shape": {
            "common":{
                "inclusive": [ ]
            }
        }
    },
    variables = {
        **muon_hists(coll="MuonGood"),
        **muon_hists(coll="MuonGood", pos=0),
    }
)
cfg.filter_dataset(nfiles=1)
output_dir = "output"
cfg.save_config(output_dir)

Saving config file to output/config.json


In [3]:
run_options = {
    "executor"       : "local",
    "env"            : "conda",
    "workers"        : 1,
    "scaleout"       : 32,
    "worker_image"   : "/cvmfs/unpacked.cern.ch/gitlab-registry.cern.ch/cms-analysis/general/pocketcoffea:lxplus-cc7-latest",
    "queue"          : "standard",
    "walltime"       : "02:00:00",
    "mem_per_worker" : "4GB", # GB
    "disk_per_worker" : "1GB", # GB
    "exclusive"      : False,
    "chunk"          : 200000,
    "retries"        : 50,
    "treereduction"  : 20,
    "adapt"          : False,   
}
runner = FuturesRunner(architecture="local", run_options=run_options, output_dir=output_dir, loglevel="INFO")

cp: ‘/t3home/mmarcheg/.x509up_u718’ and ‘/t3home/mmarcheg/.x509up_u718’ are the same file


In [15]:
filesets = {
    "DYJetsToLL_2012": {
        "metadata": {
            "sample": "DYJetsToLL",
            "year": "2012",
            "isMC": "True",
            "xsec": "6077.22",
        },
    "files": [f"{localdir}/datasets/DYJetsToLL.root"]
    }
}

In [5]:
runner.run(
    cfg.filesets,
    cfg.processor_instance,
    full=True,
)

Output()

Output()

[0;37m[INFO    ] Working on samples: ['TTTo2L2Nu_2018', 'TTToSemiLeptonic_2018', 'DATA_SingleMuon_2018_EraA', 'DATA_SingleMuon_2018_EraB', 'DATA_SingleMuon_2018_EraC', 'DATA_SingleMuon_2018_EraD'][0m


Output()

Saving output to output/output_all.coffea


In [10]:
os.listdir("output")

['parameters_dump.yaml',
 'configurator.pkl',
 'output_all.coffea',
 'config.json']

In [8]:
from coffea.util import load
o = load("output/output_all.coffea")
o.keys()

dict_keys(['sum_genweights', 'sumw', 'cutflow', 'variables', 'columns', 'processing_metadata', 'datasets_metadata'])

## Fetch the dataset from DAS and rucio

In [None]:
build_datasets(
    "datasets/datasets_definitions.json",
    overwrite=True,
    #local_prefix=self.local_prefix,
    #whitelist_sites=self.whitelist_sites,
    #blacklist_sites=self.blacklist_sites,
    #regex_sites=self.regex_sites,
    parallelize=4,
)