In [119]:
import awkward as ak
import numpy as np
import time
import coffea
import uproot
import hist
import vector
print("awkward version ", ak.__version__)
print("coffea version ", coffea.__version__)
from coffea import util, processor
from coffea.nanoevents import NanoEventsFactory, NanoAODSchema, BaseSchema
from collections import defaultdict
import pickle
from distributed.diagnostics.plugin import UploadDirectory
import os

from tqdm.notebook import tqdm, trange
import time
import glob
import datetime

awkward version  1.10.3
coffea version  0.7.21


In [120]:
from smp_utils import *
from cms_utils import *
from jet_output_lib import *
from jet_output import *

In [121]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [122]:
# f = uproot.open("/mnt/data/cms/store/mc/RunIISummer20UL16NanoAODv9/DYJetsToLL_M-50_HT-100to200_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v2/260000/18D0A087-30BD-FE4E-B447-5F493C2D2794.root")
# f["Events"].keys()

In [123]:
def is_rootcompat(a):
    """Is it a flat or 1-d jagged array?"""
    t = ak.type(a)
    if isinstance(t, ak._ext.ArrayType):
        if isinstance(t.type, ak._ext.PrimitiveType):
            return True
        if isinstance(t.type, ak._ext.ListType) and isinstance(t.type.type, ak._ext.PrimitiveType):
            return True
    return False


def uproot_writeable(events):
    """Restrict to columns that uproot can write compactly"""
    out = {}
    for bname in events.fields:
        if events[bname].fields:
            out[bname] = ak.zip(
                {
                    n: ak.packed(ak.without_parameters(events[bname][n]))
                    for n in events[bname].fields
                    if is_rootcompat(events[bname][n])
                }
            )
        else:
            out[bname] = ak.packed(ak.without_parameters(events[bname]))
    return out

pt cut for ee is 40 and pt cut for mm is 20, pt cut for jet is 200

In [124]:
ptcut_e = 40
ptcut_m = 29


In [125]:

def event_skimmer(filename, outFileLocation):   
    events = NanoEventsFactory.from_root(
        filename,
        metadata={"dataset": "SomeDataset"},
    ).events()

    isGenElectron = np.abs(events.GenDressedLepton.pdgId) == 11
    isGenMuon = np.abs(events.GenDressedLepton.pdgId) == 13
    gen_charge = ak.where( events.GenDressedLepton.pdgId > 0, +1, -1)

    twoGen_ee = (ak.sum(isGenElectron, axis=1) == 2) & (ak.all(events.GenDressedLepton.pt > ptcut_e, axis=1)) & (ak.all( np.abs(events.GenDressedLepton.eta) < 2.5, axis=1)) 

    twoGen_mm = (ak.sum(isGenMuon, axis=1) == 2) & (ak.all(events.GenDressedLepton.pt > ptcut_m, axis=1)) & (ak.all( np.abs(events.GenDressedLepton.eta) < 2.5, axis=1)) 

    events = events[twoGen_ee | twoGen_mm]
    

    now = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    with uproot.recreate(outFileLocation+"skimmed_mc"+now+".root") as fout:
        fout["Events"] = uproot_writeable(events)

    

In [126]:
def remove_old_files(era, binn):
    files = glob.glob(folderset[era][binn] + "*.root")
    for f in files:
        os.remove(f)
    print("Removed old files")

In [127]:
# filename = "/mnt/data/cms/store/mc/RunIISummer20UL16NanoAODv9/DYJetsToLL_M-50_HT-100to200_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v2/260000/18D0A087-30BD-FE4E-B447-5F493C2D2794.root"
# event_skimmer(filename)

In [128]:
# fileset = {}
# filedir = "samples/"
# prependstr = "/mnt/data/cms"
# eras_mc = [
#         'UL16NanoAODv9', 
#         'UL17NanoAODv9', 
#         'UL18NanoAODv9'
#     ]

# dy_mc_filestr = "DYJetsToLL_M-50_HT_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8_%s_files.txt"

# for era in eras_mc: 
#     filename = filedir + dy_mc_filestr % (era)
#     print(filename)
#     with open(filename) as f:
#         dy_mc_files = [prependstr + i.rstrip() for i in f.readlines() if i[0] != "#" ]
#         fileset[era] = dy_mc_files

In [129]:
# for era in eras_mc:
#     for file in tqdm(fileset[era]):
#         print(file)
#         event_skimmer(file, outFileLocation)

In [130]:
fileset = {}
filedir = "samples/"
prependstr = "/mnt/data/cms"

years = ["2016/","2017/","2018/"]

eras_mc = [
        'UL16NanoAODv9', 
        'UL17NanoAODv9', 
        'UL18NanoAODv9'
    ]

bins = [
    "100to200",
    "1200to2500",
    "200to400",
    "2500toInf",
    "400to600",
    "600to800",
    "70to100",
    "800to1200"
]

dy_mc_filestr = "DYJetsToLL_M-50_HT_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8_%s_files.txt"

for era in eras_mc:
    fileset[era] = {}
    for binn in bins:
        fileset[era][binn] = {}
        filename = filedir + era + "/"+ binn + ".txt"
        with open(filename) as f:
            dy_mc_files = [prependstr + i.rstrip() for i in f.readlines() if i[0] != "#" ]
            fileset[era][binn] = dy_mc_files

In [131]:
folderstr = "DYJetsToLL_M-50_HT_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8_%s_folders.txt"
# eras_mc = [

#         'UL16NanoAODv9'

#     ]
# bins = [
#     "800to1200"
# ]

folderset = {}
for era in eras_mc:
    folderset[era] = {}
    filename = filedir+folderstr%(era)
    for i,binn in enumerate(bins):
        with open(filename) as f:
                folder = f.readlines()[i].rstrip()
        folderset[era][binn] = prependstr+folder+'skimmed/'
        for file in fileset[era][binn]:
            with open(filename) as f:
                folder = f.readlines()[i].rstrip()
                #event_skimmer(file, prependstr+folder )

In [132]:
folderset

{'UL16NanoAODv9': {'100to200': '/mnt/data/cms/store/mc/RunIISummer20UL16NanoAODv9/DYJetsToLL_M-50_HT-100to200_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v2/skimmed/',
  '1200to2500': '/mnt/data/cms/store/mc/RunIISummer20UL16NanoAODv9/DYJetsToLL_M-50_HT-1200to2500_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v2/skimmed/',
  '200to400': '/mnt/data/cms/store/mc/RunIISummer20UL16NanoAODv9/DYJetsToLL_M-50_HT-200to400_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v2/skimmed/',
  '2500toInf': '/mnt/data/cms/store/mc/RunIISummer20UL16NanoAODv9/DYJetsToLL_M-50_HT-2500toInf_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v2/skimmed/',
  '400to600': '/mnt/data/cms/store/mc/RunIISummer20UL16NanoAODv9/DYJetsToLL_M-50_HT-400to600_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v2/skimmed/',
  '600to800': '/mnt/data/cms/sto

In [None]:
# eras_mc = [

#         'UL16NanoAODv9'

#     ]
# bins = [
#     "200to400"
# ]

for era in eras_mc:
    filename = filedir+folderstr%(era)
    for binn in bins:
        remove_old_files(era,binn)
        for i,file in enumerate(tqdm(fileset[era][binn])):
            with open("logs.txt", "a") as f:
                f.write(folderset[era][binn] + "    "+str(i)+"\n")
            print(folderset[era][binn]+ "    "+str(i))
            event_skimmer(file, folderset[era][binn] )

Removed old files


  0%|          | 0/43 [00:00<?, ?it/s]

/mnt/data/cms/store/mc/RunIISummer20UL16NanoAODv9/DYJetsToLL_M-50_HT-100to200_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v2/skimmed/    0
/mnt/data/cms/store/mc/RunIISummer20UL16NanoAODv9/DYJetsToLL_M-50_HT-100to200_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v2/skimmed/    1
/mnt/data/cms/store/mc/RunIISummer20UL16NanoAODv9/DYJetsToLL_M-50_HT-100to200_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v2/skimmed/    2


## Code for creating the 'skimmed' folders 

In [48]:
folderstr = "DYJetsToLL_M-50_HT_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8_%s_folders.txt"
import oschmod
eras_mc = [

        'UL16NanoAODv9'

    ]
# bins = [
#     "800to1200"
# ]
bins = [
    "100to200",
    "1200to2500",
    "200to400",
    "2500toInf",
    "400to600",
    "600to800",
    "70to100",
    "800to1200"
]
for era in eras_mc:
    filename = filedir+folderstr%(era)
    print(filedir+folderstr%(era))
    for i,binn in enumerate(bins):
        with open(filename) as f:
            folder = f.readlines()[i].rstrip()
            print(prependstr+folder)
            
            # directory = prependstr+folder + "/skimmed"
            # if not os.path.exists(directory):
            #     os.makedirs(directory)
            # oschmod.set_mode(directory, "a+rw")   

samples/DYJetsToLL_M-50_HT_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8_UL16NanoAODv9_folders.txt
/mnt/data/cms/store/mc/RunIISummer20UL16NanoAODv9/DYJetsToLL_M-50_HT-100to200_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v2/
/mnt/data/cms/store/mc/RunIISummer20UL16NanoAODv9/DYJetsToLL_M-50_HT-1200to2500_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v2/
/mnt/data/cms/store/mc/RunIISummer20UL16NanoAODv9/DYJetsToLL_M-50_HT-200to400_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v2/
/mnt/data/cms/store/mc/RunIISummer20UL16NanoAODv9/DYJetsToLL_M-50_HT-2500toInf_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v2/
/mnt/data/cms/store/mc/RunIISummer20UL16NanoAODv9/DYJetsToLL_M-50_HT-400to600_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v2/
/mnt/data/cms/store/mc/RunIISummer20UL16NanoAODv9/DYJetsToLL_M-50_HT-600to800_Tu

In [None]:
fileset['UL16NanoAODv9']['800to1200']

In [85]:
def remove_old_files(era, binn):
    files = glob.glob(folderset[era][binn] + "*.root")
    for f in files:
        os.remove(f)
    print("Removed old files")

In [80]:
folderset["UL16NanoAODv9"]["100to200"] + "*.root"

'/mnt/data/cms/store/mc/RunIISummer20UL16NanoAODv9/DYJetsToLL_M-50_HT-100to200_TuneCP5_PSweights_13TeV-madgraphMLM-pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v2/skimmed/*.root'