In [None]:
import glob
import uproot
import pandas as pd
import numpy as np
import awkward as ak

import utils.tools as tools
import utils.plotting as plotting

import mplhep as cms
import matplotlib.pyplot as plt

cms.style.use("CMS")
plt.rcParams["figure.figsize"] = (7,7)

In [None]:
inputFormat = 'nano'     # nanoAOD
#inputFormat = 'parquet'  # awkward arrays
#inputFormat = 'hdf5'     # pandas dataframes

sigName = "zmu"
bkgName = "zb"

rootDir = "/Volumes/home/met24_nano/"
writeDir = "~/work/data/"

l1METLabels = ['L1MET_PUMon_HFZSoff', 'L1MET_PUMon_HFZSon', 'L1MET_PUMoff_HFZSon']

sigFiles = [glob.glob(path) for path in [rootDir+"/zmuMET/240307_100248/0000/nano_*.root", rootDir+"/zmuMET_HFZS/240307_100338/0000/nano_*.root", rootDir+"/zmuMET_noPUM_HFZS/240307_171843/0000/nano_*.root"]]
bkgFiles = [glob.glob(path) for path in [rootDir+"/zbMET/240307_100127/0000/nano_*.root", rootDir+"/zbMET_HFZS/240307_100159/0000/nano_*.root", rootDir+"/zbMET_noPUM_HFZS/240307_171019/0000/nano_*.root"]]

awkSigFiles = [writeDir + "/" + sigName + label + ".parq" for label in l1METLabels]
awkBkgFiles = [writeDir + "/" + bkgName + label + ".parq" for label in l1METLabels]

sig_hdf5s = [writeDir + "/" + sigName + label + ".hdf5" for label in l1METLabels]
bkg_hdf5s = [writeDir + "/" + bkgName + label + ".hdf5" for label in l1METLabels]

# use this for jet/eg/tau
inputs=[]
nObj = 0

# df struct
keepStruct=False

useEmu=True
useMP=False

# define input sums (case sensitive, see branches.py)
inputSums = ['methf', 'ntt']

# L1 MET rate (Hz)
fixedRate = 2000

# remove events with saturated MET
filterSaturated = False

# arrays containing our signal and background data
# for the different sets of input files
sigs = []
bkgs = []

sig_dfs = []
bkg_dfs = []

thresholds = []

In [None]:
if inputFormat == 'nano':
    
    # get the (sum) branches to retrieve from nanoAOD
    branches = tools.getBranches(inputs, useEmu, useMP)
    
    for sigFile, awkSigFile in zip(sigFiles, awkSigFiles):
        sigs.append(tools.getArrays(sigFile, branches, len(sigFile), awkSigFile))
                       
    for bkgFile, awkBkgFile in zip(bkgFiles, awkBkgFiles):
        bkgs.append(tools.getArrays(bkgFile, branches, len(bkgFile), awkBkgFile))


if inputFormat == 'parquet':

    for awkSigFile in awkSigFiles:
        sigs.append(ak.from_parquet(awkSigFile))

    for awkBkgFile in awkBkgFiles:
        bkgs.append(ak.from_parquet(awkBkgFile))

In [None]:
if inputFormat in ['nano', 'parquet']:

    for sig, sig_hdf5, l1METLabel in zip(sigs, sig_hdf5s, l1METLabels):
        # get the puppiMETs
        puppiMET, puppiMETNoMu = tools.getPUPPIMET(sig)
        # get the l1METs
        l1MET_df = pd.DataFrame((ak.to_list(ak.flatten(tools.getSum(sig, 'methf')['EtSum_pt']))), columns=[l1METLabel])
        puppiMET_df = pd.DataFrame(ak.to_list(puppiMET['PuppiMET_pt']), columns=['PuppiMET'])
        puppiMETNoMu_df = pd.DataFrame(ak.to_list(puppiMETNoMu['PuppiMET_pt']), columns=['PuppiMETNoMu'])
        # save to dataframe
        pd.concat([l1MET_df, puppiMET_df, puppiMETNoMu_df], axis=1).to_hdf(sig_hdf5, l1METLabel, mode='w')
        
    for bkg, bkg_hdf5, l1METLabel in zip(bkgs, bkg_hdf5s, l1METLabels):
        
        l1MET_df = pd.DataFrame(ak.to_list(ak.flatten(tools.getSum(bkg, 'methf')['EtSum_pt'])), columns=[l1METLabel])
        l1MET_df.to_hdf(bkg_hdf5, l1METLabel, mode='w')
        
    
for sig_hdf5, l1METLabel in zip(sig_hdf5s, l1METLabels):
    sig_dfs.append(pd.read_hdf(sig_hdf5, l1METLabel))
    
for bkg_hdf5, l1METLabel in zip(bkg_hdf5s, l1METLabels):
    bkg_dfs.append(pd.read_hdf(bkg_hdf5, l1METLabel))
            

# filter out events with large/saturated L1 MET from signal for training
if filterSaturated:
    for sig_df in sig_dfs:
        sig_df = sig_df[sig_df[l1METLabel] < 1000]

In [None]:
# plot the MET distributions

for sig_df, l1METLabel in zip(sig_dfs, l1METLabels):
    plt.hist(sig_df['PuppiMET'], bins = 100, range = [0,200], histtype = 'step', log = True, label = "PUPPI MET")
    plt.hist(sig_df['PuppiMETNoMu'], bins = 100, range = [0,200], histtype = 'step',  label = "PUPPI MET NoMu")
    plt.hist(sig_df[l1METLabel], bins = 100, range = [0,200], histtype = 'step', label = l1METLabel)

plt.legend(fontsize=16)

In [None]:
# plot the MET resolution
for sig_df, l1METLabel in zip(sig_dfs, l1METLabels):
    plt.hist((sig_df[l1METLabel] - sig_df['PuppiMETNoMu']), bins = 80, range = [-100,100], label = l1METLabel + " Diff")

plt.legend()

In [None]:
# rate plots must be in bins of GeV
range = [0,200]
bins = range[1]

for bkg_df, l1METLabel in zip(bkg_dfs, l1METLabels):
    rateScale = 40000000*(2452/3564)/len(bkg_df[l1METLabel])
    rateHist = plt.hist(bkg_df[l1METLabel], bins=bins, range=range, histtype = 'step', label=l1METLabel, cumulative=-1, log=True, weights=np.full(len(bkg_df[l1METLabel]), rateScale))
    thresholds.append(plotting.getThreshForRate(rateHist[0], bins, fixedRate))

plt.legend()

In [None]:
# plot the MET efficiency
for sig_df, threshold, l1METLabel in zip(sig_dfs, thresholds, l1METLabels):
    eff_data, xvals = plotting.efficiency(sig_df[l1METLabel], sig_df['PuppiMETNoMu'], threshold, 10, 500)
    plt.scatter(xvals, eff_data, label=l1METLabel + " > " + str(threshold))

plt.axhline(0.95, linestyle='--', color='black')
plt.legend(fontsize=12)