In [None]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq

import os
import matplotlib.pyplot as plt
# from coffea.nanoevents import NanoEventsFactory, BaseSchema
# import uproot
# import awkward as ak
# import hist
# from plotting import * 
from tqdm import tqdm
# from agc_schema import AGCSchema

# import mplhep as hep
# plt.style.use([hep.style.CMS])

In [None]:
pmap = {}

pmap['QCD'] = [
    'QCD_HT-100to200', 'QCD_HT-200to400', 'QCD_HT-400to600', 
    'QCD_HT-600to800', 'QCD_HT-800to1000', 'QCD_HT-1000to1200', 
    'QCD_HT-1200to1500', 'QCD_HT-1500to2000', 'QCD_HT-2000'
    ]

pmap['Zjets_had_1j'] = [
    'Zto2Q-2Jets_PTQQ-100to200_1J', 'Zto2Q-2Jets_PTQQ-200to400_1J',
    'Zto2Q-2Jets_PTQQ-400to600_1J', 'Zto2Q-2Jets_PTQQ-600_1J'
    ]

pmap['Zjets_had_2j'] = [
    'Zto2Q-2Jets_PTQQ-100to200_2J', 'Zto2Q-2Jets_PTQQ-200to400_2J',
    'Zto2Q-2Jets_PTQQ-400to600_2J', 'Zto2Q-2Jets_PTQQ-600_2J'
    ]

pmap['Wjets_had_1j'] = [
    'Zto2Q-2Jets_PTQQ-100to200_1J', 'Zto2Q-2Jets_PTQQ-200to400_1J',
    'Zto2Q-2Jets_PTQQ-400to600_1J', 'Zto2Q-2Jets_PTQQ-600_1J'
    ]

pmap['Wjets_had_2j'] = [
    'Wto2Q-2Jets_PTQQ-100to200_2J', 'Wto2Q-2Jets_PTQQ-200to400_2J',
    'Wto2Q-2Jets_PTQQ-400to600_2J', 'Wto2Q-2Jets_PTQQ-600_2J'
    ]

pmap['Hbb_all'] = [
    'ggZH_Hto2B_Zto2L_M-125', 'ggZH_Hto2B_Zto2Nu_M-125', 'ggZH_Hto2B_Zto2Q_M-125',
    'GluGluHto2B_M-125', 'ttHto2B_M-125', 'VBFHto2B_M-125',
    'WminusH_Hto2B_Wto2Q_M-125', 'WminusH_Hto2B_WtoLNu_M-125', 
    'WplusH_Hto2B_Wto2Q_M-125', 'WplusH_Hto2B_WtoLNu_M-125',
    'ZH_Hto2B_Zto2L_M-125', 'ZH_Hto2B_Zto2Nu_M-125', 'ZH_Hto2B_Zto2Q_M-125'
    ]

pmap['VV'] = ['WW', 'WZ', 'ZZ']

pmap['ggF'] = ['GluGluHto2B_M-125']

pmap['VBF'] = ['VBFHto2B_M-125']

pmap['WH'] = ['WminusH_Hto2B_Wto2Q_M-125', 'WplusH_Hto2B_Wto2Q_M-125',
              'WminusH_Hto2B_WtoLNu_M-125', 'WplusH_Hto2B_WtoLNu_M-125']

pmap['ZH'] = ['ZH_Hto2B_Zto2Q_M-125', 'ZH_Hto2B_Zto2L_M-125', 'ZH_Hto2B_Zto2Nu_M-125',
              'ggZH_Hto2B_Zto2Q_M-125', 'ggZH_Hto2B_Zto2L_M-125', 'ggZH_Hto2B_Zto2Nu_M-125']

pmap['VH'] = pmap['WH']+pmap['ZH']

pmap['ttH'] = ['ttHto2B_M-125']

pmap['ttbar'] = ['TTto4Q', 'TTto2L2Nu', 'TTtoLNu2Q']

pmap['Jet_data'] = ['JetMET_Run2023Cv1', 'JetMET_Run2023Cv2', 
                    'JetMET_Run2023Cv3', 'JetMET_Run2023Cv4',]

pmap['background'] = pmap['QCD']+pmap['Wjets_had_2j']+pmap['Wjets_had_1j'] +pmap['Zjets_had_1j']+pmap['VV']+pmap['ttbar']

print(pmap['VH'])

In [None]:
def tuple_to_bytes(data_tuple):
    string_part, int_part = data_tuple
    string_bytes = string_part.encode('utf-8')
    int_bytes = int_part.to_bytes(4, byteorder='big')  # Adjust byte length as needed
    return string_bytes + int_bytes

def get_paths(data_path, proc = 'QCD', deep=False):
    #returns list of paths to parquet files
    parquet_parents = [os.path.join(data_path, p, 'parquet') for p in pmap[proc]]
    
    if deep:
        file_list=None
        for parent in parquet_parents:
            if file_list is None:
                file_list = [os.path.join(parent,file)for file in os.listdir(parent)]
            else:
                file_list = np.append(file_list, [os.path.join(parent,file)for file in os.listdir(parent)])
    else:
        file_list=parquet_parents
    return file_list

def get_proc_field(file_path, field = 'GenVMass'):
    # df_columns_only = pd.read_parquet(file_path, engine='pyarrow', columns=None)
    # multiindex_columns = df_columns_only.columns
    
    # target_column = field
    # selected_columns = [col for col in multiindex_columns if isinstance(col, int) and col[0] == target_column]

    # print(selected_columns)
    # events = pd.read_parquet(file_path, 
    #                         columns=selected_columns, engine='pyarrow'
    #                         )
    # return events
    
    events = pd.read_parquet(file_path)
    if field in events.columns:
       variable = events[field] 
    else:
        print(f"Field {field} not found")

    del events

    return variable
    

def fill_hist(process, field, full=True):
    
    data_dir = '/uscms/home/bweiss/nobackup/hbb/2023' #folder containing all samples
    #collect all field data for the process
    dirs = get_paths(data_dir, process)
    # print(dirs)
    var = None

    for d in tqdm(dirs, desc="Processing "+str(process)+' '+str(field)):
        # print(d)
        for i, file in enumerate(os.listdir(d)):
            path = os.path.join(d, file)
            if var is None:
                var = get_proc_field(path, field)
                # print(var.shape, var)
            else:
                var = np.append(var, get_proc_field(path, field).values, axis=0)
                # print(var.shape, type(var))
            # if not full:
            #     print('stored var from only 1 parquet')
            #     break
    return var

# print(get_dirs(data_dir, proc, field = 'Zjets_had_1j'))

In [None]:
path = './2023/QCD_HT-100to200/parquet/out_0.parquet'
path = '/uscms/home/bweiss/nobackup/hbb/2023'
dpaths = get_paths(path, 'QCD')

# print(dpaths)
# c = ('ak8FatJetPt',1)
# cb = tuple_to_bytes(c)

# schema = pq.read_schema(dpaths[0])
# all_columns = schema.names
# print(len(all_columns))

# import ast
# column_tuples = [ast.literal_eval(name) for name in all_columns]
# selected_columns = [col for col in column_tuples if col[0] == 'ak8FatJetPt']

# print(type(selected_columns))

# x = pd.read_parquet(dpaths[0], columns = selected_columns)

# print(x)

# x = pd.read_parquet(dpaths[0], columns = [('ak8FatJetPt'),('ak8FatJetEta')])
x=fill_hist('QCD', 'ak8FatJetPt')

# print(x)

In [None]:
print(x.shape)
bins = np.linspace(300,2000,10)
# plt.hist(x[x>-999], bins=bins)

print(x[x>-999][:20])
# print(x[:20,:])

In [None]:
'''print fields for a given process'''

proc = 'ggF'
main_dir = '/uscms/home/bweiss/nobackup/hbb/2023'
g = get_paths(main_dir, proc)
df = pd.read_parquet(g[0])

fields = np.unique(np.array([c[0] for c in df.columns]))
sizes = np.array([[c[0] for c in df.columns].count(f) for f in fields])

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)


print(pd.DataFrame({'Field':fields , 'Size':sizes}))


In [None]:
procs = [
        'ggF', 'VBF', 'VH', 
        # 'QCD'
        ]

fields = [
        # 'GenbPt', 'GenbEta', 'GenbPhi'
        'ak8FatJetParTPXcc', 'ak8FatJetParTPXcc','ak8FatJetParTPXgg', 'ak8FatJetParTPXqq' 
    ]
scale= ['log','log','log','log']
colors = ['red', 'blue', 'green', 'black']
# var = fill_hist(proc,field)[:,:2]

fig, ax = plt.subplots(2,2, figsize = (12,12))
plt.tight_layout(pad=2)

ax = ax.flat

# pt = ak.Array(fill_hist(proc,field))
for j, proc in enumerate(procs):
    # pt = fill_hist(proc,'GenbPt')[:,:2]
    # pt = fill_hist(proc,'ak8FatJetPt')[:,:2]
    # high_pt = (np.arange(len(pt[:,0])), np.argmax(pt, axis = 1))
    # low_pt = (np.arange(len(pt[:,0])), np.argmin(pt, axis = 1))
    # mask = np.zeros(pt.shape) 
    # mask[high_pt] = True
    # print(mask)
    # print(high_pt)
    # print(proc)
    for i, field in enumerate(fields):
        # print(field)
        # field = fields[i]
        var = fill_hist(proc,field)[:,:2]
        # if i == 0:
        # print(field, var[:5], var[high_pt][:5]) 
        if 'Pt' in field:
            bins = np.linspace(200,1000, 10)
            ax[i].hist(var[:,0], histtype='step', density=True, bins=bins, label = proc, color = colors[j], linewidth=2)
            ax[i].set(yscale = 'log', xlabel = 'Leading '+field)
        elif 'soft' in field:
            bins = np.linspace(0,300, 10)
            ax[i].hist(var[:,0], histtype='step', density=True, bins=bins, label = proc, color = colors[j], linewidth=2)
            ax[i].set(xlabel = 'Leading '+field)
        else:
            bins = np.linspace(0,1,10)
            ax[i].hist(var[:,0], histtype='step', density=True, bins=bins, label = proc, color = colors[j], linewidth=2)
            ax[i].set(xlabel = 'Leading '+field)
            if scale[i]:
                ax[i].set(yscale=scale[i])
            
        # ax[1][i].hist(var[low_pt], histtype='step', density=True)
        # ax[1][i].set(xlabel = 'Sub-leading '+field)
    # del pt, var
nl = len(procs)
lines_labels = [a.get_legend_handles_labels() for a in fig.axes]
lines, labels = [sum(lol, []) for lol in zip(*lines_labels)]
lgd = fig.legend(lines[:nl], labels[:nl], 
                 # bbox_to_anchor=(1.05, 0.5),
                 loc='upper right',
                 ncol=3
                )
fig.supylabel('Events (Normalized)')
fig.suptitle('Input Variable Scan', ha='left', x=0.01)

plt.tight_layout()
