In [None]:
import numpy as np
# import awkward as ak
# import dask
import json
# from coffea import processor
# from coffea.analysis_tools import Weights, PackedSelection

import pandas as pd
import pyarrow.parquet as pq
from tqdm.auto import tqdm
import os
import xgboost as xgb
import matplotlib.pyplot as plt
from pathlib import Path
import pickle
import hist
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

le = LabelEncoder()

import mplhep as hep
plt.style.use([hep.style.CMS])

In [None]:
# with open('samples_noQCD2000.json', 'r') as file:
with open('samples.json', 'r') as file:
    pmap = json.load(file)

print(pmap.keys())
    

In [None]:
# '''processing functions'''
'''Cut definitions'''
def minjetkin(df):
    # fatjets = df['ak8FatJetmsoftdrop', 'ak8FatJetPt', 'ak8FatJetEta']
    # print(df['ak8FatJetmsoftdrop'].shape)
    fatjet_msd = df['FatJet0_msd'].values
    fatjet_pt = df['FatJet0_pt'].values
    fatjet_eta = df['FatJet0_eta'].values
        # fatjets['msdcorr'] = fatjets.msoftdrop
        # fatjets['qcdrho'] = 2 * np.log(fatjets.msdcorr / fatjets.pt)
    candidatejet = df[
            (fatjet_pt > 200)
            & (abs(fatjet_eta) < 2.5)
            # & fatjets.isTight 
        ]

    # candidatejet = candidatejet[:, :2]
    # candidatejet = ak.firsts(candidatejet[ak.argmax(candidatejet.particleNet_XbbVsQCD, axis=1, keepdims=True)])

    # bvl = candidatejet.particleNet_XbbVsQCD
    minjetkin=np.array([
            (candidatejet['FatJet0_pt'] >= 450)
            & (candidatejet['FatJet0_pt']< 1200)
            & (candidatejet['FatJet0_msd'] >= 40.)
            & (candidatejet['FatJet0_msd'] < 201.)
            & (abs(candidatejet['FatJet0_eta']) < 2.5)
       ])
    # minjetkin=np.sum(minjetkin, axis=1).astype('bool').transpose()
    minjetkin = minjetkin.astype('bool').transpose()

    # print(minjetkin)
    # print(minjetkin.shape)
    # print(minjetkin)
    
    return df[minjetkin]
    
def get_paths(year, data_path, proc = 'QCD', deep=False):
    #returns list of paths to parquet files
    parquet_parents = [os.path.join(data_path, year, p, 'parquet','signal-all') for p in pmap[proc]]
    
    if deep:
        file_list=None
        for parent in parquet_parents:
            if file_list is None:
                file_list = [os.path.join(parent,file)for file in os.listdir(parent)]
            else:
                file_list = np.append(file_list, [os.path.join(parent,file)for file in os.listdir(parent)])
    else:
        file_list=parquet_parents
    return file_list

def mode2category(mode):
    cats = np.array(['ggF', 'VBF', 'VH'])
    if mode not in cats:
        raise ValueError(f'Decay mode {mode} not in {cats}')
    category = (cats==mode).astype(int)
    return category

print(mode2category('VBF'))
        
    
    
def process_single(df, 
                   cuts=False,
                   save_fields = ['weight','FatJet0_pt'],
                   signal = False,
                   category = 'QCD', #category order: ['ggF', 'VBF', 'VH']
               ):    
    if cuts: 
        dfc = minjetkin(df.copy(deep=True))
        #add more cuts here
    else:
        dfc = df.copy(deep=True)
                           
    X = dfc[save_fields] 

    if signal:
        X['isSignal']  = np.ones(X['weight'].shape[0]).astype(int)
        X['category'] = [signal]*X['weight'].shape[0]
        # X['y'] = mode2category(X['category'])*X['weight'].shape[0]
    else: 
        X['isSignal'] = np.zeros(X['weight'].shape[0]).astype(int)
        X['category'] = ['QCD']*X['weight'].shape[0]
        # X['y'] = np.array([0,0,0]*X['weight'].shape[0])
    del dfc
    return X

def get_sum_genweights(data_dir: Path, dataset: str) -> float:
    """
    Get the sum of genweights for a given dataset.
    :param data_dir: The directory where the datasets are stored.
    :param dataset: The name of the dataset to get the genweights for.
    :return: The sum of genweights for the dataset.
    """
    total_sumw = 0
    try:
        # Load the genweights from the pickle file
        for pickle_file in list(Path(data_dir / dataset / "pickles").glob("*.pkl")):
            with Path(pickle_file).open("rb") as file:
                out_dict = pickle.load(file)
            # The sum of weights is stored in the "sumw" key
            # You can access it like this:
            for key in out_dict:
                sumw = next(iter(out_dict[key]["sumw"].values()))
            total_sumw += sumw
        print(pickle_file)
    except:
        print("shit: ", list(Path(data_dir / dataset / "pickles").glob("*.pkl"))[0])
        warnings.warn(
            f"Error loading genweights for dataset: {dataset}. Skipping.",
            category=UserWarning,
            stacklevel=2,
        )
        total_sumw = 1

    # print(f"Total sum of weights for all pickles for {dataset}: {total_sumw}")
    return total_sumw

def accumulator(proc, isSignal=False, shallow=False, path=None): #perform data accumulation for a particular process
    if path is None:
        data_dir = '/uscms/home/bweiss/nobackup/hbb/'
        dirs = get_paths('2023', data_dir, proc)
        # print(dirs)
    else:
        if os.path.isfile(path):
            dirs = [path]
        else: 
            dirs = os.listdir(path)
    all_data = None
    for d in tqdm(dirs, desc="Processing "+str(proc)): #runs through subsets of a process
        dataset = None
        if os.path.isfile(d):
            ds = [d]
        else:
            ds = os.listdir(d)
        # print(ds)
        for i, file in enumerate(ds): #runs through files in subset
            if shallow and i>shallow: #use only 1 parquet file from each subset if shallow
                print(file)
                break
            file_path = os.path.join(d,file)
            df = pd.read_parquet(file_path)
            cols = df.columns
            excluded_cols = ['MET', #'FatJet0_pt', 'FatJet0_msd', 'FatJet0_pnetMass', 'FatJet0_pnetTXbb'
                            ]
            save_cols = [c for c in cols if (c not in excluded_cols) 
                         and ('Gen' not in c)
                         
                        ]
            thisdf = process_single(df, cuts=True,
                               save_fields = save_cols,
                               signal = isSignal,
                                  ) #apply cuts save select columns, add isSignal column
            if dataset is None:
                dataset = thisdf
            else:
                dataset = pd.concat([dataset, thisdf], axis = 0, ignore_index=True)
            del thisdf
        #reweight events but sum of weights in a MC dataset
        this_dataset = Path(d).parent.parent.name
        print(this_dataset)
        sumW = get_sum_genweights(Path('/uscms/home/bweiss/nobackup/hbb/2023'), this_dataset)
        dataset['sumW'] = np.ones_like(dataset['weight'])*sumW
        # sumW = np.sum(dataset['weight'].values)
        dataset['weight_final'] = abs(dataset['weight'])/sumW
        print(f'sum of all weights in {d} is {sumW}')
        dataset['MC_name'] = this_dataset
        if all_data is None:
            all_data = dataset
        else:
            all_data = pd.concat([all_data, dataset], axis = 0, ignore_index=True)
        # del dataset
            
    # print('save_cols: ', save_cols)
    return all_data

In [None]:
model = xgb.XGBClassifier()
model.load_model('MultiClassBDT_23Oct25.json')
omit_cols = ['isSignal', 'weight','weight_final', 
             'category', 'FatJet0_pt', 'FatJet0_msd', 'FatJet0_msdmatched',
             'MC_name', 'sumW', 'genWeight',
             'FatJet0_pnetMass', 'FatJet0_pnetTXbb', 'FatJet0_pnetTXgg',
             'FatJet0_pnetTXcc', 'FatJet0_pnetTXqq', 'FatJet0_pnetXbbXcc', 'FatJet0_pnetTQCD',]

In [None]:
''' Accumulate data, BDT predict it, store BDT score and some other vars'''
import warnings
warnings.filterwarnings('ignore') 

MC_cats = ['VBF', 'VH', 'ggF', 'ttH']
MC_cats += ['QCD','Wto2Q','Zto2Q','VV','TopFS']

# samples = ['QCD']
# isSignal = ['QCD']

for j, s in enumerate(MC_cats):
    Y = pd.DataFrame()
    X = accumulator(s, isSignal=s, shallow=False)
    _, X_test = train_test_split(X, test_size=0.2, random_state=42, shuffle=True, #stratify = X['isSignal']
                                 )
    X_test.reset_index()
    # print(X_test.columns)
    # X_toBDT = X_test.drop(omit_cols, axis=1)
    # print(X_test.columns)
    Y_predict = model.predict_proba(X_test.drop(omit_cols, axis=1))
    print(Y_predict)
    # Y['BDT_score'] = Y_predict
    Y['BDT_cat'] = np.argmax(Y_predict, axis =1)
    Y['MC_cat'] = s
    Y['MC_name'] = X_test['MC_name'].values
    Y['weight_final'] = X_test['weight_final'].values
    Y['FatJet0_msd'] = X_test['FatJet0_msd'].values
    Y.to_parquet(f'./BDT_predictions/{s}.parquet')

print(Y)
print(X_test['MC_name'])
    



In [None]:
print(Y[:20])

In [None]:

'''Compute yield from list of weights'''
def get_yield(y, BDT_cat = 'VH', MC_cat = 'TopFS'):
    BDT_cat_dict = {'VBF':0, 'VH':1, 'ggF':2}
    BDT_class = BDT_cat_dict[BDT_cat]
    BDT_mask = y['BDT_cat'] == BDT_class
    MC_mask = y['MC_cat'] == MC_cat

    y_mask = BDT_mask&MC_mask
    # print(BDT_mask)
    # print(MC_mask)
    # print(y_mask)
    yeeld = np.sum(y[y_mask]['weight_final'])
    N = np.sum(y_mask)
    return yeeld, N

# print(get_yield(Y, BDT_cat = 'VH', MC_cat = 'TopFS'))
MC_cats = ['VBF', 'VH', 'ggF', 'ttH']
MC_cats += ['QCD','Wto2Q','Zto2Q','VV','TopFS']

pred_path = './BDT_predictions'
yield_table = pd.DataFrame(index = list(BDT_cat_dict.keys()))
for df_file in os.listdir(pred_path):
    df = pd.read_parquet(os.path.join(pred_path, df_file))
    MC_cat = df['MC_cat'][0]
    print(MC_cat)
    cat_yields = np.zeros(3)
    for i, cat in enumerate(BDT_cat_dict.keys()):
        cat_yields[i] = get_yield(df, BDT_cat = cat, MC_cat = MC_cat)[0]
    yield_table[MC_cat] = cat_yields
print(yield_table)
yield_table.to_csv('yield_table.csv')
    # cat_index = le.transform([category])
        
    # pred_mask = X['CB_cat'] == category
    # X_pred = X[pred_mask]
    # use_weights = int(use_weights)
    # yields = np.zeros((4,2))
    # bottom = 0
    # for cat in range(4):
    #     truecat = le.inverse_transform([cat])[0]
    #     cat_mask = X_pred['true_cat'] == truecat
    #     yields[cat][:] = [np.sum(cat_mask), 
    #                       np.sum(X_pred['weight_final'][cat_mask].values, axis = 0)]
    # sumw = np.sum(yields[:,1])
    # yields[:,1] = yields[:,1]/sumw
    # for c in range(4):
    # print("----------------------------------------")
    # print(f'Predicted {category} purity: {yields[cat_index,1]})')
    # print(f'contains {yields[0][0]} VBF, {yields[1][0]} VH, {yields[2][0]} ggF')
    # print(f'with yields: {yields[0][1]} VBF, {yields[1][1]} VH, {yields[2][1]} ggF')

# fig, ax = plt.subplots(1,1)
# cutBased_purity(ax, CutBased_events, category = 'ggF', use_weights=True)

# fig, axes  = plt.subplots(1,3, figsize = (10,6))
# categories = ['VBF', 'VH', 'ggF', 'ttH']
# for i in range(3):
#     a = cutBased_purity(axes[i], CutBased_events, use_weights = True, category = categories[i])
# fig.legend(categories, ncol=4, loc = 'lower center', bbox_to_anchor=(0.5, -0.07))
# fig.suptitle('Cut based purity', x=0.5, y=0.92)
# fig.tight_layout()