In [None]:
import numpy as np
# import awkward as ak
# import dask
import json
# from coffea import processor
# from coffea.analysis_tools import Weights, PackedSelection

import pandas as pd
import pyarrow.parquet as pq
from tqdm.auto import tqdm
import os
import xgboost as xgb
import matplotlib.pyplot as plt
from pathlib import Path
import pickle
import hist


# import mplhep as hep
# plt.style.use([hep.style.CMS])

In [None]:
# with open('samples_noQCD2000.json', 'r') as file:
with open('samples.json', 'r') as file:
    pmap = json.load(file)

In [None]:
# '''processing functions'''
'''Cut definitions'''
def minjetkin(df):
    # fatjets = df['ak8FatJetmsoftdrop', 'ak8FatJetPt', 'ak8FatJetEta']
    # print(df['ak8FatJetmsoftdrop'].shape)
    fatjet_msd = df['FatJet0_msd'].values
    fatjet_pt = df['FatJet0_pt'].values
    fatjet_eta = df['FatJet0_eta'].values
        # fatjets['msdcorr'] = fatjets.msoftdrop
        # fatjets['qcdrho'] = 2 * np.log(fatjets.msdcorr / fatjets.pt)
    candidatejet = df[
            (fatjet_pt > 200)
            & (abs(fatjet_eta) < 2.5)
            # & fatjets.isTight 
        ]

    # candidatejet = candidatejet[:, :2]
    # candidatejet = ak.firsts(candidatejet[ak.argmax(candidatejet.particleNet_XbbVsQCD, axis=1, keepdims=True)])

    # bvl = candidatejet.particleNet_XbbVsQCD
    minjetkin=np.array([
            (candidatejet['FatJet0_pt'] >= 450)
            & (candidatejet['FatJet0_pt']< 1200)
            & (candidatejet['FatJet0_msd'] >= 40.)
            & (candidatejet['FatJet0_msd'] < 201.)
            & (abs(candidatejet['FatJet0_eta']) < 2.5)
       ])
    # minjetkin=np.sum(minjetkin, axis=1).astype('bool').transpose()
    minjetkin = minjetkin.astype('bool').transpose()

    # print(minjetkin)
    # print(minjetkin.shape)
    # print(minjetkin)
    
    return df[minjetkin]
    
def get_paths(year, data_path, proc = 'QCD', deep=False):
    #returns list of paths to parquet files
    parquet_parents = [os.path.join(data_path, year, p, 'parquet','signal-all') for p in pmap[proc]]
    
    if deep:
        file_list=None
        for parent in parquet_parents:
            if file_list is None:
                file_list = [os.path.join(parent,file)for file in os.listdir(parent)]
            else:
                file_list = np.append(file_list, [os.path.join(parent,file)for file in os.listdir(parent)])
    else:
        file_list=parquet_parents
    return file_list
    
def process_single(df, 
                   cuts=False,
                   save_fields = ['weight','FatJet0_pt'],
                   signal = False,
               ):    
    if cuts: 
        dfc = minjetkin(df.copy(deep=True))
        #add more cuts here
    else:
        dfc = df.copy(deep=True)
                           
    X = dfc[save_fields] 

    if signal:
        X['isSignal']  = np.ones(X['weight'].shape[0]).astype(int)
        X['category'] = [signal]*X['weight'].shape[0]
    else: 
        X['isSignal'] = np.zeros(X['weight'].shape[0]).astype(int)
        X['category'] = ['QCD']*X['weight'].shape[0]
    del dfc
    return X

def get_sum_genweights(data_dir: Path, dataset: str) -> float:
    """
    Get the sum of genweights for a given dataset.
    :param data_dir: The directory where the datasets are stored.
    :param dataset: The name of the dataset to get the genweights for.
    :return: The sum of genweights for the dataset.
    """
    total_sumw = 0
    try:
        # Load the genweights from the pickle file
        for pickle_file in list(Path(data_dir / dataset / "pickles").glob("*.pkl")):
            with Path(pickle_file).open("rb") as file:
                out_dict = pickle.load(file)
            # The sum of weights is stored in the "sumw" key
            # You can access it like this:
            for key in out_dict:
                sumw = next(iter(out_dict[key]["sumw"].values()))
            total_sumw += sumw
        print(pickle_file)
    except:
        print("shit: ", list(Path(data_dir / dataset / "pickles").glob("*.pkl"))[0])
        warnings.warn(
            f"Error loading genweights for dataset: {dataset}. Skipping.",
            category=UserWarning,
            stacklevel=2,
        )
        total_sumw = 1

    # print(f"Total sum of weights for all pickles for {dataset}: {total_sumw}")
    return total_sumw

def accumulator(proc, isSignal=False, shallow=False, path=None): #perform data accumulation for a particular process
    if path is None:
        data_dir = '/uscms/home/bweiss/nobackup/hbb/'
        dirs = get_paths('2023', data_dir, proc)
        # print(dirs)
    else:
        if os.path.isfile(path):
            dirs = [path]
        else: 
            dirs = os.listdir(path)
    # dataset = None
    all_data = None
    # total = 0
    # for d in dirs:
    #     if shallow:
    #         total += min(len(os.listdir(d)), shallow)
    #     else:
    #         total += len(os.listdir(d))
    # print(total)
    for d in tqdm(dirs, desc="Processing "+str(proc)): #runs through subsets of a process
        dataset = None
        if os.path.isfile(d):
            ds = [d]
        else:
            ds = os.listdir(d)
        # print(ds)
        for i, file in enumerate(ds): #runs through files in subset
            if shallow and i>shallow: #use only 1 parquet file from each subset if shallow
                print(file)
                break
            file_path = os.path.join(d,file)
            df = pd.read_parquet(file_path)
            cols = df.columns
            excluded_cols = ['MET', #'FatJet0_pt', 'FatJet0_msd', 'FatJet0_pnetMass', 'FatJet0_pnetTXbb'
                            ]
            save_cols = [c for c in cols if (c not in excluded_cols) 
                         and ('Gen' not in c)
                         
                        ]
            # # save_cols = [col for col in multiindex_columns if isinstance(col, int) and col_string in col[0]]+['weight']
            # save_cols = [col for col in multiindex_columns if ( (col_string in col[0]) #save all ak8fatjet columns and weights
                                                            # and ('ass' not in col[0]) 
                                                            # and ('soft' not in col[0])
                                                            #   )] 
            # save_cols = save_cols + [('weight', 0)] + [('weight_noxsec', 0)]
            # if i == 0:
            #      print('save_cols: ', save_cols)
            
            thisdf = process_single(df, cuts=True,
                               save_fields = save_cols,
                               signal = isSignal,
                                  ) #apply cuts save select columns, add isSignal column
            if dataset is None:
                dataset = thisdf
            else:
                dataset = pd.concat([dataset, thisdf], axis = 0, ignore_index=True)
            del thisdf
        #reweight events but sum of weights in a MC dataset
        this_dataset = Path(d).parent.parent.name
        print(this_dataset)
        sumW = get_sum_genweights(Path('/uscms/home/bweiss/nobackup/hbb/2023'), this_dataset)
        # sumW = np.sum(dataset['weight'].values)
        dataset['weight_final'] = abs(dataset['weight'])/sumW
        print(f'sum of all weights in {d} is {sumW}')
        dataset['MC_name'] = this_dataset
        if all_data is None:
            all_data = dataset
        else:
            all_data = pd.concat([all_data, dataset], axis = 0, ignore_index=True)
        # del dataset
            
    # print('save_cols: ', save_cols)
    return all_data

def df2Dmatrix(X):
    #convert final df to dmatrix for xgb
    dmatrix = xgb.DMatrix(X, label= X['isSignal'], missing = -9999, weight = X['weight_noxsec'])
    return dmatrix

In [None]:
def fill_hist(year, process, field, cut = True, full=True):
    
    data_dir = '/uscms/home/bweiss/nobackup/hbb/' #folder containing all samples
    #collect all field data for the process
    dirs = get_paths(year, data_dir, process)
    # print(dirs)
    var = None

    for d in tqdm(dirs, desc="Processing "+str(proc)+' '+str(field)):
        # print(d)
        for i, file in enumerate(os.listdir(d)):
            path = os.path.join(d,file)
            # print(path)
            # print(path)
            if var is None:
                var = get_proc_field(path, field, cut=cut)
                # print(var)
            else:
                var = np.append(var, get_proc_field(path, field, cut=cut), axis=0)
                # print(var.shape, type(var))
            # if not full:
            #     print('stored var from only 1 parquet')
            #     break
    # var = var.flatten()
    return var

In [None]:
path = '/uscms/home/bweiss/nobackup/hbb/2023/VBFHto2B_M-125_dipoleRecoilOn/parquet/signal-all/part0.parquet'

df = pd.read_parquet(path)
print(df.columns)
print(df['weight'].head())

# data = accumulator('VH', isSignal='test', shallow=100, path = path)

# print(data.columns)
# print('nJet: ', data['nFatJet'].head())
# print(type(data['FatJet1_pt']), data['FatJet1_pt'].head())

# for c in proc_data.columns:
#     print(c, type(data[c]), type(data[c][10]))

# proc_data = data

In [None]:
d = get_paths('2023', '/uscms/home/bweiss/nobackup/hbb/', 'VH')[0]
print(d)
d = Path(d)
dataset = d.parent.parent.name
print(dataset)

data_dir = '/uscms/home/bweiss/nobackup/hbb/2023'

sumW = get_sum_genweights(Path(data_dir), dataset)

# example_pkl = '/uscms/home/bweiss/nobackup/hbb/2023/WminusH_Hto2B_Wto2Q_M-125/pickles/out_9.pkl'

# with Path(example_pkl).open("rb") as file:
#     out_dict = pickle.load(file)
# # The sum of weights is stored in the "sumw" key
# # You can access it like this:
# for key in out_dict:
#     sumw = next(iter(out_dict[key]["sumw"].values()))

print(sumW)



In [None]:
''' Accumulate data, prepare it, save to mega DF '''
import warnings
# warnings.filterwarnings('ignore', category=pd.core.common.SettingWithCopyWarning) 
warnings.filterwarnings('ignore') 

# shallow = True #take only one parquet from each process/proc subset

samples = ['VBF', 'VH', 'ggF', 'QCD'] #processes to aquire
isSignal = ['VBF', 'VH', 'ggF', False]

# samples = ['QCD']
# isSignal = ['QCD']

X = None

for j, s in enumerate(samples):
    # print(s)
    proc_data = accumulator(s, isSignal=isSignal[j], shallow=False)
    # print(proc_data.columns)
    if X is None:
        X = proc_data
        # print(X.columns)
    else:
        X = pd.concat([X, proc_data], axis = 0, ignore_index=True)
    # print(X['isSignal'].shape)
    del proc_data



In [None]:
print(X[['isSignal', 'category', 'MC_name', 'weight_final']])

In [None]:
# print(X.columns)
# print(X[['weight', 'weight_final', 'isSignal']])
MC_names = pmap['QCD']
print(MC_names)


cs = ['FatJet0_pt', 'FatJet1_pt', 'Jet0_pt', 'Jet1_pt']

# del hist_stacks
hist_stacks = None
weights_stacks = None
# hist_stacks.
for i, MC_name in enumerate(MC_names):
    mask = X['MC_name'] == MC_name
    var = X['FatJet0_pt'][mask].reset_index(drop=True).values
    ws = X['weight_final'][mask].reset_index(drop=True).values
    # print(sum(mask))
    # hist_stacks = pd.concat([ hist_stacks, X[cols][mask] ], axis = 0, ignore_index=True)
    if hist_stacks is None:
        hist_stacks = [var]
        # print(hist_stacks)
        weights_stacks = [ws] 
        # hist_stacks.columns = ['FatJet0_pt']
        # print(hist_stacks)
        # print(X['FatJet0_pt'][mask].values)
    else:
        hist_stacks += [var]
        weights_stacks += [ws]
        
        print(len(hist_stacks))
    print(i)
    
    # ax[0][0].hist(X['FatJet0_pt'][mask], stacked = True, histtype='barstacked', #density=True, 
    #         bins=bins, linewidth=2, weights = X['weight_final'][mask], label = 'w/sum_w')
    # # # ax[0][0].hist(X['FatJet0_pt'][mask], histtype='step', density=True, 
    # # #         bins=bins, linewidth=2, weights = X['weight'][mask], label = 'w_noNorm')
    # ax[0][0].set(yscale = 'log', xlabel = 'FatJet0_pt')
    
    # ax[0][1].hist(X['FatJet1_pt'][mask], stacked = True, histtype='barstacked', #density=True, 
    #         bins=bins, linewidth=2, weights = X['weight_final'][mask], label = 'w/sum_w')
    # # ax[0][1].hist(X['FatJet1_pt'][mask], histtype='step', density=True, 
    # #         bins=bins, linewidth=2, weights = X['weight'][mask], label = 'w_noNorm')
    # ax[0][1].set(yscale = 'log', xlabel = 'FatJet1_pt')
    
    # ax[1][0].hist(X['Jet0_pt'][mask], stacked = True, histtype='barstacked', #density=True, 
    #         bins=bins, linewidth=2, weights = X['weight_final'][mask], label = 'w/sum_w')
    # # ax[1][0].hist(X['Jet0_pt'][mask], histtype='step', density=True, 
    # #         bins=bins, linewidth=2, weights = X['weight'][mask], label = 'w_noNorm')
    # ax[1][0].set(yscale = 'log', xlabel = 'Jet0_pt')
    
    # ax[1][1].hist(X['Jet1_pt'][mask], stacked = True, histtype='barstacked', #density=True, 
    #         bins=bins, linewidth=2, weights = X['weight_final'][mask], label = 'w/sum_w')
    # # ax[1][1].hist(X['Jet1_pt'][mask], histtype='step', density=True, 
    # #         bins=bins, linewidth=2, weights = X['weight'][mask], label = 'w_noNorm')
    # ax[1][1].set(yscale = 'log', xlabel = 'Jet1_pt')

In [None]:
print(len(hist_stacks))
fig, ax = plt.subplots(1, 2, figsize = (9,5))
bins = np.linspace(400,1500, 100)

ax[0].hist(hist_stacks, histtype='barstacked', #density=True, 
        bins=bins, linewidth=2, weights = weights_stacks, label = 'w/sum_w')
# ax[1].hist(, stacked = True, #histtype='barstacked', #density=True, 
#         bins=bins, linewidth=2, #weights = weights_stacks, label = 'w/sum_w'
#           )
# # ax[0][0].hist(X['FatJet0_pt'][mask], histtype='step', density=True, 
# #         bins=bins, linewidth=2, weights = X['weight'][mask], label = 'w_noNorm')
ax[0].set(yscale = 'log', xlabel = 'FatJet0_pt')
ax[1].set(yscale = 'log', xlabel = 'weights_final')

samples = ['VBF', 'VH', 'ggF', 'QCD']

for s in samples:
    mask = X['category'] == s
    Ws = X['weight_final'][mask]
    print(f'{s} has yield:{sum(Ws)}')
    ax[1].hist(Ws, stacked = False, histtype='step', #density=True, 
         linewidth=2, #weights = weights_stacks, label = 'w/sum_w'

ax[0].legend(MC_names, loc = 'upper right')

plt.suptitle('QCD Jet Pt reweighting')
fig.tight_layout()

plt.savefig('QCD_Jet_Pt_reweighting3.png')

In [None]:
ak4eta_cols = ['Jet0_eta', 'Jet1_eta', 'Jet2_eta','Jet3_eta']
jj_pairs = [(0,1), (0,2), (0,3), (1,2), (1,3), (2,3)]

ak4_etas = X[ak4eta_cols]

for jj in jj_pairs:
    jet1_eta, jet2_eta = ak4_etas[ak4eta_cols(jj[0])], ak4_etas[ak4eta_cols(jj[1])]
    dEta_jj_all = ak4_etas[f'jj_{jj}']


In [None]:
#manual omition of negative weights
# X= proc_data
print(X.columns)

# X['weight']=abs(X['weight'])
# X['isSignal'] = X['isSignal'].astype(int)

from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(X, test_size=0.2, random_state=42, shuffle=True, #stratify = X['isSignal']
                                  )

In [None]:
# Define the BDT model
# import xgboost as xgb

# see https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.XGBClassifier
# for detailed explanations of parameters




In [None]:
omit_cols = ['isSignal', 'weight','weight_final', 
             'category', 'FatJet0_pt', 'FatJet0_msd', 
             'FatJet0_pnetMass', 'FatJet0_pnetTXbb', 'MC_name'
            ]

Y_train = X_train['isSignal']
Y_test = X_test['isSignal']
W_train = X_train['weight_final']
W_test = X_test['weight_final']
pos_weight = sum(W_train[X_train['isSignal'] == 0])/sum(W_train[X_train['isSignal'] == 1])
print(pos_weight)
# print('Y_train: ',Y_train.head(), sum(Y_train))
# print('Y_test: ', sum(Y_test))

model = xgb.XGBClassifier(
    n_estimators=100,  # number of boosting rounds (i.e. number of decision trees)
    max_depth=8,  # max depth of each decision tree
    learning_rate=0.1,
    early_stopping_rounds=20,  #Remove this # how many rounds to wait to see if the loss is going down
    missing = np.nan,
    scale_pos_weight = pos_weight,
    # eval_metric='logloss',
    # objective = 'binary:logistic'
    
)


# print('X_train: ', X_train.drop(omit_cols, axis = 1).columns)
#
# print(X_train['Jet3_pt'], type(X_train['Jet3_pt'][3000]))
# from sklearn.preprocessing import LabelEncoder
# le = LabelEncoder()
# X_train['isSignal'] = le.fit_transform(X_train['isSignal'])
# X_test['isSignal'] = le.fit_transform(X_test['isSignal'])


trained_model = model.fit(
    X_train.drop(omit_cols, axis=1), #data should not include label column OR weights
    Y_train, #labels
    sample_weight=W_train,
    # Y_train_val,
    # xgboost uses the last set for early stopping
    # https://xgboost.readthedocs.io/en/stable/python/python_intro.html#early-stopping
    eval_set=[(X_train.drop(omit_cols, axis=1), Y_train), 
              (X_test.drop(omit_cols, axis=1), Y_test)],  # sets for which to save the loss
    verbose=True,
)

In [None]:
evals_result = trained_model.evals_result()
fig = plt.figure(figsize=(5, 4))
for i, label in enumerate(["Train", "Test"]):
    plt.plot(evals_result[f"validation_{i}"]["logloss"], label=label, linewidth=2)
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.show()


# Plot ROC
Y_predict = model.predict_proba(X_test.drop(omit_cols, axis=1))

# Y_predict = Y_predict[:, 1].squeeze()
# print(Y_predict)
# Y_predict = le.inverse_transform(Y_predict)
# print(Y_predict)
# X_test['isSignal'] = le.inverse_transform(X_test['isSignal'])

from sklearn.metrics import roc_curve, auc

fig, ax = plt.subplots(1, 1, figsize=(5, 4))

samples = [False, 'VBF', 'ggF',  'VH']
for i, s in enumerate(samples[1:]):
    category_mask = ((X_test['category'] == s) | (X_test['category'] == 'QCD'))
    fpr, tpr, thresholds = roc_curve(X_test[category_mask]['isSignal'].astype(int), 
                                     Y_predict[category_mask,1], 
                                     sample_weight = X_test[category_mask]['weight_final'],
                                     pos_label=1)
    roc_auc = auc(fpr, tpr)
    
    ax.plot(fpr, tpr, lw=2, label=f"{s} auc = %.3f" % (roc_auc))
ax.plot([0, 1], [0, 1], linestyle="--", lw=2, color="k", label="random chance")
ax.set_xlim([0, 1.0])
ax.set_ylim([0, 1.0])
ax.set_xlabel("false positive rate")
ax.set_ylabel("true positive rate")
ax.set_title("receiver operating curve")
ax.legend(loc="lower right")
plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize = (5,4))
samples = ['QCD', 'VBF', 'ggF',  'VH', ]
colors = ['black', [0.6,1,0.6], [0,0.8,0], [0,0.3,0]]
print(Y_predict[:,1].shape)
print(X_test['weight'].shape)

bins = np.linspace(0,1,10)
signals = pd.DataFrame()
# weights = pd.DataFrame()
for i, s in enumerate(samples[1:]):
    category_mask = X_test['category'] == s
    # w = pd.DataFrame(X_test['weight_final'][category_mask], columns=[s])
    data = pd.DataFrame(Y_predict[category_mask, 1], columns=[s])
    signals = pd.concat([signals, data], axis=1)
    # weigths = pd.concat([weights, w], axis=1)
    # if s is not 'QCD':
    #     # ax.hist(Y_predict[category_mask, 1], color = colors[i], label = samples[i], stacked=True, linewidth=3, bins=bins #weights = w
    #        )
    # else:
    #     ax.hist(Y_predict[category_mask, 1], histtype='step', color = colors[i], label = samples[i], stacked=False, linewidth=3, bins=bins #weights = w
    #        )




QCD_mask =  X_test['category'] == 'QCD'
ax.hist(Y_predict[QCD_mask, 1], histtype='step', color = colors[0], label = samples[0], stacked=False, linewidth=3, bins=bins #weights = w
           )
ax.hist(signals, #color = colors[1:],
        label =signals.columns, stacked=True, linewidth=3, bins=bins, #weights = w
           )
    
ax.set_yscale('log')
ax.legend(samples)
ax.set_ylabel('Events')
ax.set_xlabel('BDT score')
ax.set_title('S/B classifer scores by channel')

In [None]:
# import shap

In [None]:
# print(trained_model.feature_importances_)
plt.figure(figsize=(9,18))
# plot
# c = X_train.columns
# fields = np.unique(np.array([c[0] for c in X_train.columns]))

# fields=np.array([c[0] for c in X_train.columns])
features = trained_model.get_booster().feature_names
importance = trained_model.feature_importances_
print(importance)
# y = range(len(importance))


fi = pd.DataFrame({'features': features, 'importance': importance})
fi.sort_values(by = 'importance', ascending=False).reset_index(drop=True)
# print(fi['importance'])
# print(fi)

y=range(len(fi))
plt.barh(y, fi['importance'])
# plt.invert_yaxis()
# plt.bar(fields, trained_model.feature_importances_)
plt.yticks(y, labels=fi['features'])

plt.show()

In [None]:
print(X_train['Jet3_pt'])
print(X_train['Jet3_pt'].loc[7201776])
print(type(X_train['Jet3_pt'].loc[7201776]))