In [2]:
import sys
#sys.path.append('/home/mdd424/CARL_tthbb/')
#sys.path.append('/home/mdd424/downloads/carl-torch')

import uproot
import numpy as np
import math
import json
import bisect
import os
import pickle
import logging
from tqdm.notebook import tqdm

import torch
from torch import nn
from torch import sigmoid
from torch.utils.data import DataLoader
#from torchsummary import summary

from sklearn import preprocessing
from sklearn.isotonic import IsotonicRegression
from sklearn.calibration import calibration_curve
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KernelDensity

from scipy.spatial import distance
import awkward as ak

import matplotlib as mpl
default_backend = mpl.get_backend()
print(default_backend)
import matplotlib.pyplot as plt
#import imageio

module://matplotlib_inline.backend_inline


In [2]:
mpl.use(default_backend)

In [3]:
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
DEVICE

device(type='cpu')

In [4]:
jet_features = ["Jet_pt", "Jet_eta", "Jet_phi", "Jet_mass"]
electron_features = ["Electron_pt", "Electron_eta", "Electron_phi", "Electron_mass"]
muon_features = ["Muon_pt", "Muon_eta", "Muon_phi", "Muon_mass"]

In [5]:
features = jet_features + electron_features + muon_features

In [6]:
#weight_features = ["weight_mc", "weight_pileup", "weight_leptonSF", "weight_jvt", "weight_bTagSF_DL1r_Continuous"]
weight_features = ["genWeight", "btagWeight_CSVV2"]

In [7]:
# Luminosities in pb^-1
luminosities = {'2015': 36207.66, '2017': 44307.4, '2018': 58450.1}
luminosities_by_run = {'9364': 36207.66, '10201': 44307.4, '10724': 58450.1}

In [8]:
with open("nanoaod_inputs.json", 'r') as f:
    file_dict = json.load(f)

In [10]:
all_nominal_files = [x["path"] for x in file_dict["ttbar"]["nominal"]["files"]]
print(len(all_nominal_files), all_nominal_files[0])
np.random.shuffle(all_nominal_files)
all_nominal_files[0]

243 https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneCUETP8M1_13TeV-powheg-pythia8/cmsopendata2015_ttbar_19980_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext3-v1_00000_0000.root


'https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneCUETP8M1_13TeV-powheg-pythia8/cmsopendata2015_ttbar_19981_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext4-v1_80000_0005.root'

In [11]:
all_variation_files = [x["path"] for x in file_dict["ttbar"]["PS_var"]["files"]]
print(len(all_variation_files), all_variation_files[0])
np.random.shuffle(all_variation_files)
all_variation_files[0]

15 https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneEE5C_13TeV-powheg-herwigpp/cmsopendata2015_ttbar_19999_PU25nsData2015v1_76X_mcRun2_asymptotic_v12-v1_10000_0000.root


'https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneEE5C_13TeV-powheg-herwigpp/cmsopendata2015_ttbar_19999_PU25nsData2015v1_76X_mcRun2_asymptotic_v12-v1_10000_0004.root'

In [12]:
"""nominal_Nevents = 0
variation_Nevents = 0
for filename in all_nominal_files:
    dataset = uproot.open(filename)["Events"].arrays(jet_features[0])
    nominal_Nevents += len(dataset)
for filename in all_variation_files:
    dataset = uproot.open(filename)["Events"].arrays(jet_features[0])
    variation_Nevents += len(dataset)
max_data_index = min([nominal_Nevents, variation_Nevents])
max_data_index"""

'nominal_Nevents = 0\nvariation_Nevents = 0\nfor filename in all_nominal_files:\n    dataset = uproot.open(filename)["Events"].arrays(jet_features[0])\n    nominal_Nevents += len(dataset)\nfor filename in all_variation_files:\n    dataset = uproot.open(filename)["Events"].arrays(jet_features[0])\n    variation_Nevents += len(dataset)\nmax_data_index = min([nominal_Nevents, variation_Nevents])\nmax_data_index'

In [3]:
data_metadata_dict = {}
with open("carl_data_metadata.json", 'r') as f:
    data_metadata_dict = json.load(f)
    max_data_index = data_metadata_dict["max_data_index"]
    max_jet_size = data_metadata_dict["max_jet_size"]
    max_electron_size = data_metadata_dict["max_electron_size"]
    max_muon_size = data_metadata_dict["max_muon_size"]

In [14]:
train_frac = 0.6
val_frac = 0.2
test_frac = 0.2

max_train_index = int(train_frac * max_data_index)
max_val_index = max_train_index + int(val_frac * max_data_index)
max_test_index = max_val_index + int(test_frac * max_data_index)

np.random.shuffle(all_nominal_files)
np.random.shuffle(all_variation_files)
print(max_train_index, max_val_index, max_test_index)

11602238 15469650 19337062


In [15]:
def get_max_sizes(filename):
    dataset = uproot.open(filename)["Events"].arrays([jet_features[0], electron_features[0], muon_features[0]])
    max_jet_size = max(map(len, dataset[jet_features[0]]))
    max_electron_size = max(map(len, dataset[electron_features[0]]))
    max_muon_size = max(map(len, dataset[muon_features[0]]))
    return [max_jet_size, max_electron_size, max_muon_size]

In [16]:
#max_jet_size = 0
#for filename in all_nominal_files + all_variation_files:
#    dataset = uproot.open(filename)["Events"].arrays(jet_features)
#    max_jet_size = max([max(map(len, dataset[jet_features[0]])), max_jet_size]) 

In [17]:
#max_jet_size = 20
max_jet_size

64

In [18]:
def fill_or_extend_tree(datafile, tree_dict, treename="Events"):
    if datafile.get(treename) is None:
        datafile[treename] = tree_dict
    else:
        datafile[treename].extend(tree_dict)
    return None

In [19]:
def build_data_dict(features, arrays, split_index=None, split_low=False, split_high=False):
    if split_low is False and split_high is False:
        data_dict = dict(zip(features, ak.unzip(arrays)))
    elif split_high is True:
        data_dict = dict(zip(features, ak.unzip(arrays[:split_index])))
    elif split_low is True:
        data_dict = dict(zip(features, ak.unzip(arrays[split_index:])))
    return data_dict

# Section 1: Create a standard dataset with zero padding

## Generate the nominal datasets

In [31]:
df_train = uproot.recreate("/data/mdrnevich/AGC/CMS_ttbar_nominal_training_data.root")
df_val = uproot.recreate("/data/mdrnevich/AGC/CMS_ttbar_nominal_validation_data.root")
df_test = uproot.recreate("/data/mdrnevich/AGC/CMS_ttbar_nominal_testing_data.root")

chunk_size = 100000

current_index = 0
for filename in all_nominal_files:
    print("Loading file: {}".format(filename))
    # Load the data
    nominal_dataset = uproot.open(filename)["Events"]#.arrays(jet_features + electron_features + muon_features + weight_features)
    filesize = int(nominal_dataset.arrays(jet_features[0]).type.length)
    print(filesize)
    for i in range(int(np.ceil(filesize / chunk_size))):
        #data_arrays = nominal_dataset.arrays(jet_features + electron_features + muon_features + weight_features,
        #                                     entry_start=int(i * chunk_size),
        #                                     entry_stop=int((i+1) * chunk_size))

        jet_arr = nominal_dataset.arrays(jet_features, entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size))
        electron_arr = nominal_dataset.arrays(electron_features, entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size))
        muon_arr = nominal_dataset.arrays(muon_features, entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size))
        weight_arr = nominal_dataset.arrays(weight_features, entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size))
        # Get the run number
        #nominal_file_run_number = str(NOMINAL_FILE_TO_RUN_NUMBER[filename])
        # Get the DSID number
        #nominal_file_dsid = str(NOMINAL_FILE_TO_DSID[filename])
        # Get the luminsotiy, DSID cross section, and per DSID total weighted events
        #_scale_factor = luminosities_by_run[nominal_file_run_number] * NOMINAL_XSECTIONS[nominal_file_dsid] / NOMINAL_NORMALIZATIONS[nominal_file_dsid]
        _scale_factor = 3378 * 1 / 1
        # Extract the combined weight array
        _weights = ak.concatenate(ak.unzip(weight_arr[weight_features][:, np.newaxis]), axis=1).to_numpy().prod(axis=1)

        nominal_padded_jets = ak.fill_none(ak.pad_none(jet_arr[jet_features], max_jet_size, clip=True), 0, axis=1)
        nominal_padded_electrons = ak.fill_none(ak.pad_none(electron_arr[electron_features], max_electron_size, clip=True), 0, axis=1)
        nominal_padded_muons = ak.fill_none(ak.pad_none(muon_arr[muon_features], max_muon_size, clip=True), 0, axis=1)

        current_data_size = len(_weights)

        # put everything in train
        if current_index + current_data_size < max_train_index:
            data_dict = build_data_dict(jet_features, nominal_padded_jets)
            data_dict.update(build_data_dict(electron_features, nominal_padded_electrons))
            data_dict.update(build_data_dict(muon_features, nominal_padded_muons))
            data_dict["weight_mc_combined"] = _weights * _scale_factor
            fill_or_extend_tree(df_train, data_dict)
        # put part in train and the rest in val
        elif current_index < max_train_index and current_index + current_data_size < max_val_index:
            split_index = max_train_index - current_index
            data_dict = build_data_dict(jet_features, nominal_padded_jets, split_index=split_index, split_high=True)
            data_dict.update(build_data_dict(electron_features, nominal_padded_electrons, split_index=split_index, split_high=True))
            data_dict.update(build_data_dict(muon_features, nominal_padded_muons, split_index=split_index, split_high=True))
            data_dict["weight_mc_combined"] = _weights[:split_index] * _scale_factor
            fill_or_extend_tree(df_train, data_dict)

            data_dict = build_data_dict(jet_features, nominal_padded_jets, split_index=split_index, split_low=True)
            data_dict.update(build_data_dict(electron_features, nominal_padded_electrons, split_index=split_index, split_low=True))
            data_dict.update(build_data_dict(muon_features, nominal_padded_muons, split_index=split_index, split_low=True))
            data_dict["weight_mc_combined"] = _weights[split_index:] * _scale_factor
            fill_or_extend_tree(df_val, data_dict)
        # put everything into val
        elif current_index >= max_train_index and current_index + current_data_size < max_val_index:
            data_dict = build_data_dict(jet_features, nominal_padded_jets)
            data_dict.update(build_data_dict(electron_features, nominal_padded_electrons))
            data_dict.update(build_data_dict(muon_features, nominal_padded_muons))
            data_dict["weight_mc_combined"] = _weights * _scale_factor
            fill_or_extend_tree(df_val, data_dict)
        # put part in val and the rest in test
        elif current_index < max_val_index and current_index + current_data_size < max_test_index:
            split_index = max_val_index - current_index
            data_dict = build_data_dict(jet_features, nominal_padded_jets, split_index=split_index, split_high=True)
            data_dict.update(build_data_dict(electron_features, nominal_padded_electrons, split_index=split_index, split_high=True))
            data_dict.update(build_data_dict(muon_features, nominal_padded_muons, split_index=split_index, split_high=True))
            data_dict["weight_mc_combined"] = _weights[:split_index] * _scale_factor
            fill_or_extend_tree(df_val, data_dict)

            data_dict = build_data_dict(jet_features, nominal_padded_jets, split_index=split_index, split_low=True)
            data_dict.update(build_data_dict(electron_features, nominal_padded_electrons, split_index=split_index, split_low=True))
            data_dict.update(build_data_dict(muon_features, nominal_padded_muons, split_index=split_index, split_low=True))
            data_dict["weight_mc_combined"] = _weights[split_index:] * _scale_factor
            fill_or_extend_tree(df_test, data_dict)
        # put everything into test
        elif current_index >= max_val_index and current_index + current_data_size < max_test_index:
            data_dict = build_data_dict(jet_features, nominal_padded_jets)
            data_dict.update(build_data_dict(electron_features, nominal_padded_electrons))
            data_dict.update(build_data_dict(muon_features, nominal_padded_muons))
            data_dict["weight_mc_combined"] = _weights * _scale_factor
            fill_or_extend_tree(df_test, data_dict)
        # put what's needed into test and ignore the rest
        elif current_index < max_test_index and current_index + current_data_size >= max_test_index:
            split_index = max_test_index - current_index
            data_dict = build_data_dict(jet_features, nominal_padded_jets, split_index=split_index, split_high=True)
            data_dict.update(build_data_dict(electron_features, nominal_padded_electrons, split_index=split_index, split_high=True))
            data_dict.update(build_data_dict(muon_features, nominal_padded_muons, split_index=split_index, split_high=True))
            data_dict["weight_mc_combined"] = _weights[:split_index] * _scale_factor
            fill_or_extend_tree(df_test, data_dict)
        else:
            print("Uncaught case:",
                  "current_index: {}".format(current_index),
                  "current_data_size: {}".format(current_data_size),
                  "max_train_index: {}".format(max_train_index),
                  "max_val_index: {}".format(max_val_index),
                  "max_test_index: {}".format(max_test_index), sep='\n')

        current_index += current_data_size
        if current_index >= max_test_index:
            break
    if current_index >= max_test_index:
        break
print("Finished!")

Loading file: https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneCUETP8M1_13TeV-powheg-pythia8/cmsopendata2015_ttbar_19981_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext4-v1_40001_0000.root
1189800
Loading file: https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneCUETP8M1_13TeV-powheg-pythia8/cmsopendata2015_ttbar_19981_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext4-v1_40000_0007.root
1181800
Loading file: https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneCUETP8M1_13TeV-powheg-pythia8/cmsopendata2015_ttbar_19980_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext3-v1_20000_0007.root
612355
Loading file: https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneCUETP8M1_13TeV-powheg-pythia8/cmsopendata2015_ttbar_19980_PU25nsData2015v1_76X_mcRun2_asymptotic_v12_ext3-v1_60000_0006.root
1175424
Loading file: https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneCUETP8M1_13TeV-powheg-pythia8/cmsopendata2015_ttbar_19980_PU25nsData2015v1

## Generate the alternative datasets

In [32]:
df_train = uproot.recreate("/data/mdrnevich/AGC/CMS_ttbar_PS_var_training_data.root")
df_val = uproot.recreate("/data/mdrnevich/AGC/CMS_ttbar_PS_var_validation_data.root")
df_test = uproot.recreate("/data/mdrnevich/AGC/CMS_ttbar_PS_var_testing_data.root")

chunk_size = 100000

current_index = 0
for filename in all_variation_files:
    print("Loading file: {}".format(filename))
    # Load the data
    variation_dataset = uproot.open(filename)["Events"]#.arrays(jet_features + electron_features + muon_features + weight_features)
    filesize = int(variation_dataset.arrays(jet_features[0]).type.length)
    print(filesize)
    for i in range(int(np.ceil(filesize / chunk_size))):
        jet_arr = variation_dataset.arrays(jet_features, entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size))
        electron_arr = variation_dataset.arrays(electron_features, entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size))
        muon_arr = variation_dataset.arrays(muon_features, entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size))
        weight_arr = variation_dataset.arrays(weight_features, entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size))
        # Get the run number
        #variation_file_run_number = str(VARIATION_FILE_TO_RUN_NUMBER[filename])
        # Get the DSID number
        #variation_file_dsid = str(VARIATION_FILE_TO_DSID[filename])
        # Get the luminsotiy, DSID cross section, and per DSID total weighted events
        #_scale_factor = luminosities_by_run[variation_file_run_number] * VARIATION_XSECTIONS[variation_file_dsid] / VARIATION_NORMALIZATIONS[variation_file_dsid]
        _scale_factor = 3378 * 1 / 1
        # Extract the combined weight array
        _weights = ak.concatenate(ak.unzip(weight_arr[weight_features][:, np.newaxis]), axis=1).to_numpy().prod(axis=1)

        variation_padded_jets = ak.fill_none(ak.pad_none(jet_arr[jet_features], max_jet_size, clip=True), 0, axis=1)
        variation_padded_electrons = ak.fill_none(ak.pad_none(electron_arr[electron_features], max_electron_size, clip=True), 0, axis=1)
        variation_padded_muons = ak.fill_none(ak.pad_none(muon_arr[muon_features], max_muon_size, clip=True), 0, axis=1)

        current_data_size = len(_weights)

        # put everything in train
        if current_index + current_data_size < max_train_index:
            data_dict = build_data_dict(jet_features, variation_padded_jets)
            data_dict.update(build_data_dict(electron_features, variation_padded_electrons))
            data_dict.update(build_data_dict(muon_features, variation_padded_muons))
            data_dict["weight_mc_combined"] = _weights * _scale_factor
            fill_or_extend_tree(df_train, data_dict)
        # put part in train and the rest in val
        elif current_index < max_train_index and current_index + current_data_size < max_val_index:
            split_index = max_train_index - current_index
            data_dict = build_data_dict(jet_features, variation_padded_jets, split_index=split_index, split_high=True)
            data_dict.update(build_data_dict(electron_features, variation_padded_electrons, split_index=split_index, split_high=True))
            data_dict.update(build_data_dict(muon_features, variation_padded_muons, split_index=split_index, split_high=True))
            data_dict["weight_mc_combined"] = _weights[:split_index] * _scale_factor
            fill_or_extend_tree(df_train, data_dict)

            data_dict = build_data_dict(jet_features, variation_padded_jets, split_index=split_index, split_low=True)
            data_dict.update(build_data_dict(electron_features, variation_padded_electrons, split_index=split_index, split_low=True))
            data_dict.update(build_data_dict(muon_features, variation_padded_muons, split_index=split_index, split_low=True))
            data_dict["weight_mc_combined"] = _weights[split_index:] * _scale_factor
            fill_or_extend_tree(df_val, data_dict)
        # put everything into val
        elif current_index >= max_train_index and current_index + current_data_size < max_val_index:
            data_dict = build_data_dict(jet_features, variation_padded_jets)
            data_dict.update(build_data_dict(electron_features, variation_padded_electrons))
            data_dict.update(build_data_dict(muon_features, variation_padded_muons))
            data_dict["weight_mc_combined"] = _weights * _scale_factor
            fill_or_extend_tree(df_val, data_dict)
        # put part in val and the rest in test
        elif current_index < max_val_index and current_index + current_data_size < max_test_index:
            split_index = max_val_index - current_index
            data_dict = build_data_dict(jet_features, variation_padded_jets, split_index=split_index, split_high=True)
            data_dict.update(build_data_dict(electron_features, variation_padded_electrons, split_index=split_index, split_high=True))
            data_dict.update(build_data_dict(muon_features, variation_padded_muons, split_index=split_index, split_high=True))
            data_dict["weight_mc_combined"] = _weights[:split_index] * _scale_factor
            fill_or_extend_tree(df_val, data_dict)

            data_dict = build_data_dict(jet_features, variation_padded_jets, split_index=split_index, split_low=True)
            data_dict.update(build_data_dict(electron_features, variation_padded_electrons, split_index=split_index, split_low=True))
            data_dict.update(build_data_dict(muon_features, variation_padded_muons, split_index=split_index, split_low=True))
            data_dict["weight_mc_combined"] = _weights[split_index:] * _scale_factor
            fill_or_extend_tree(df_test, data_dict)
        # put everything into test
        elif current_index >= max_val_index and current_index + current_data_size < max_test_index:
            data_dict = build_data_dict(jet_features, variation_padded_jets)
            data_dict.update(build_data_dict(electron_features, variation_padded_electrons))
            data_dict.update(build_data_dict(muon_features, variation_padded_muons))
            data_dict["weight_mc_combined"] = _weights * _scale_factor
            fill_or_extend_tree(df_test, data_dict)
        # put what's needed into test and ignore the rest
        elif current_index < max_test_index and current_index + current_data_size >= max_test_index:
            split_index = max_test_index - current_index
            data_dict = build_data_dict(jet_features, variation_padded_jets, split_index=split_index, split_high=True)
            data_dict.update(build_data_dict(electron_features, variation_padded_electrons, split_index=split_index, split_high=True))
            data_dict.update(build_data_dict(muon_features, variation_padded_muons, split_index=split_index, split_high=True))
            data_dict["weight_mc_combined"] = _weights[:split_index] * _scale_factor
            fill_or_extend_tree(df_test, data_dict)
        else:
            print("Uncaught case:",
                  "current_index: {}".format(current_index),
                  "current_data_size: {}".format(current_data_size),
                  "max_train_index: {}".format(max_train_index),
                  "max_val_index: {}".format(max_val_index),
                  "max_test_index: {}".format(max_test_index), sep='\n')

        current_index += current_data_size
        if current_index >= max_test_index:
            break
    if current_index >= max_test_index:
        break
print("Finished!")

Loading file: https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneEE5C_13TeV-powheg-herwigpp/cmsopendata2015_ttbar_19999_PU25nsData2015v1_76X_mcRun2_asymptotic_v12-v1_10000_0006.root
1347632
Loading file: https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneEE5C_13TeV-powheg-herwigpp/cmsopendata2015_ttbar_19999_PU25nsData2015v1_76X_mcRun2_asymptotic_v12-v1_10000_0013.root
1339381
Loading file: https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneEE5C_13TeV-powheg-herwigpp/cmsopendata2015_ttbar_19999_PU25nsData2015v1_76X_mcRun2_asymptotic_v12-v1_10000_0007.root
1383023
Loading file: https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneEE5C_13TeV-powheg-herwigpp/cmsopendata2015_ttbar_19999_PU25nsData2015v1_76X_mcRun2_asymptotic_v12-v1_10000_0004.root
1309020
Loading file: https://xrootd-local.unl.edu:1094//store/user/AGC/nanoAOD/TT_TuneEE5C_13TeV-powheg-herwigpp/cmsopendata2015_ttbar_19999_PU25nsData2015v1_76X_mcRun2_asymptotic_v12-v1_1000

## Now create simplified datasets for training the CARL models

In [27]:
extended_features = [jet_features[0] + "{}".format(i) for i in range(max_jet_size)]
extended_features.extend([jet_features[1] + "{}".format(i) for i in range(max_jet_size)])
extended_features.extend([jet_features[2] + "{}".format(i) for i in range(max_jet_size)])
extended_features.extend([jet_features[3] + "{}".format(i) for i in range(max_jet_size)])

extended_features.extend([electron_features[0] + "{}".format(i) for i in range(max_electron_size)])
extended_features.extend([electron_features[1] + "{}".format(i) for i in range(max_electron_size)])
extended_features.extend([electron_features[2] + "{}".format(i) for i in range(max_electron_size)])
extended_features.extend([electron_features[3] + "{}".format(i) for i in range(max_electron_size)])

extended_features.extend([muon_features[0] + "{}".format(i) for i in range(max_muon_size)])
extended_features.extend([muon_features[1] + "{}".format(i) for i in range(max_muon_size)])
extended_features.extend([muon_features[2] + "{}".format(i) for i in range(max_muon_size)])
extended_features.extend([muon_features[3] + "{}".format(i) for i in range(max_muon_size)])

In [28]:
nominal_tree = uproot.open("/data/mdrnevich/AGC/CMS_ttbar_nominal_training_data.root")["Events"]
alternative_tree = uproot.open("/data/mdrnevich/AGC/CMS_ttbar_PS_var_training_data.root")["Events"]

In [29]:
df = uproot.recreate("/data/mdrnevich/AGC/CMS_ttbar_nominal_training_data_CARL.root")
chunk_size = 100000

# Load the data
filesize = int(nominal_tree.arrays(jet_features[0]).type.length)
print(filesize)

weight_mean = ak.unzip(nominal_tree.arrays(["weight_mc_combined"]))[0].to_numpy().mean()
for i in range(int(np.ceil(filesize / chunk_size))):
    X_array = ak.concatenate(ak.unzip(nominal_tree.arrays(features, entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size))), axis=1).to_numpy().astype(np.float32)
    weight_array = ak.unzip(nominal_tree.arrays(["weight_mc_combined"], entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size)))[0].to_numpy()
    # Normalize by the mean for training stability
    weight_array /= weight_mean

    data_dict = dict()
    for ix, feat in enumerate(extended_features):
        data_dict[feat] = X_array[:, ix].ravel()
    data_dict["weight_mc_combined"] = weight_array
    fill_or_extend_tree(df, data_dict)

11602238


In [30]:
df = uproot.recreate("/data/mdrnevich/AGC/CMS_ttbar_PS_var_training_data_CARL.root")
chunk_size = 100000

# Load the data
filesize = int(alternative_tree.arrays(jet_features[0]).type.length)
print(filesize)

weight_mean = ak.unzip(alternative_tree.arrays(["weight_mc_combined"]))[0].to_numpy().mean()
for i in range(int(np.ceil(filesize / chunk_size))):
    X_array = ak.concatenate(ak.unzip(alternative_tree.arrays(features, entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size))), axis=1).to_numpy().astype(np.float32)
    weight_array = ak.unzip(alternative_tree.arrays(["weight_mc_combined"], entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size)))[0].to_numpy()
    # Normalize by the mean for training stability
    weight_array /= weight_mean

    data_dict = dict()
    for ix, feat in enumerate(extended_features):
        data_dict[feat] = X_array[:, ix].ravel()
    data_dict["weight_mc_combined"] = weight_array
    fill_or_extend_tree(df, data_dict)

11602238


# Section 2: Making a DeepSets Dataset

Best to have this as separate scripts

In [None]:
df_train = uproot.recreate("/data/mdrnevich/AGC/CMS_ttbar_nominal_DeepSets_training_data.root")
df_val = uproot.recreate("/data/mdrnevich/AGC/CMS_ttbar_nominal_DeepSets_validation_data.root")
df_test = uproot.recreate("/data/mdrnevich/AGC/CMS_ttbar_nominal_DeepSets_testing_data.root")

chunk_size = 100000

current_index = 0
for filename in all_nominal_files:
    print("Loading file: {}".format(filename))
    # Load the data
    nominal_dataset = uproot.open(filename)["Events"]
    filesize = int(nominal_dataset.arrays(jet_features[0]).type.length)
    print(filesize)
    for i in tqdm(range(int(np.ceil(filesize / chunk_size)))):
        jet_arr = nominal_dataset.arrays(jet_features, entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size))
        electron_arr = nominal_dataset.arrays(electron_features, entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size))
        muon_arr = nominal_dataset.arrays(muon_features, entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size))
        weight_arr = nominal_dataset.arrays(weight_features, entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size))
        # Get the run number
        #nominal_file_run_number = str(NOMINAL_FILE_TO_RUN_NUMBER[filename])
        # Get the DSID number
        #nominal_file_dsid = str(NOMINAL_FILE_TO_DSID[filename])
        # Get the luminsotiy, DSID cross section, and per DSID total weighted events
        #_scale_factor = luminosities_by_run[nominal_file_run_number] * NOMINAL_XSECTIONS[nominal_file_dsid] / NOMINAL_NORMALIZATIONS[nominal_file_dsid]
        _scale_factor = 3378 * 1 / 1
        # Extract the combined weight array
        _weights = ak.concatenate(ak.unzip(weight_arr[weight_features][:, np.newaxis]), axis=1).to_numpy().prod(axis=1)

        current_data_size = len(_weights)
        
        jet_sets = []
        electron_sets = []
        muon_sets = []
        for j in range(current_data_size):
            jet_sets.append(
                np.concatenate([x.to_numpy()[:, np.newaxis] for x in ak.unzip(jet_arr[jet_features][j])], axis=1).flatten()
            )
            electron_sets.append(
                np.concatenate([x.to_numpy()[:, np.newaxis] for x in ak.unzip(electron_arr[electron_features][j])], axis=1).flatten()
            )
            muon_sets.append(
                np.concatenate([x.to_numpy()[:, np.newaxis] for x in ak.unzip(muon_arr[muon_features][j])], axis=1).flatten()
            )

        nominal_features = ["jet_4vec", "electron_4vec", "muon_4vec"]
        nominal_arr = ak.Array({"jet_4vec": jet_sets,
                                "electron_4vec": electron_sets,
                                "muon_4vec": muon_sets})

        # put everything in train
        if current_index + current_data_size < max_train_index:
            data_dict = build_data_dict(nominal_features, nominal_arr)
            data_dict["weight_mc_combined"] = _weights * _scale_factor
            fill_or_extend_tree(df_train, data_dict)
        # put part in train and the rest in val
        elif current_index < max_train_index and current_index + current_data_size < max_val_index:
            split_index = max_train_index - current_index
            data_dict = build_data_dict(nominal_features, nominal_arr, split_index=split_index, split_high=True)
            data_dict["weight_mc_combined"] = _weights[:split_index] * _scale_factor
            fill_or_extend_tree(df_train, data_dict)

            data_dict = build_data_dict(nominal_features, nominal_arr, split_index=split_index, split_low=True)
            data_dict["weight_mc_combined"] = _weights[split_index:] * _scale_factor
            fill_or_extend_tree(df_val, data_dict)
        # put everything into val
        elif current_index >= max_train_index and current_index + current_data_size < max_val_index:
            data_dict = build_data_dict(nominal_features, nominal_arr)
            data_dict["weight_mc_combined"] = _weights * _scale_factor
            fill_or_extend_tree(df_val, data_dict)
        # put part in val and the rest in test
        elif current_index < max_val_index and current_index + current_data_size < max_test_index:
            split_index = max_val_index - current_index
            data_dict = build_data_dict(nominal_features, nominal_arr, split_index=split_index, split_high=True)
            data_dict["weight_mc_combined"] = _weights[:split_index] * _scale_factor
            fill_or_extend_tree(df_val, data_dict)

            data_dict = build_data_dict(nominal_features, nominal_arr, split_index=split_index, split_low=True)
            data_dict["weight_mc_combined"] = _weights[split_index:] * _scale_factor
            fill_or_extend_tree(df_test, data_dict)
        # put everything into test
        elif current_index >= max_val_index and current_index + current_data_size < max_test_index:
            data_dict = build_data_dict(nominal_features, nominal_arr)
            data_dict["weight_mc_combined"] = _weights * _scale_factor
            fill_or_extend_tree(df_test, data_dict)
        # put what's needed into test and ignore the rest
        elif current_index < max_test_index and current_index + current_data_size >= max_test_index:
            split_index = max_test_index - current_index
            data_dict = build_data_dict(nominal_features, nominal_arr, split_index=split_index, split_high=True)
            data_dict["weight_mc_combined"] = _weights[:split_index] * _scale_factor
            fill_or_extend_tree(df_test, data_dict)
        else:
            print("Uncaught case:",
                  "current_index: {}".format(current_index),
                  "current_data_size: {}".format(current_data_size),
                  "max_train_index: {}".format(max_train_index),
                  "max_val_index: {}".format(max_val_index),
                  "max_test_index: {}".format(max_test_index), sep='\n')

        current_index += current_data_size
        if current_index >= max_test_index:
            break
    if current_index >= max_test_index:
        break
print("Finished!")

In [None]:
df_train = uproot.recreate("/data/mdrnevich/AGC/CMS_ttbar_PS_var_DeepSets_training_data.root")
df_val = uproot.recreate("/data/mdrnevich/AGC/CMS_ttbar_PS_var_DeepSets_validation_data.root")
df_test = uproot.recreate("/data/mdrnevich/AGC/CMS_ttbar_PS_var_DeepSets_testing_data.root")

chunk_size = 100000

current_index = 0
for filename in all_variation_files:
    print("Loading file: {}".format(filename))
    # Load the data
    variation_dataset = uproot.open(filename)["Events"]
    filesize = int(variation_dataset.arrays(jet_features[0]).type.length)
    print(filesize)
    for i in tqdm(range(int(np.ceil(filesize / chunk_size)))):
        jet_arr = variation_dataset.arrays(jet_features, entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size))
        electron_arr = variation_dataset.arrays(electron_features, entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size))
        muon_arr = variation_dataset.arrays(muon_features, entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size))
        weight_arr = variation_dataset.arrays(weight_features, entry_start=int(i * chunk_size), entry_stop=int((i+1) * chunk_size))
        # Get the run number
        #variation_file_run_number = str(VARIATION_FILE_TO_RUN_NUMBER[filename])
        # Get the DSID number
        #variation_file_dsid = str(VARIATION_FILE_TO_DSID[filename])
        # Get the luminsotiy, DSID cross section, and per DSID total weighted events
        #_scale_factor = luminosities_by_run[variation_file_run_number] * VARIATION_XSECTIONS[variation_file_dsid] / VARIATION_NORMALIZATIONS[variation_file_dsid]
        _scale_factor = 3378 * 1 / 1
        # Extract the combined weight array
        _weights = ak.concatenate(ak.unzip(weight_arr[weight_features][:, np.newaxis]), axis=1).to_numpy().prod(axis=1)

        current_data_size = len(_weights)
        
        jet_sets = []
        electron_sets = []
        muon_sets = []
        for i in range(current_data_size):
            jet_sets.append(
                np.concatenate([x.to_numpy()[:, np.newaxis] for x in ak.unzip(jet_arr[jet_features][i])], axis=1).flatten()
            )
            electron_sets.append(
                np.concatenate([x.to_numpy()[:, np.newaxis] for x in ak.unzip(electron_arr[electron_features][i])], axis=1).flatten()
            )
            muon_sets.append(
                np.concatenate([x.to_numpy()[:, np.newaxis] for x in ak.unzip(muon_arr[muon_features][i])], axis=1).flatten()
            )

        variation_features = ["jet_4vec", "electron_4vec", "muon_4vec"]
        variation_arr = ak.Array({"jet_4vec": jet_sets,
                                "electron_4vec": electron_sets,
                                "muon_4vec": muon_sets})

        # put everything in train
        if current_index + current_data_size < max_train_index:
            data_dict = build_data_dict(variation_features, variation_arr)
            data_dict["weight_mc_combined"] = _weights * _scale_factor
            fill_or_extend_tree(df_train, data_dict)
        # put part in train and the rest in val
        elif current_index < max_train_index and current_index + current_data_size < max_val_index:
            split_index = max_train_index - current_index
            data_dict = build_data_dict(variation_features, variation_arr, split_index=split_index, split_high=True)
            data_dict["weight_mc_combined"] = _weights[:split_index] * _scale_factor
            fill_or_extend_tree(df_train, data_dict)

            data_dict = build_data_dict(variation_features, variation_arr, split_index=split_index, split_low=True)
            data_dict["weight_mc_combined"] = _weights[split_index:] * _scale_factor
            fill_or_extend_tree(df_val, data_dict)
        # put everything into val
        elif current_index >= max_train_index and current_index + current_data_size < max_val_index:
            data_dict = build_data_dict(variation_features, variation_arr)
            data_dict["weight_mc_combined"] = _weights * _scale_factor
            fill_or_extend_tree(df_val, data_dict)
        # put part in val and the rest in test
        elif current_index < max_val_index and current_index + current_data_size < max_test_index:
            split_index = max_val_index - current_index
            data_dict = build_data_dict(variation_features, variation_arr, split_index=split_index, split_high=True)
            data_dict["weight_mc_combined"] = _weights[:split_index] * _scale_factor
            fill_or_extend_tree(df_val, data_dict)

            data_dict = build_data_dict(variation_features, variation_arr, split_index=split_index, split_low=True)
            data_dict["weight_mc_combined"] = _weights[split_index:] * _scale_factor
            fill_or_extend_tree(df_test, data_dict)
        # put everything into test
        elif current_index >= max_val_index and current_index + current_data_size < max_test_index:
            data_dict = build_data_dict(variation_features, variation_arr)
            data_dict["weight_mc_combined"] = _weights * _scale_factor
            fill_or_extend_tree(df_test, data_dict)
        # put what's needed into test and ignore the rest
        elif current_index < max_test_index and current_index + current_data_size >= max_test_index:
            split_index = max_test_index - current_index
            data_dict = build_data_dict(variation_features, variation_arr, split_index=split_index, split_high=True)
            data_dict["weight_mc_combined"] = _weights[:split_index] * _scale_factor
            fill_or_extend_tree(df_test, data_dict)
        else:
            print("Uncaught case:",
                  "current_index: {}".format(current_index),
                  "current_data_size: {}".format(current_data_size),
                  "max_train_index: {}".format(max_train_index),
                  "max_val_index: {}".format(max_val_index),
                  "max_test_index: {}".format(max_test_index), sep='\n')

        current_index += current_data_size
        if current_index >= max_test_index:
            break
    if current_index >= max_test_index:
        break
print("Finished!")