In [1]:
import subprocess
import numpy as np

command = ["cernopendata-client", "get-file-locations", "--recid", "63168", "--protocol", "xrootd"]
result = subprocess.run(command, capture_output=True, text=True)
filenames = result.stdout.splitlines()
filenames_bkg = np.array(filenames)

command = ["cernopendata-client", "get-file-locations", "--recid", "33703", "--protocol", "xrootd"]
result = subprocess.run(command, capture_output=True, text=True)
filenames = result.stdout.splitlines()
filenames_sig = np.array(filenames)

# Print the NumPy array
print(filenames_bkg)
print(filenames_sig)

['root://eospublic.cern.ch//eos/opendata/cms/mc/RunIISummer20UL16NanoAODv9/QCD_Pt-15to7000_TuneCP5_Flat2018_13TeV_pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v1/130000/1C569D85-60AE-7D43-B42B-6D6FD1D66CC7.root'
 'root://eospublic.cern.ch//eos/opendata/cms/mc/RunIISummer20UL16NanoAODv9/QCD_Pt-15to7000_TuneCP5_Flat2018_13TeV_pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v1/130000/2EDCC683-1B4B-614B-BEB7-D80BBC20AD8E.root'
 'root://eospublic.cern.ch//eos/opendata/cms/mc/RunIISummer20UL16NanoAODv9/QCD_Pt-15to7000_TuneCP5_Flat2018_13TeV_pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v1/270000/19E8D842-3175-1449-AF6C-FD9C69D12724.root'
 'root://eospublic.cern.ch//eos/opendata/cms/mc/RunIISummer20UL16NanoAODv9/QCD_Pt-15to7000_TuneCP5_Flat2018_13TeV_pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v1/270000/3957434B-7E09-3B4C-8329-FD44D82C7DB7.root'
 'root://eospublic.cern.ch//eos/opendata/cms/mc/RunIISummer20UL16NanoAODv9/QCD_Pt-15to7000_TuneCP5_Flat2018_13TeV_pythia8/NANOAODSIM/106X_mcRun2_asy

In [2]:
import uproot
import matplotlib.pylab as plt
import awkward as ak
import numpy as np
from glob import glob

treename = "Events"
branch_dict = {
                "Muon": ["pt", "eta", "phi"],
                "Electron": ["pt", "eta", "phi"],
                "FatJet": ["pt", "eta", "phi", "lsf3", "msoftdrop", "nConstituents"],
                # "MET": ["pt", "phi"]
            }

# make list of branches to read from the dictionary above
branch_names = []
for obj, var in branch_dict.items(): 
    branch_names += [obj + "_" + v for v in var]

infiles_sig = filenames_sig[0:5] # Lets use all the signal files

data_sig = uproot.concatenate({fname:"Events" for fname in infiles_sig}, 
                              branch_names, 
                              how = "zip",
                              library = "ak",
                              timeout=90,
                             )

# infiles_bkg = ["root://eospublic.cern.ch//eos/opendata/cms/mc/RunIISummer20UL16NanoAODv9/QCD_Pt-15to7000_TuneCP5_Flat2018_13TeV_pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v1/130000/2EDCC683-1B4B-614B-BEB7-D80BBC20AD8E.root","root://eospublic.cern.ch//eos/opendata/cms/mc/RunIISummer20UL16NanoAODv9/QCD_Pt-15to7000_TuneCP5_Flat2018_13TeV_pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v1/270000/19E8D842-3175-1449-AF6C-FD9C69D12724.root","root://eospublic.cern.ch//eos/opendata/cms/mc/RunIISummer20UL16NanoAODv9/QCD_Pt-15to7000_TuneCP5_Flat2018_13TeV_pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v1/270000/3957434B-7E09-3B4C-8329-FD44D82C7DB7.root","root://eospublic.cern.ch//eos/opendata/cms/mc/RunIISummer20UL16NanoAODv9/QCD_Pt-15to7000_TuneCP5_Flat2018_13TeV_pythia8/NANOAODSIM/106X_mcRun2_asymptotic_v17-v1/270000/397D1673-167A-CF46-9E79-D7069D9AC359.root"]
infiles_bkg = filenames_bkg[4:8] #Let's use only 4 of the background files, one of the first files seems to not be accessible with xrootd so I take entry 4-8
data_bkg = uproot.concatenate({fname:"Events" for fname in infiles_bkg}, 
                              branch_names, 
                              how = "zip",
                              library = "ak",
                              timeout=90,
                             )


# # Here is an example of how you can open a single file rith awkward and regex for the branch expression :

# file_bkg = uproot.open(infiles_bkg[0])
# data_bkg = file_bkg["Events"].arrays(
#     filter_name = "/(Muon|Electron|FatJet|MET)_(pt|eta|phi|sumEt)/", 
#     how = "zip"
# )

In [3]:
def getPadNParr(events, obj, n_pad, fields, cuts = None, name = None, pad_val = 0):
    '''
    This function filter objects and pads them to a certain length with a given value
    '''
    
    objects = events[obj]
    
    if not name: name = obj
    
    pad_arrs = []
    var_names = []
        
    # padding with nones
    pad_arr = ak.pad_none(objects, n_pad, clip=True)
    
    # combining to numpy
    for i in range(n_pad):

        for var in fields:
            pad_arrs += [ak.to_numpy( ak.fill_none(pad_arr[var][:,i], pad_val) )]
            var_names.append( "{}_{}_{}".format(name, i, var) )
            
    return np.stack(pad_arrs), var_names

def formatData(data, objects, verbosity = 0):
    '''
    This function concatenates the padded arrays for different objects.
    It is controlled via a dictionary as defined above
    '''
    
    # this will be filled by all required objects
    dataList = [] 
    varList = []
    
    for obj in objects: 
        print(obj)
        dat, names = getPadNParr(data, obj["key"], obj["n_obj"], obj["fields"], obj["cuts"] if "cuts" in obj else None, obj["name"] )
        dataList.append(dat)
        varList += names
        
    if verbosity > 0:
        print("The input variables are the following:")
        print(varList)
                
    # combining and returning (and transforming back so events are along the first axis...)
    return np.concatenate(dataList, axis = 0).T, varList

In [4]:
 objects = [
    # {"name" : "MET", "key" : "MET", "fields" : ["pt", "phi"], "n_obj" : 1 },
    {"name" : "FatJet", "key" : "FatJet", "fields" : ["pt", "eta", "phi", "lsf3", "msoftdrop", "nConstituents"], "n_obj" : 6},
    {"name" : "Electron", "key" : "Electron", "fields" : ["pt", "eta", "phi"], "n_obj" : 4},
    {"name" : "Muon", "key" : "Muon", "fields" : ["pt", "eta", "phi"], "n_obj" : 4}
]
    
x_sig, var_names = formatData(data_sig, objects, verbosity = 99) 
x_bkg, var_names = formatData(data_bkg, objects, verbosity = 0) 

{'name': 'FatJet', 'key': 'FatJet', 'fields': ['pt', 'eta', 'phi', 'lsf3', 'msoftdrop', 'nConstituents'], 'n_obj': 6}
{'name': 'Electron', 'key': 'Electron', 'fields': ['pt', 'eta', 'phi'], 'n_obj': 4}
{'name': 'Muon', 'key': 'Muon', 'fields': ['pt', 'eta', 'phi'], 'n_obj': 4}
The input variables are the following:
['FatJet_0_pt', 'FatJet_0_eta', 'FatJet_0_phi', 'FatJet_0_lsf3', 'FatJet_0_msoftdrop', 'FatJet_0_nConstituents', 'FatJet_1_pt', 'FatJet_1_eta', 'FatJet_1_phi', 'FatJet_1_lsf3', 'FatJet_1_msoftdrop', 'FatJet_1_nConstituents', 'FatJet_2_pt', 'FatJet_2_eta', 'FatJet_2_phi', 'FatJet_2_lsf3', 'FatJet_2_msoftdrop', 'FatJet_2_nConstituents', 'FatJet_3_pt', 'FatJet_3_eta', 'FatJet_3_phi', 'FatJet_3_lsf3', 'FatJet_3_msoftdrop', 'FatJet_3_nConstituents', 'FatJet_4_pt', 'FatJet_4_eta', 'FatJet_4_phi', 'FatJet_4_lsf3', 'FatJet_4_msoftdrop', 'FatJet_4_nConstituents', 'FatJet_5_pt', 'FatJet_5_eta', 'FatJet_5_phi', 'FatJet_5_lsf3', 'FatJet_5_msoftdrop', 'FatJet_5_nConstituents', 'Electron_

In [5]:
import h5py
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
_ = scaler.fit(x_bkg)
x_bkg_scaled = scaler.transform(x_bkg)
x_sig_scaled = scaler.transform(x_sig)
    
        
# define training, test and validation datasets
X_train_bkg, X_test_bkg = train_test_split(x_bkg_scaled, test_size=0.2, shuffle=True)

X_train_sig, X_test_sig = train_test_split(x_sig_scaled, test_size=0.2, shuffle=True)


print("Training bkg data shape = ",X_train_bkg.shape)    
with h5py.File('bkg_dataset_classifier.h5', 'w') as h5f:
    h5f.create_dataset('X_train', data = X_train_bkg)
    h5f.create_dataset('X_test', data = X_test_bkg)

print("Training sig data shape = ",X_train_sig.shape)    
with h5py.File('signal_dataset_classifier.h5', 'w') as h5f2:
    h5f2.create_dataset('X_train', data = X_train_sig)
    h5f2.create_dataset('X_test', data = X_test_sig)       


Training bkg data shape =  (1800000, 60)
Training sig data shape =  (53600, 60)


In [1]:
# construct mixed s+b training and test set
X_train = np.concatenate((X_train_bkg, X_train_sig), axis=0)
X_test = np.concatenate((X_test_bkg, X_test_sig), axis=0)

Y_train = np.concatenate(
    (np.zeros(X_train_bkg.shape[0]), np.ones(X_train_sig.shape[0])),
    axis=0
)
Y_test = np.concatenate(
    (np.zeros(X_test_bkg.shape[0]), np.ones(X_test_sig.shape[0])),
    axis=0
)

NameError: name 'np' is not defined