In [1]:
import os, sys
import numpy as np
import uproot
import pandas as pd

In [2]:
#Global parameters:
indir = '../../input_trees/'
jobname = 'tree_2LSSinclusive_baseline_Sept17'
#jobname = 'tree_2016postVFPUL_2LSSmm_Sept23'

modeldict = {
    'QCD-VLLD_mu-classifier':'nnscore_qcd_vlldmu'
}

In [3]:
#Given a TFile, read its branches into a dataframe.
def read_file_into_df(filepath, truth=None):

    filename = filepath.split('/')[-1]
    sample = filename.split("_")[1]
    subsample = "_".join(filename.split("_")[2:])
    
    #Exceptions
    if 'QCD_' in filename or 'VLL' in filename:
        sample = filename.split("_")[1]+"_"+filename.split("_")[2]
        subsample = filename.split("_")[3].split(".")[0]
        
    if subsample.endswith(".root"): subsample = subsample[:-5]

    tfile = uproot.open(filepath)
    
    ttree = tfile['myEvents']
    branches = ttree.keys()
    awkarray = ttree.arrays(branches)
    df = pd.DataFrame(awkarray.to_list())
    if truth: df['truth'] = truth
    df['sample'] = sample
    df['subsample'] = subsample

    return df

def ApplyMinMax(X, min_filename, max_filename):
    # Load min values from the file
    minval = np.loadtxt(min_filename)
    
    # Load max values from the file
    maxval = np.loadtxt(max_filename)
    
    #print('Min from txt: ', minval)
    #print('Max from txt: ', maxval)
    
    # Calculate the difference
    diff = maxval - minval
    normed_X = X.copy()    
    # Scale the data only for non-constant columns
    nonconst = np.where(diff != 0)[0]
    normed_X[:, nonconst] = 2 * ((X[:, nonconst] - minval[nonconst]) / diff[nonconst]) - 1.0
    
    return normed_X

def write_df_into_file(df, filepath):

    if df.empty:
        with uproot.recreate(filepath) as file:
            file['myEvents'] = {}

    else:
        df_drop = df.drop(columns=['sample', 'subsample'])        
        data_dict = df_drop.to_dict('list')
        with uproot.recreate(filepath) as file:
            file['myEvents'] = data_dict

print('Functions loaded.')

Functions loaded.


In [4]:
%%time

import tensorflow as tf
import scipy.sparse as sparse #for numpy.array - pd.dataframe column conversion
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve,auc
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

list_of_files = os.listdir(os.path.join(indir, jobname))
#train_var = ['njet', 'nbjet', 'dilep_mt', 'dilep_dR', 'HTMETllpt', 'STfrac', 'dphi_metdilep', 'dphi_metlep_max', 'dphi_metlep_min']
train_var = ['njet', 'dilep_mt', 'dilep_dR', 'dilep_dphi', 'HTMETllpt', 'HT', 'STfrac', 'dphi_metlep0', 'dphi_metdilep', 'dphi_metlep_max', 'dphi_metlep_min']
# The list of training variables has to match with the trainning part.

for f in list_of_files: 

    #if 'VLLD_mu' not in f: continue 

    #Step1: Prepare the dataframe
    print(f'Loading file: {f}')
    filepath = os.path.join(indir, jobname, f)
    sample = filepath.split("_")[1]
    subsample = "_".join(filepath.split("_")[2:])
    outdir = f'../../input_trees_modified/{jobname}'
    #outdir = 'testdump'
    os.makedirs(outdir, exist_ok=True)    
    outfile = os.path.join(outdir, f)
    
    #if os.path.exists(outfile): continue
    
    df = read_file_into_df(filepath)
    if df.empty : 
        print(f"\033[0;31mWarning: Empty file written: {outfile}\033[0m\n")
        
        continue

    #Step2: Turn it into X matrix, y will be predicted by the models:
    X= df[train_var].values
    
    #Step3 Load the model and evaulate:
    for modelname, scorename in modeldict.items():
        model_filename = f'{modelname}/model_{modelname}.h5'
        min_filename = f'{modelname}/scaling_parameters_min.txt'
        max_filename = f'{modelname}/scaling_parameters_max.txt'
        X = ApplyMinMax(X, min_filename, max_filename)
        print('X is scaled with min-max values.')
        
        mymodel = tf.keras.models.load_model(model_filename)
        mymodel.load_weights(model_filename)
        print(f'Model loaded: {modelname}')

        #print(X)
        y= mymodel.predict(X)
        #print(y)
        df[scorename] = y

        break #model

    write_df_into_file(df, os.path.join(outdir, f))
    print(f'\033[1;32mFile written: {outfile}\033[0m\n')

    #break #file

2024-09-24 19:31:22.841379: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading file: tree_DYJetsToLL_M10to50.root
X is scaled with min-max values.
Model loaded: QCD-VLLD_mu-classifier


2024-09-24 19:31:24.927096: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


[1;32mFile written: ../../input_trees_modified/tree_2LSSinclusive_baseline_Sept17/tree_DYJetsToLL_M10to50.root[0m

Loading file: tree_DYJetsToLL_M50.root
X is scaled with min-max values.
Model loaded: QCD-VLLD_mu-classifier
[1;32mFile written: ../../input_trees_modified/tree_2LSSinclusive_baseline_Sept17/tree_DYJetsToLL_M50.root[0m

Loading file: tree_EGamma_EGamma_A.root
X is scaled with min-max values.
Model loaded: QCD-VLLD_mu-classifier
[1;32mFile written: ../../input_trees_modified/tree_2LSSinclusive_baseline_Sept17/tree_EGamma_EGamma_A.root[0m

Loading file: tree_EGamma_EGamma_B.root
X is scaled with min-max values.
Model loaded: QCD-VLLD_mu-classifier
[1;32mFile written: ../../input_trees_modified/tree_2LSSinclusive_baseline_Sept17/tree_EGamma_EGamma_B.root[0m

Loading file: tree_EGamma_EGamma_C.root
X is scaled with min-max values.
Model loaded: QCD-VLLD_mu-classifier
[1;32mFile written: ../../input_trees_modified/tree_2LSSinclusive_baseline_Sept17/tree_EGamma_EGamma_C