# Python based analysis

This script takes trees as inputs and writes another sets of trees as outputs.<br>
The trees are filtered in between.

In [1]:
import os, sys
import numpy as np
import uproot
import pandas as pd

In [2]:
#Global parameters

basedir = '../input_trees_modified'
#injob  = 'tree_2LSSinclusive_baseline_Sept17_evalSept30'
injob  = 'tree_2LSS_2018UL_baseline_Oct16_evalOct21'
outjob = 'tree_2LSS_topCR_Oct24'
#outjob = 'tree_2LSS_Baseline_Inclusive_Sept30'
#channelno = 0 #mumu
#channelno = 3 #ee

indir = os.path.join(basedir, injob)
outdir = os.path.join(basedir, outjob)
os.makedirs(outdir, exist_ok=True) 

In [3]:
def read_file_into_df(filepath):

    tfile = uproot.open(filepath)
    ttree = tfile['myEvents']
    branches = ttree.keys()
    awkarray = ttree.arrays(branches)
    df = pd.DataFrame(awkarray.to_list())

    #Not filtering by channels right now.
    #if 'channel' in df.columns: df = df[df['channel'] == channelno]       
    #print(f'dataframe for {sample} {subsample} loaded.')
    return df

def write_df_into_file(df, filepath):
    if df.empty:
        data_dict = {col: np.array([], dtype=df[col].dtype) for col in df.columns}
        print(f"\033[0;31mWarning: Writing empty file: {filepath}\033[0m\n")
    else:
        data_dict = df.to_dict('list')
        
    with uproot.recreate(filepath) as file: file['myEvents'] = data_dict
    
print('Functions loaded.')

Functions loaded.


In [4]:
%%time

list_of_files = os.listdir(indir)
list_empty_input = []
list_empty_output = []

for f in list_of_files:

    #if 'SingleMuon' not in f: continue

    #Step1: Prepare the dataframe
    print(f'\nLoading file: {f}')
    filepath = os.path.join(indir, f)
    sample = f.split("_")[1]
    subsample = "_".join(f.split("_")[2:])
       
    outfile = os.path.join(outdir, f)
    #if os.path.exists(outfile): continue
    #if 'WJets' not in f: continue

    df = read_file_into_df(filepath)
    if df.empty : 
        print(f"\033[0;31mWarning: Empty input file: {filepath}\033[0m\n")
        list_empty_input.append(filepath)

    #Creating working points on the dataframe using the NN.
    df['flag_qcd'] = (
        ((df['channel'].isin([0, 1])) & (df['nnscore_qcd_vlldmu_200_800']  < 0.30)) |
        ((df['channel'].isin([2, 3])) & (df['nnscore_qcd_vlldele_200_800'] < 0.30))
    )
    
    df['flag_searchregion'] = (
        ((df['channel'].isin([0, 1])) & (df['nnscore_qcd_vlldmu_200_800']  > 0.70)) |
        ((df['channel'].isin([2, 3])) & (df['nnscore_qcd_vlldele_200_800'] > 0.70))
    )

    # Put any cuts here on dataframe as you like
    # Define conditions as strings
    qcdcr = f'flag_qcd and 0.02<lep0_iso<0.15 and lep0_sip3d>5'
    qcdvr = f'not flag_qcd and not flag_searchregion and 0.02<lep0_iso<0.15 and lep0_sip3d>5'
    DY_enhanced = f'76<dilep_mass<106  and dilep_ptratio > 0.7'
    
    qcdveto = 'ST > 150'
    drellyan_veto = 'channel == 3 and 76 < dilep_mass < 106' 
    search_region = f'flag_searchregion and {qcdveto}'
    
    top_enhanced = f'{search_region} and nbjet>0'
    topcr = f'{top_enhanced} and HT>200'

    sig_enhanced1 = f'{search_region} and nbjet == 0'
    sig_enhanced2 = f'{sig_enhanced1} and lep0_sip3d<5 and lep1_sip3d<10'
    
    # Combine conditions for query
    event_selection = topcr

    # Use df.query with the combined condition
    df_filtered = df.query(event_selection)
    
    if df_filtered.empty: list_empty_output.append(f'{sample}_{subsample}')

    nbefore = len(df)
    nafter = len(df_filtered)

    frac = 0
    if nbefore != 0: frac = nafter*100/nbefore

    #display(df_filtered)

    write_df_into_file(df_filtered, outfile)
    print(f'\033[1;32mFile written: {outfile}\033[0m ({nbefore}->{nafter}, {frac:.2f}%)\n')
    #break #file

print(f'\n\033[1;33mSummary:\033[0m')
print(f'\n\033[33mEmpty input: {list_empty_input}\033[0m')
print(f'\n\033[31mEmpty output: {list_empty_output}\033[0m\n')


Loading file: tree_DYJetsToLL_M10to50.root
[1;32mFile written: ../input_trees_modified/tree_2LSS_topCR_Oct24/tree_DYJetsToLL_M10to50.root[0m (257->1, 0.39%)


Loading file: tree_DYJetsToLL_M50.root
[1;32mFile written: ../input_trees_modified/tree_2LSS_topCR_Oct24/tree_DYJetsToLL_M50.root[0m (116185->158, 0.14%)


Loading file: tree_EGamma_EGamma_A.root
[1;32mFile written: ../input_trees_modified/tree_2LSS_topCR_Oct24/tree_EGamma_EGamma_A.root[0m (65324->442, 0.68%)


Loading file: tree_EGamma_EGamma_B.root
[1;32mFile written: ../input_trees_modified/tree_2LSS_topCR_Oct24/tree_EGamma_EGamma_B.root[0m (33016->198, 0.60%)


Loading file: tree_EGamma_EGamma_C.root
[1;32mFile written: ../input_trees_modified/tree_2LSS_topCR_Oct24/tree_EGamma_EGamma_C.root[0m (32905->208, 0.63%)


Loading file: tree_EGamma_EGamma_D.root
[1;32mFile written: ../input_trees_modified/tree_2LSS_topCR_Oct24/tree_EGamma_EGamma_D.root[0m (150874->947, 0.63%)


Loading file: tree_Higgs_bbH_HToZZTo4L.root

### Convert them into histograms

In [5]:
%%time

run_here = False

channel_ = 'mm'
jobname_ = outjob
dump_ = outjob.replace('tree', 'hist')+'_'+channel_
command = f'python3 extractHistFromTree.py --jobname {jobname_} --dump {dump_} --channel {channel_}'
print(command)

if run_here: os.system(command)
else: print('Run it yourself:')
    
print('Done!')

python3 extractHistFromTree.py --jobname tree_2LSS_topCR_Oct24 --dump hist_2LSS_topCR_Oct24_mm --channel mm
Run it yourself:
Done!
CPU times: user 0 ns, sys: 222 μs, total: 222 μs
Wall time: 213 μs
