# Python based analysis

This script takes trees as inputs and writes another sets of trees as outputs.<br>
The trees are filtered in between.

In [1]:
import os, sys
import numpy as np
import uproot
import pandas as pd

In [2]:
#Global parameters

basedir = '../input_trees_modified'
#injob  = 'tree_2LSS_2016preVFP_baseline_Nov08'
injob  = 'tree_2LSS_2018UL_baseline_Nov12'
outjob = 'tree_2LSS_2018UL_lowHTcr_Nov18'
#channelno = 0 #mumu
#channelno = 3 #ee
campaign = '2018_UL'

indir = os.path.join(basedir, injob)
outdir = os.path.join(basedir, outjob)
os.makedirs(outdir, exist_ok=True) 

In [3]:
def read_file_into_df(filepath):

    tfile = uproot.open(filepath)
    ttree = tfile['myEvents']
    branches = ttree.keys()
    awkarray = ttree.arrays(branches)
    df = pd.DataFrame(awkarray.to_list())

    #Not filtering by channels right now.
    #if 'channel' in df.columns: df = df[df['channel'] == channelno]       
    #print(f'dataframe for {sample} {subsample} loaded.')
    return df

def write_df_into_file(df, filepath):
    if df.empty:
        data_dict = {col: np.array([], dtype=df[col].dtype) for col in df.columns}
        print(f"\033[0;31mWarning: Writing empty file: {filepath}\033[0m\n")
    else:
        data_dict = df.to_dict('list')
        
    with uproot.recreate(filepath) as file: file['myEvents'] = data_dict
    
print('Functions loaded.')

Functions loaded.


In [4]:
%%time

list_of_files = os.listdir(indir)
list_empty_input = []
list_empty_output = []

for f in list_of_files:

    #if 'SingleMuon' not in f: continue

    #Step1: Prepare the dataframe
    print(f'\nLoading file: {f}')
    filepath = os.path.join(indir, f)
    sample = f.split("_")[1]
    subsample = "_".join(f.split("_")[2:])
       
    outfile = os.path.join(outdir, f)
    #if os.path.exists(outfile): continue
    #if 'WJets' not in f: continue

    df = read_file_into_df(filepath)
    if df.empty : 
        print(f"\033[0;31mWarning: Empty input file: {filepath}\033[0m\n")
        list_empty_input.append(filepath)

    #Creating working points on the dataframe using the NN.
    df['flag_qcd'] = (
        ((df['channel'].isin([0, 1])) & (df['nnscore_qcd_vlldmu_200_800']  < 0.30)) |
        ((df['channel'].isin([2, 3])) & (df['nnscore_qcd_vlldele_200_800'] < 0.30))
    )

    df['flag_topregion'] = (
        ((df['channel'].isin([0, 1])) & (df['nnscore_qcd_vlldmu_200_800']  > 0.50) & (df['nnscore_qcd_vlldmu_200_800']  < 0.70)) |
        ((df['channel'].isin([2, 3])) & (df['nnscore_qcd_vlldmu_200_800']  > 0.50) & (df['nnscore_qcd_vlldele_200_800'] < 0.70))
    )
    
    df['flag_searchregion'] = (
        ((df['channel'].isin([0, 1])) & (df['nnscore_qcd_vlldmu_200_800']  > 0.70)) |
        ((df['channel'].isin([2, 3])) & (df['nnscore_qcd_vlldele_200_800'] > 0.70))
    )

    # Put any cuts here on dataframe as you like
    # Define conditions as strings
    dy_veto = 'not (channel == 3 and 76 < dilep_mass < 106)'

    #Step1: Controlling QCD:
    qcdcr = f'{dy_veto} and flag_qcd and 0.02<lep0_iso<0.15 and lep0_sip3d>5'
    qcdvr = f'{dy_veto} and flag_qcd and 0.02<lep0_iso<0.15 and lep0_sip3d<5'

    #Step2: Controlling Drell-Yan:
    dycr  = f'76<dilep_mass<106  and dilep_ptratio > 0.7'

    #Step3: Controlling TTbar:
    tight_sip3d =  'lep0_sip3d<5 and lep1_sip3d<10'
    top_cr = f'flag_topregion and {tight_sip3d} and {dy_veto} and HT>100'
    top_vr = f'flag_searchregion and {tight_sip3d} and {dy_veto} and HT>100 and nbjet>0'
    
    lowHT_cr = f'flag_topregion and {tight_sip3d} and {dy_veto} and HT<100'
    
    #Step5: Validation of all the corrections:

    #Step6: Signal regions:
    sig_enhanced1 = f'flag_searchregion and nbjet == 0 and HT>100'
    #sig_enhanced2 = f'{sig_enhanced1} and lep0_sip3d<5 and lep1_sip3d<10'
    
    #------------------------------
    # Final event selection:
    event_selection = sig_enhanced1 
    #------------------------------

    # Use df.query with the combined condition
    df_filtered = df.query(event_selection)
    
    if df_filtered.empty: list_empty_output.append(f'{sample}_{subsample}')

    nbefore = len(df)
    nafter = len(df_filtered)

    frac = 0
    if nbefore != 0: frac = nafter*100/nbefore

    #display(df_filtered)

    write_df_into_file(df_filtered, outfile)
    print(f'\033[1;32mFile written: {outfile}\033[0m ({nbefore}->{nafter}, {frac:.2f}%)\n')
    #break #file

print(f'\n\033[1;33mSummary:\033[0m')
print(f'\n\033[33mEmpty input: {list_empty_input}\033[0m')
print(f'\n\033[31mEmpty output: {list_empty_output}\033[0m\n')


Loading file: tree_DYJetsToLL_M10to50.root
[1;32mFile written: ../input_trees_modified/tree_2LSS_2018UL_SE1_Nov17/tree_DYJetsToLL_M10to50.root[0m (257->4, 1.56%)


Loading file: tree_DYJetsToLL_M50.root
[1;32mFile written: ../input_trees_modified/tree_2LSS_2018UL_SE1_Nov17/tree_DYJetsToLL_M50.root[0m (116185->3945, 3.40%)


Loading file: tree_EGamma_EGamma_A.root
[1;32mFile written: ../input_trees_modified/tree_2LSS_2018UL_SE1_Nov17/tree_EGamma_EGamma_A.root[0m (65324->1966, 3.01%)


Loading file: tree_EGamma_EGamma_B.root
[1;32mFile written: ../input_trees_modified/tree_2LSS_2018UL_SE1_Nov17/tree_EGamma_EGamma_B.root[0m (33016->974, 2.95%)


Loading file: tree_EGamma_EGamma_C.root
[1;32mFile written: ../input_trees_modified/tree_2LSS_2018UL_SE1_Nov17/tree_EGamma_EGamma_C.root[0m (32905->973, 2.96%)


Loading file: tree_EGamma_EGamma_D.root
[1;32mFile written: ../input_trees_modified/tree_2LSS_2018UL_SE1_Nov17/tree_EGamma_EGamma_D.root[0m (150874->4357, 2.89%)


Loading fi

### Convert them into histograms

In [5]:
%%time

run_here = False

channels = ['mm', 'me', 'em', 'ee']
for channel_ in channels:
    jobname_ = outjob
    campaign_ = campaign
    dump_ = outjob.replace('tree', 'hist')+'_'+channel_
    command = f'python3 extractHistFromTree.py --jobname {jobname_} --dump {dump_} --channel {channel_} --campaign {campaign_}'
    print(f'\033[93m\nRunnning .. \n{command}\033[0m\n')

    if run_here: os.system(command)
    else: print('Run it yourself:')
    
print('Done!')

[93m
Runnning .. 
python3 extractHistFromTree.py --jobname tree_2LSS_2018UL_SE1_Nov17 --dump hist_2LSS_2018UL_SE1_Nov17_mm --channel mm --campaign 2018_UL[0m

Run it yourself:
[93m
Runnning .. 
python3 extractHistFromTree.py --jobname tree_2LSS_2018UL_SE1_Nov17 --dump hist_2LSS_2018UL_SE1_Nov17_me --channel me --campaign 2018_UL[0m

Run it yourself:
[93m
Runnning .. 
python3 extractHistFromTree.py --jobname tree_2LSS_2018UL_SE1_Nov17 --dump hist_2LSS_2018UL_SE1_Nov17_em --channel em --campaign 2018_UL[0m

Run it yourself:
[93m
Runnning .. 
python3 extractHistFromTree.py --jobname tree_2LSS_2018UL_SE1_Nov17 --dump hist_2LSS_2018UL_SE1_Nov17_ee --channel ee --campaign 2018_UL[0m

Run it yourself:
Done!
CPU times: user 163 μs, sys: 12 μs, total: 175 μs
Wall time: 166 μs
