# Python based analysis

This script takes trees as inputs and writes another sets of trees as outputs.<br>
The trees are filtered in between.

In [1]:
import os, sys
import numpy as np
import uproot
import pandas as pd

In [2]:
#Global parameters

basedir = '../input_files/trees_modified'
injob  = 'tree_2018UL_baseline_Dec30'
outjob = 'tree_2018UL_sr_Dec30'
#channelno = 0 #mumu
#channelno = 3 #ee
campaign = '2018_UL'

indir = os.path.join(basedir, injob)
outdir = os.path.join(basedir, outjob)
os.makedirs(outdir, exist_ok=True) 

In [3]:
def read_file_into_df(filepath):

    tfile = uproot.open(filepath)
    ttree = tfile['myEvents']
    branches = ttree.keys()
    awkarray = ttree.arrays(branches)
    df = pd.DataFrame(awkarray.to_list())

    #Not filtering by channels right now.
    #if 'channel' in df.columns: df = df[df['channel'] == channelno]       
    #print(f'dataframe for {sample} {subsample} loaded.')
    return df

def write_df_into_file(df, filepath):
    if df.empty:
        data_dict = {col: np.array([], dtype=df[col].dtype) for col in df.columns}
        print(f"\033[0;31mWarning: Writing empty file: {filepath}\033[0m\n")
    else:
        data_dict = df.to_dict('list')
        
    with uproot.recreate(filepath) as file: file['myEvents'] = data_dict
    
print('Functions loaded.')

Functions loaded.


In [4]:
%%time

list_of_files = os.listdir(indir)
list_empty_input = []
list_empty_output = []

for f in list_of_files:

    #if 'SingleMuon' not in f: continue

    #Step1: Prepare the dataframe
    print(f'\nLoading file: {f}')
    filepath = os.path.join(indir, f)
    sample = f.split("_")[1]
    subsample = "_".join(f.split("_")[2:])
       
    outfile = os.path.join(outdir, f)
    #if os.path.exists(outfile): continue
    #if 'WJets' not in f: continue

    df = read_file_into_df(filepath)
    if df.empty : 
        print(f"\033[0;31mWarning: Empty input file: {filepath}\033[0m\n")
        list_empty_input.append(filepath)

    #Creating working points on the dataframe using the NN.
    df['flag_qcd'] = (
        ((df['channel'].isin([0, 1])) & (df['nnscore_qcd_vlldmu_200_800']  < 0.30)) |
        ((df['channel'].isin([2, 3])) & (df['nnscore_qcd_vlldele_200_800'] < 0.30))
    )

    df['flag_topregion'] = (
        ((df['channel'].isin([0, 1])) & (df['nnscore_qcd_vlldmu_200_800']  > 0.50) & (df['nnscore_qcd_vlldmu_200_800']  < 0.70)) |
        ((df['channel'].isin([2, 3])) & (df['nnscore_qcd_vlldmu_200_800']  > 0.50) & (df['nnscore_qcd_vlldele_200_800'] < 0.70))
    )
    
    df['flag_searchregion'] = (
        ((df['channel'].isin([0, 1])) & (df['nnscore_qcd_vlldmu_200_800']  > 0.70)) |
        ((df['channel'].isin([2, 3])) & (df['nnscore_qcd_vlldele_200_800'] > 0.70))
    )

    # Put any cuts here on dataframe as you like
    # Define conditions as strings
    dy_veto = 'not (channel == 3 and 76 < dilep_mass < 106)'

    #Step1: Controlling QCD:
    qcdcr = f'{dy_veto} and flag_qcd and 0.02<lep0_iso<0.15 and lep0_sip3d>5'
    qcdvr = f'{dy_veto} and flag_qcd and 0.02<lep0_iso<0.15 and lep0_sip3d<5'

    #Step2: Controlling Drell-Yan:
    dycr  = f'76<dilep_mass<106  and dilep_ptratio > 0.7'

    tight_sip3d =  'lep0_sip3d<5 and lep1_sip3d<10'
    qcd_veto = f'{tight_sip3d} and {dy_veto} and not flag_qcd and HT>50'

    #Step3: Controlling TTbar:
    top_cr = f'{qcd_veto} and flag_topregion'
    top_vr = f'{qcd_veto} and flag_searchregion and nbjet>0'

    #Step5: Signal regions:
    sig_region = f'{qcd_veto} and flag_searchregion and nbjet == 0'
    
    #------------------------------
    # Final event selection:
    event_selection = sig_region
    #------------------------------

    # Use df.query with the combined condition
    df_filtered = df.query(event_selection)
    
    if df_filtered.empty: list_empty_output.append(f'{sample}_{subsample}')

    nbefore = len(df)
    nafter = len(df_filtered)

    frac = 0
    if nbefore != 0: frac = nafter*100/nbefore

    #display(df_filtered)

    write_df_into_file(df_filtered, outfile)
    print(f'\033[1;32mFile written: {outfile}\033[0m ({nbefore}->{nafter}, {frac:.2f}%)\n')
    #break #file

print(f'\n\033[1;33mSummary:\033[0m')
print(f'\n\033[33mEmpty input: {list_empty_input}\033[0m')
print(f'\n\033[31mEmpty output: {list_empty_output}\033[0m\n')


Loading file: tree_DYJetsToLL_M10to50.root
[1;32mFile written: ../input_files/trees_modified/tree_2018UL_sr_Dec30/tree_DYJetsToLL_M10to50.root[0m (257->7, 2.72%)


Loading file: tree_DYJetsToLL_M50.root
[1;32mFile written: ../input_files/trees_modified/tree_2018UL_sr_Dec30/tree_DYJetsToLL_M50.root[0m (116185->686, 0.59%)


Loading file: tree_EGamma_EGamma_A.root
[1;32mFile written: ../input_files/trees_modified/tree_2018UL_sr_Dec30/tree_EGamma_EGamma_A.root[0m (65324->956, 1.46%)


Loading file: tree_EGamma_EGamma_B.root
[1;32mFile written: ../input_files/trees_modified/tree_2018UL_sr_Dec30/tree_EGamma_EGamma_B.root[0m (33016->461, 1.40%)


Loading file: tree_EGamma_EGamma_C.root
[1;32mFile written: ../input_files/trees_modified/tree_2018UL_sr_Dec30/tree_EGamma_EGamma_C.root[0m (32905->493, 1.50%)


Loading file: tree_EGamma_EGamma_D.root
[1;32mFile written: ../input_files/trees_modified/tree_2018UL_sr_Dec30/tree_EGamma_EGamma_D.root[0m (150874->2192, 1.45%)


Loading file

### Convert them into histograms

In [5]:
%%time

run_here = False

channels = ['mm', 'me', 'em', 'ee']
for channel_ in channels:
    jobname_ = outjob
    campaign_ = campaign
    dump_ = outjob.replace('tree', 'hist')+'_'+channel_
    #command = f'python3 extractHistsFromTrees.py --jobname {jobname_} --dump {dump_} --channel {channel_} --campaign {campaign_}'
    arguments = f'extractHistsFromTrees.C("{jobname_}", "{dump_}", "{campaign_}", "{channel_}")'
    command = f"root -q -b -l '{arguments}'"
    print(f'\033[93m\nRunnning .. \n{command}\033[0m\n')

    if run_here: os.system(command)
    else: print('Run it yourself:')
    
print('Done!')

[93m
Runnning .. 
root -q -b -l 'extractHistsFromTrees.C("tree_2018UL_sr_Dec30", "hist_2018UL_sr_Dec30_mm", "2018_UL", "mm")'[0m

Run it yourself:
[93m
Runnning .. 
root -q -b -l 'extractHistsFromTrees.C("tree_2018UL_sr_Dec30", "hist_2018UL_sr_Dec30_me", "2018_UL", "me")'[0m

Run it yourself:
[93m
Runnning .. 
root -q -b -l 'extractHistsFromTrees.C("tree_2018UL_sr_Dec30", "hist_2018UL_sr_Dec30_em", "2018_UL", "em")'[0m

Run it yourself:
[93m
Runnning .. 
root -q -b -l 'extractHistsFromTrees.C("tree_2018UL_sr_Dec30", "hist_2018UL_sr_Dec30_ee", "2018_UL", "ee")'[0m

Run it yourself:
Done!
CPU times: user 134 μs, sys: 0 ns, total: 134 μs
Wall time: 128 μs
