# Reading into histograms

In [1]:
import os, sys
import json
import numpy as np
import pandas as pd
import ROOT

print('Modules loaded.')

Welcome to JupyROOT 6.26/10
Modules loaded.


In [2]:
jsonfile2018 = '../../InputJsons/lumidata_2018.json'

with open(jsonfile2018,'r') as infile: filedict = json.load(infile)
    
jobnames = ["hist_2LSS_SE2_Oct03_mm"]
histname = 'nnscore_qcd_vlld_combined'
rebin = 5

signals = ['VLLD_ele', 'VLLD_mu']

In [3]:
sigdict = {
    'VLLD_ele': {
        'M100': {'mass': 100, 'xsec': 16.9, 'ngen': 110871},
        'M200': {'mass': 200, 'xsec': 1.36, 'ngen': 73730},
        'M300': {'mass': 300, 'xsec': 0.291, 'ngen': 24753},
        'M400': {'mass': 400, 'xsec': 0.0907, 'ngen': 24491},
        'M600': {'mass': 600, 'xsec': 0.0149, 'ngen': 24611},
        'M800': {'mass': 800, 'xsec': 0.00347, 'ngen': 23680},
        'M1000': {'mass': 1000, 'xsec': 0.000971, 'ngen': 24286}
    },
    'VLLD_mu': {
        'M100': {'mass': 100, 'xsec': 16.9, 'ngen': 111926},
        'M200': {'mass': 200, 'xsec': 1.36, 'ngen': 73908},
        'M300': {'mass': 300, 'xsec': 0.291, 'ngen': 25022},
        'M400': {'mass': 400, 'xsec': 0.0907, 'ngen': 24299},
        'M600': {'mass': 600, 'xsec': 0.0149, 'ngen': 24890},
        'M800': {'mass': 800, 'xsec': 0.00347, 'ngen': 24763}
    }
}

bkgdict = {}

def set_last_bin_as_overflow(hst):
    lastBin  = hst.GetNbinsX()
    content  = hst.GetBinContent(lastBin)
    error    = hst.GetBinError(lastBin)
    overflow = hst.GetBinContent(lastBin + 1)
            
    updated_content = content + overflow
    updated_error   = (error**2 + overflow**2)**0.5
            
    hst.SetBinContent(lastBin, updated_content)
    hst.SetBinError(lastBin, updated_error)
            
    # Handle underflow:
    content_first = hst.GetBinContent(1)
    error_first   = hst.GetBinError(1)
    underflow     = hst.GetBinContent(0)
            
    updated_content_first = content_first + underflow
    updated_error_first = (error_first**2 + underflow**2)**0.5
            
    hst.SetBinContent(1, updated_content_first)
    hst.SetBinError(1, updated_error_first)

print('Functions loaded.')

Functions loaded.


In [4]:
for sample, subs in filedict.items():
    if sample not in bkgdict: bkgdict[sample] = {}

    for subsample, lumi in subs.items():
        #if sample not in 'VLLD_mu': continue
        if 'SingleMuon' in sample or 'EGamma' in sample: continue
        #print(sample, subsample, lumi)
        if subsample not in bkgdict[sample]: bkgdict[sample][subsample] = {}
        
        yields = []
        errors = []
        integrals = []

        #Fill-up filedict and signal-dict with more information:
        # Step1: Open the histogram and find out yield and error in in each bins.
        for job in jobnames:
            
            input_dir = os.path.join('../input_hists', job)
            filename = f'hst_{sample}_{subsample}.root'
            filepath = os.path.join(input_dir, filename)
            if not os.path.exists(filepath): continue

            tfile = ROOT.TFile(filepath)
            hist = tfile.Get(histname)

            set_last_bin_as_overflow(hist)

            #Pick lumi from signal:
            if sample in signals:
                lumi = sigdict[sample][subsample]['ngen']/sigdict[sample][subsample]['xsec']
                
            hist.Scale(59800/lumi)

            hist.Rebin(rebin)
            integral = hist.Integral()
            integrals.append(integral)

            nbins = hist.GetNbinsX()
            #print(nbins)
            for bin in range(1, nbins + 1):
                yield_value = hist.GetBinContent(bin)
                error_value = hist.GetBinError(bin)
                yields.append(yield_value)
                errors.append(error_value)

            tfile.Close()

        if 'VLL' not in sample:
            bkgdict[sample][subsample]['yields'] = yields
            bkgdict[sample][subsample]['errors'] = errors
            bkgdict[sample][subsample]['integrals'] = integrals

        if sample in signals:
            if sample not in sigdict:            sigdict[sample] = {}
            if subsample not in sigdict[sample]: sigdict[sample][subsample] = {}
            sigdict[sample][subsample]['yields'] = yields
            sigdict[sample][subsample]['errors'] = errors
            sigdict[sample][subsample]['integrals'] = integrals
            #print(f'Updated dictionary for {sample} {subsample}')
            if sample == 'VLLD_mu' and subsample=='M100': print(yields, errors, integrals)
        

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.604369163513184, 7.5355706214904785, 8.121240615844727, 16.020751953125, 35.872188568115234, 63.90382766723633, 157.65457153320312, 106.15678405761719, 148.5453338623047, 140.87515258789062, 212.5577850341797, 195.47897338867188, 191.01226806640625, 187.77584838867188, 186.9493408203125, 251.87643432617188, 216.72866821289062, 256.46636962890625, 321.199951171875, 240.88897705078125, 264.70977783203125, 215.229736328125, 293.0583801269531, 262.9051513671875, 230.9696502685547, 162.87310791015625, 210.67849731445312, 216.32093811035156, 105.29950714111328] [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.604369208022463, 7.535570499957379, 8.121240503760431, 11.384601015956521, 16.136382406841793, 22.604668296493273, 35.34644720992406, 28.479790523490554, 34.27403530355917, 33.32769853849903, 40.363257468520395, 38.59216143910314, 38.42231197968886, 38.471461283933465, 37.65473526200857, 43.416359857584965, 40.5086716079

### Extracting background yields

In [5]:
combined_bkg_yield = None
combined_bkg_error = None

# Loop through the samples and subsamples in bkgdict
for sample, subs in bkgdict.items():
    if 'VLL' in sample: continue
        
    for subsample, val in subs.items():
        
        # Check if 'yields' and 'errors' keys exist in the current subsample
        if 'yields' not in val or 'errors' not in val:
            print(f"Warning: 'yields' or 'errors' not found for sample {sample}, subsample {subsample}.")
            continue

        skip_samples = ['WWZ_WWZJetsTo4L2Nu', 'WJetsNLO_Inclusive', 'WpWp_WpWpJJQCD', 'WGamma_Inclusive']
        if sample+'_'+subsample in skip_samples:
            print(f'Skipping background: {sample} {subsample}')
            continue
            
        # Get the yields and errors for the current subsample
        yields = np.array(val['yields'])
        errors = np.array(val['errors'])

        if yields.size == 0 or errors.size == 0: continue

        # Initialize the combined arrays if not already initialized
        if combined_bkg_yield is None:
            combined_bkg_yield = np.zeros_like(yields)
            combined_bkg_error = np.zeros_like(errors)

        # Add yields normally
        combined_bkg_yield += yields

        # Add errors in quadrature
        combined_bkg_error = np.sqrt(combined_bkg_error**2 + errors**2)
        #print(f"Total yields and errors calculated for background: {sample} {subsample}")

# Now, `combined_bkg_yield` contains the total yields binwise
# And `combined_bkg_error` contains the total errors binwise
print('\n'+'-'*100)
print(f"{'bin':<7} {'nBkg':<7} {'nBkgErr'}")
for i in range(combined_bkg_yield.shape[0]):
    print(f'{i+1:<7} {combined_bkg_yield[i]:<7.2f} {combined_bkg_error[i]:.3f}')

Skipping background: WWZ WWZJetsTo4L2Nu
Skipping background: WJetsNLO Inclusive
Skipping background: WpWp WpWpJJQCD
Skipping background: WGamma Inclusive

----------------------------------------------------------------------------------------------------
bin     nBkg    nBkgErr
1       0.00    0.001
2       0.00    0.000
3       0.06    0.043
4       0.08    0.056
5       0.14    0.072
6       0.18    0.083
7       1.19    0.928
8       1.62    1.127
9       1.91    1.276
10      4.02    2.048
11      2.32    0.286
12      4.95    0.705
13      8.16    0.730
14      15.08   2.023
15      62.24   32.785
16      38.96   6.431
17      51.64   5.293
18      77.01   6.990
19      83.96   5.892
20      106.26  6.850
21      142.09  14.000
22      178.41  32.554
23      342.65  155.505
24      146.25  8.289
25      148.68  13.197
26      152.77  9.559
27      155.94  8.853
28      150.70  9.262
29      193.03  32.671
30      147.29  8.732
31      200.59  33.021
32      160.08  8.145
33      

### Extracting data yields (Setting it to background for now)

In [6]:
combined_data = combined_bkg_yield

# Preparing dataframe that holds yields

In [7]:
def prepare_df(sigdict, combined_data, combined_bkg_yield, combined_bkg_error):
    nbins = len(combined_bkg_yield)  # Set nbins based on the size of combined_bkg_yield

    # Dictionary to store dataframes for each sample and subsample
    df_hierarchy = {}

    for sample, subs in sigdict.items():
        df_hierarchy[sample] = {}  # Initialize a dictionary for each sample
        
        for subsample, val in subs.items():
            #if sample != 'VLLD_mu': continue
            #if subsample != 'M100': continue

            # Initialize the dataframe to store information for the current subsample:
            df = None

            yields = np.array(val['yields'])
            errors = np.array(val['errors'])

            # If yields are zero, set a small value
            yields[yields == 0] = 1e-7
            
            for ibin in range(nbins):

                nbin = ibin + 1
                
                sig  = yields[ibin] if ibin < len(yields) else 0
                dsig = errors[ibin] if ibin < len(errors) else 0

                # Ensure the background is not zero
                bkg  = combined_bkg_yield[ibin] if ibin < len(combined_bkg_yield) else 0
                dbkg = combined_bkg_error[ibin] if ibin < len(combined_bkg_error) else 0
                
                if bkg == 0:
                    pass
                    #print(f'Removed bin {nbin} with zero background in {sample} - {subsample}')
                    #continue  # Reject bins with zero background
                
                # Signal-to-background ratio
                stob = 0
                relative_dbkg = 0
                if bkg != 0 :
                    stob = sig / np.sqrt(bkg)
                    relative_dbkg = dbkg/bkg
                
                deltaB = 1 + relative_dbkg
                
                # Create the entry for the dataframe as a row
                new_row = pd.DataFrame([{
                    'bin': nbin,
                    'signal': sig,  # Store as numeric
                    'nObs': combined_data[ibin],  # Store as numeric
                    'nBkg': bkg,  # Store as numeric
                    'bkg_err': dbkg,  # Store as numeric
                    'S/sqrtB': stob,  # Store as numeric
                    'deltaB': deltaB  # Store as numeric
                }])

                if df is None: df = new_row
                else: df = pd.concat([df, new_row], ignore_index=True)
                
            # Sort and filter the dataframe
            df = df.sort_values(by='S/sqrtB', ascending=False).reset_index(drop=True)

            filter_condition = pd.Series([True] * len(df))
            filter_condition =  (df['nBkg'] > 0.1) & (df['deltaB']<1.10)
            samplename = sample+'_'+subsample
            if   samplename == 'VLLD_mu_M100': filter_condition = filter_condition & (df['S/sqrtB'] > 1)
            elif samplename == 'VLLD_mu_M200': filter_condition = filter_condition & (df['S/sqrtB'] > 0.1)
            elif samplename == 'VLLD_mu_M300': filter_condition = filter_condition & (df['S/sqrtB'] > 0.1)
            elif samplename == 'VLLD_mu_M400': filter_condition = filter_condition & (df['S/sqrtB'] > 0.01)
            elif samplename == 'VLLD_mu_M600': filter_condition = filter_condition & (df['S/sqrtB'] > 0.001)
            elif samplename == 'VLLD_mu_M800': filter_condition = filter_condition & (df['S/sqrtB'] > 0.0001)
                
            df = df.loc[filter_condition]
            df = df.reset_index(drop=True)

            # Store the DataFrame in the dictionary
            df_hierarchy[sample][subsample] = df

    return df_hierarchy

In [8]:
df = prepare_df(sigdict, combined_data, combined_bkg_yield, combined_bkg_error)
display(df['VLLD_mu']['M600'])

Unnamed: 0,bin,signal,nObs,nBkg,bkg_err,S/sqrtB,deltaB
0,40,1.765579,167.530924,167.530924,6.738142,0.136408,1.04022
1,39,0.586693,193.235768,193.235768,5.137792,0.042205,1.026588
2,37,0.291319,193.772971,193.772971,9.299646,0.020928,1.047992
3,33,0.2401,161.349823,161.349823,7.940673,0.018902,1.049214
4,35,0.240907,181.569012,181.569012,13.900176,0.017878,1.076556
5,38,0.223901,203.790762,203.790762,9.21441,0.015684,1.045215
6,36,0.158996,183.813541,183.813541,9.421964,0.011727,1.051258
7,34,0.142982,167.198069,167.198069,8.299252,0.011058,1.049637
8,28,0.132026,150.700014,150.700014,9.261721,0.010755,1.061458
9,26,0.063981,152.766496,152.766496,9.558797,0.005177,1.062571


In [9]:
# Assuming df['VLLD_mu']['M100'] is a DataFrame
print(f"{'nbin':<7} {'signal':<7} {'nObs':<7} {'nBkg':<7} {'bkgErr':<7} {'S/sqrtB':<7} {'deltaB'}")
for index, row in df['VLLD_mu']['M100'].iterrows():
    print(f"{index+1}\t{row['signal']:.2f}\t{row['nObs']:.2f}\t{row['nBkg']:.2f}\t{row['bkg_err']:.2f}\t{row['S/sqrtB']:.2f}\t{row['deltaB']:.2f}")

nbin    signal  nObs    nBkg    bkgErr  S/sqrtB deltaB
1	321.20	147.29	147.29	8.73	26.47	1.06
2	293.06	167.20	167.20	8.30	22.66	1.05
3	264.71	160.08	160.08	8.15	20.92	1.05
4	251.88	155.94	155.94	8.85	20.17	1.06
5	262.91	181.57	181.57	13.90	19.51	1.08
6	157.65	77.01	77.01	6.99	17.97	1.09
7	216.73	150.70	150.70	9.26	17.65	1.06
8	230.97	183.81	183.81	9.42	17.04	1.05
9	215.23	161.35	161.35	7.94	16.94	1.05
10	191.01	146.25	146.25	8.29	15.80	1.06
11	216.32	193.24	193.24	5.14	15.56	1.03
12	187.78	148.68	148.68	13.20	15.40	1.09
13	186.95	152.77	152.77	9.56	15.13	1.06
14	210.68	203.79	203.79	9.21	14.76	1.05
15	148.55	106.26	106.26	6.85	14.41	1.06
16	140.88	142.09	142.09	14.00	11.82	1.10
17	162.87	193.77	193.77	9.30	11.70	1.05
18	106.16	83.96	83.96	5.89	11.59	1.07
19	105.30	167.53	167.53	6.74	8.14	1.04
20	7.54	8.16	8.16	0.73	2.64	1.09


## Writing into text files

In [10]:
def write_df_to_file(df, filename):
    with open(filename, 'w') as f:
        # Write the header with fixed widths
        header = f"{'bin':<10}{'signal':<10}{'nObs':<10}{'nBkg':<10}{'bkg_err':<10}{'S/sqrtB':<10}{'deltaB':<10}"
        f.write(header + '\n')
        
        # Write the data rows
        for index, row in df.iterrows():
            line = f"{int(row['bin']):<10}"
            line += f"{float(row['signal']):<10.2f}"
            line += f"{float(row['nObs']):<10.2f}"
            line += f"{float(row['nBkg']):<10.2f}"
            line += f"{float(row['bkg_err']):<10.2f}"
            line += f"{float(row['S/sqrtB']):<10.2f}"
            line += f"{float(row['deltaB']):<10.4f}"
            f.write(line + '\n')

def write_datacard(df, datacard):

    df = df.reset_index(drop=True)
    num_bins = len(df)  # Total number of bins
    
    if num_bins == 0:
        print(f'Warning: Zero bins detected! SKipping file {datacard}')
        return
        
    #print(f'Processing {num_bins} bins ')
    
    with open(datacard, 'w') as f:
        #header information
        f.write(f"imax {num_bins}                          # number of channels\n")
        f.write(f"jmax 1                           # number of backgrounds\n")
        f.write(f"kmax {num_bins}                          # number of nuisance parameters\n")
        f.write("------------\n")
        
        # Bin section
        f.write(f"{'bin':<16}")
        line = ""
        for i in range(num_bins): line += f"bin{i + 1}\t"
        line = line[:-1]
        f.write(line + "\n")
        
        # Observation section
        f.write(f"{'observation':<16}")
        line = ""
        for i in range(num_bins): line += f"{int(df['nObs'].iat[i])}\t"
        line = line[:-1]
        f.write(line + "\n")
        f.write("------------\n")

        # Bin-Bin section
        f.write(f"{'bin':<16}")
        line = ""
        for i in range(num_bins): line += f"bin{i + 1}\tbin{i + 1}\t"
        line = line[:-1]
        f.write(line + "\n")
        
        #Process section
        f.write(f"{'process':<16}")
        line = ""
        for i in range(num_bins): line += "sig\tbkg\t"
        line = line[:-1]
        f.write(line + "\n")

        #Process ID section:
        f.write(f"{'process':<16}")
        line = ""
        for i in range(num_bins): line += f"{-1*(i + 1)}\t{(i + 1)}\t"
        line = line[:-1]
        f.write(line + "\n")

        # Rate section
        f.write(f"{'rate':<16}")
        line = ""
        for i in range(num_bins): line += f"{df['signal'].at[i]:.2f}\t{df['nBkg'].at[i]:.2f}\t"
        line = line[:-1]
        f.write(line + "\n")
        f.write("------------\n")

        #uncertainty:
        for i in range(num_bins):
            uncertainty_line = f"xs{i + 1:<6}lnN\t"
            values = []
            for j in range(num_bins):
                if j == i: # Diagonal element
                    values.append("-")  # Signal uncertainty
                    uncertainty_value = df['deltaB'][i]
                    values.append(f"{uncertainty_value:.5f}") # Background uncertainty
                else:
                    values.append("-") # Signal uncertainty
                    values.append("-") # Background uncertainty
            uncertainty_line += "\t".join(values)
            f.write(uncertainty_line + "\n")
            
    print(f'Wrote file: {datacard}')  
    
print('Funtions loaded.')

Funtions loaded.


In [11]:
not_found = []

for sample, subs in sigdict.items():
    for subsample, val in subs.items():
        yieldfile = f"yields/yields_{sample}_{subsample}.txt"
        os.makedirs('yields', exist_ok=True)
        
        # Check if the DataFrame for the current sample and subsample exists
        if sample in df and subsample in df[sample]:
            write_df_to_file(df[sample][subsample], yieldfile)
            print(f'Wrote text file: {yieldfile}')
        else: not_found.append(f'{sample}_{subsample}')

print('\nWarning: Information for the following samples are not found.')
print(not_found)

Wrote text file: yields/yields_VLLD_ele_M100.txt
Wrote text file: yields/yields_VLLD_ele_M200.txt
Wrote text file: yields/yields_VLLD_ele_M300.txt
Wrote text file: yields/yields_VLLD_ele_M400.txt
Wrote text file: yields/yields_VLLD_ele_M600.txt
Wrote text file: yields/yields_VLLD_ele_M800.txt
Wrote text file: yields/yields_VLLD_ele_M1000.txt
Wrote text file: yields/yields_VLLD_mu_M100.txt
Wrote text file: yields/yields_VLLD_mu_M200.txt
Wrote text file: yields/yields_VLLD_mu_M300.txt
Wrote text file: yields/yields_VLLD_mu_M400.txt
Wrote text file: yields/yields_VLLD_mu_M600.txt
Wrote text file: yields/yields_VLLD_mu_M800.txt

[]


In [12]:
not_found = []

for sample, subs in sigdict.items():
    for subsample, val in subs.items():
        datacard = f"datacards/datacard_{sample}_{subsample}.txt"
        os.makedirs('datacards', exist_ok=True)

        # Check if the DataFrame for the current sample and subsample exists
        if sample in df and subsample in df[sample]:
            #if not (sample in 'VLLD_mu' and subsample in 'M100') : continue
            write_datacard(df[sample][subsample], datacard)
            
        else: not_found.append(f'{sample}_{subsample}')

print('\nWarning: Information for the following samples are not found.')
print(not_found)

Wrote file: datacards/datacard_VLLD_ele_M100.txt
Wrote file: datacards/datacard_VLLD_ele_M200.txt
Wrote file: datacards/datacard_VLLD_ele_M300.txt
Wrote file: datacards/datacard_VLLD_ele_M400.txt
Wrote file: datacards/datacard_VLLD_ele_M600.txt
Wrote file: datacards/datacard_VLLD_ele_M800.txt
Wrote file: datacards/datacard_VLLD_ele_M1000.txt
Wrote file: datacards/datacard_VLLD_mu_M100.txt
Wrote file: datacards/datacard_VLLD_mu_M200.txt
Wrote file: datacards/datacard_VLLD_mu_M300.txt
Wrote file: datacards/datacard_VLLD_mu_M400.txt
Wrote file: datacards/datacard_VLLD_mu_M600.txt
Wrote file: datacards/datacard_VLLD_mu_M800.txt

[]
