# Reading into histograms

In [1]:
import os, sys
import json
import numpy as np
import pandas as pd
import ROOT

print('Modules loaded.')

Welcome to JupyROOT 6.26/10
Modules loaded.


In [2]:
jsonfile2018 = '../../InputJsons/lumidata_2018.json'

with open(jsonfile2018,'r') as infile: filedict = json.load(infile)
    
jobnames = ["hist_2LSS_SE2_Oct03_mm"]
histname = 'nnscore_qcd_vlld_combined'
rebin = 5

signals = ['VLLD_ele', 'VLLD_mu']

In [3]:
sigdict = {
    'VLLD_ele': {
        'M100': {'mass': 100, 'xsec': 16.9, 'ngen': 110871},
        'M200': {'mass': 200, 'xsec': 1.36, 'ngen': 73730},
        'M300': {'mass': 300, 'xsec': 0.291, 'ngen': 24753},
        'M400': {'mass': 400, 'xsec': 0.0907, 'ngen': 24491},
        'M600': {'mass': 600, 'xsec': 0.0149, 'ngen': 24611},
        'M800': {'mass': 800, 'xsec': 0.00347, 'ngen': 23680},
        'M1000': {'mass': 1000, 'xsec': 0.000971, 'ngen': 24286}
    },
    'VLLD_mu': {
        'M100': {'mass': 100, 'xsec': 16.9, 'ngen': 111926},
        'M200': {'mass': 200, 'xsec': 1.36, 'ngen': 73908},
        'M300': {'mass': 300, 'xsec': 0.291, 'ngen': 25022},
        'M400': {'mass': 400, 'xsec': 0.0907, 'ngen': 24299},
        'M600': {'mass': 600, 'xsec': 0.0149, 'ngen': 24890},
        'M800': {'mass': 800, 'xsec': 0.00347, 'ngen': 24763}
    }
}

bkgdict = {}

def set_last_bin_as_overflow(hst):
    lastBin = hst.GetNbinsX()
    content = hst.GetBinContent(lastBin)
    error = hst.GetBinError(lastBin)
    overflow = hst.GetBinContent(lastBin + 1)
            
    updated_content = content + overflow
    updated_error = (error**2 + overflow**2)**0.5
            
    hst.SetBinContent(lastBin, updated_content)
    hst.SetBinError(lastBin, updated_error)
            
    # Handle underflow:
    content_first = hst.GetBinContent(1)
    error_first = hst.GetBinError(1)
    underflow = hst.GetBinContent(0)
            
    updated_content_first = content_first + underflow
    updated_error_first = (error_first**2 + underflow**2)**0.5
            
    hst.SetBinContent(1, updated_content_first)
    hst.SetBinError(1, updated_error_first)

print('Functions loaded.')

Functions loaded.


In [4]:
for sample, subs in filedict.items():
    if sample not in bkgdict: bkgdict[sample] = {}

    for subsample, lumi in subs.items():
        #if sample not in 'VLLD_mu': continue
        if 'SingleMuon' in sample or 'EGamma' in sample: continue
        #print(sample, subsample, lumi)
        if subsample not in bkgdict[sample]: bkgdict[sample][subsample] = {}
        
        yields = []
        errors = []
        integrals = []

        #Fill-up filedict and signal-dict with more information:
        # Step1: Open the histogram and find out yield and error in in each bins.
        for job in jobnames:
            
            input_dir = os.path.join('../input_hists', job)
            filename = f'hst_{sample}_{subsample}.root'
            filepath = os.path.join(input_dir, filename)
            if not os.path.exists(filepath): continue

            tfile = ROOT.TFile(filepath)
            hist = tfile.Get(histname)

            set_last_bin_as_overflow(hist)
            hist.Scale(59800/lumi)

            hist.Rebin(rebin)
            integral = hist.Integral()
            integrals.append(integral)

            nbins = hist.GetNbinsX()
            #print(nbins)
            for bin in range(1, nbins + 1):
                yield_value = hist.GetBinContent(bin)
                error_value = hist.GetBinError(bin)
                yields.append(yield_value)
                errors.append(error_value)

            tfile.Close()

        if 'VLL' not in sample:
            bkgdict[sample][subsample]['yields'] = yields
            bkgdict[sample][subsample]['errors'] = errors
            bkgdict[sample][subsample]['integrals'] = integrals

        if 'VLL' in sample:
            if sample not in sigdict:            sigdict[sample] = {}
            if subsample not in sigdict[sample]: sigdict[sample][subsample] = {}
            sigdict[sample][subsample]['yields'] = yields
            sigdict[sample][subsample]['errors'] = errors
            sigdict[sample][subsample]['integrals'] = integrals
            #print(f'Updated dictionary for {sample} {subsample}')
            if sample == 'VLLD_mu' and subsample=='M100': print(yields, errors, integrals)
        

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.604369163513184, 7.5355706214904785, 8.121240615844727, 16.020751953125, 35.8721923828125, 63.90382766723633, 157.65457153320312, 106.15678405761719, 148.54534912109375, 140.87515258789062, 212.55780029296875, 195.47897338867188, 191.01226806640625, 187.77586364746094, 186.9493408203125, 251.87644958496094, 216.72866821289062, 256.46636962890625, 321.1999816894531, 240.8889923095703, 264.70977783203125, 215.229736328125, 293.0583801269531, 262.9051513671875, 230.96966552734375, 162.87310791015625, 210.67849731445312, 216.32095336914062, 105.29951477050781] [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 7.604369479786646, 7.535570769262841, 8.121240793996508, 11.38460142281825, 16.136382983522207, 22.604669104336637, 35.34644847313183, 28.47979154129845, 34.274036528441194, 33.32769972956097, 40.363258911018434, 38.59216281830593, 38.422313352821575, 38.471462658822674, 37.654736607709715, 43.41636140919447, 40.508673055

### Extracting background yields

In [5]:
combined_bkg_yield = None
combined_bkg_error = None

# Loop through the samples and subsamples in bkgdict
for sample, subs in bkgdict.items():
    if 'VLL' in sample: continue
        
    for subsample, val in subs.items():
        
        # Check if 'yields' and 'errors' keys exist in the current subsample
        if 'yields' not in val or 'errors' not in val:
            print(f"Warning: 'yields' or 'errors' not found for sample {sample}, subsample {subsample}.")
            continue
            
        # Get the yields and errors for the current subsample
        yields = np.array(val['yields'])
        errors = np.array(val['errors'])

        if yields.size == 0 or errors.size == 0: continue

        # Initialize the combined arrays if not already initialized
        if combined_bkg_yield is None:
            combined_bkg_yield = np.zeros_like(yields)
            combined_bkg_error = np.zeros_like(errors)

        # Add yields normally
        combined_bkg_yield += yields

        # Add errors in quadrature
        combined_bkg_error = np.sqrt(combined_bkg_error**2 + errors**2)
        #print(f"Total yields and errors calculated for background: {sample} {subsample}")

# Now, `combined_bkg_yield` contains the total yields binwise
# And `combined_bkg_error` contains the total errors binwise
print('\n'+'-'*100)
print('bin\t\tnBkg\tnBkgErr')
for i in range(combined_bkg_yield.shape[0]):
    print(f'bin {i+1} \t {combined_bkg_yield[i]:>10.2f} \t {combined_bkg_error[i]:.2f}')


----------------------------------------------------------------------------------------------------
bin		nBkg	nBkgErr
bin 1 	       0.00 	 0.00
bin 2 	       0.00 	 0.00
bin 3 	       0.06 	 0.04
bin 4 	       0.08 	 0.06
bin 5 	       0.14 	 0.07
bin 6 	       0.18 	 0.08
bin 7 	       1.20 	 0.93
bin 8 	       1.64 	 1.13
bin 9 	       1.91 	 1.28
bin 10 	       4.27 	 2.21
bin 11 	       2.16 	 0.27
bin 12 	       4.87 	 0.70
bin 13 	       7.97 	 0.72
bin 14 	      15.07 	 2.18
bin 15 	      60.92 	 32.74
bin 16 	      38.67 	 6.47
bin 17 	      46.88 	 3.77
bin 18 	      69.32 	 5.21
bin 19 	      80.38 	 5.12
bin 20 	      99.56 	 5.41
bin 21 	     133.80 	 12.09
bin 22 	     163.23 	 31.87
bin 23 	     336.85 	 155.47
bin 24 	     137.37 	 6.86
bin 25 	     140.61 	 11.25
bin 26 	     145.45 	 8.71
bin 27 	     145.75 	 7.34
bin 28 	     141.75 	 8.15
bin 29 	     176.36 	 31.94
bin 30 	     140.12 	 7.89
bin 31 	     186.64 	 32.48
bin 32 	     150.05 	 6.74
bin 33 	     149.

### Extracting data yields (Setting it to background for now)

In [6]:
combined_data = combined_bkg_yield

# Preparing dataframe that holds yields

In [7]:
def prepare_df(sigdict, combined_data, combined_bkg_yield, combined_bkg_error):
    nbins = len(combined_bkg_yield)  # Set nbins based on the size of combined_bkg_yield

    # Dictionary to store dataframes for each sample and subsample
    df_hierarchy = {}

    for sample, subs in sigdict.items():
        df_hierarchy[sample] = {}  # Initialize a dictionary for each sample
        
        for subsample, val in subs.items():
            if sample != 'VLLD_mu': continue
            if subsample != 'M100': continue

            # Initialize the dataframe to store information for the current subsample:
            df = None

            yields = np.array(val['yields'])
            errors = np.array(val['errors'])

            # If yields are zero, set a small value
            yields[yields == 0] = 1e-7
            
            for ibin in range(nbins):

                nbin = ibin + 1
                
                sig  = yields[ibin] if ibin < len(yields) else 0
                dsig = errors[ibin] if ibin < len(errors) else 0

                # Ensure the background is not zero
                bkg  = combined_bkg_yield[ibin] if ibin < len(combined_bkg_yield) else 0
                dbkg = combined_bkg_error[ibin] if ibin < len(combined_bkg_error) else 0
                
                if bkg == 0:
                    pass
                    #print(f'Removed bin {nbin} with zero background in {sample} - {subsample}')
                    #continue  # Reject bins with zero background
                
                # Signal-to-background ratio
                stob = 0
                relative_dbkg = 0
                if bkg != 0 :
                    stob = sig / np.sqrt(bkg)
                    relative_dbkg = dbkg/bkg
                
                deltaB = 1 + relative_dbkg
                
                # Create the entry for the dataframe as a row
                new_row = pd.DataFrame([{
                    'bin': nbin,
                    'signal': sig,  # Store as numeric
                    'nObs': combined_data[ibin],  # Store as numeric
                    'nBkg': bkg,  # Store as numeric
                    'bkg_err': dbkg,  # Store as numeric
                    'S/sqrtB': stob,  # Store as numeric
                    'deltaB': deltaB  # Store as numeric
                }])

                if df is None: df = new_row
                else: df = pd.concat([df, new_row], ignore_index=True)
                
            # Sort and filter the dataframe
            df = df.sort_values(by='S/sqrtB', ascending=False).reset_index(drop=True)
            #filter_condition = (df['nBkg'] > 0) & (df['S/sqrtB'] > 1)
            #df = df.loc[filter_condition]

            # Store the DataFrame in the dictionary
            df_hierarchy[sample][subsample] = df

    return df_hierarchy

In [8]:
df = prepare_df(sigdict, combined_data, combined_bkg_yield, combined_bkg_error)
display(df['VLLD_mu']['M100'])

Unnamed: 0,bin,signal,nObs,nBkg,bkg_err,S/sqrtB,deltaB
0,30,321.2,140.12239,140.12239,7.885643,27.1345,1.056277
1,34,293.0584,153.102022,153.102022,6.219892,23.68447,1.040626
2,32,264.7098,150.048256,150.048256,6.743564,21.60999,1.044943
3,27,251.8764,145.752803,145.752803,7.335051,20.86311,1.050325
4,35,262.9052,163.974478,163.974478,11.135817,20.53104,1.067912
5,29,256.4664,176.36253,176.36253,31.942395,19.312,1.181118
6,18,157.6546,69.323124,69.323124,5.208406,18.9351,1.075132
7,28,216.7287,141.750588,141.750588,8.150636,18.20346,1.0575
8,36,230.9697,164.268796,164.268796,6.516016,18.02094,1.039667
9,31,240.889,186.642256,186.642256,32.482846,17.63242,1.174038


In [9]:
# Assuming df['VLLD_mu']['M100'] is a DataFrame
for index, row in df['VLLD_mu']['M100'].iterrows():
    print(f"{index}\t{row['signal']:.2f}\t{row['nObs']:.2f}\t{row['nBkg']:.2f}\t{row['bkg_err']:.2f}\t{row['S/sqrtB']:.2f}\t{row['deltaB']:.2f}")

0	321.20	140.12	140.12	7.89	27.13	1.06
1	293.06	153.10	153.10	6.22	23.68	1.04
2	264.71	150.05	150.05	6.74	21.61	1.04
3	251.88	145.75	145.75	7.34	20.86	1.05
4	262.91	163.97	163.97	11.14	20.53	1.07
5	256.47	176.36	176.36	31.94	19.31	1.18
6	157.65	69.32	69.32	5.21	18.94	1.08
7	216.73	141.75	141.75	8.15	18.20	1.06
8	230.97	164.27	164.27	6.52	18.02	1.04
9	240.89	186.64	186.64	32.48	17.63	1.17
10	215.23	149.40	149.40	5.96	17.61	1.04
11	212.56	163.23	163.23	31.87	16.64	1.20
12	191.01	137.37	137.37	6.86	16.30	1.05
13	216.32	185.20	185.20	4.77	15.90	1.03
14	187.78	140.61	140.61	11.25	15.84	1.08
15	210.68	182.86	182.86	5.97	15.58	1.03
16	186.95	145.45	145.45	8.71	15.50	1.06
17	148.55	99.56	99.56	5.41	14.89	1.05
18	162.87	173.05	173.05	6.15	12.38	1.04
19	140.88	133.80	133.80	12.09	12.18	1.09
20	106.16	80.38	80.38	5.12	11.84	1.06
21	195.48	336.85	336.85	155.47	10.65	1.46
22	63.90	46.88	46.88	3.77	9.33	1.08
23	105.30	154.28	154.28	4.61	8.48	1.03
24	35.87	38.67	38.67	6.47	5.77	1.17
25	7.60	4.87	4.87

## Writing into text files

In [10]:
def write_df_to_file(df, filename):
    with open(filename, 'w') as f:
        # Write the header with fixed widths
        header = f"{'bin':<10}{'signal':<10}{'nObs':<10}{'nBkg':<10}{'bkg_err':<10}{'S/sqrtB':<10}{'deltaB':<10}"
        f.write(header + '\n')
        
        # Write the data rows
        for index, row in df.iterrows():
            line = f"{int(row['bin']):<10}"
            line += f"{float(row['signal']):<10.2f}"
            line += f"{float(row['nObs']):<10.2f}"
            line += f"{float(row['nBkg']):<10.2f}"
            line += f"{float(row['bkg_err']):<10.2f}"
            line += f"{float(row['S/sqrtB']):<10.2f}"
            line += f"{float(row['deltaB']):<10.4f}"
            f.write(line + '\n')

def write_datacard(df, datacard):
    num_bins = len(df)  # Total number of bins
    
    with open(datacard, 'w') as f:
        #header information
        f.write(f"imax {num_bins}                          # number of channels\n")
        f.write(f"jmax 1                           # number of backgrounds\n")
        f.write(f"kmax {num_bins}                          # number of nuisance parameters\n")
        f.write("-" * 12 + "\n")
        
        # Bin section
        f.write(f"{'bin':<16}" + "\t".join([f"bin{i + 1}" for i in range(num_bins)]) + "\n")        
        # Observation section
        f.write(f"{'observation':<16}" + "\t".join([f"{df['nObs'][i]:.2f}" for i in range(num_bins)]) + "\n")
        f.write("-" * 12 + "\n")

        # Bin section again
        f.write(f"{'bin':<16}" + "\t".join([f"bin{i + 1}" for i in range(num_bins)]) + "\n")

        #Process section
        f.write(f"{'process':<16}" + "\t" + "\t".join(["sig", "bkg"] * num_bins) + "\n") 
        process_values = []
        for i in range(1, num_bins + 1):
            process_values.append(f"-{i}")  # Signal
            process_values.append(f"{i}")   # Background
        f.write(f"{'process':<16}" + "\t".join(process_values) + "\n")

        #Rate section
        rate_values = []
        for i in range(num_bins):  # For each bin
            rate_values.append(f"{df['signal'][i]:.2f}")  # Signal rate
            rate_values.append(f"{df['nBkg'][i]:.2f}")    # Background rate
        f.write(f"{'rate':<16}" + "\t".join(rate_values) + "\n")
        f.write("-" * 12 + "\n")

        #uncertainty:
        for i in range(num_bins):
            uncertainty_line = f"xs{i + 1}     lnN\t"
            values = []
            for j in range(num_bins):
                if j == i: # Diagonal element
                    values.append("-")  # Signal uncertainty
                    uncertainty_value = df['deltaB'][i]
                    values.append(f"{uncertainty_value:.5f}") # Background uncertainty
                else:
                    values.append("-") # Signal uncertainty
                    values.append("-") # Background uncertainty
            uncertainty_line += "\t".join(values)
            f.write(uncertainty_line + "\n")

print('Funtions loaded.')

Funtions loaded.


In [11]:
not_found = []

for sample, subs in sigdict.items():
    for subsample, val in subs.items():
        yieldfile = f"yields/yields_{sample}_{subsample}.txt"
        os.makedirs('yields', exist_ok=True)
        
        # Check if the DataFrame for the current sample and subsample exists
        if sample in df and subsample in df[sample]:
            write_df_to_file(df[sample][subsample], yieldfile)
            print(f'Wrote text file: {yieldfile}')
        else: not_found.append(f'{sample}_{subsample}')

print('\nWarning: Information for the following samples are not found.')
print(not_found)

Wrote text file: yields/yields_VLLD_mu_M100.txt

['VLLD_ele_M100', 'VLLD_ele_M200', 'VLLD_ele_M300', 'VLLD_ele_M400', 'VLLD_ele_M600', 'VLLD_ele_M800', 'VLLD_ele_M1000', 'VLLD_mu_M200', 'VLLD_mu_M300', 'VLLD_mu_M400', 'VLLD_mu_M600', 'VLLD_mu_M800', 'VLLS_ele_M100', 'VLLS_ele_M125', 'VLLS_ele_M150', 'VLLS_ele_M200', 'VLLS_ele_M250', 'VLLS_ele_M300', 'VLLS_ele_M350', 'VLLS_ele_M400', 'VLLS_ele_M450', 'VLLS_ele_M500', 'VLLS_ele_M750', 'VLLS_ele_M1000', 'VLLS_mu_M100', 'VLLS_mu_M125', 'VLLS_mu_M150', 'VLLS_mu_M200', 'VLLS_mu_M250', 'VLLS_mu_M300', 'VLLS_mu_M400', 'VLLS_mu_M450', 'VLLS_mu_M500', 'VLLS_mu_M750', 'VLLS_mu_M1000', 'VLLS_tau_M100', 'VLLS_tau_M125', 'VLLS_tau_M150', 'VLLS_tau_M200', 'VLLS_tau_M250', 'VLLS_tau_M300', 'VLLS_tau_M350', 'VLLS_tau_M400']


In [12]:
not_found = []

for sample, subs in sigdict.items():
    for subsample, val in subs.items():
        datacard = f"datacards/datacard_{sample}_{subsample}.txt"
        os.makedirs('datacards', exist_ok=True)

        # Check if the DataFrame for the current sample and subsample exists
        if sample in df and subsample in df[sample]:
            write_datacard(df[sample][subsample], datacard)
            print(f"Wrote datacard: {datacard}")
        else: not_found.append(f'{sample}_{subsample}')

print('\nWarning: Information for the following samples are not found.')
print(not_found)

Wrote datacard: datacards/datacard_VLLD_mu_M100.txt

['VLLD_ele_M100', 'VLLD_ele_M200', 'VLLD_ele_M300', 'VLLD_ele_M400', 'VLLD_ele_M600', 'VLLD_ele_M800', 'VLLD_ele_M1000', 'VLLD_mu_M200', 'VLLD_mu_M300', 'VLLD_mu_M400', 'VLLD_mu_M600', 'VLLD_mu_M800', 'VLLS_ele_M100', 'VLLS_ele_M125', 'VLLS_ele_M150', 'VLLS_ele_M200', 'VLLS_ele_M250', 'VLLS_ele_M300', 'VLLS_ele_M350', 'VLLS_ele_M400', 'VLLS_ele_M450', 'VLLS_ele_M500', 'VLLS_ele_M750', 'VLLS_ele_M1000', 'VLLS_mu_M100', 'VLLS_mu_M125', 'VLLS_mu_M150', 'VLLS_mu_M200', 'VLLS_mu_M250', 'VLLS_mu_M300', 'VLLS_mu_M400', 'VLLS_mu_M450', 'VLLS_mu_M500', 'VLLS_mu_M750', 'VLLS_mu_M1000', 'VLLS_tau_M100', 'VLLS_tau_M125', 'VLLS_tau_M150', 'VLLS_tau_M200', 'VLLS_tau_M250', 'VLLS_tau_M300', 'VLLS_tau_M350', 'VLLS_tau_M400']
