# Reading into histograms

In [1]:
import os, sys
import json
import numpy as np
import pandas as pd
import ROOT

print('Modules loaded.')

Welcome to JupyROOT 6.26/10
Modules loaded.


In [2]:
jsonfile2018 = '../../InputJsons/lumidata_2018.json'

with open(jsonfile2018,'r') as infile: filedict = json.load(infile)
    
jobnames = ["hist_2LSS_SE2_Oct03_mm"]
histname = 'nnscore_qcd_vlld_combined'
rebin = 10

signals = ['VLLD_mu']
tag = 'scaled_mm'

In [3]:
sigdict = {
    'VLLD_ele': {
        'M100': {'mass': 100, 'xsec': 16.9,       'ngen': 110871, 'scale':1},
        'M200': {'mass': 200, 'xsec': 1.36,       'ngen': 73730 , 'scale':1},
        'M300': {'mass': 300, 'xsec': 0.291,      'ngen': 24753 , 'scale':1},
        'M400': {'mass': 400, 'xsec': 0.0907,     'ngen': 24491 , 'scale':1},
        'M600': {'mass': 600, 'xsec': 0.0149,     'ngen': 24611 , 'scale':1},
        'M800': {'mass': 800, 'xsec': 0.00347,    'ngen': 23680 , 'scale':1},
        'M1000': {'mass': 1000, 'xsec': 0.000971, 'ngen': 24286 , 'scale':1}
    },
    'VLLD_mu': {
        'M100': {'mass': 100, 'xsec': 16.9,    'ngen': 111926, 'scale':50},
        'M200': {'mass': 200, 'xsec': 1.36,    'ngen': 73908,  'scale':1},
        'M300': {'mass': 300, 'xsec': 0.291,   'ngen': 25022,  'scale':1},
        'M400': {'mass': 400, 'xsec': 0.0907,  'ngen': 24299 , 'scale':1},
        'M600': {'mass': 600, 'xsec': 0.0149,  'ngen': 24890,  'scale':1},
        'M800': {'mass': 800, 'xsec': 0.00347, 'ngen': 24763,  'scale':1}
    }
}

bkgdict = {}

def set_last_bin_as_overflow(hst):
    lastBin  = hst.GetNbinsX()
    content  = hst.GetBinContent(lastBin)
    error    = hst.GetBinError(lastBin)
    overflow = hst.GetBinContent(lastBin + 1)
            
    updated_content = content + overflow
    updated_error   = (error**2 + overflow**2)**0.5
            
    hst.SetBinContent(lastBin, updated_content)
    hst.SetBinError(lastBin, updated_error)
            
    # Handle underflow:
    content_first = hst.GetBinContent(1)
    error_first   = hst.GetBinError(1)
    underflow     = hst.GetBinContent(0)
            
    updated_content_first = content_first + underflow
    updated_error_first = (error_first**2 + underflow**2)**0.5
            
    hst.SetBinContent(1, updated_content_first)
    hst.SetBinError(1, updated_error_first)

print('Functions loaded.')

Functions loaded.


In [4]:
for sample, subs in filedict.items():
    if sample not in bkgdict: bkgdict[sample] = {}

    for subsample, lumi in subs.items():
        #if sample not in 'VLLD_mu': continue
        if 'SingleMuon' in sample or 'EGamma' in sample: continue
        #print(sample, subsample, lumi)
        if subsample not in bkgdict[sample]: bkgdict[sample][subsample] = {}
        
        yields = []
        errors = []
        integrals = []

        #Fill-up filedict and signal-dict with more information:
        # Step1: Open the histogram and find out yield and error in in each bins.
        for job in jobnames:
            
            input_dir = os.path.join('../input_hists', job)
            filename = f'hst_{sample}_{subsample}.root'
            filepath = os.path.join(input_dir, filename)
            if not os.path.exists(filepath): continue

            tfile = ROOT.TFile(filepath)
            hist = tfile.Get(histname)

            set_last_bin_as_overflow(hist)

            #Pick lumi from signal:
            if sample in signals:
                lumi = sigdict[sample][subsample]['ngen']/sigdict[sample][subsample]['xsec']
                
            hist.Scale(59800/lumi)

            hist.Rebin(rebin)
            integral = hist.Integral()
            integrals.append(integral)

            nbins = hist.GetNbinsX()
            #print(nbins)
            for bin in range(1, nbins + 1):
                yield_value = hist.GetBinContent(bin)
                error_value = hist.GetBinError(bin)
                yields.append(yield_value)
                errors.append(error_value)

            tfile.Close()

        if 'VLL' not in sample:
            bkgdict[sample][subsample]['yields'] = yields
            bkgdict[sample][subsample]['errors'] = errors
            bkgdict[sample][subsample]['integrals'] = integrals

        if sample in signals:
            if sample not in sigdict:            sigdict[sample] = {}
            if subsample not in sigdict[sample]: sigdict[sample][subsample] = {}
            sigdict[sample][subsample]['yields'] = yields
            sigdict[sample][subsample]['errors'] = errors
            sigdict[sample][subsample]['integrals'] = integrals
            #print(f'Updated dictionary for {sample} {subsample}')
            if sample == 'VLLD_mu' and subsample=='M100': print(yields, errors, integrals)
        

[0.0, 0.0, 0.0, 0.0, 0.0, 7.604369163513184, 15.656810760498047, 51.892940521240234, 221.5583953857422, 254.70211791992188, 353.43292236328125, 386.4912414550781, 374.7251892089844, 468.6051025390625, 577.6663208007812, 505.5987548828125, 508.2881164550781, 493.8747863769531, 373.5516052246094, 321.6204528808594] [0.0, 0.0, 0.0, 0.0, 0.0, 7.604369208022463, 11.0787801711085, 19.7482145388476, 41.95643406151715, 44.5624052790188, 52.34432197806308, 54.457588841284014, 53.832447659131596, 59.37956532954974, 67.02220411683332, 62.03849015817213, 62.84738291787642, 62.071569276726024, 54.2528806583707, 50.95392574806199] [4915.2691259384155]


### Extracting background yields

In [5]:
combined_bkg_yield = None
combined_bkg_error = None

# Loop through the samples and subsamples in bkgdict
for sample, subs in bkgdict.items():
    if 'VLL' in sample: continue
        
    for subsample, val in subs.items():
        
        # Check if 'yields' and 'errors' keys exist in the current subsample
        if 'yields' not in val or 'errors' not in val:
            print(f"Warning: 'yields' or 'errors' not found for sample {sample}, subsample {subsample}.")
            continue

        skip_samples = ['WWZ_WWZJetsTo4L2Nu', 'WJetsNLO_Inclusive', 'WpWp_WpWpJJQCD', 'WGamma_Inclusive']
        if sample+'_'+subsample in skip_samples:
            print(f'Skipping background: {sample} {subsample}')
            continue
            
        # Get the yields and errors for the current subsample
        yields = np.array(val['yields'])
        errors = np.array(val['errors'])

        if yields.size == 0 or errors.size == 0: continue

        # Initialize the combined arrays if not already initialized
        if combined_bkg_yield is None:
            combined_bkg_yield = np.zeros_like(yields)
            combined_bkg_error = np.zeros_like(errors)

        # Add yields normally
        combined_bkg_yield += yields

        # Add errors in quadrature
        combined_bkg_error = np.sqrt(combined_bkg_error**2 + errors**2)
        #print(f"Total yields and errors calculated for background: {sample} {subsample}")

# Now, `combined_bkg_yield` contains the total yields binwise
# And `combined_bkg_error` contains the total errors binwise
print('\n'+'-'*100)
print(f"{'bin':<7} {'nBkg':<7} {'nBkgErr'}")
for i in range(combined_bkg_yield.shape[0]):
    print(f'{i+1:<7} {combined_bkg_yield[i]:<7.2f} {combined_bkg_error[i]:.3f}')

Skipping background: WWZ WWZJetsTo4L2Nu
Skipping background: WJetsNLO Inclusive
Skipping background: WpWp WpWpJJQCD
Skipping background: WGamma Inclusive

----------------------------------------------------------------------------------------------------
bin     nBkg    nBkgErr
1       0.00    0.001
2       0.13    0.071
3       0.33    0.110
4       2.81    1.460
5       5.93    2.413
6       7.27    0.761
7       23.24   2.151
8       101.20  33.410
9       128.65  8.768
10      190.22  9.035
11      320.50  35.436
12      488.89  155.726
13      301.45  16.295
14      306.64  12.812
15      340.33  33.818
16      360.66  34.011
17      328.55  11.486
18      365.38  16.793
19      397.56  13.092
20      360.77  8.473


### Extracting data yields (Setting it to background for now)

In [6]:
combined_data = combined_bkg_yield

# Preparing dataframe that holds yields

In [7]:
def prepare_df(sigdict, combined_data, combined_bkg_yield, combined_bkg_error, scalesignal=1):
    nbins = len(combined_bkg_yield)  # Set nbins based on the size of combined_bkg_yield

    # Dictionary to store dataframes for each sample and subsample
    df_hierarchy = {}

    for sample, subs in sigdict.items():
        df_hierarchy[sample] = {}  # Initialize a dictionary for each sample
        if sample not in signals: continue
        
        for subsample, val in subs.items():
            #if sample != 'VLLD_mu': continue
            #if subsample != 'M100': continue

            # Initialize the dataframe to store information for the current subsample:
            df = None

            yields = np.array(val['yields'])
            errors = np.array(val['errors'])

            # If yields are zero, set a small value
            yields[yields == 0] = 1e-7
            
            for ibin in range(nbins):

                nbin = ibin + 1
                
                sig  = yields[ibin] if ibin < len(yields) else 0
                dsig = errors[ibin] if ibin < len(errors) else 0

                # Ensure the background is not zero
                bkg  = combined_bkg_yield[ibin] if ibin < len(combined_bkg_yield) else 0
                dbkg = combined_bkg_error[ibin] if ibin < len(combined_bkg_error) else 0
                
                if bkg == 0:
                    pass
                    #print(f'Removed bin {nbin} with zero background in {sample} - {subsample}')
                    #continue  # Reject bins with zero background
                
                # Signal-to-background ratio
                stob = 0
                relative_dbkg = 0
                if bkg != 0 :
                    stob = sig / np.sqrt(bkg)
                    relative_dbkg = dbkg/bkg
                
                deltaB = 1 + relative_dbkg
                
                # Create the entry for the dataframe as a row
                rowdict = {
                    'bin': nbin,
                    'signal': sig,  # Store as numeric
                    'nObs': combined_data[ibin],  # Store as numeric
                    'nBkg': bkg,  # Store as numeric
                    'bkg_err': dbkg,  # Store as numeric
                    'S/sqrtB': stob,  # Store as numeric
                    'deltaB': deltaB  # Store as numeric
                }
                rowdict['signal'] = rowdict['signal']/sigdict[sample][subsample]['scale']
                #rowdict['signal'] = rowdict['signal']/scalesignal
                new_row = pd.DataFrame([rowdict])

                if df is None: df = new_row
                else: df = pd.concat([df, new_row], ignore_index=True)
                
            # Sort and filter the dataframe
            df = df.sort_values(by='S/sqrtB', ascending=False).reset_index(drop=True)

            filter_condition = pd.Series([True] * len(df))
            filter_condition = (df['deltaB']<1.10)
            samplename = sample+'_'+subsample
            '''
            if   samplename == 'VLLD_mu_M100': filter_condition = filter_condition & (df['S/sqrtB'] > 0.001)
            elif samplename == 'VLLD_mu_M200': filter_condition = filter_condition & (df['S/sqrtB'] > 0.1)
            elif samplename == 'VLLD_mu_M300': filter_condition = filter_condition & (df['S/sqrtB'] > 0.1)
            elif samplename == 'VLLD_mu_M400': filter_condition = filter_condition & (df['S/sqrtB'] > 0.01)
            elif samplename == 'VLLD_mu_M600': filter_condition = filter_condition & (df['S/sqrtB'] > 0.001)
            elif samplename == 'VLLD_mu_M800': filter_condition = filter_condition & (df['S/sqrtB'] > 0.0001)
            ''' 
            df = df.loc[filter_condition]
            df = df.reset_index(drop=True)

            # Store the DataFrame in the dictionary
            df_hierarchy[sample][subsample] = df

    return df_hierarchy

In [8]:
scale = 0.01
df = prepare_df(sigdict, combined_data, combined_bkg_yield, combined_bkg_error, scalesignal = scale)
display(df['VLLD_mu']['M800'])

Unnamed: 0,bin,signal,nObs,nBkg,bkg_err,S/sqrtB,deltaB
0,20,43.413019,360.766683,360.766683,8.473456,0.02285634,1.023487
1,19,11.345303,397.563738,397.563738,13.091553,0.005690006,1.032929
2,18,9.083211,365.382555,365.382555,16.792507,0.004751881,1.045959
3,17,7.130785,328.547888,328.547888,11.48616,0.003934033,1.03496
4,14,5.917874,306.643638,306.643638,12.812433,0.003379471,1.041783
5,15,6.121801,340.326766,340.326766,33.81799,0.003318418,1.099369
6,16,4.463984,360.664078,360.664078,34.010652,0.002350559,1.0943
7,13,1.566084,301.451255,301.451255,16.294924,0.0009020001,1.054055
8,10,0.774828,190.219543,190.219543,9.03546,0.0005617947,1.0475
9,7,1e-05,23.241538,23.241538,2.150899,2.074281e-08,1.092545


In [9]:
df_test = df['VLLD_mu']['M800']

total_sig = np.sum(df_test['signal'])
total_bkg = np.sum(df_test['nBkg'])

print(f'total sig = {total_sig:.2f}')
print(f'total bkg = {total_bkg:.2f}')
print(f"{'nbin':<7} {'signal':<7} {'nObs':<7} {'nBkg':<7} {'bkgErr':<7} {'S/sqrtB':<7} {'deltaB'}")
for index, row in df_test.iterrows():
    print(f"{index+1}\t{row['signal']:.2f}\t{row['nObs']:.2f}\t{row['nBkg']:.2f}\t{row['bkg_err']:.2f}\t{row['S/sqrtB']:.2f}\t{row['deltaB']:.2f}")

total sig = 89.82
total bkg = 3103.46
nbin    signal  nObs    nBkg    bkgErr  S/sqrtB deltaB
1	43.41	360.77	360.77	8.47	0.02	1.02
2	11.35	397.56	397.56	13.09	0.01	1.03
3	9.08	365.38	365.38	16.79	0.00	1.05
4	7.13	328.55	328.55	11.49	0.00	1.03
5	5.92	306.64	306.64	12.81	0.00	1.04
6	6.12	340.33	340.33	33.82	0.00	1.10
7	4.46	360.66	360.66	34.01	0.00	1.09
8	1.57	301.45	301.45	16.29	0.00	1.05
9	0.77	190.22	190.22	9.04	0.00	1.05
10	0.00	23.24	23.24	2.15	0.00	1.09
11	0.00	128.65	128.65	8.77	0.00	1.07


## Writing into text files

In [10]:
def write_df_to_file(df, filename):
    with open(filename, 'w') as f:
        # Write the header with fixed widths
        header = f"{'bin':<10}{'signal':<10}{'nObs':<10}{'nBkg':<10}{'bkg_err':<10}{'S/sqrtB':<10}{'deltaB':<10}"
        f.write(header + '\n')
        
        # Write the data rows
        for index, row in df.iterrows():
            line = f"{int(row['bin']):<10}"
            line += f"{float(row['signal']):<10.2f}"
            line += f"{float(row['nObs']):<10.2f}"
            line += f"{float(row['nBkg']):<10.2f}"
            line += f"{float(row['bkg_err']):<10.2f}"
            line += f"{float(row['S/sqrtB']):<10.2f}"
            line += f"{float(row['deltaB']):<10.4f}"
            f.write(line + '\n')

def write_datacard(df, datacard):

    df = df.reset_index(drop=True)
    num_bins = len(df)  # Total number of bins
    
    if num_bins == 0:
        print(f'Warning: Zero bins detected! SKipping file {datacard}')
        return
        
    #print(f'Processing {num_bins} bins ')
    
    with open(datacard, 'w') as f:
        #header information
        f.write(f"imax {num_bins}                          # number of channels\n")
        f.write(f"jmax 1                           # number of backgrounds\n")
        f.write(f"kmax {num_bins}                          # number of nuisance parameters\n")
        f.write("------------\n")
        
        # Bin section
        f.write(f"{'bin':<16}")
        line = ""
        for i in range(num_bins): line += f"bin{i + 1}\t"
        line = line[:-1]
        f.write(line + "\n")
        
        # Observation section
        f.write(f"{'observation':<16}")
        line = ""
        for i in range(num_bins): line += f"{int(df['nObs'].iat[i])}\t"
        line = line[:-1]
        f.write(line + "\n")
        f.write("------------\n")

        # Bin-Bin section
        f.write(f"{'bin':<16}")
        line = ""
        for i in range(num_bins): line += f"bin{i + 1}\tbin{i + 1}\t"
        line = line[:-1]
        f.write(line + "\n")
        
        #Process section
        f.write(f"{'process':<16}")
        line = ""
        for i in range(num_bins): line += "sig\tbkg\t"
        line = line[:-1]
        f.write(line + "\n")

        #Process ID section:
        f.write(f"{'process':<16}")
        line = ""
        for i in range(num_bins): line += f"{-1*(i + 1)}\t{(i + 1)}\t"
        line = line[:-1]
        f.write(line + "\n")

        # Rate section
        f.write(f"{'rate':<16}")
        line = ""
        for i in range(num_bins): line += f"{df['signal'].at[i]:.2f}\t{df['nBkg'].at[i]:.2f}\t"
        line = line[:-1]
        f.write(line + "\n")
        f.write("------------\n")

        #uncertainty:
        for i in range(num_bins):
            uncertainty_line = f"xs{i + 1:<6}lnN\t"
            values = []
            for j in range(num_bins):
                if j == i: # Diagonal element
                    values.append("-")  # Signal uncertainty
                    uncertainty_value = df['deltaB'][i]
                    values.append(f"{uncertainty_value:.5f}") # Background uncertainty
                else:
                    values.append("-") # Signal uncertainty
                    values.append("-") # Background uncertainty
            uncertainty_line += "\t".join(values)
            f.write(uncertainty_line + "\n")
            
    print(f'Wrote file: {datacard}')  
    
print('Funtions loaded.')

Funtions loaded.


In [11]:
#write_datacard(df['VLLD_mu']['M800'], f'test_scale{scale}.txt')

Wrote file: test_scale0.01.txt


In [12]:
not_found = []

for sample, subs in sigdict.items():
    for subsample, val in subs.items():
        outfolder = f"yields/{tag}"
        yieldfile = f"{outfolder}/yields_{sample}_{subsample}_{tag}.txt"
        os.makedirs(outfolder, exist_ok=True)
        
        # Check if the DataFrame for the current sample and subsample exists
        if sample in df and subsample in df[sample]:
            write_df_to_file(df[sample][subsample], yieldfile)
            #pass
            print(f'Wrote text file: {yieldfile}')
        else: not_found.append(f'{sample}_{subsample}')

print('\nWarning: Information for the following samples are not found.')
print(not_found)

Wrote text file: yields/scaled_mm/yields_VLLD_mu_M100_scaled_mm.txt
Wrote text file: yields/scaled_mm/yields_VLLD_mu_M200_scaled_mm.txt
Wrote text file: yields/scaled_mm/yields_VLLD_mu_M300_scaled_mm.txt
Wrote text file: yields/scaled_mm/yields_VLLD_mu_M400_scaled_mm.txt
Wrote text file: yields/scaled_mm/yields_VLLD_mu_M600_scaled_mm.txt
Wrote text file: yields/scaled_mm/yields_VLLD_mu_M800_scaled_mm.txt

['VLLD_ele_M100', 'VLLD_ele_M200', 'VLLD_ele_M300', 'VLLD_ele_M400', 'VLLD_ele_M600', 'VLLD_ele_M800', 'VLLD_ele_M1000']


In [13]:
not_found = []

for sample, subs in sigdict.items():
    for subsample, val in subs.items():
        outfolder = f"datacards/{tag}"
        datacard = f"{outfolder}/datacard_{sample}_{subsample}_{tag}.txt"
        os.makedirs(outfolder, exist_ok=True)

        # Check if the DataFrame for the current sample and subsample exists
        if sample in df and subsample in df[sample]:
            write_datacard(df[sample][subsample], datacard)
            #pass
            
        else: not_found.append(f'{sample}_{subsample}')

print('\nWarning: Information for the following samples are not found.')
print(not_found)


['VLLD_ele_M100', 'VLLD_ele_M200', 'VLLD_ele_M300', 'VLLD_ele_M400', 'VLLD_ele_M600', 'VLLD_ele_M800', 'VLLD_ele_M1000']
