# Preparing datacards for CMS-Combine

This notebook reads into a text file containing yield information in the following format and turns them into pandas dataframes.
```
bin >> sig >> obs >> exp >> experr >> S/sqrtB >> dbkg
```


Each text file contains one model, for one specific signal region in one specific campaign. The dataframes from all signal regions and campaigns can be combined.

### Reading text files intdictionaryrd

In [1]:
import os, sys
import json
import numpy as np
import pandas as pd
import ROOT

print('Modules loaded.')

Welcome to JupyROOT 6.26/10
Modules loaded.


In [2]:
#Setting parameters

#Set which signal to prove
signame = "VLLD_mu"

#Mention which jobs to join
#jobs = ['yields_2018UL_SE1_mm']
#jobs = ['yields_2018UL_SE1_me']
#jobs = ['yields_2018UL_SE1_em']
jobs = ['yields_2018UL_SE1_mm', 'yields_2018UL_SE1_me', 'yields_2018UL_SE1_em']

#Find the output names:
outputname = f'datacards_{signame}_2018UL_SE1_combined'

sigdict = {
    'VLLD_ele': {
        'M100': {'mass': 100, 'xsec': 16.9,       'ngen': 110871, 'scale':1},
        'M200': {'mass': 200, 'xsec': 1.36,       'ngen': 73730 , 'scale':1},
        'M300': {'mass': 300, 'xsec': 0.291,      'ngen': 24753 , 'scale':1},
        'M400': {'mass': 400, 'xsec': 0.0907,     'ngen': 24491 , 'scale':1},
        'M600': {'mass': 600, 'xsec': 0.0149,     'ngen': 24611 , 'scale':1},
        'M800': {'mass': 800, 'xsec': 0.00347,    'ngen': 23680 , 'scale':1},
        'M1000': {'mass': 1000, 'xsec': 0.000971, 'ngen': 24286 , 'scale':1}
    },
    'VLLD_mu': {
        'M100': {'mass': 100, 'xsec': 16.9,    'ngen': 111926, 'scale':50},
        'M200': {'mass': 200, 'xsec': 1.36,    'ngen': 73908,  'scale':1},
        'M300': {'mass': 300, 'xsec': 0.291,   'ngen': 25022,  'scale':1},
        'M400': {'mass': 400, 'xsec': 0.0907,  'ngen': 24299 , 'scale':1},
        'M600': {'mass': 600, 'xsec': 0.0149,  'ngen': 24890,  'scale':1},
        'M800': {'mass': 800, 'xsec': 0.00347, 'ngen': 24763,  'scale':1}
    }
}

print('Global settings loaded.')

Global settings loaded.


In [3]:
def return_dict(jobname, signame):
    baseinputdir = '../StackMaker/signalyields/'
    indir = os.path.join(baseinputdir, jobname)
    df = {}

    for sample, subdict in sigdict.items():
        if sample not in signame: 
            continue

        if sample not in df: 
            df[sample] = {}

        for subsample, val in subdict.items():
            filename = f'yield_{sample}_{subsample}.txt'
            filepath = os.path.join(indir, filename)

            if not os.path.exists(filepath):
                print(f'\033[33mWarning: File not found: {filepath}\033[0m')
                continue

            try:
                # Read the file into a pandas DataFrame
                temp_df = pd.read_csv(
                    filepath,
                    sep=r'\s+',  # Split by one or more spaces
                    names=['bin', 'sig', 'obs', 'exp', 'experr', 'S/sqrtB', 'dbkg'],
                )

                # If subsample not present in df[sample], initialize numpy arrays for the columns
                if subsample not in df[sample]:
                    df[sample][subsample] = {
                        'bin': np.array([]),
                        'sig': np.array([]),
                        'obs': np.array([]),
                        'exp': np.array([]),
                        'experr': np.array([]),
                        'S/sqrtB': np.array([]),
                        'dbkg': np.array([]),
                    }

                # Append data from temp_df to the numpy arrays
                for col in temp_df.columns:
                    df[sample][subsample][col] = np.append(df[sample][subsample][col], temp_df[col].values)

            except Exception as e:
                print(f'\033[31mError loading file {filepath}: {e}\033[0m')

            #break #subsample
        #break  #sample

    return df


In [4]:
datadict = {}

for jobname in jobs:
    print(f'Reading job: {jobname}')
    
    dict_job = return_dict(jobname, signame)
    
    for sample, subdict in dict_job.items():
        if sample not in datadict:
            datadict[sample] = {}

        for subsample, columns in subdict.items():
            if subsample not in datadict[sample]:
                datadict[sample][subsample] = {
                    'bin': np.array([]),
                    'sig': np.array([]),
                    'obs': np.array([]),
                    'exp': np.array([]),
                    'experr': np.array([]),
                    'S/sqrtB': np.array([]),
                    'dbkg': np.array([]),
                }

            for col in columns: datadict[sample][subsample][col] = np.append(datadict[sample][subsample][col], columns[col])

print('Data collection complete!')

Reading job: yields_2018UL_SE1_mm
Reading job: yields_2018UL_SE1_me
Reading job: yields_2018UL_SE1_em
Data collection complete!


## Preparing datacard from the dictionary

In [5]:
def write_datacard(df, datacard):
    df = df.reset_index(drop=True)
    num_bins = len(df)  # Total number of bins
    
    if num_bins == 0:
        print(f'Warning: Zero bins detected! Skipping file {datacard}')
        return
    
    with open(datacard, 'w') as f:
        # Header information
        f.write(f"imax {num_bins}                          # number of channels\n")
        f.write(f"jmax 1                           # number of backgrounds\n")
        f.write(f"kmax {num_bins}                          # number of nuisance parameters\n")
        f.write("------------\n")
        
        # Bin section
        f.write(f"{'bin':<16}")
        line = ""
        for i in range(num_bins): line += f"bin{i + 1}\t"
        line = line[:-1]
        f.write(line + "\n")
        
        # Observation section
        f.write(f"{'observation':<16}")
        line = ""
        for i in range(num_bins): line += f"{int(df['obs'].iat[i])}\t"
        line = line[:-1]
        f.write(line + "\n")
        f.write("------------\n")

        # Bin-Bin section
        f.write(f"{'bin':<16}")
        line = ""
        for i in range(num_bins): line += f"bin{i + 1}\tbin{i + 1}\t"
        line = line[:-1]
        f.write(line + "\n")
        
        # Process section
        f.write(f"{'process':<16}")
        line = ""
        for i in range(num_bins): line += "sig\tbkg\t"
        line = line[:-1]
        f.write(line + "\n")

        # Process ID section:
        f.write(f"{'process':<16}")
        line = ""
        for i in range(num_bins): line += f"{-1*(i + 1)}\t{(i + 1)}\t"
        line = line[:-1]
        f.write(line + "\n")

        # Rate section
        f.write(f"{'rate':<16}")
        line = ""
        for i in range(num_bins): line += f"{df['sig'].iat[i]:.2f}\t{df['exp'].iat[i]:.2f}\t"
        line = line[:-1]
        f.write(line + "\n")
        f.write("------------\n")

        # Uncertainty:
        for i in range(num_bins):
            uncertainty_line = f"xs{i + 1:<6}lnN\t"
            values = []
            for j in range(num_bins):
                if j == i: # Diagonal element
                    values.append("-")  # Signal uncertainty
                    uncertainty_value = df['dbkg'].iat[i]
                    values.append(f"{uncertainty_value:.5f}") # Background uncertainty
                else:
                    values.append("-") # Signal uncertainty
                    values.append("-") # Background uncertainty
            uncertainty_line += "\t".join(values)
            f.write(uncertainty_line + "\n")
            
    print(f'Wrote file: {datacard}')


In [6]:
outdir = f'datacards/{outputname}'
os.makedirs(outdir, exist_ok=True)

count = 0
for sample, subs in sigdict.items():
    if sample not in datadict: continue
    
    for subsample, val in subs.items():
        if subsample not in datadict[sample]: continue

        count+= 1
        sampleyield = datadict[sample][subsample]
        sample_df = pd.DataFrame(sampleyield)
        sample_df['bin'] = sample_df['bin'].astype(int)
        sample_df['obs'] = sample_df['obs'].astype(int)
        sample_df_sorted = sample_df.sort_values(by='S/sqrtB', ascending=False)

        if count < 5:
            print(f"\nDataFrame for {sample}_{subsample}:")
            display(sample_df_sorted)

        datacard_name = f'datacard_{sample}_{subsample}.txt'
        datacard_path = os.path.join(outdir, datacard_name)
        write_datacard(sample_df_sorted, datacard_path)

print('Done!')


DataFrame for VLLD_mu_M100:


Unnamed: 0,bin,sig,obs,exp,experr,S/sqrtB,dbkg
3,4,413.81,623,623.849,8.63235,16.5677,1.01384
17,4,368.492,953,953.215,14.5895,11.9353,1.01531
4,5,143.045,285,285.907,5.36517,8.45981,1.01877
2,3,94.239,156,156.407,4.15204,7.53534,1.02655
18,5,150.998,499,499.221,11.9818,6.75812,1.024
5,6,52.7757,88,88.0511,2.36653,5.62428,1.02688
16,3,90.2128,289,289.874,8.46979,5.29863,1.02922
6,7,30.1574,51,51.8055,2.20294,4.18992,1.04252
19,6,43.5749,190,190.591,9.48952,3.15635,1.04979
11,5,44.8469,237,237.226,7.7096,2.91173,1.0325


Wrote file: datacards/datacards_VLLD_mu_2018UL_SE1_combined/datacard_VLLD_mu_M100.txt

DataFrame for VLLD_mu_M200:


Unnamed: 0,bin,sig,obs,exp,experr,S/sqrtB,dbkg
6,7,38.8642,51,51.8055,2.20294,5.3996,1.04252
13,7,25.2846,52,52.7873,2.78106,3.4801,1.05268
5,6,31.1269,88,88.0511,2.36653,3.31718,1.02688
4,5,47.1071,285,285.907,5.36517,2.78595,1.01877
3,4,60.2664,623,623.849,8.63235,2.41288,1.01384
12,6,21.371,80,80.8938,3.68821,2.37612,1.04559
11,5,24.8027,237,237.226,7.7096,1.61034,1.0325
9,3,11.0961,109,109.862,5.17164,1.05863,1.04707
1,2,5.06131,22,22.9272,1.38723,1.05703,1.06051
0,1,3.69554,12,12.4779,0.809931,1.04618,1.06491


Wrote file: datacards/datacards_VLLD_mu_2018UL_SE1_combined/datacard_VLLD_mu_M200.txt

DataFrame for VLLD_mu_M300:


Unnamed: 0,bin,sig,obs,exp,experr,S/sqrtB,dbkg
6,7,17.318,51,51.8055,2.20294,2.40608,1.04252
13,7,12.7925,52,52.7873,2.78106,1.76072,1.05268
12,6,7.84418,80,80.8938,3.68821,0.872147,1.04559
11,5,13.0528,237,237.226,7.7096,0.847467,1.0325
5,6,7.93794,88,88.0511,2.36653,0.845942,1.02688
4,5,14.1278,285,285.907,5.36517,0.835528,1.01877
3,4,16.0211,623,623.849,8.63235,0.641436,1.01384
10,4,9.77436,468,468.241,11.9696,0.451703,1.02556
0,1,1.36362,12,12.4779,0.809931,0.386032,1.06491
2,3,4.40969,156,156.407,4.15204,0.352598,1.02655


Wrote file: datacards/datacards_VLLD_mu_2018UL_SE1_combined/datacard_VLLD_mu_M300.txt

DataFrame for VLLD_mu_M400:


Unnamed: 0,bin,sig,obs,exp,experr,S/sqrtB,dbkg
6,7,8.15834,51,51.8055,2.20294,1.13348,1.04252
13,7,4.30442,52,52.7873,2.78106,0.592448,1.05268
12,6,3.76893,80,80.8938,3.68821,0.419045,1.04559
5,6,3.34331,88,88.0511,2.36653,0.356295,1.02688
0,1,1.25517,12,12.4779,0.809931,0.355332,1.06491
4,5,3.74397,285,285.907,5.36517,0.221421,1.01877
11,5,2.86737,237,237.226,7.7096,0.186167,1.0325
3,4,3.96916,623,623.849,8.63235,0.158913,1.01384
1,2,0.755527,22,22.9272,1.38723,0.157788,1.06051
2,3,1.12763,156,156.407,4.15204,0.090165,1.02655


Wrote file: datacards/datacards_VLLD_mu_2018UL_SE1_combined/datacard_VLLD_mu_M400.txt
Wrote file: datacards/datacards_VLLD_mu_2018UL_SE1_combined/datacard_VLLD_mu_M600.txt
Wrote file: datacards/datacards_VLLD_mu_2018UL_SE1_combined/datacard_VLLD_mu_M800.txt
Done!
