# Preparing datacards for CMS-Combine

This notebook reads into a text file containing yield information in the following format and turns them into pandas dataframes.
```
bin >> sig >> obs >> exp >> experr >> S/sqrtB >> dbkg
```


Each text file contains one model, for one specific signal region in one specific campaign. The dataframes from all signal regions and campaigns can be combined.

### Reading text files intdictionaryrd

In [1]:
import os, sys
import json
import numpy as np
import pandas as pd
import ROOT

print('Modules loaded.')

Welcome to JupyROOT 6.26/10
Modules loaded.


In [2]:
#Setting parameters

#Set which signal to probe
signame = "VLLD_mu"

#Mention which jobs to join
jobs = ['hist_2018UL_sr_Dec30_mm']

#Find the output names:
outputname = f'datacards_{signame}_2018UL_test'

sigdict = {
    'VLLD_ele': {
        'M100': {'mass': 100, 'xsec': 16.9,       'ngen': 110871, 'scale':1},
        'M200': {'mass': 200, 'xsec': 1.36,       'ngen': 73730 , 'scale':1},
        'M300': {'mass': 300, 'xsec': 0.291,      'ngen': 24753 , 'scale':1},
        'M400': {'mass': 400, 'xsec': 0.0907,     'ngen': 24491 , 'scale':1},
        'M600': {'mass': 600, 'xsec': 0.0149,     'ngen': 24611 , 'scale':1},
        'M800': {'mass': 800, 'xsec': 0.00347,    'ngen': 23680 , 'scale':1},
        'M1000': {'mass': 1000, 'xsec': 0.000971, 'ngen': 24286 , 'scale':1}
    },
    'VLLD_mu': {
        'M100': {'mass': 100, 'xsec': 16.9,    'ngen': 111926, 'scale':10},
        'M200': {'mass': 200, 'xsec': 1.36,    'ngen': 73908,  'scale':1},
        'M300': {'mass': 300, 'xsec': 0.291,   'ngen': 25022,  'scale':1},
        'M400': {'mass': 400, 'xsec': 0.0907,  'ngen': 24299 , 'scale':1},
        'M600': {'mass': 600, 'xsec': 0.0149,  'ngen': 24890,  'scale':1},
        'M800': {'mass': 800, 'xsec': 0.00347, 'ngen': 24763,  'scale':1}
    }
}

print('Global settings loaded.')

Global settings loaded.


In [3]:
def return_dict(jobname, signame):
    baseinputdir = '../StackMaker/signalyields/'
    indir = os.path.join(baseinputdir, jobname)
    df = {}

    for sample, subdict in sigdict.items():
        if sample not in signame: 
            continue

        if sample not in df: 
            df[sample] = {}

        for subsample, val in subdict.items():
            filename = f'yield_{sample}_{subsample}.txt'
            filepath = os.path.join(indir, filename)

            if not os.path.exists(filepath):
                print(f'\033[33mWarning: File not found: {filepath}\033[0m')
                continue

            try:
                # Read the file into a pandas DataFrame
                temp_df = pd.read_csv(
                    filepath,
                    sep=r'\s+',  # Split by one or more spaces
                    names=['bin', 'sig', 'obs', 'exp', 'experr', 'S/sqrtB', 'dbkg'],
                )

                # If subsample not present in df[sample], initialize numpy arrays for the columns
                if subsample not in df[sample]:
                    df[sample][subsample] = {
                        'bin': np.array([]),
                        'sig': np.array([]),
                        'obs': np.array([]),
                        'exp': np.array([]),
                        'experr': np.array([]),
                        'S/sqrtB': np.array([]),
                        'dbkg': np.array([]),
                    }

                # Append data from temp_df to the numpy arrays
                for col in temp_df.columns:
                    df[sample][subsample][col] = np.append(df[sample][subsample][col], temp_df[col].values)

            except Exception as e:
                print(f'\033[31mError loading file {filepath}: {e}\033[0m')

            #break #subsample
        #break  #sample

    return df

In [4]:
datadict = {}

for jobname in jobs:
    print(f'Reading job: {jobname}')
    
    dict_job = return_dict(jobname, signame)
    
    for sample, subdict in dict_job.items():
        if sample not in datadict:
            datadict[sample] = {}

        for subsample, columns in subdict.items():
            if subsample not in datadict[sample]:
                datadict[sample][subsample] = {
                    'bin': np.array([]),
                    'sig': np.array([]),
                    'obs': np.array([]),
                    'exp': np.array([]),
                    'experr': np.array([]),
                    'S/sqrtB': np.array([]),
                    'dbkg': np.array([]),
                }

            for col in columns: datadict[sample][subsample][col] = np.append(datadict[sample][subsample][col], columns[col])

print('Data collection complete!')

Reading job: hist_2018UL_sr_Dec30_mm
Data collection complete!


## Preparing datacard from the dictionary

In [5]:
def write_datacard(df, datacard):
    df = df.reset_index(drop=True)
    num_bins = len(df)  # Total number of bins
    
    if num_bins == 0:
        print(f'Warning: Zero bins detected! Skipping file {datacard}')
        return
    
    with open(datacard, 'w') as f:
        # Header information
        f.write(f"imax {num_bins}                          # number of channels\n")
        f.write(f"jmax 1                           # number of backgrounds\n")
        f.write(f"kmax {num_bins}                          # number of nuisance parameters\n")
        f.write("------------\n")
        
        # Bin section
        f.write(f"{'bin':<16}")
        line = ""
        for i in range(num_bins): line += f"bin{i + 1}\t"
        line = line[:-1]
        f.write(line + "\n")
        
        # Observation section
        f.write(f"{'observation':<16}")
        line = ""
        for i in range(num_bins): line += f"{int(df['obs'].iat[i])}\t"
        line = line[:-1]
        f.write(line + "\n")
        f.write("------------\n")

        # Bin-Bin section
        f.write(f"{'bin':<16}")
        line = ""
        for i in range(num_bins): line += f"bin{i + 1}\tbin{i + 1}\t"
        line = line[:-1]
        f.write(line + "\n")
        
        # Process section
        f.write(f"{'process':<16}")
        line = ""
        for i in range(num_bins): line += "sig\tbkg\t"
        line = line[:-1]
        f.write(line + "\n")

        # Process ID section:
        f.write(f"{'process':<16}")
        line = ""
        for i in range(num_bins): line += f"{-1*(i + 1)}\t{(i + 1)}\t"
        line = line[:-1]
        f.write(line + "\n")

        # Rate section
        f.write(f"{'rate':<16}")
        line = ""
        for i in range(num_bins): line += f"{df['sig'].iat[i]:.2f}\t{df['exp'].iat[i]:.2f}\t"
        line = line[:-1]
        f.write(line + "\n")
        f.write("------------\n")

        # Uncertainty:
        for i in range(num_bins):
            uncertainty_line = f"xs{i + 1:<6}lnN\t"
            values = []
            for j in range(num_bins):
                if j == i: # Diagonal element
                    values.append("-")  # Signal uncertainty
                    uncertainty_value = df['dbkg'].iat[i]
                    values.append(f"{uncertainty_value:.5f}") # Background uncertainty
                else:
                    values.append("-") # Signal uncertainty
                    values.append("-") # Background uncertainty
            uncertainty_line += "\t".join(values)
            f.write(uncertainty_line + "\n")
            
    print(f'Wrote file: {datacard}')


In [6]:
outdir = f'datacards/{outputname}'
os.makedirs(outdir, exist_ok=True)

count = 0
for sample, subs in sigdict.items():
    if sample not in datadict: continue
    
    for subsample, val in subs.items():
        if subsample not in datadict[sample]: continue

        count+= 1
        sampleyield = datadict[sample][subsample]
        sample_df = pd.DataFrame(sampleyield)
        sample_df['bin'] = sample_df['bin'].astype(int)
        sample_df['obs'] = sample_df['obs'].astype(int)
        sample_df = sample_df[sample_df['S/sqrtB']>0.0001] #dropping very small S/B
        sample_df_sorted = sample_df.sort_values(by='S/sqrtB', ascending=False)

        if count < 5:
            print(f"\nDataFrame for {sample}_{subsample}:")
            display(sample_df_sorted)

        datacard_name = f'datacard_{sample}_{subsample}.txt'
        datacard_path = os.path.join(outdir, datacard_name)
        write_datacard(sample_df_sorted, datacard_path)

print('Done!')


DataFrame for VLLD_mu_M100:


Unnamed: 0,bin,sig,obs,exp,experr,S/sqrtB,dbkg
3,4,467.317,597,597.746,9.9635,19.1141,1.01667
4,5,283.269,347,347.348,6.98263,15.1991,1.0201
2,3,91.721,101,101.634,3.09115,9.09808,1.03041
6,7,57.1607,69,69.6194,2.704,6.85066,1.03884
5,6,70.6785,120,120.223,3.58042,6.44606,1.02978


Wrote file: datacards/datacards_VLLD_mu_2018UL_test/datacard_VLLD_mu_M100.txt

DataFrame for VLLD_mu_M200:


Unnamed: 0,bin,sig,obs,exp,experr,S/sqrtB,dbkg
6,7,56.6738,69,69.6194,2.704,6.79231,1.03884
4,5,83.4812,347,347.348,6.98263,4.47926,1.0201
5,6,48.8873,120,120.223,3.58042,4.45865,1.02978
3,4,90.4104,597,597.746,9.9635,3.69794,1.01667
0,1,3.74175,7,7.67507,0.586551,1.35062,1.07642
2,3,11.8126,101,101.634,3.09115,1.17173,1.03041
1,2,4.35518,15,15.7289,1.69254,1.09814,1.10761


Wrote file: datacards/datacards_VLLD_mu_2018UL_test/datacard_VLLD_mu_M200.txt

DataFrame for VLLD_mu_M300:


Unnamed: 0,bin,sig,obs,exp,experr,S/sqrtB,dbkg
6,7,25.5809,69,69.6194,2.704,3.06585,1.03884
5,6,14.5643,120,120.223,3.58042,1.3283,1.02978
4,5,16.8466,347,347.348,6.98263,0.903922,1.0201
3,4,17.6226,597,597.746,9.9635,0.720794,1.01667
2,3,5.083,101,101.634,3.09115,0.504198,1.03041
1,2,1.9856,15,15.7289,1.69254,0.50066,1.10761
0,1,1.36362,7,7.67507,0.586551,0.492212,1.07642


Wrote file: datacards/datacards_VLLD_mu_2018UL_test/datacard_VLLD_mu_M300.txt

DataFrame for VLLD_mu_M400:


Unnamed: 0,bin,sig,obs,exp,experr,S/sqrtB,dbkg
6,7,10.9863,69,69.6194,2.704,1.3167,1.03884
0,1,1.18808,7,7.67507,0.586551,0.428848,1.07642
5,6,3.69442,120,120.223,3.58042,0.336941,1.02978
4,5,4.93063,347,347.348,6.98263,0.264557,1.0201
1,2,0.790608,15,15.7289,1.69254,0.199348,1.10761
3,4,4.53721,597,597.746,9.9635,0.18558,1.01667
2,3,1.11748,101,101.634,3.09115,0.110846,1.03041


Wrote file: datacards/datacards_VLLD_mu_2018UL_test/datacard_VLLD_mu_M400.txt
Wrote file: datacards/datacards_VLLD_mu_2018UL_test/datacard_VLLD_mu_M600.txt
Wrote file: datacards/datacards_VLLD_mu_2018UL_test/datacard_VLLD_mu_M800.txt
Done!
