In [None]:
from pathlib import Path
import uproot
import pandas as pd
import sys
import numpy as np
import warnings
import matplotlib.pyplot as plt

# PID weights and uncertainties

Here we calculate the PID effects by comparing data and MC for certain benchmark channels. For this we use the [systematics framework](https://syscorrfw.readthedocs.io/en/latest/index.html).

Fist we will obtain the PID corrwction tables, with uncertainties, then we will apply the weights to the ntuples and lastly we calculate the systematics table.

## Correction tables

This code will only run on KEKCC.

In [None]:
from IPython.display import HTML 

# Path to scripts on KEKCC
sys_path = '/group/belle2/dataprod/Systematics/systematic_corrections_framework/scripts'

sys.path.insert(1, sys_path)

sys.path.append('/group/belle2/dataprod/Systematics/systematic_corrections_framework')

import efficiency_table as et
import process_tables as pt
import show_db_content as sdb
import id_vs_misid_curve as roc
import weight_table as wm

In [None]:
ratio_cfg = {
    "cut": "kaonID > 0.9",
    "particle_type": "K", #pi - efficiency max 5%
    "data_collection": "proc13+prompt",
    "mc_collection": "MC15ri",
    "track_variables": ["p", "cosTheta"],
    "precut": "nCDCHits > 20",
    "binning": [list(np.linspace(0.5, 4.5, 11)), 
                [-0.866, -0.682, -0.4226, -0.1045, 0.225, 0.5, 0.766, 0.8829, 0.9563]],
}
efficiency = wm.produce_data_mc_ratio(**ratio_cfg)

efficiency_table_K = efficiency.create_weights()
# Uncomment the following line to see the content 
# of the produced weight table in pandas DataFrame format:

efficiency_table_K

In [None]:
ratio_cfg = {
    "cut": "kaonID > 0.9",
    "particle_type": "pi", #pi - efficiency max 5%
    "data_collection": "proc13+prompt",
    "mc_collection": "MC15ri",
    "track_variables": ["p", "cosTheta"],
    "precut": "nCDCHits > 20",
    "binning": [list(np.linspace(0.5, 4.5, 11)), 
                [-0.866, -0.682, -0.4226, -0.1045, 0.225, 0.5, 0.766, 0.8829, 0.9563]],
}
efficiency = wm.produce_data_mc_ratio(**ratio_cfg)

efficiency_table_pi = efficiency.create_weights()
# Uncomment the following line to see the content 
# of the produced weight table in pandas DataFrame format:

efficiency_table_pi

In [None]:
efficiency_table_K.to_csv('pid-tables/efficiency_table_K.csv')
efficiency_table_pi.to_csv('pid-tables/efficiency_table_pi.csv')

efficiency_table_K = pd.from_csv('pid-tables/efficiency_table_K.csv')
efficiency_table_pi = pd.from_csv('pid-tables/efficiency_table_pi.csv')

## Applying PID weights

This code will only run on KEKCC, as the corresponding samples are there. 

In [None]:
BASE = Path('/home/belle2/lorenzg/pyhf-tutorial/ntuples_reconstructed')
ntuples = {
    'data_ssbar'    : {BASE / 'MC15ri_b_Kpi0_generic_200fb_8/ssbar.root': 'Bsig'},
    'data_ccbar'    : {BASE / 'MC15ri_b_Kpi0_generic_200fb_8/ccbar.root': 'Bsig'},
    'data_charged'  : {BASE / 'MC15ri_b_Kpi0_generic_200fb_8/charged.root': 'Bsig'},
    'data_mixed'    : {BASE / 'MC15ri_b_Kpi0_generic_200fb_8/mixed.root': 'Bsig'},
    'signal'        : {BASE / 'MC15ri_b_Kpi0_signal_8/00.root': 'Bsig'},
    'signal_rmt'    : {BASE / 'MC15ri_b_Kpi0_signal_rmT_4/sub00/grid_00000_job348357260_00.root': 'Bsig'},
    'ssbar'         : {BASE / 'MC15ri_b_Kpi0_generic_200fb_9/ssbar.root' : 'Bsig'},
    'ssbar_rmt'     : {BASE / 'MC15ri_b_Kpi0_generic_200fb_9/ssbar_rmT.root': 'Bsig'},
    'ccbar'         : {BASE / 'MC15ri_b_Kpi0_generic_200fb_9/ccbar.root' : 'Bsig'},
    'ccbar_rmt'     : {BASE / 'MC15ri_b_Kpi0_generic_200fb_9/ccbar_rmT.root': 'Bsig'},
    'charged'       : {BASE / 'MC15ri_b_Kpi0_generic_200fb_9/charged.root': 'Bsig'},
    'charged_rmt'   : {BASE / 'MC15ri_b_Kpi0_generic_200fb_9/charged_rmT.root': 'Bsig'},
    'mixed'         : {BASE / 'MC15ri_b_Kpi0_generic_200fb_9/mixed.root': 'Bsig'},
    'mixed_rmt'     : {BASE / 'MC15ri_b_Kpi0_generic_200fb_9/mixed_rmT.root': 'Bsig'},
    'misID'         : {BASE / 'MC15ri_b_Kpi0_misID/misID.root': 'Bsig'},
    'misID_rmt'     : {BASE / 'MC15ri_b_Kpi0_misID/misID_rmT.root': 'Bsig'}    
}

cols = ['__production__', 'B_isSignal', 'B_deltaE', 'K_mcPDG',  'K_p', 'K_theta', 'B_R2', 'B_cosTBTO', 'B_mcErrors']
pid_cols = ['data_MC_ratio',
            'data_MC_uncertainty_stat_up', 'data_MC_uncertainty_stat_dn',
            'data_MC_uncertainty_sys_up', 'data_MC_uncertainty_sys_dn']
uproot.open({BASE / 'MC15ri_b_Kpi0_signal_8/00.root': 'Bsig'}).keys()

In [None]:
def get_weight(p, th, eff, var):
    costh = np.cos(th)
    return eff.query('@p >= p_min and @p < p_max and @costh >= cosTheta_min and @costh < cosTheta_max')[var]

nK = 0
npi= 0

kPDG = 321
piPDG = 211

for k,v in ntuples.items():
    df = uproot.concatenate(v, cols, library='pd')

    rows = [[row['K_p'], row['K_theta'], row['K_mcPDG']] for _, row in df.iterrows()]
    
    pid_weights = pd.DataFrame(data=None, columns=pid_cols)
    for r in rows:
        if np.abs(r[2]) == kPDG:
            nK += 1
            w = get_weight(r[0], r[1], efficiency_table_K, pid_cols)
        elif np.abs(r[2]) == piPDG:
            npi += 1
            w = get_weight(r[0], r[1], efficiency_table_pi, pid_cols)
        else:
            w = pd.DataFrame(data=[[1,0,0,0,0]], columns=pid_cols)
        pid_weights  = pd.concat([pid_weights, w], ignore_index=True, sort=False)
        
    for v in pid_cols:
        df[v] = pid_weights[v]
    
    # add total uncertainty
    unc = np.sqrt(((pid_weights[pid_cols[1]].astype(float)+pid_weights[pid_cols[2]].astype(float))/2.)**2 
                    + ((pid_weights[pid_cols[3]].astype(float)+pid_weights[pid_cols[4]].astype(float))/2.)**2)
    df['PID_total_uncertainty'] = unc
        
    print(k, df.info())
    with uproot.recreate(f'ntuples/{k}.root') as file:
        file['B'] = df

In [None]:
print(nK, npi)

## Calculating systematic corrections

In [None]:
def getBin(df, var, bins):
    '''
        Determine bin numbers for the 2D->1D
        Input:
            df: input dataframe (or lazyarray)
            binning: bin edges of the fitting binning
        Output:
           np.array of 2D bin numbers. For entries outside bin boundaries, returns 0 or len1*len2+1.

        Example usage:
           dSig = getBin2D(sig[sig.bdt_v24_ff_weights>0.9],'B_sig_K_pt__vs__bdt_v24_ff_weights_T09')
           h = plt.hist(dSig[(dSig>0)&(dSig<13)],13,(1.,13.))
    '''
    # 1-d digitize:
    out = np.digitize(df[var], bins)
    return out


def identify_mc_category(prod_id):
    ''' Identify a MC category from a production number
    '''
    if prod_id in [25052] :
        return 'signal'
    elif prod_id in [24787, 24797] :
        return 'qqbar'
    elif prod_id in [24817, 24822]:
        return 'BBbar'
    elif prod_id in [26308]:
        return 'misID'

    else:
        warnings.warn(f"{prod_id} category unknown.")
        return 'unknown'
    
v_category = np.vectorize(identify_mc_category)

class PIDstatErrorPropagator:
    '''
       Class to compute covariance matrix for PID stat. uncertainties.
    '''

    def __init__(self, df, bins, varStat, nrep=500):
        '''
           Input variables:
              df   : mc sample. Should include all productions (signal + backgrounds)
              bins: bin edges of the fitting binning
              varStat : PID stat. error
              nrep : number of toy MC replica
        '''
        keys = set(np.nan_to_num(df[varStat]))
        self.df = df
        self.bins = bins
        self.len = len(keys)
        self.varStat = varStat
        self.nrep = nrep
        print("len=", self.len)
        self.toys = dict()
        for key in keys:
            toy = np.random.lognormal(0., min(key, 1.), nrep)
            self.toys[key] = toy

    def getW(self):
        ''' Return array of weight uncertainties for each event
            Output:
              out :  array ( len(df),  self.nrep ) of weight errors per event
        '''
        a = np.nan_to_num(np.array(self.df[self.varStat]))
        out = np.zeros((len(a), self.nrep))
        for i in range(len(a)):
            out[i] = self.toys[a[i]]
        return out

    def cov(
        self,
        var,
        prods=[
            'signal',
            'qqbar',
            'BBbar',
            'misID']):
        '''
            Compute covariance matrix for df and fit bins

            Input variables:
               var : variable name in variable_registry
               prods : event categories

            Output:
               av    : sum of weights for each bin, each event category.
                       Dimention of av is given by Ncategories x Nbins
               cov   : covariance matrix
               prods : same as input (useful if default value is used)
        '''
        prod = v_category(self.df.__production__)
        www = self.getW()
        dSig = getBin(self.df, var, self.bins)
        nbin = len(self.bins)-1
        # over bins
        out = np.zeros((nbin*len(prods), self.nrep))
        for j, p in enumerate(prods):
            for i in range(nbin):
                out[j*nbin+i] = np.sum(www[(dSig == i+1) & (p == prod)], axis=0)

        av = np.average(out, axis=1)
        dd = out-av[:, np.newaxis]
        cov = np.sum(dd[np.newaxis, :, :]*dd[:, np.newaxis, :], axis=2)/self.nrep
        return av, cov, prods
    
def covToNui(cov):
    '''
    Perform decoposition of covariance matrix cov

    Input variables:
        cov : covariance matrix
    Output variables:
        vec : eigenvectors, ordered with most significant being the last
    '''
    va, ve = np.linalg.eigh(cov)
    diag = np.identity(len(va))
    va = np.where(va > 0, va, 0)
    sva = np.sqrt(va)*diag
    pa = ve.dot(sva)  # error vectors, to use with "nuisance parameters"
    return np.transpose(pa)

def truncateNui(vec, n):
    '''
    Keep only N most significant eigenvectors, cut the rest

    Input variables:
        vec : input eigenvectors
        n   : number of eigenvectors to keep

    Output:
        dia  : uncorrelated part, to be added to stat. uncertainty in quadrature
        ovec : remaining correlated eigenvectors
    '''
    s = vec.shape[0]
    ovec = vec[s-n:s]
    odia = vec[:s-n]
    dia = np.sqrt(np.sum(odia**2, axis=0))
    return dia, ovec

def check_array(x):
    try:
        x.shape
        return True
    except:
        return False

def dumpSyst(dia, ovec, cent, names, fname):
    '''
        Store results in a csv file.

        Input variables:
           dia  : array of uncorrelated uncertainties (or None)
           ovec : matrix of correlated eigenvectors
           cent : central values
           names: names of categories
           fname: output file name
        Output:
           creates a comma separated table "fname"
           column "type" has value "u" for uncorrelated and "c" for correlated uncertainty
           other colums encode bin/category info, e.g.  "signal_10"
    '''
    isDia = check_array(dia)
    
    if isDia == False:
        pass
    else:
        d = np.where(cent > 0, dia/cent, 0)
    v = np.where(cent > 0, ovec/cent, 0)

    ln = len(names)
    if isDia == False:
        lo = ovec.shape[1]
    else:
        lo = len(dia)
    lb = lo // ln
    with open(fname, "w") as f:
        s = "type"
        for n in names:
            for b in range(lb):
                s += ",{}_{}".format(n, b+1)
        f.write(s+"\n")

        if isDia:
            f.write("u," + ",".join(["{:6.3f}".format(d) for d in d*100]) + "\n")
        if (len(v.shape) > 1): 
            for i,vv in enumerate(v[:]):
                f.write("c{},".format(i+1) + ",".join(["{:6.3f}".format(d) for d in vv*100]) + "\n")
        else:
            f.write("c," + ",".join(["{:6.3f}".format(d) for d in v*100]) + "\n")


In [None]:
productions = ['signal', 'ssbar', 'ccbar', 'charged', 'mixed', 'misID']
cols = ['__production__', 'B_isSignal', 'B_deltaE', 'K_mcPDG',  'K_p', 'K_theta', 'B_R2', 'B_cosTBTO', 'B_mcErrors']
files = [{f'ntuples/{p}.root':'B'} for p in productions]
all = uproot.concatenate(files, cols+pid_cols+['PID_total_uncertainty'], library='pd')
all.head()

In [None]:
bins = np.linspace(-0.4, 0.4, 20 + 1)
pid = PIDstatErrorPropagator(all, bins, varStat='PID_total_uncertainty', nrep=500)

sigAv, sigCov, pnames = pid.cov('B_deltaE')

In [None]:
vec = covToNui(sigCov)

In [None]:
from mpl_toolkits.axes_grid1 import make_axes_locatable
_labels = {'signal': 'signal', 'qqbar': 'qqbar', 'BBbar': 'BBbar', 'misID': 'misID'}

N = 5

fig, ax = plt.subplots(1,2,figsize=(12,5))
di = np.diag(sigCov)
rr = np.sqrt(di[np.newaxis,:]*di[:,np.newaxis])
ra = np.where(rr>0,sigCov/rr,0)
im = ax[0].imshow(ra,vmin=-0.2,vmax=1,origin='lower')
divider = make_axes_locatable(ax[0])
cax = divider.append_axes("right", size="5%", pad=0.05)
fig.colorbar(im, cax=cax)
ax[0].set_xticks(np.arange(0,80,20))
ax[0].set_xticklabels([_labels[p] for p in pnames], rotation=45)
ax[0].set_yticks(np.arange(0,80,20))
ax[0].set_yticklabels([_labels[p] for p in pnames], rotation=0)
ax[0].set_xlabel('Bin number')
ax[0].set_ylabel('Bin number')

d,v =truncateNui(vec,N)
cov2 = np.transpose(v).dot(v) +np.identity(len(d))*d**2

di = np.diag(cov2)
rr = np.sqrt(di[np.newaxis,:]*di[:,np.newaxis])
ra = np.where(rr>0,cov2/rr,0)
im = ax[1].imshow(ra,vmin=-0.2,vmax=1,origin='lower')
divider = make_axes_locatable(ax[1])
cax = divider.append_axes("right", size="5%", pad=0.05)
fig.colorbar(im, cax=cax)
ax[1].set_xticks(np.arange(0,80,20))
ax[1].set_xticklabels([_labels[p] for p in pnames], rotation=45) #45
ax[1].set_yticks(np.arange(0,80,20))
ax[1].set_yticklabels([_labels[p] for p in pnames], rotation=0)
ax[1].set_xlabel('Bin number')
ax[1].set_ylabel('Bin number')

fig.tight_layout()
fig.patch.set_facecolor('white')

In [None]:
dumpSyst(d, v, sigAv, pnames, 'pid-tables/pid_systematics.csv')