In [1]:
import os
from pathlib import Path
import sys

import lightning
import numpy as np

project_root = Path('/home/yianchen/NTUHEPML-CWoLa')
sys.path.append(str(project_root))

from importnb import Notebook

with Notebook():
    from notebooks.main_tf import LitDataModule

np.set_printoptions(linewidth=np.inf, threshold=np.inf, precision=15, floatmode="unique")  # comment out to disable

In [2]:
'''Change the hyperparameters here'''
data_mode = 'jet_flavor'  # 'jet_flavor' or 'supervised'
data_format = 'sequence'     # 'image' or 'sequence'
luminosity = 100          # change the luminosity here
data_info = {
    'decay_channel': 'diphoton',
    'branching_ratio': 0.00227,   # branching ratio for H -> aa
    'signal': {
        # change the path of dataset here
        'path': 'data/VBF_diphoton.h5',
        'cut_info': 'data/selection_results_VBF_quark_jet.npy',
        'cross_section': 4278.0,    # 4.278 * 1000, precomputed
    },
    'background': {
        # change the path of dataset here
        'path': 'data/GGF_diphoton.h5',
        'cut_info': 'data/selection_results_GGF_quark_jet.npy',
        'cross_section': 54670.0,   # 54.67 * 1000, precomputed
    },
}


# data setup
for include_decay in [True, False]:
    
    lightning.seed_everything(123)
    
    lit_data_module = LitDataModule(
        batch_size=512,
        data_mode=data_mode,
        data_format=data_format,
        data_info=data_info,
        include_decay=include_decay,
        luminosity=luminosity,
    )

    prefix = 'w_photon' if include_decay else 'wo_photon'
    output = project_root / 'data' / 'pc_photon' / f"{prefix}-{data_format}"
    os.makedirs(output, exist_ok=True)
    np.save(output / 'train_sig.npy', lit_data_module.train_sig.numpy())
    np.save(output / 'train_bkg.npy', lit_data_module.train_bkg.numpy())
    np.save(output / 'valid_sig.npy', lit_data_module.valid_sig.numpy())
    np.save(output / 'valid_bkg.npy', lit_data_module.valid_bkg.numpy())
    np.save(output / 'test_sig.npy', lit_data_module.test_sig.numpy())
    np.save(output / 'test_bkg.npy', lit_data_module.test_bkg.numpy())

Seed set to 123




  pt_mean = np.nanmean(pt, axis=-1, keepdims=True)
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,


[CWoLa-Log] [signal] two quark jet: sig region: 340 events
[CWoLa-Log] [signal] two quark jet: bkg region: 53 events
[CWoLa-Log] [background] two quark jet: sig region: 560 events
[CWoLa-Log] [background] two quark jet: bkg region: 562 events


Seed set to 123


[MCSimData Log] /home/yianchen/NTUHEPML-CWoLa/data/VBF_diphoton.h5-TOWER has purity 100.0000%
[MCSimData Log] /home/yianchen/NTUHEPML-CWoLa/data/VBF_diphoton.h5-TRACK has purity 0.0000%
[MCSimData Log] /home/yianchen/NTUHEPML-CWoLa/data/GGF_diphoton.h5-TOWER has purity 100.0000%
[MCSimData Log] /home/yianchen/NTUHEPML-CWoLa/data/GGF_diphoton.h5-TRACK has purity 0.0000%
[CWoLa-Log] [signal] two quark jet: sig region: 340 events
[CWoLa-Log] [signal] two quark jet: bkg region: 53 events
[CWoLa-Log] [background] two quark jet: sig region: 560 events
[CWoLa-Log] [background] two quark jet: bkg region: 562 events


In [3]:
L = 100
data_format = 'sequence'

for include_decay in [True, False]:

    photon_file = 'w_photon' if include_decay else 'wo_photon'
    photon_file += f"-{data_format}"

    tf_mode = 'test'
    tf_data = np.load(project_root / 'data' / 'tf_photon' / photon_file / f"L{100}_seed_{123}" / f"X_{tf_mode}_pt_norm.npy")
    tf_data = tf_data.astype(np.float32)
    if data_format == 'image':
        tf_data = tf_data.transpose(0, 3, 1, 2)  # (N, H, W, C) -> (N, C, H, W)
    elif data_format == 'sequence':
        tf_data = np.nan_to_num(tf_data, nan=-1)

    pc_mode = 'test'
    pc_path = project_root / 'data' / 'pc_photon' / photon_file
    pc_data = np.concatenate((np.load(pc_path / f"{pc_mode}_sig.npy"), np.load(pc_path / f"{pc_mode}_bkg.npy")), axis=0)
    pc_data = pc_data.astype(np.float32)
    if data_format == 'sequence':
        pc_data = np.nan_to_num(pc_data, nan=-1)

    if data_format == 'image':
        assert tf_data.shape == pc_data.shape, f"Shape mismatch: {tf_data.shape} vs {pc_data.shape}"
    elif data_format == 'sequence':
        tf_data = tf_data[:, :400, :3]
        pc_data = pc_data[:, :400, :3]

    counter = 0

    for i in range(len(tf_data)):
        _tf_data, _pc_data = tf_data[i], pc_data[i]
        diff = np.abs(_tf_data[:400] - _pc_data[:400])

        atol = 1e-7
        if not np.allclose(_tf_data[:400], _pc_data[:400], atol=atol):
            counter += 1
            # print(f"Data index: {i}, Sum of absolute differences: {np.sum(diff)}")
            # print('TensorFlow data (TF)', _tf_data[np.where(diff > atol)].ravel())
            # print('PyTorch data (PC)   ', _pc_data[np.where(diff > atol)].ravel())
            # print('Diphoton from TF    ', _tf_data[400:, :].ravel())
            # print()
            # if counter > 5:
            #     break

    print(f"[L{L}][{photon_file}] Found {counter} mismatched samples out of {len(tf_data)}\n")

[L100][w_photon-sequence] Found 46 mismatched samples out of 20000

[L100][wo_photon-sequence] Found 59 mismatched samples out of 20000

