In [1]:
import uproot

In [2]:
import numpy as np

In [3]:
rng = np.random.default_rng()

In [4]:
filename1 = "df_nu_e_CC.root"
filename2 = "df_nu_tau_CC_hadronic.root"
train_file_name = "df_nu_CC_non-muonic_equal_train.root"
test_file_name = "df_nu_CC_non-muonic_equal_test.root"
treename = "df"

In [5]:
filename1 = "df_nu_mu_CC.root"
filename2 = "df_nu_tau_CC_muonic.root"
train_file_name = "df_nu_CC_muonic_equal_train.root"
test_file_name = "df_nu_CC_muonic_equal_test.root"
treename = "df"

In [6]:
tree1 = uproot.open(f"{filename1}:{treename}")

In [7]:
tree2 = uproot.open(f"{filename2}:{treename}")

In [8]:
train_frac = 0.8

In [9]:
train_file = uproot.recreate(train_file_name)
test_file = uproot.recreate(test_file_name)

In [10]:
target_dims = (3279, 116)
mufilter_dims = (3279, 68)

In [11]:
train_file.mktree(
    treename,
    {
        "X": (">f4", target_dims),
        "X_mufilter": (">f4", mufilter_dims),
        "start_x": ">f8",
        "start_y": ">f8",
        "start_z": ">f8",
        "nu_energy": ">f8",
        "hadron_energy": ">f8",
        "lepton_energy": ">f8",
        "energy_dep_target": ">f8",
        "energy_dep_mufilter": ">f8",
        "nu_flavour": ">i8",
        "is_cc": "bool",
    },
    title="Dataframe for CNN studies",
)

<WritableTree '/df' at 0x7fb3a00d5750>

In [12]:
test_file.mktree(
    treename,
    {
        "X": (">f4", target_dims),
        "X_mufilter": (">f4", mufilter_dims),
        "start_x": ">f8",
        "start_y": ">f8",
        "start_z": ">f8",
        "nu_energy": ">f8",
        "hadron_energy": ">f8",
        "lepton_energy": ">f8",
        "energy_dep_target": ">f8",
        "energy_dep_mufilter": ">f8",
        "nu_flavour": ">i8",
        "is_cc": "bool",
    },
    title="Dataframe for CNN studies",
)

<WritableTree '/df' at 0x7fb3981d4e50>

In [13]:
from tqdm import tqdm

In [14]:
prob = tree2.num_entries / (tree1.num_entries + tree2.num_entries)

In [15]:
tree2.num_entries / tree1.num_entries

0.039674751156596105

In [16]:
print(prob)

0.038160733549083066


In [17]:
from random import shuffle

In [18]:
gen1 = tree1.iterate(step_size="10MB", library="np")
gen2 = tree2.iterate(step_size="10MB", library="np")
t = tqdm(total=tree1.num_entries + tree2.num_entries)
for batch1, batch2 in zip(gen1, gen2):
    batch1_size = len(batch1["start_z"])
    batch2_size = len(batch2["start_z"])
    assert batch1_size == batch1_size
    batch_size = batch1_size + batch2_size
    batch_partition = rng.binomial(batch_size, train_frac)
    indexer = np.concatenate([np.zeros(batch1_size), np.ones(batch2_size)])
    shuffle(indexer)
    i1 = 0
    i2 = 0
    for index, sample in enumerate(indexer):
        batch = batch2 if sample else batch1
        i = i2 if sample else i1
        if index < batch_partition:
            train_file[treename].extend(
                {key: np.array([batch[key][i]]) for key in batch.keys()}
            )
        else:
            test_file[treename].extend(
                {key: np.array([batch[key][i]]) for key in batch.keys()}
            )
        if sample:
            i2 += 1
        else:
            i1 += 1
    t.update(batch_size)
t.close()

  9%|▉         | 9330/103824 [09:43<1:38:26, 16.00it/s]


In [19]:
print(train_file[treename].num_entries)
print(test_file[treename].num_entries)

7465
1865


In [20]:
train_file.close()
test_file.close()