In [1]:
import os
import json
from itertools import product
import numpy as np
import pandas as pd
from Gaugi import load as gload
from Gaugi import save as gsave
from kepler.pandas.readers import load as kload
from sklearn.model_selection import StratifiedKFold

Welcome to JupyROOT 6.16/00
Using all sub packages with ROOT dependence


In [2]:
old_ets = np.arange(5)
old_etas = np.arange(5)
#old_ets = np.array([4])
#old_etas = np.array([0])
et_key = 'L2Calo_et'
eta_key = 'L2Calo_eta'
# dataset = 'mc16_13TeV.302236_309995_341330.sgn.boosted_probes.WZ_llqq_plus_radion_ZZ_llqq_plus_ggH3000.merge.25bins.v2'
dataset = 'data17_13TeV.AllPeriods.sgn.probes_lhmedium_EGAM1.bkg.VProbes_EGAM7.GRL_v97'
# dataset = 'data17_13TeV.AllPeriods.sgn.probes_lhvloose_EGAM1.bkg.vprobes_vlhvloose_EGAM7.GRL_v97.25bins'
homepath = os.path.expanduser('~')
datapath = os.path.join(homepath, 'data', dataset)
filepath = os.path.join(datapath, dataset + '_et{et}_eta{eta}.npz')
load_func = 'gload'
add_fold = True
if add_fold:
    crossval = StratifiedKFold(n_splits=10, random_state=512, shuffle=True)

out_dataset = 'ided_' + dataset + '.parquet'
out_datapath =  os.path.join(homepath, 'data', out_dataset)
out_filepath =  os.path.join(out_datapath, out_dataset + '_et{et}_eta{eta}.parquet')
if not os.path.exists(out_datapath):
    os.makedirs(out_datapath)

In [None]:
new_id_start = 0
for et, eta in product(old_ets, old_etas):
    print(f'Processing et {et} eta {eta}')
    if load_func == 'gload':
        data = gload(filepath.format(et=et, eta=eta))
        data_df = pd.DataFrame(data['data'], columns=data['features'])
        target_df = pd.DataFrame(data['target'], columns=['target'])
        final_data_df = pd.concat([data_df, target_df], axis=1)
    elif load_func == 'kload':
        final_data_df = kload(filepath.format(et=et, eta=eta))
    else:
        raise ValueError('Available load functions are gload and kload')
    
    final_data_df['id'] = np.arange(new_id_start, new_id_start+len(final_data_df), dtype=np.uint64)
    new_id_start += len(final_data_df)
    if add_fold:
        final_data_df['test_fold'] = -1
        test_fold=0
        for train_index, test_index in crossval.split(data['data'], data['target']):
            print(f'At fold {test_fold}')
            final_data_df.loc[test_index, 'test_fold'] = test_fold
            test_fold += 1
    print('Saving')
    final_data_df.to_parquet(out_filepath.format(et=et, eta=eta))

Processing et 0 eta 0
At fold 0
At fold 1
At fold 2
At fold 3
At fold 4
At fold 5
At fold 6
At fold 7
At fold 8
At fold 9
Saving
Processing et 0 eta 1
At fold 0
At fold 1
At fold 2
At fold 3
At fold 4
At fold 5
At fold 6
At fold 7
At fold 8
At fold 9
Saving
Processing et 0 eta 2
At fold 0
At fold 1
At fold 2
At fold 3
At fold 4
At fold 5
At fold 6
At fold 7
At fold 8
At fold 9
Saving
Processing et 0 eta 3
At fold 0
At fold 1
At fold 2
At fold 3
At fold 4
At fold 5
At fold 6
At fold 7
At fold 8
At fold 9
Saving
Processing et 0 eta 4
At fold 0
At fold 1
At fold 2
At fold 3
At fold 4
At fold 5
At fold 6
At fold 7
At fold 8
At fold 9
Saving
Processing et 1 eta 0
At fold 0
At fold 1
At fold 2
At fold 3
At fold 4
At fold 5
At fold 6
At fold 7
At fold 8
At fold 9
Saving
Processing et 1 eta 1
At fold 0
At fold 1
At fold 2
At fold 3
At fold 4
At fold 5
At fold 6
At fold 7
At fold 8
At fold 9
Saving
Processing et 1 eta 2
At fold 0
At fold 1
At fold 2
At fold 3
At fold 4
At fold 5
At fold 6
At fo

In [None]:
dataset_dir, _ = os.path.split(datapath)
out_dataset_name = out_dataset.replace('.parquet', '')
with open(os.path.join(dataset_dir, f'{out_dataset_name}_schema.json'), 'w') as json_file:
    json.dump(final_data_df.dtypes.astype(str).to_dict(), json_file, indent=4)