In [1]:
import os
from itertools import product
import numpy as np
import pandas as pd
from Gaugi import load as gload
from Gaugi import save as gsave
from sklearn.model_selection import StratifiedKFold

Welcome to JupyROOT 6.16/00
Using all sub packages with ROOT dependence


In [2]:
old_ets = np.arange(5)
old_etas = np.arange(5)
#old_ets = np.array([4])
#old_etas = np.array([0])
et_key = 'L2Calo_et'
eta_key = 'L2Calo_eta'
dataset = 'data17_13TeV.AllPeriods.sgn.probes_lhmedium_EGAM1.bkg.VProbes_EGAM7.GRL_v97'
homepath = os.path.expanduser('~')
datapath = os.path.join(homepath, 'data', dataset)
filepath = os.path.join(datapath, dataset + '_et{et}_eta{eta}.npz')

out_dataset = 'ided_' + dataset + '.parquet'
out_datapath =  os.path.join(homepath, 'data', out_dataset)
out_filepath =  os.path.join(out_datapath, out_dataset + '_et{et}_eta{eta}.parquet')
if not os.path.exists(out_datapath):
    os.makedirs(out_datapath)

In [3]:
crossval = StratifiedKFold(n_splits=10, random_state=512, shuffle=True)

In [4]:
new_id_start = 0
for et, eta in product(old_ets, old_etas):
    print(f'Processing et {et} eta {eta}')
    data = gload(filepath.format(et=et, eta=eta))
    data_df = pd.DataFrame(data['data'], columns=data['features'])
    target_df = pd.DataFrame(data['target'], columns=['target'])
    final_data_df = pd.concat([data_df, target_df], axis=1)
    final_data_df['test_fold'] = -1
    final_data_df['id'] = np.arange(new_id_start, new_id_start+len(final_data_df))
    new_id_start += len(final_data_df)
    test_fold=0
    for train_index, test_index in crossval.split(data['data'], data['target']):
        print(f'At fold {test_fold}')
        final_data_df.loc[test_index, 'test_fold'] = test_fold
        test_fold += 1
    final_data_df.reset_index().rename(dict(index='id'), axis=1)
    print('Saving')
    final_data_df.to_parquet(out_filepath.format(et=et, eta=eta))

Processing et 0 eta 0
At fold 0
At fold 1
At fold 2
At fold 3
At fold 4
At fold 5
At fold 6
At fold 7
At fold 8
At fold 9
Saving
Processing et 0 eta 1
At fold 0
At fold 1
At fold 2
At fold 3
At fold 4
At fold 5
At fold 6
At fold 7
At fold 8
At fold 9
Saving
Processing et 0 eta 2
At fold 0
At fold 1
At fold 2
At fold 3
At fold 4
At fold 5
At fold 6
At fold 7
At fold 8
At fold 9
Saving
Processing et 0 eta 3
At fold 0
At fold 1
At fold 2
At fold 3
At fold 4
At fold 5
At fold 6
At fold 7
At fold 8
At fold 9
Saving
Processing et 0 eta 4
At fold 0
At fold 1
At fold 2
At fold 3
At fold 4
At fold 5
At fold 6
At fold 7
At fold 8
At fold 9
Saving
Processing et 1 eta 0
At fold 0
At fold 1
At fold 2
At fold 3
At fold 4
At fold 5
At fold 6
At fold 7
At fold 8
At fold 9
Saving
Processing et 1 eta 1
At fold 0
At fold 1
At fold 2
At fold 3
At fold 4
At fold 5
At fold 6
At fold 7
At fold 8
At fold 9
Saving
Processing et 1 eta 2
At fold 0
At fold 1
At fold 2
At fold 3
At fold 4
At fold 5
At fold 6
At fo

In [6]:
df = pd.read_parquet(out_datapath)

In [7]:
df.head()

Unnamed: 0,avgmu,L2Calo_ring_0,L2Calo_ring_1,L2Calo_ring_2,L2Calo_ring_3,L2Calo_ring_4,L2Calo_ring_5,L2Calo_ring_6,L2Calo_ring_7,L2Calo_ring_8,...,el_lhmedium,el_lhloose,el_lhvloose,T0HLTElectronT2CaloTight,T0HLTElectronT2CaloMedium,T0HLTElectronT2CaloLoose,T0HLTElectronT2CaloVLoose,target,test_fold,id
0,31.42,20.722918,145.061462,214.078415,-365.355927,-100.914551,-151.044815,-208.753479,75.000626,1570.550537,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1,6,0
1,31.42,725.026123,110.832283,303.516968,321.592499,-135.673965,1032.414673,561.357117,-160.497116,1205.812988,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1,4,1
2,31.299999,123.479851,218.375916,-450.4664,466.682343,-187.977219,-38.499943,-43.483982,-83.335388,801.987915,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1,4,2
3,31.299999,381.280792,-65.667053,-327.027191,-114.572136,-29.165712,3.222214,-24.147762,-162.795822,1827.555786,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1,4,3
4,31.299999,685.628784,149.687103,216.083435,327.475616,124.14698,139.680588,199.122116,135.836777,1642.526367,...,1.0,0.0,0.0,1.0,1.0,1.0,1.0,1,0,4


In [8]:
df.shape

(39863779, 157)

In [9]:
df['id'].duplicated().any()

False