In [10]:
import os
import sys
import h5py
import getpass

from golden_json import GoldenJson

In [2]:
# Permission for eos
os.system("echo %s | kinit" % getpass.getpass())

········


0

In [64]:
# Init variables

dset_name = "ZeroBias2016"
dset_name = "JetHT2016"
dset_name = "SingleMuon2016"
dset_name = "SinglePhoton2016"

# Source directory of original h5 datasets
data_src = "/home/mantydze/data/{dset_name}/original".format(dset_name=dset_name)

# Destinaton directory for a merged dataset
data_dst = "/home/mantydze/data/{dset_name}/single/{dset_name}.h5".format(dset_name=dset_name) 

# Golden JSON location
golden_json_path = "../../data/Cert_271036-284044_13TeV_PromptReco_Collisions16_JSON.txt"

gj = GoldenJson(golden_json_path)

In [65]:
def get_sub_dset(filename):
    """
        Read sub dataset (_background, _signal) 
        Apply Golden JSON (Overwrite existing label with one from GoldenJSON)
        Return dataset with new labels
    """
    
    print(filename, flush=True)
    sub_dset = None
    
    with h5py.File("%s/%s" % (data_src, filename), "r") as sub_h5:
        sub_dset_name = filename.replace(".h5", "")
        sub_dset = sub_h5[sub_dset_name][:]
        rows_count = sub_dset.shape[0] # rows count
        
        print ("Rows: %d" % rows_count, flush=True)
        
        for row_index in range(rows_count):
            run_number = int(sub_dset[row_index, 2807])
            lumi_section = int(sub_dset[row_index, 2808])
            
            label_orig = int(sub_dset[row_index, 2812]) # inversed (1 - bad, 0 - good)
            label_gj = gj.is_good(run_number, lumi_section)
            
            # Assign new label
            sub_dset[row_index, 2812] = label_gj
            
            # Print both old(original) and new(GoldenJSON) labels if they differ
            if (label_orig == label_gj):
                print("%d) Run %d | LS %d | Orig %d | GoldenJSON %d" % (row_index, run_number, lumi_section, label_orig, label_gj))
                
    return sub_dset

In [66]:
%%time

with h5py.File(data_dst, "w") as main_h5:
    main_dset = None

    total_rows_count = 0
    # Iterate all datasets in a directory
    for filename in os.listdir(data_src):
        
        # Get dataset with good labels
        sub_dset = get_sub_dset(filename)
        sub_rows_count = sub_dset.shape[0] # rows count
        
        total_rows_count += sub_rows_count
        
        # Append sub dataset into final dataset
        if main_dset is None:
            main_dset = main_h5.create_dataset(dset_name, chunks=True, dtype="float64", 
                                               maxshape=(None, None), data=sub_dset[:], 
                                               compression="gzip", compression_opts=0)
        else:
            main_dset.resize(main_dset.shape[0] + sub_rows_count, axis=0)
            main_dset[-sub_rows_count:] = sub_dset[:]

    print("Total row count: %d" % total_rows_count)
    print("Main dset:")
    print(main_dset.shape)
    

SinglePhoton_C_background.h5
Rows: 474
111) Run 276095 | LS 1 | Orig 0 | GoldenJSON 0
112) Run 276095 | LS 2 | Orig 0 | GoldenJSON 0
113) Run 276095 | LS 3 | Orig 0 | GoldenJSON 0
114) Run 276095 | LS 4 | Orig 0 | GoldenJSON 0
115) Run 276095 | LS 5 | Orig 0 | GoldenJSON 0
168) Run 275833 | LS 116 | Orig 0 | GoldenJSON 0
186) Run 275922 | LS 4 | Orig 0 | GoldenJSON 0
187) Run 275922 | LS 5 | Orig 0 | GoldenJSON 0
188) Run 275922 | LS 6 | Orig 0 | GoldenJSON 0
201) Run 275911 | LS 355 | Orig 0 | GoldenJSON 0
210) Run 275757 | LS 108 | Orig 0 | GoldenJSON 0
211) Run 275757 | LS 109 | Orig 0 | GoldenJSON 0
212) Run 275757 | LS 110 | Orig 0 | GoldenJSON 0
213) Run 275757 | LS 111 | Orig 0 | GoldenJSON 0
214) Run 275757 | LS 112 | Orig 0 | GoldenJSON 0
215) Run 275757 | LS 113 | Orig 0 | GoldenJSON 0
216) Run 275757 | LS 114 | Orig 0 | GoldenJSON 0
217) Run 275757 | LS 115 | Orig 0 | GoldenJSON 0
218) Run 275757 | LS 116 | Orig 0 | GoldenJSON 0
219) Run 275757 | LS 117 | Orig 0 | GoldenJSON

Rows: 16480
2946) Run 278240 | LS 67 | Orig 0 | GoldenJSON 0
2947) Run 278240 | LS 68 | Orig 0 | GoldenJSON 0
2948) Run 278240 | LS 69 | Orig 0 | GoldenJSON 0
3345) Run 278240 | LS 65 | Orig 0 | GoldenJSON 0
3346) Run 278240 | LS 66 | Orig 0 | GoldenJSON 0
5488) Run 278308 | LS 1206 | Orig 0 | GoldenJSON 0
5489) Run 278308 | LS 1207 | Orig 0 | GoldenJSON 0
5490) Run 278308 | LS 1208 | Orig 0 | GoldenJSON 0
5491) Run 278308 | LS 1210 | Orig 0 | GoldenJSON 0
5494) Run 278308 | LS 1205 | Orig 0 | GoldenJSON 0
5495) Run 278308 | LS 1209 | Orig 0 | GoldenJSON 0
5547) Run 278308 | LS 1211 | Orig 0 | GoldenJSON 0
5548) Run 278308 | LS 1212 | Orig 0 | GoldenJSON 0
5549) Run 278308 | LS 1213 | Orig 0 | GoldenJSON 0
5550) Run 278308 | LS 1214 | Orig 0 | GoldenJSON 0
5551) Run 278308 | LS 1215 | Orig 0 | GoldenJSON 0
5552) Run 278308 | LS 1216 | Orig 0 | GoldenJSON 0
5707) Run 278308 | LS 1201 | Orig 0 | GoldenJSON 0
5708) Run 278308 | LS 1202 | Orig 0 | GoldenJSON 0
5709) Run 278308 | LS 1203 | 

SinglePhoton_H_background.h5
Rows: 1680
64) Run 282037 | LS 458 | Orig 0 | GoldenJSON 0
193) Run 282923 | LS 31 | Orig 0 | GoldenJSON 0
SinglePhoton_H_signal.h5
Rows: 22533
10516) Run 281707 | LS 983 | Orig 0 | GoldenJSON 0
10517) Run 281707 | LS 984 | Orig 0 | GoldenJSON 0
10527) Run 281707 | LS 985 | Orig 0 | GoldenJSON 0
10528) Run 281707 | LS 986 | Orig 0 | GoldenJSON 0
10529) Run 281707 | LS 987 | Orig 0 | GoldenJSON 0
10530) Run 281707 | LS 988 | Orig 0 | GoldenJSON 0
10531) Run 281707 | LS 989 | Orig 0 | GoldenJSON 0
10532) Run 281707 | LS 990 | Orig 0 | GoldenJSON 0
10724) Run 281707 | LS 991 | Orig 0 | GoldenJSON 0
10725) Run 281707 | LS 992 | Orig 0 | GoldenJSON 0
10726) Run 281707 | LS 993 | Orig 0 | GoldenJSON 0
10727) Run 281707 | LS 994 | Orig 0 | GoldenJSON 0
10728) Run 281707 | LS 995 | Orig 0 | GoldenJSON 0
10729) Run 281707 | LS 996 | Orig 0 | GoldenJSON 0
11054) Run 281707 | LS 997 | Orig 0 | GoldenJSON 0
11055) Run 281707 | LS 998 | Orig 0 | GoldenJSON 0
11056) Run 