In [1]:
import ROOT
import root_numpy as rnpy
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn
import pickle

Welcome to ROOTaaS 6.06/08


In [2]:
batch_size_jets = 25
batch_size_tracks = 25
read_pos_jets = 0
read_pos_tracks = 0

In [3]:
d1 = pd.DataFrame(rnpy.root2array("/mnt/t3nfs01/data01/shome/jpata/btv/gc/TagVarExtractor/GCa08e5e237323/TT_TuneCUETP8M1_13TeV-powheg-pythia8/job_0_out.root",
                                 treename = "tagVars/ttree", start = read_pos_jets, stop = read_pos_jets + batch_size_jets))

In [4]:
d2 = pd.DataFrame(rnpy.root2array("/mnt/t3nfs01/data01/shome/jpata/btv/gc/TagVarExtractor/GCa08e5e237323/TT_TuneCUETP8M1_13TeV-powheg-pythia8/job_0_out.root",
                                 treename = "tagVars/ttree_track", start = read_pos_tracks, stop = read_pos_tracks + batch_size_tracks))

  warn_missing_tree)


In [5]:
last_tracks = (int)(d2.tail(1)['Track_jetIndex'].iloc[0]-1)

In [6]:
last_jet = (int)(d1.tail(1)['Jet_jetIndex'].iloc[0]-1)

In [7]:
read_pos_jets += (d1.loc[d1['Jet_jetIndex'] == last_tracks].index[-1] + 1)
read_pos_tracks += (d2.loc[d2['Track_jetIndex'] == last_tracks].index[-1] + 1)

In [8]:
d1['track_data'] = pd.np.empty((len(d1.index), 0)).tolist()

In [9]:
# iterate over the track list to join jets with the tracks belonging to them
for irow, row in d2.iterrows():
    # these are the track data of the current track:
    tracks = row[["Track_pt", "Track_eta", "Track_phi", "Track_dxy", "Track_dz", "Track_IP", "Track_IP2D", "Track_length"]].as_matrix()
    jet_index = int(row["Track_jetIndex"])
    if jet_index > last_tracks:
        break
    table_index = d1.loc[d1['Jet_jetIndex'] == jet_index].index[0]
    
    # append the tracks data to the matching jet in the main table
    d1['track_data'][table_index].append(tracks)

In [10]:
def get_max_tracks(data):
    retval = 0
    for cur in data['track_data']:
        if len(cur) > retval:
            retval = len(cur)
    return retval

In [11]:
def equalize_tracks(data, set_tracks):
    empty = np.full(8, 0, float)
    for idx, cur in enumerate(data['track_data']):
        # take only these that are non-empty track lists
        if(len(cur) > 0):
            for i in range(set_tracks - len(cur)):
                data['track_data'][idx].append(empty)

In [14]:
def create_track_columns(set_tracks, number_parameters):
    colnames = []
    for i in range(set_tracks * number_parameters):
        colnames.append('T' + str(i))
    return colnames

In [15]:
def create_track_table(data):
    set_tracks = len(data['track_data'][0])
    number_parameters = len(data['track_data'][0][0])
    
    tracks = []
    colnames = create_track_columns(set_tracks, number_parameters)
    
    for cur in data['track_data']:
        arr = np.array(cur)
        tracks.append(arr.flatten())
        
    return pd.DataFrame(tracks, columns=colnames)

In [4]:
def create_track_list(table, set_tracks, number_parameters):
    number_jets = len(table)
    cols = create_track_columns(set_tracks, number_parameters)

    # extract raw matrix
    tracks = table.ix[:,cols].as_matrix()
    return tracks.reshape(number_jets, -1, number_parameters)

In [12]:
set_tracks = 10
number_parameters = 8

In [22]:
equalize_tracks(d1, set_tracks)

In [23]:
track_table = create_track_table(d1)

In [24]:
joined = pd.concat([d1.ix[:,0:-1], track_table], axis = 1)

In [25]:
tracks_reconstructed = create_track_list(joined, set_tracks, number_parameters)

In [42]:
matched = d1.loc[d1['Jet_jetIndex'] < 30]

In [131]:
def save_dataset(file, data):
    store = pd.HDFStore(file)
    store.put('data', data, format = 'table')
    store.close()

In [517]:
save_dataset('./padded_test.h5', joined)

In [38]:
# now divide the jets and put them in separate lists, according to their flavour
jets_b = []
jets_l = []
jets_c = []

# iterate over the jet list, with already matched tracks
for irow, row in d1.iterrows():
    jet_index = int(row["Jet_jetIndex"])
    if jet_index > last_tracks:
        break
    
    flavour = int(row["Jet_flavour"])
    
    # select the right list this jet belongs to
    if abs(flavour) == 5:
        jets = jets_b
    elif abs(flavour) == 4:
        jets = jets_c
    else:
        jets = jets_l
        
    # add the new jet to the list
    jets += [(row["Jet_pt"], row["Jet_eta"], row["Jet_phi"], row["Jet_mass"], flavour, row["track_data"])]

In [7]:
def read_metadata(store):
    return store.get_storer('data').attrs.metadata

In [8]:
with pd.HDFStore('/shome/phwindis/data/matched/1.h5') as store:
    metadata = read_metadata(store)
number_tracks = metadata['number_tracks']

In [10]:
metadata

{'number_jets': 218280, 'number_tracks': 33}

In [2]:
readin = pd.read_hdf('/shome/phwindis/data/matched/1.h5')

In [36]:
tracks_reconstructed = create_track_list(readin, number_tracks, number_parameters)

In [30]:
flavours = np.array(abs(readin['Jet_flavour']) == 5)

In [28]:
np.array(flavours) * 1

array([1, 0, 0, ..., 1, 0, 1])

In [34]:
flavours = flavours * 1

In [41]:
flavours.reshape((len(flavours),1))

array([[1],
       [0],
       [0],
       ..., 
       [1],
       [0],
       [1]])

In [37]:
tracks_reconstructed

array([[[  2.94921875e+00,  -3.15134138e-01,  -2.41892505e+00, ...,
           5.65006621e-02,   2.34374944e-02,   2.41832882e-01],
        [  1.00546875e+01,  -3.76293212e-01,  -2.76434135e+00, ...,
           1.14440946e-02,   1.13964770e-02,   7.37814903e-02],
        [  2.63867188e+00,  -4.74623859e-01,  -2.40915179e+00, ...,
           1.91296991e-02,   1.62890702e-02,   1.00551613e-01],
        ..., 
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00],
        [  0.00000000e+00,   0.00000000e+00,   0.00000000e+00, ...,
           0.00000000e+00,   0.00000000e+00,   0.00000000e+00]],

       [[  4.66406250e+00,  -4.32142094e-02,   2.62967229e+00, ...,
          -1.38634199e-03,  -1.36107870e-03,   1.38712360e-03],
        [  3.04882812e+00,  -9.04568657e-02,   2.60525823e+0

