In [51]:
import ROOT
import root_numpy as rnpy
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn
import pickle
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM

In [52]:
# build here the keras model
def RNN_classifier():
    model = Sequential()
    
    model.add(LSTM(32, input_dim = 8))
    
    # make an output layer with just 1 output -> for a binary classification problem: b-jet / not b-jet
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  
    return model

In [79]:
batch_size_jets = 10
batch_size_tracks = 50
read_pos_jets = 0
read_pos_tracks = 0
number_chunks = 0
chunks_limit = 1

In [80]:
while number_chunks < chunks_limit:
    number_chunks += 1
    
    # read in new chunk of jet and track data
    d1 = pd.DataFrame(rnpy.root2array("/mnt/t3nfs01/data01/shome/jpata/btv/gc/TagVarExtractor/GCa08e5e237323/TT_TuneCUETP8M1_13TeV-powheg-pythia8/job_0_out.root",
                                 treename = "tagVars/ttree", start = read_pos_jets, stop = read_pos_jets + batch_size_jets))
    d2 = pd.DataFrame(rnpy.root2array("/mnt/t3nfs01/data01/shome/jpata/btv/gc/TagVarExtractor/GCa08e5e237323/TT_TuneCUETP8M1_13TeV-powheg-pythia8/job_0_out.root",
                                 treename = "tagVars/ttree_track", start = read_pos_tracks, stop = read_pos_tracks + batch_size_tracks))
    
    # figure out where the next chunk should start so that we don't count any jets multiple times
    last_tracks = (int)(d2.tail(1)['Track_jetIndex'].iloc[0]-1)
    last_jet = (int)(d1.tail(1)['Jet_jetIndex'].iloc[0]-1)
    read_pos_jets += (d1.loc[d1['Jet_jetIndex'] == last_tracks].index[-1] + 1)
    read_pos_tracks += (d2.loc[d2['Track_jetIndex'] == last_tracks].index[-1] + 1)

    # add the track data to the jet list
    d1['track_data'] = pd.np.empty((len(d1.index),0)).tolist()
    
    # iterate over the track list to join jets with the tracks belonging to them
    for irow, row in d2.iterrows():
        # these are the track data of the current track:
        tracks = row[["Track_pt", "Track_eta", "Track_phi", "Track_dxy", "Track_dz", "Track_IP", "Track_IP2D", "Track_length"]].as_matrix()
        jet_index = int(row["Track_jetIndex"])
        if jet_index > last_tracks:
            break
        table_index = d1.loc[d1['Jet_jetIndex'] == jet_index].index[0]

        # append the tracks data to the matching jet in the main table
        d1['track_data'][table_index].append(tracks)
    
    # now divide the jets and put them in separate lists, according to their flavour
    jets_b = []
    jets_l = []
    jets_c = []

    # iterate over the jet list, with already matched tracks
    for irow, row in d1.iterrows():
        jet_index = int(row["Jet_jetIndex"])
        if jet_index > last_tracks:
            break

        flavour = int(row["Jet_flavour"])

        # select the right list this jet belongs to
        if abs(flavour) == 5:
            jets = jets_b
        elif abs(flavour) == 4:
            jets = jets_c
        else:
            jets = jets_l

        # add the new jet to the list
        jets += [(row["Jet_pt"], row["Jet_eta"], row["Jet_phi"], row["Jet_mass"], flavour, row["track_data"])]
        
    # now, have sorted jets in three lists, can use them directly for training!

In [55]:
test = jets_b[0][-1]

In [87]:
# extract the tracks and put them in pt-order, hardest tracks first
b_tracks = [cur[-1] for cur in jets_b]
b_tracks = [sorted(cur, key = lambda tracks: tracks[0], reverse = True) for cur in b_tracks]

In [65]:
# sort the list of tracks in reverse pt-order: i.e. put the hardest track first!
x=sorted(test, key = lambda tracks: tracks[0], reverse = True)

In [103]:
# make the output vector (1 ... b-jets, 0 ... non-b-jets)
y_train = np.full((len(b_tracks), 1), 1, float)

In [69]:
by=np.array(1).reshape(1,1)

In [90]:
np.array(b_tracks)

array([ [array([ 12.0859375 ,   1.08896148,  -0.16641054,   0.02646352,
         0.01300435,   0.02760686,   0.02646351,   3.06698036], dtype=float32), array([ 10.984375  ,   1.06442451,  -0.18750662,   0.03318312,
         0.03779414,   0.04054669,   0.03318313,   1.70997262], dtype=float32), array([  7.34765625e+00,   1.11990726e+00,  -4.97098789e-02,
        -2.66015623e-02,   1.53442379e-03,   2.66169552e-02,
         2.66015567e-02,   3.82213205e-01], dtype=float32), array([  2.78320312e+00,   8.39381099e-01,  -3.69839042e-01,
        -2.19482416e-03,  -2.47070310e-03,  -2.83815456e-03,
        -2.19482300e-03,   1.27673065e-02], dtype=float32), array([ 2.5390625 ,  1.15927613, -0.06299405, -0.01549805, -0.04019531,
        0.0277005 ,  0.01549804,  0.37648013], dtype=float32), array([ 2.21289062,  1.11386454, -0.14570315,  0.01983399, -0.00496338,
       -0.02005092, -0.01983399,  0.16084765], dtype=float32)],
       [array([ 5.20703125,  0.5877865 , -1.1446619 ,  0.02435589,  0.

In [66]:
b=np.array(x).reshape(1,6,8)

In [45]:
model = RNN_classifier()

In [62]:
model.fit(b,by, batch_size = 1)

INFO (theano.gof.compilelock): Refreshing lock /mnt/t3nfs01/data01/shome/phwindis/.theano/compiledir_Linux-2.6-el6.x86_64-x86_64-with-redhat-6.6-Carbon-x86_64-2.7.12-64/lock_dir/lock


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7fe6ded18e90>

In [64]:
model.predict(b, batch_size = 1)

INFO (theano.gof.compilelock): Refreshing lock /mnt/t3nfs01/data01/shome/phwindis/.theano/compiledir_Linux-2.6-el6.x86_64-x86_64-with-redhat-6.6-Carbon-x86_64-2.7.12-64/lock_dir/lock


array([[ 0.37535214]], dtype=float32)

In [67]:
b

array([[[  1.20859375e+01,   1.08896148e+00,  -1.66410536e-01,
           2.64635198e-02,   1.30043542e-02,   2.76068579e-02,
           2.64635142e-02,   3.06698036e+00],
        [  1.09843750e+01,   1.06442451e+00,  -1.87506616e-01,
           3.31831202e-02,   3.77941430e-02,   4.05466929e-02,
           3.31831314e-02,   1.70997262e+00],
        [  7.34765625e+00,   1.11990726e+00,  -4.97098789e-02,
          -2.66015623e-02,   1.53442379e-03,   2.66169552e-02,
           2.66015567e-02,   3.82213205e-01],
        [  2.78320312e+00,   8.39381099e-01,  -3.69839042e-01,
          -2.19482416e-03,  -2.47070310e-03,  -2.83815456e-03,
          -2.19482300e-03,   1.27673065e-02],
        [  2.53906250e+00,   1.15927613e+00,  -6.29940480e-02,
          -1.54980468e-02,  -4.01953124e-02,   2.77004987e-02,
           1.54980412e-02,   3.76480132e-01],
        [  2.21289062e+00,   1.11386454e+00,  -1.45703152e-01,
           1.98339857e-02,  -4.96337889e-03,  -2.00509205e-02,
          -1.9

In [99]:
by

array([[1]])