In [4]:
import tensorflow as tf
from keras import backend as K
from keras import optimizers

In [6]:
from keras.layers import Dense, LSTM, Activation
from keras.engine.topology import Input
from keras.preprocessing.sequence import pad_sequences
import keras.engine.training

In [7]:
config = tf.ConfigProto(intra_op_parallelism_threads = 10, inter_op_parallelism_threads = 10, allow_soft_placement = True, device_count = {'CPU': 10})
session = tf.Session(config = config)
K.set_session(session)

In [8]:
from trainlib_development.FileCollection import FileCollection
from trainlib_development.Preprocessor import Preprocessor
from trainlib_development.PCAWhiteningPreprocessor import PCAWhiteningPreprocessor
from trainlib_development.RNNPreprocessor import RNNPreprocessor
from trainlib_development.ListPreprocessor import ListPreprocessor
import trainlib_development.cuts
import trainlib.cuts as cuts
from trainlib.utils import read_data
import numpy as np
import pandas as pd
import math

Welcome to JupyROOT 6.10/09


In [9]:
# slightly extended version of the numpy-internal one with same name, also handles the case when the entries in the dataframe are actually numpy arrays themselves
def as_matrix(df):
    return np.array(df.as_matrix().tolist())

In [10]:
# read some input data
fcoll = FileCollection({"/data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root" : cuts.no_cut}, 0.0, 1.0)
setup_data = read_data(fcoll, 0, 10, branches = ["PFMET", "nCleanedJetsPt30", "JetPt", "JetEta", "JetPhi"])
validation_data = read_data(fcoll, 400, 800, branches = ["PFMET", "nCleanedJetsPt30", "JetPt", "JetEta", "JetPhi"])

skimming /data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root
collection set up: 1 files, 110483 entries in total, 110483 of which will be used


In [11]:
# set up the preprocessor for the RNN
nonperiodic_columns = ["JetPt", "JetEta"]
periodic_columns = ["JetPhi"]
sorted_column = "JetPt"

pre_rnn = RNNPreprocessor(nonperiodic_columns, periodic_columns, sorted_column, cuts.no_cut, PCAWhiteningPreprocessor)

TypeError: __init__() takes exactly 5 arguments (4 given)

In [12]:
pre_list = ListPreprocessor("listtest", nonperiodic_columns, cuts.no_cut, PCAWhiteningPreprocessor)

In [13]:
pre_list.setup(setup_data)

10 remaining after the cuts
12 remaining after the cuts


In [14]:
processed_data = pre_list.process(setup_data)

In [21]:
processed_array = processed_data['listtest']

In [20]:
perm = np.random.permutation(10)

In [25]:
perm

array([2, 4, 1, 8, 7, 5, 9, 3, 6, 0])

In [26]:
processed_array[perm]

array([[[ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ]],

       [[-0.39138788,  2.20816612],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ]],

       [[-0.00496914,  0.22699654],
        [-0.88267016,  0.15451565],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ]],

       [[ 2.59364891,  0.60388923],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ]],

       [[-0.83751357, -0.01256469],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ]],

       [[-0.8887645 , -0.86318493],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ]],

       [[-0.27976915, -0.82395327],
        [-0.42282125,  1.22269213],
        [-0.50204527, -0.11804488],
        [ 0.    

In [24]:
processed_array

array([[[ 0.98708069, -1.65421212],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ]],

       [[-0.00496914,  0.22699654],
        [-0.88267016,  0.15451565],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ]],

       [[ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ]],

       [[ 1.07879114,  0.06104446],
        [-0.44957966, -1.00534427],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ]],

       [[-0.39138788,  2.20816612],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ]],

       [[-0.8887645 , -0.86318493],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ]],

       [[ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.        ,  0.        ],
        [ 0.    

In [68]:
setup_data

Unnamed: 0,PFMET,nCleanedJetsPt30,JetPt,JetEta,JetPhi
0,40.490429,1,[53.6443],[-2.29227],[2.54869]
1,40.09692,1,"[36.2919, 21.1634]","[0.326643, -0.80063]","[2.83377, 1.15123]"
2,44.240479,0,[],[],[]
3,71.606529,1,"[55.005, 28.7824]","[1.21199, -2.61541]","[1.77548, -0.592928]"
4,21.410542,0,[29.372],[3.82245],[-2.58757]
5,21.15958,0,[21.1895],[-2.82564],[-1.86381]
6,28.228645,0,[],[],[]
7,34.765644,0,[21.9638],[-1.08136],[-2.47143]
8,54.827316,1,[81.062],[3.98602],[1.68111]
9,7.474819,1,"[31.6878, 28.9569, 27.7632]","[-2.06541, 1.83295, -0.914617]","[1.32394, 0.201612, 2.14285]"


In [59]:
# set up the preprocessor for the remaining fixed-size input variables
fixed_size_columns = ["PFMET", "nCleanedJetsPt30"]
pre_fixed = PCAWhiteningPreprocessor(fixed_size_columns, cuts.no_cut)

In [14]:
pre_rnn.setup(setup_data)

10 remaining after the cuts
12 remaining after the cuts


In [15]:
pre_fixed.setup(setup_data)

10 remaining after the cuts


In [18]:
processed_rnn = pre_rnn.process(validation_data)

In [26]:
processed_fixed = pre_fixed.process(validation_data)

In [27]:
np.mean(processed_fixed, axis = 0)

array([-0.27069429,  0.32918631])

In [28]:
np.std(processed_fixed, axis = 0)

array([ 1.07572128,  1.95916834])

In [74]:
final = as_matrix(processed_data)

In [96]:
final = np.swapaxes(final, 1, 2)

In [98]:
final

array([[[ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ]],

       [[ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ]],

       [[-0.58775389,  1.85188425, -1.40367889,  0.28947026],
        [-0.59278959,  1.81904542, -0.15680805,  1.41542077],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ]],

       ..., 
       [[-0.08938136, -0.91685706,  1.21690023,  0.

In [83]:
class CombinedModel:
    def __init__(self):
        self.model = None
        
    def build(self):
        in_layer = Input(shape = (None, 2))
        # number units = dimensionality of the output space
        lstm = LSTM(units = 16, return_sequences = False)(in_layer)
        out_layer = Dense(1, activation = 'tanh')(lstm)
        
        self.model = keras.engine.training.Model(in_layer, out_layer, name = 'lstm')

In [84]:
mod = CombinedModel()
mod.build()

In [2]:
processed_data

NameError: name 'processed_data' is not defined

In [85]:
mod.model.predict(processed_data)

array([[ 0.08889437],
       [ 0.11208226],
       [ 0.        ],
       [ 0.12064432],
       [ 0.09134383],
       [ 0.07961366],
       [ 0.        ],
       [ 0.08135621],
       [ 0.08613631],
       [ 0.20706847]], dtype=float32)