In [26]:
import tensorflow as tf
from keras import backend as K
from keras import optimizers

In [2]:
from keras.layers import Dense, LSTM, Activation
from keras.engine.topology import Input
from keras.preprocessing.sequence import pad_sequences
import keras.engine.training

In [3]:
config = tf.ConfigProto(intra_op_parallelism_threads = 10, inter_op_parallelism_threads = 10, allow_soft_placement = True, device_count = {'CPU': 10})
session = tf.Session(config = config)
K.set_session(session)

In [6]:
from trainlib.FileCollection import FileCollection
from trainlib.Preprocessor import Preprocessor
from trainlib.PCAWhiteningPreprocessor import PCAWhiteningPreprocessor
from trainlib.RNNPreprocessor import RNNPreprocessor
from trainlib.ListPreprocessor import ListPreprocessor
from trainlib.generator import Generator
import trainlib.cuts
import trainlib.cuts as cuts
from trainlib.utils import read_data
import numpy as np
import pandas as pd
import math

In [7]:
# slightly extended version of the numpy-internal one with same name, also handles the case when the entries in the dataframe are actually numpy arrays themselves
def as_matrix(df):
    return np.array(df.as_matrix().tolist())

In [8]:
# prepare the generator for tests
H1_stream = {"/data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root" : cuts.no_cut}
H0_stream = {"/data_CMS/cms/wind/CJLST_NTuples/VBFH125/ZZ4lAnalysis.root" : cuts.no_cut}

In [9]:
# read some input data
fcoll = FileCollection({"/data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root" : cuts.no_cut}, 0.0, 1.0)
setup_data = read_data(fcoll, 0, 10, branches = ["JetPt", "JetEta", "JetPhi"])
validation_data = read_data(fcoll, 400, 800, branches = ["JetPt", "JetEta", "JetPhi"])

skimming /data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root
collection set up: 1 files, 110483 entries in total, 110483 of which will be used


In [10]:
# set up the preprocessor for the RNN
nonperiodic_columns = ["JetPt", "JetEta"]
periodic_columns = ["JetPhi"]
sorted_column = "JetPt"

In [11]:
pre_rnn = RNNPreprocessor('RNN_test', nonperiodic_columns, periodic_columns, sorted_column, cuts.no_cut, PCAWhiteningPreprocessor)

In [12]:
setup_data

Unnamed: 0,JetPt,JetEta,JetPhi
0,[53.6443],[-2.29227],[2.54869]
1,"[36.2919, 21.1634]","[0.326643, -0.80063]","[2.83377, 1.15123]"
2,[],[],[]
3,"[55.005, 28.7824]","[1.21199, -2.61541]","[1.77548, -0.592928]"
4,[29.372],[3.82245],[-2.58757]
5,[21.1895],[-2.82564],[-1.86381]
6,[],[],[]
7,[21.9638],[-1.08136],[-2.47143]
8,[81.062],[3.98602],[1.68111]
9,"[31.6878, 28.9569, 27.7632]","[-2.06541, 1.83295, -0.914617]","[1.32394, 0.201612, 2.14285]"


In [103]:
setup_data.apply(sort_row, raw = True, axis = 1)

[array([ 53.64427948], dtype=float32) array([-2.29226828], dtype=float32)
 array([ 2.54869056], dtype=float32)]
(3,)
[[ 53.64427948]
 [ -2.29226828]
 [  2.54869056]]
[[ 53.64427948]
 [ -2.29226828]
 [  2.54869056]]
[array(53.64427947998047, dtype=float32), array(-2.2922682762145996, dtype=float32), array(2.5486905574798584, dtype=float32)]
<type 'list'>
[array([ 53.64427948], dtype=float32) array([-2.29226828], dtype=float32)
 array([ 2.54869056], dtype=float32)]
(3,)
[[ 53.64427948]
 [ -2.29226828]
 [  2.54869056]]
[[ 53.64427948]
 [ -2.29226828]
 [  2.54869056]]
[array(53.64427947998047, dtype=float32), array(-2.2922682762145996, dtype=float32), array(2.5486905574798584, dtype=float32)]
<type 'list'>


TypeError: only integer scalar arrays can be converted to a scalar index

In [102]:
# sort it according to the first row
def sort_row(row):
    print row
    print np.shape(row)
    row_arr = np.stack(row)
    print row_arr
    sorted_arr = row_arr[:,row_arr[0,:].argsort()]
    print sorted_arr
    
    sorted_arr_list = np.split(sorted_arr, np.size(sorted_arr, axis = 0), axis = 0)
    retval = map(np.squeeze, sorted_arr_list)
    
    print retval
    print type(retval)
    
    return np.ndarray([np.array([1]), np.array([2]), np.array([3])])

In [30]:
testarr = np.array([[1,3,2],[1,2,3]])

In [94]:
sort_row(testarr)

[[1 3 2]
 [1 2 3]]
(2, 3)
[[1 3 2]
 [1 2 3]]
[[1 2 3]
 [1 3 2]]
[array([1, 2, 3]), array([1, 3, 2])]
<type 'list'>


[array([1, 2, 3]), array([1, 3, 2])]

In [18]:
sortlist = lambda row: 

<function __main__.<lambda>>

In [15]:
testgen = Generator(H1_stream, H0_stream, nonperiodic_columns + periodic_columns, preprocessor = pre_rnn.process)
setup_len = testgen.setup_training_data()

skimming /data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root
collection set up: 1 files, 110483 entries in total, 55241 of which will be used
skimming /data_CMS/cms/wind/CJLST_NTuples/VBFH125/ZZ4lAnalysis.root
collection set up: 1 files, 62320 entries in total, 31160 of which will be used


In [16]:
pre_rnn.setup_generator(testgen.raw_generator_scrambled(), len_setupdata = setup_len)

H1 contains 55241 entries
H0 contains 31160 entries
using the following chunk sizes: (55 / 31)
setting up list preprocessor on 86430 events
                                               JetPt  \
0                                          [45.3012]   
1                                          [111.147]   
2                                 [141.323, 50.4588]   
3                                            [69.31]   
4                                                 []   
5                                 [108.344, 82.3163]   
6                                          [84.7401]   
7                                 [40.9529, 20.7419]   
8                                                 []   
9                        [315.341, 78.5353, 27.9815]   
10                                         [21.9638]   
11                                [72.9257, 21.2938]   
12                       [31.6878, 28.9569, 27.7632]   
13                                                []   
14              [110

In [17]:
valgen = Generator(H1_stream, H0_stream, nonperiodic_columns + periodic_columns, preprocessor = pre_rnn.process)
valgen.setup_validation_data()

skimming /data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root
collection set up: 1 files, 110483 entries in total, 55242 of which will be used
skimming /data_CMS/cms/wind/CJLST_NTuples/VBFH125/ZZ4lAnalysis.root
collection set up: 1 files, 62320 entries in total, 31160 of which will be used


86402

In [25]:
for data in valgen.preprocessed_generator():
    preprocessed_data = data
    break

NameError: name 'valgen' is not defined

In [24]:
processed_data = preprocessed_data[0]['RNN_test']

NameError: name 'preprocessed_data' is not defined

In [22]:
processed_data

array([[[-0.02233414, -0.76905417,  0.71278638, -1.22003782],
        [-0.64363503,  1.59466064,  0.22980963,  1.38311088],
        [-0.67002487, -1.75270104, -0.95927858,  1.05279624],
        ..., 
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ]],

       [[-0.08187895,  1.26891649, -1.38891745,  0.18818794],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        ..., 
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ]],

       [[ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        ..., 
        [ 0.        ,  0

In [32]:
# test its whitening quality
flattened_data = processed_data[:,:,3].flatten()
quality_data = flattened_data[np.nonzero(flattened_data)]

In [33]:
np.mean(quality_data)

-0.08629901

In [34]:
np.std(quality_data)

1.0268894

In [10]:
pre_list = ListPreprocessor("listtest", nonperiodic_columns + periodic_columns, cuts.no_cut, PCAWhiteningPreprocessor)

In [15]:
# test the whitening quality:
data = preprocessed_data[0]['listtest']

In [16]:
np.shape(data)

(86, 9, 3)

In [17]:
data

array([[[-0.51350057,  0.7443307 ,  0.64557534],
        [-0.70693231,  0.50574839,  0.33523723],
        [-0.70693147,  1.48632395,  1.44796491],
        ..., 
        [ 0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ]],

       [[-0.04372877,  0.10874736,  1.48220909],
        [-0.47562236,  0.10941769, -0.52878791],
        [ 0.        ,  0.        ,  0.        ],
        ..., 
        [ 0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ]],

       [[-0.75590676,  0.89329928, -0.88921976],
        [ 0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ],
        ..., 
        [ 0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ]],

       ..., 
       [[ 0.        ,  0.        ,  0.        ],
        

In [18]:
first_col = data[:,:,2].flatten()
first_col_antipadded = first_col[np.nonzero(first_col)]

In [19]:
np.mean(first_col_antipadded)

0.14932205

In [20]:
np.std(first_col_antipadded)

1.0147306

In [21]:
H0_testdata

NameError: name 'H0_testdata' is not defined

In [12]:
pre_list.setup_generator

<bound method ListPreprocessor.setup_generator of <trainlib_development.ListPreprocessor.ListPreprocessor object at 0x7f4d8f228450>>

In [11]:
pre_list.setup(setup_data)

10 remaining after the cuts
12 remaining after the cuts
found a maximum list length in the setup data of 3: will pad or truncate to this length from now on


In [11]:
processed_data = pre_list.process(setup_data)

unpacked:        JetPt    JetEta    JetPhi
0  53.644279 -2.292268  2.548691
processed: [[ 53.64427948]
 [ -2.29226828]
 [  2.54869056]]
padded: [[ 53.64427948   0.           0.        ]
 [ -2.29226828   0.           0.        ]
 [  2.54869056   0.           0.        ]]
unpacked:        JetPt    JetEta    JetPhi
0  36.291904  0.326643  2.833765
1  21.163439 -0.800630  1.151233
processed: [[ 36.29190445  21.1634388 ]
 [  0.32664341  -0.80063021]
 [  2.83376527   1.15123296]]
padded: [[ 36.29190445  21.1634388    0.        ]
 [  0.32664341  -0.80063021   0.        ]
 [  2.83376527   1.15123296   0.        ]]
unpacked: Empty DataFrame
Columns: [JetPt, JetEta, JetPhi]
Index: []
processed: [[ 0.]
 [ 0.]
 [ 0.]]
padded: [[ 0.  0.  0.]
 [ 0.  0.  0.]
 [ 0.  0.  0.]]
unpacked:        JetPt    JetEta    JetPhi
0  55.005016  1.211992  1.775483
1  28.782440 -2.615411 -0.592928
processed: [[ 55.00501633  28.78244019]
 [  1.21199214  -2.6154108 ]
 [  1.77548277  -0.59292841]]
padded: [[ 55.00501633

In [13]:
processed_data['listtest']

array([[[ 53.64427948,  -2.29226828,   2.54869056],
        [  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ]],

       [[ 36.29190445,   0.32664341,   2.83376527],
        [ 21.1634388 ,  -0.80063021,   1.15123296],
        [  0.        ,   0.        ,   0.        ]],

       [[  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ]],

       [[ 55.00501633,   1.21199214,   1.77548277],
        [ 28.78244019,  -2.6154108 ,  -0.59292841],
        [  0.        ,   0.        ,   0.        ]],

       [[ 29.37203407,   3.82244658,  -2.58756518],
        [  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ]],

       [[ 21.18945503,  -2.82564116,  -1.86380863],
        [  0.        ,   0.        ,   0.        ],
        [  0.        ,   0.        ,   0.        ]],

       [[  0.        ,   0.        ,   0.        ],


In [68]:
setup_data

Unnamed: 0,PFMET,nCleanedJetsPt30,JetPt,JetEta,JetPhi
0,40.490429,1,[53.6443],[-2.29227],[2.54869]
1,40.09692,1,"[36.2919, 21.1634]","[0.326643, -0.80063]","[2.83377, 1.15123]"
2,44.240479,0,[],[],[]
3,71.606529,1,"[55.005, 28.7824]","[1.21199, -2.61541]","[1.77548, -0.592928]"
4,21.410542,0,[29.372],[3.82245],[-2.58757]
5,21.15958,0,[21.1895],[-2.82564],[-1.86381]
6,28.228645,0,[],[],[]
7,34.765644,0,[21.9638],[-1.08136],[-2.47143]
8,54.827316,1,[81.062],[3.98602],[1.68111]
9,7.474819,1,"[31.6878, 28.9569, 27.7632]","[-2.06541, 1.83295, -0.914617]","[1.32394, 0.201612, 2.14285]"


In [59]:
# set up the preprocessor for the remaining fixed-size input variables
fixed_size_columns = ["PFMET", "nCleanedJetsPt30"]
pre_fixed = PCAWhiteningPreprocessor(fixed_size_columns, cuts.no_cut)

In [14]:
pre_rnn.setup(setup_data)

10 remaining after the cuts
12 remaining after the cuts


In [15]:
pre_fixed.setup(setup_data)

10 remaining after the cuts


In [18]:
processed_rnn = pre_rnn.process(validation_data)

In [26]:
processed_fixed = pre_fixed.process(validation_data)

In [27]:
np.mean(processed_fixed, axis = 0)

array([-0.27069429,  0.32918631])

In [28]:
np.std(processed_fixed, axis = 0)

array([ 1.07572128,  1.95916834])

In [74]:
final = as_matrix(processed_data)

In [96]:
final = np.swapaxes(final, 1, 2)

In [98]:
final

array([[[ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ]],

       [[ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ]],

       [[-0.58775389,  1.85188425, -1.40367889,  0.28947026],
        [-0.59278959,  1.81904542, -0.15680805,  1.41542077],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ]],

       ..., 
       [[-0.08938136, -0.91685706,  1.21690023,  0.

In [21]:
class CombinedModel:
    def __init__(self):
        self.model = None
        
    def build(self):
        in_layer = Input(shape = (None, 2))
        # number units = dimensionality of the output space
        lstm = LSTM(units = 16, return_sequences = False)(in_layer)
        out_layer = Dense(1, activation = 'tanh')(lstm)
        
        self.model = keras.engine.training.Model(in_layer, out_layer, name = 'lstm')

In [22]:
mod = CombinedModel()
mod.build()

In [23]:
processed_data

NameError: name 'processed_data' is not defined

In [85]:
mod.model.predict(processed_data)

array([[ 0.08889437],
       [ 0.11208226],
       [ 0.        ],
       [ 0.12064432],
       [ 0.09134383],
       [ 0.07961366],
       [ 0.        ],
       [ 0.08135621],
       [ 0.08613631],
       [ 0.20706847]], dtype=float32)