In [1]:
import tensorflow as tf
from keras import backend as K
from keras import optimizers

Using TensorFlow backend.


In [2]:
from keras.layers import Dense, LSTM, Activation
from keras.engine.topology import Input
from keras.preprocessing.sequence import pad_sequences
import keras.engine.training

In [3]:
config = tf.ConfigProto(intra_op_parallelism_threads = 10, inter_op_parallelism_threads = 10, allow_soft_placement = True, device_count = {'CPU': 10})
session = tf.Session(config = config)
K.set_session(session)

In [4]:
from trainlib.FileCollection import FileCollection
from trainlib.Preprocessor import Preprocessor
from trainlib.PCAWhiteningPreprocessor import PCAWhiteningPreprocessor
from trainlib.RNNPreprocessor import RNNPreprocessor
from trainlib.ListPreprocessor import ListPreprocessor
from trainlib.generator import Generator
import trainlib.cuts
import trainlib.cuts as cuts
from trainlib.utils import read_data
import numpy as np
import pandas as pd
import math

Welcome to JupyROOT 6.10/09


In [5]:
# slightly extended version of the numpy-internal one with same name, also handles the case when the entries in the dataframe are actually numpy arrays themselves
def as_matrix(df):
    return np.array(df.as_matrix().tolist())

In [6]:
# prepare the generator for tests
H1_stream = {"/data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root" : cuts.no_cut}
H0_stream = {"/data_CMS/cms/wind/CJLST_NTuples/VBFH125/ZZ4lAnalysis.root" : cuts.no_cut}

In [11]:
# read some input data
fcoll = FileCollection({"/data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root" : cuts.no_cut}, 0.0, 1.0)
setup_data = read_data(fcoll, 0, 10, branches = ["JetPt", "JetEta", "JetPhi", "PFMET", "nCleanedJetsPt30"])
validation_data = read_data(fcoll, 400, 800, branches = ["JetPt", "JetEta", "JetPhi", "PFMET", "nCleanedJetsPt30"])

skimming /data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root
collection set up: 1 files, 110483 entries in total, 110483 of which will be used


In [8]:
# set up the preprocessor for the RNN
nonperiodic_columns = ["JetPt", "JetEta"]
periodic_columns = ["JetPhi"]
sorted_column = "JetPt"

In [27]:
setup_data

Unnamed: 0,JetPt,JetEta,JetPhi,PFMET,nCleanedJetsPt30
0,[53.6443],[-2.29227],[2.54869],40.490429,1
1,"[36.2919, 21.1634]","[0.326643, -0.80063]","[2.83377, 1.15123]",40.09692,1
2,[],[],[],44.240479,0
3,"[55.005, 28.7824]","[1.21199, -2.61541]","[1.77548, -0.592928]",71.606529,1
4,[29.372],[3.82245],[-2.58757],21.410542,0
5,[21.1895],[-2.82564],[-1.86381],21.15958,0
6,[],[],[],28.228645,0
7,[21.9638],[-1.08136],[-2.47143],34.765644,0
8,[81.062],[3.98602],[1.68111],54.827316,1
9,"[31.6878, 28.9569, 27.7632]","[-2.06541, 1.83295, -0.914617]","[1.32394, 0.201612, 2.14285]",7.474819,1


In [11]:
testgen = Generator(H1_stream, H0_stream, nonperiodic_columns + periodic_columns, preprocessor = pre_rnn.process)
setup_len = testgen.setup_training_data()

skimming /data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root
collection set up: 1 files, 110483 entries in total, 55241 of which will be used
skimming /data_CMS/cms/wind/CJLST_NTuples/VBFH125/ZZ4lAnalysis.root
collection set up: 1 files, 62320 entries in total, 31160 of which will be used


In [12]:
pre_rnn = RNNPreprocessor('RNN_test', nonperiodic_columns, periodic_columns, sorted_column, cuts.no_cut, PCAWhiteningPreprocessor)

In [12]:
pre_rnn.setup_generator(testgen.raw_generator_scrambled(), len_setupdata = setup_len)

H1 contains 55241 entries
H0 contains 31160 entries
using the following chunk sizes: (55 / 31)
setting up list preprocessor on 86430 events
86430 remaining after the cuts
102669 remaining after the cuts
found a maximum list length in the setup data of 9: will pad or truncate to this length from now on


In [13]:
valgen = Generator(H1_stream, H0_stream, nonperiodic_columns + periodic_columns, preprocessor = pre_rnn.process)
valgen.setup_validation_data()

skimming /data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root
collection set up: 1 files, 110483 entries in total, 55242 of which will be used
skimming /data_CMS/cms/wind/CJLST_NTuples/VBFH125/ZZ4lAnalysis.root
collection set up: 1 files, 62320 entries in total, 31160 of which will be used


86402

In [14]:
for data in valgen.preprocessed_generator():
    preprocessed_data = data
    break

H1 contains 55242 entries
H0 contains 31160 entries
using the following chunk sizes: (55 / 31)


In [15]:
processed_data = preprocessed_data[0]['RNN_test']

In [16]:
processed_data

array([[[ 1.29843354,  0.7812869 ,  0.4199647 , -1.35477984],
        [-0.80170888,  1.27705646, -0.53867084,  1.29122496],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        ..., 
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ]],

       [[ 0.01474688,  1.49829412,  1.35071456, -0.44997355],
        [-0.52562273, -0.29844102, -1.40878177,  0.11286414],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        ..., 
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ]],

       [[ 0.49698573,  0.21857388,  0.36759889, -1.37099385],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        [ 0.        ,  0.        ,  0.        ,  0.        ],
        ..., 
        [ 0.        ,  0

In [26]:
# test its whitening quality
flattened_data = processed_data[:,:,0].flatten()
quality_data = flattened_data[np.nonzero(flattened_data)]

In [27]:
np.mean(quality_data)

-0.019649608

In [28]:
np.std(quality_data)

0.93612128

In [12]:
nonperiodic_columns = ["JetPt", "JetEta"]
periodic_columns = ["JetPhi"]
sorted_column = "JetPt"
fixed_size_columns = ["PFMET", "nCleanedJetsPt30"]
testgen = Generator(H1_stream, H0_stream, nonperiodic_columns + periodic_columns + fixed_size_columns, preprocessor = None)
setup_len = testgen.setup_training_data()

skimming /data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root
collection set up: 1 files, 110483 entries in total, 55241 of which will be used
skimming /data_CMS/cms/wind/CJLST_NTuples/VBFH125/ZZ4lAnalysis.root
collection set up: 1 files, 62320 entries in total, 31160 of which will be used


In [9]:
pre_rnn = RNNPreprocessor('RNN_test', nonperiodic_columns, periodic_columns, sorted_column, cuts.no_cut, PCAWhiteningPreprocessor)
pre_rnn.setup_generator(testgen.raw_generator_scrambled(), len_setupdata = setup_len)

H1 contains 55241 entries
H0 contains 31160 entries
using the following chunk sizes: (55 / 31)
setting up list preprocessor on 86430 events
86430 remaining after the cuts
102669 remaining after the cuts
found a maximum list length in the setup data of 9: will pad or truncate to this length from now on


In [10]:
# set up the preprocessor for the remaining fixed-size input variables
pre_fixed = PCAWhiteningPreprocessor('fixed_test', fixed_size_columns, cuts.no_cut)
pre_fixed.setup_generator(testgen.raw_generator_scrambled(), len_setupdata = setup_len)

H1 contains 55241 entries
H0 contains 31160 entries
using the following chunk sizes: (55 / 31)
setting up PCA whitening on 86430 events
86430 remaining after the cuts


In [7]:
class CombinedModel:
    def __init__(self):
        self.model = None
        
    def build(self):
        in_layer_lstm = Input(shape = (None, 4), name = 'Jet')
        # number units = dimensionality of the output space
        lstm = LSTM(units = 16, return_sequences = False)(in_layer_lstm)
        out_layer_lstm = Dense(4, activation = 'tanh')(lstm)
        
        in_layer_dense = Input(shape = (2,), name = 'scalar_inputs')
        
        x = keras.layers.concatenate([out_layer_lstm, in_layer_dense])
        x = Dense(128, activation = 'tanh')(x)
        x = Dense(128, activation = 'tanh')(x)

        out_layer = Dense(1, activation = 'tanh', name = 'target')(x)
        
        self.model = keras.engine.training.Model(inputs = [in_layer_lstm, in_layer_dense], outputs = [out_layer], name = 'combined')

In [8]:
class CombinedPreprocessor(Preprocessor):
    def __init__(self, name, scalar_inputs, scalar_preprocessor_basetype, list_inputs, list_preprocessor_basetype, cuts):
        self.name = name
        self.scalar_inputs = scalar_inputs
        self.list_inputs = list_inputs # note: list inputs is of the form {input_group_name: ["input_1", ...], ...}
        self.cuts = cuts
        self.last_indices = None
        
        self.list_preprocessors = {}
        self.scalar_preprocessor = scalar_preprocessor_basetype('scalar_inputs', self.scalar_inputs, self.cuts)
        
        self.processed_columns = []
        
        # construct the RNN preprocessors from the passed dictionary of list inputs:
        for name, input_columns in self.list_inputs.iteritems():
            periodic_columns = []
            nonperiodic_columns = []
            sorted_column = None
            for input_column in input_columns:
                # listen for the keyword "Phi" to classify input variables as being periodic
                if "Phi" in input_column:
                    periodic_columns.append(input_column)
                else:
                    nonperiodic_columns.append(input_column)
                    
                # listen for the keyword "Pt" to set the column as the one that is sorted
                if "Pt" in input_column:
                    sorted_column = input_column
                    
            print("for list input group '" + name + "': assigned periodic inputs " + str(periodic_columns) + 
                " and nonperiodic inputs " + str(nonperiodic_columns) + ", sorting according to " + sorted_column)
            
            list_pre = list_preprocessor_basetype(name, nonperiodic_columns, periodic_columns, sorted_column, self.cuts, scalar_preprocessor_basetype)
            self.list_preprocessors[name] = list_pre
            
            self.processed_columns += nonperiodic_columns
            self.processed_columns += periodic_columns
            
        self.processed_columns += self.scalar_inputs
        print "total processed columns: " + str(self.processed_columns)
            
    def setup_generator(self, datagen, len_setupdata):
        self.len_setupdata = len_setupdata
        extracted_data = []
        extracted_rows = 0
        
        for data in datagen:
            extracted_data.append(data)
            extracted_rows += len(data)
            
            if extracted_rows > self.len_setupdata:
                break
                
        print "setting up preprocessor on " + str(extracted_rows) + " events"
        
        input_data = pd.concat(extracted_data)
        input_data = input_data.reset_index(drop = True)
        
        self.setup(input_data)
    
    def setup(self, data):
        cut_data = self._rowcol_cut(data)
        print cut_data.columns
        # in turn, set up all the list-type preprocessors as well as the separate scalar one
        print "setting up scalar preprocessor"
        self.scalar_preprocessor.setup(cut_data)
        
        for name, pre in self.list_preprocessors.iteritems():
            print "setting up list preprocessor for '" + name + "'"
            pre.setup(cut_data)
    
    def process(self, data):
        cut_data = self._rowcol_cut(data)
        self.last_indices = cut_data.index
        
        # call each preprocessor in turn, and in the end combine all their outputs into the final dictionary object
        retval = self.scalar_preprocessor.process(cut_data)
                
        for name, pre in self.list_preprocessors.iteritems():
            list_output = pre.process(cut_data)
            retval.update(list_output)
            
        return retval
    
    def get_last_indices(self):
        return self.last_indices
    
    def save(self, folder, filename):
        # save separately the scalar preprocessor as well as those for the list inputs
        self.scalar_preprocessor.save(folder, "scalar_" + filename)
        
        for name, pre in self.list_preprocessors.iteritems():
            pre.save(folder, "list_" + name + "_" + filename)
    
    def load(self, folder, filename):
        # load them back separately as well
        self.scalar_preprocessor.load(folder, "scalar_" + filename)
        
        for name, pre in self.list_preprocessor.iteritems():
            pre.load(folder, "list_" + name + "_" + filename)
            
    def _rowcol_cut(self, data):
        data = data.loc[data.apply(self.cuts, axis = 1)]
        
        output_data = data.loc[:, self.processed_columns]
        
        return output_data

In [9]:
scalar_inputs = ["PFMET", "nCleanedJetsPt30"]
list_inputs = {'Jet': ["JetPt", "JetEta", "JetPhi"]}

pre = CombinedPreprocessor('combined_test', scalar_inputs, PCAWhiteningPreprocessor, list_inputs, RNNPreprocessor, cuts.no_cut)

PCA setup: ['PFMET', 'nCleanedJetsPt30']
for list input group 'Jet': assigned periodic inputs ['JetPhi'] and nonperiodic inputs ['JetPt', 'JetEta'], sorting according to JetPt
PCA setup: ['JetPt', 'JetEta', 'JetPhi_sin', 'JetPhi_cos']
total processed columns: ['JetPt', 'JetEta', 'JetPhi', 'PFMET', 'nCleanedJetsPt30']


In [10]:
testgen = Generator(H1_stream, H0_stream, pre.processed_columns, preprocessor = None)
setup_len = testgen.setup_training_data()
pre.setup_generator(testgen.raw_generator_scrambled(), len_setupdata = setup_len)

skimming /data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root
collection set up: 1 files, 110483 entries in total, 55241 of which will be used
skimming /data_CMS/cms/wind/CJLST_NTuples/VBFH125/ZZ4lAnalysis.root
collection set up: 1 files, 62320 entries in total, 31160 of which will be used
H1 contains 55241 entries
H0 contains 31160 entries
using the following chunk sizes: (55 / 31)
setting up preprocessor on 86430 events
Index([u'JetPt', u'JetEta', u'JetPhi', u'PFMET', u'nCleanedJetsPt30'], dtype='object')
setting up scalar preprocessor
setting up PCA whitening on 86430 events
86430 remaining after the cuts
setting up list preprocessor for 'Jet'
List: Index([u'JetPt', u'JetEta', u'JetPhi_sin', u'JetPhi_cos'], dtype='object')
86430 remaining after the cuts
setting up PCA whitening on 102669 events
102669 remaining after the cuts
found a maximum list length in the setup data of 9: will pad or truncate to this length from now on


In [12]:
pre.process(setup_data)

{'Jet': array([[[-0.17588344, -1.00814199,  0.45338827, -1.33961868],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ]],
 
        [[-0.49884474,  0.15300953,  0.81654179, -1.16422951],
         [-0.78042543, -0.34356138, -1.23776233, -0.69459617],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.      

In [13]:
mod = CombinedModel()
mod.build()

In [14]:
sgd = optimizers.SGD(lr = 0.01, momentum = 0.9)

In [15]:
mod.model.compile(loss = "mean_squared_error", optimizer = sgd, metrics = ["accuracy"])

In [16]:
train_gen = Generator(H1_stream, H0_stream, pre.processed_columns, preprocessor = pre.process)
training_len = train_gen.setup_training_data()
val_gen = Generator(H1_stream, H0_stream, pre.processed_columns, preprocessor = pre.process)
validation_len = val_gen.setup_validation_data()

skimming /data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root
collection set up: 1 files, 110483 entries in total, 55241 of which will be used
skimming /data_CMS/cms/wind/CJLST_NTuples/VBFH125/ZZ4lAnalysis.root
collection set up: 1 files, 62320 entries in total, 31160 of which will be used
skimming /data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root
collection set up: 1 files, 110483 entries in total, 55242 of which will be used
skimming /data_CMS/cms/wind/CJLST_NTuples/VBFH125/ZZ4lAnalysis.root
collection set up: 1 files, 62320 entries in total, 31160 of which will be used


In [17]:
mod.model.fit_generator(train_gen.preprocessed_generator(), steps_per_epoch = 128, epochs = 5, 
                        verbose = 2, validation_data = val_gen.preprocessed_generator(), validation_steps = 128)

Epoch 1/5
H1 contains 55241 entries
H0 contains 31160 entries
using the following chunk sizes: (55 / 31)
H1 contains 55242 entries
H0 contains 31160 entries
using the following chunk sizes: (55 / 31)
99s - loss: 0.2055 - acc: 0.7147 - val_loss: 0.1888 - val_acc: 0.7227
Epoch 2/5
92s - loss: 0.1801 - acc: 0.7343 - val_loss: 0.1773 - val_acc: 0.7345
Epoch 3/5
93s - loss: 0.1749 - acc: 0.7366 - val_loss: 0.1741 - val_acc: 0.7319
Epoch 4/5
92s - loss: 0.1774 - acc: 0.7299 - val_loss: 0.1722 - val_acc: 0.7298
Epoch 5/5
93s - loss: 0.1712 - acc: 0.7353 - val_loss: 0.1680 - val_acc: 0.7407


<keras.callbacks.History at 0x7f34be470290>