In [1]:
from ROOT import TFile
from root_numpy import root2array, root2rec, tree2array
import array
import pandas as pd
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense, Activation
from keras.layers import LSTM
from keras.callbacks import EarlyStopping
from keras import losses, optimizers
from keras import backend as K
import math

Welcome to JupyROOT 6.10/09


Using TensorFlow backend.


In [2]:
config = tf.ConfigProto(intra_op_parallelism_threads=10, inter_op_parallelism_threads=10, \
                        allow_soft_placement=True, device_count = {'CPU': 10})
session = tf.Session(config = config)
K.set_session(session)

2018-03-16 19:19:56.862052: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.1 instructions, but these are available on your machine and could speed up CPU computations.
2018-03-16 19:19:56.862126: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use SSE4.2 instructions, but these are available on your machine and could speed up CPU computations.
2018-03-16 19:19:56.862144: W tensorflow/core/platform/cpu_feature_guard.cc:45] The TensorFlow library wasn't compiled to use AVX instructions, but these are available on your machine and could speed up CPU computations.


In [3]:
inpath = "/data_CMS/cms/tsculac/CJLST_NTuples/"
filename = "/ZZ4lAnalysis.root"
H1_files = ["VBFH125"]
H0_files = ["ggH125"]

In [4]:
def generate_training_data(H1_file, H0_file, branches, training_split = 0.5, chunks = 100):
    return datagen(H1_file, H0_file, branches, start_fraction = 0.0, end_fraction = training_split, chunks = chunks)

In [5]:
def generate_validation_data(H1_file, H0_file, branches, training_split = 0.5, chunks = 100):
    return datagen(H1_file, H0_file, branches, start_fraction = training_split, end_fraction = 1.0, chunks = chunks)

In [6]:
def datagen(H1_file, H0_file, branches, start_fraction, end_fraction, chunks = 100):
    H1_curpos = 0
    H0_curpos = 0
    
    # first determine the length of the actual data sets (important for respecting the training / testing split)
    f_H1 = TFile.Open(inpath + H1_file + filename)
    H1_length = f_H1.Get("ZZTree/candTree").GetEntries()
    f_H1.Close()
    
    f_H0 = TFile.Open(inpath + H0_file + filename)
    H0_length = f_H0.Get("ZZTree/candTree").GetEntries()
    f_H0.Close()
    
    H1_minpos = int(H1_length * start_fraction)
    H0_minpos = int(H0_length * start_fraction)
    
    H1_maxpos = int(H1_length * end_fraction)
    H0_maxpos = int(H0_length * end_fraction)
    
    print "H1 contains " + str(H1_length) + " entries, " + str(H1_maxpos) + " of which will be used"
    print "H0 contains " + str(H0_length) + " entries, " + str(H0_maxpos) + " of which will be used"

    H1_chunksize = int(H1_maxpos / chunks)
    H0_chunksize = int(H0_maxpos / chunks)
    
    print "using the following chunk sizes: " + "(" + str(H1_chunksize) + " / " + str(H0_chunksize) + ")"
    
    # deliver data forever, it will be the job of the early_stopping to actually terminate the training
    while True:
        # prepare next training data chunk set by drawing events randomly from the two files
        H1_data = pd.DataFrame(root2array(inpath + H0_file + filename, treename = "ZZTree/candTree", 
                    branches = branches, start = H1_curpos, stop = H1_curpos + H1_chunksize))

        H0_data = pd.DataFrame(root2array(inpath + H1_file + filename, treename = "ZZTree/candTree",
                    branches = branches, start = H0_curpos, stop = H0_curpos + H0_chunksize))
                        
        # update the starting position for the next chunk
        H1_curpos += H1_chunksize
        H0_curpos += H0_chunksize
        
        if H1_curpos > H1_maxpos:
            H1_curpos = H1_minpos
            
        if H0_curpos > H0_maxpos:
            H0_maxpos = H0_minpos

        # add the truth information
        H1_data["target"] = 1.0
        H0_data["target"] = 0.0

        data_chunk = pd.concat([H1_data, H0_data])

        # return a randomized signal + background sample
        training_data = data_chunk.sample(frac = 1)
        input_data = training_data[branches].as_matrix()
        target_data = training_data["target"].as_matrix()
        
        yield input_data, target_data

In [7]:
branches = ["PFMET", "nCleanedJetsPt30"]

In [8]:
# make a simple Keras model, just for Hello World purposes
model = Sequential()

In [9]:
model.add(Dense(32, input_shape=(2,)))
model.add(Dense(32))
model.add(Dense(32))
model.add(Dense(1))
model.add(Activation("relu"))

In [10]:
sgd = optimizers.SGD(lr = 0.1)
model.compile(loss = "mean_squared_error", optimizer = sgd, metrics = ["accuracy"])

In [11]:
train_gen = generate_training_data(H0_files[0], H1_files[0], branches, chunks = 100)
val_gen = generate_validation_data(H0_files[0], H1_files[0], branches, chunks = 100)

In [12]:
early_stop = EarlyStopping(monitor = 'val_loss',
                          patience = 10,
                          verbose = 1,
                          mode = 'auto')

In [13]:
ret = model.fit_generator(train_gen, steps_per_epoch = 128, epochs = 50, verbose = 2, validation_data = val_gen, validation_steps = 10, callbacks = [early_stop])

H1 contains 110483 entries, 55241 of which will be used
Epoch 1/50H0 contains 62320 entries, 31160 of which will be used

using the following chunk sizes: (552 / 311)
H1 contains 110483 entries, 110483 of which will be used
H0 contains 62320 entries, 62320 of which will be used
using the following chunk sizes: (1104 / 623)
10s - loss: 0.6396 - acc: 0.3604 - val_loss: 0.6393 - val_acc: 0.3607
Epoch 2/50
9s - loss: 0.6396 - acc: 0.3604 - val_loss: 0.6393 - val_acc: 0.3607
Epoch 3/50


KeyboardInterrupt: 