In [1]:
import pandas as pd
import numpy as np
from root_numpy import root2array, root2rec, tree2array
from trainlib.config import Config
from trainlib.Preprocessor import Preprocessor
from trainlib.generator import generate_training_data, generate_validation_data
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import pickle
from trainlib.PCAWhiteningPreprocessor import PCAWhiteningPreprocessor

Welcome to JupyROOT 6.10/09


In [2]:
H1_data_files = ["/data_CMS/cms/wind/CJLST_NTuples/VBFH125/ZZ4lAnalysis.root"]
H0_data_files = ["/data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root"]

In [3]:
processed_columns = ["PFMET", "nCleanedJetsPt30", "nCleanedJetsPt30BTagged_bTagSF", "nExtraLep", "D_VBF2j_ggH_ME"]
cuts = lambda row: row["nCleanedJetsPt30"] >= 2
pre = PCAWhiteningPreprocessor(processed_columns = processed_columns, cuts = cuts)

In [4]:
gen = generate_training_data(H1_data_files, H0_data_files, branches = Config.branches, preprocessor = None, training_split = 0.5, as_matrix = False)

skimming /data_CMS/cms/wind/CJLST_NTuples/VBFH125/ZZ4lAnalysis.root
collection set up: 1 files, 62320 entries in total, 31160 of which will be used
skimming /data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root
collection set up: 1 files, 110483 entries in total, 55241 of which will be used


In [5]:
pre.setup(gen, len_setupdata = 20000)

H1 contains 31160 entries
H0 contains 55241 entries
using the following chunk sizes: (31 / 55)
setting up PCA whitening on 20038 events


In [6]:
pre.save("/home/llr/cms/wind/", "sample_pre.pkl")

In [7]:
valgen = generate_validation_data(H1_data_files, H0_data_files, branches = Config.branches, preprocessor = None, training_split = 0.5, as_matrix = False)

skimming /data_CMS/cms/wind/CJLST_NTuples/VBFH125/ZZ4lAnalysis.root
collection set up: 1 files, 62320 entries in total, 31160 of which will be used
skimming /data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root
collection set up: 1 files, 110483 entries in total, 55242 of which will be used


In [8]:
testdata = []
testlength = 0
for data in valgen:
    testdata.append(data[0])
    testlength += len(data[0])
    if(testlength > 1000):
        break

H1 contains 31160 entries
H0 contains 55242 entries
using the following chunk sizes: (31 / 55)


In [9]:
testdata = pd.concat(testdata)

In [10]:
pre2 = PCAWhiteningPreprocessor(processed_columns = processed_columns, cuts = cuts)
pre2.load("/home/llr/cms/wind/", "sample_pre.pkl")

In [11]:
postdata = pre2.process(testdata).as_matrix()

In [12]:
np.mean(postdata, axis = 0)

array([ 0.13732255,  0.06829707,  0.01778643, -0.04054737,  0.03826833])

In [13]:
np.std(postdata, axis = 0)

array([ 1.05383876,  1.21147961,  1.03597695,  1.07936523,  1.29402369])

In [None]:
# test the preprocessor in place

In [15]:
gen = generate_training_data(H1_data_files, H0_data_files, branches = Config.branches, preprocessor = pre2.process, training_split = 0.5)

skimming /data_CMS/cms/wind/CJLST_NTuples/VBFH125/ZZ4lAnalysis.root
collection set up: 1 files, 62320 entries in total, 31160 of which will be used
skimming /data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root
collection set up: 1 files, 110483 entries in total, 55241 of which will be used


In [16]:
for data in gen:
    print data
    break

H1 contains 31160 entries
H0 contains 55241 entries
using the following chunk sizes: (31 / 55)
(array([[-0.18108239, -0.46363175,  0.11112687, -0.17775785, -0.05357878],
       [-0.92059497,  1.24016673,  0.4740526 , -0.29174219, -0.03154706],
       [ 0.05292049,  1.19178727,  0.42998214, -0.31867559, -0.01827212],
       [-0.06710771,  1.33407242, -1.16503891, -0.4945629 , -0.04397968],
       [-0.75816704,  1.36396709, -0.47061209,  3.66050724, -0.13623462],
       [-0.76943278, -0.37027093, -0.61506133, -0.24577642, -0.07292402],
       [ 1.19741737, -0.61260829,  0.99348644, -0.1101074 , -0.02057219],
       [-0.77611334, -0.44186361,  0.22966567, -0.15103868, -0.06031497],
       [ 1.05897281,  1.27507191, -1.18033402, -0.52172176, -0.02808769],
       [ 0.24239464, -0.52674847,  0.5858957 , -0.13416585, -0.04037536],
       [ 0.7170799 , -0.41965226, -0.35897678,  3.8110042 , -0.14059547],
       [-0.47549425, -0.37128739, -0.78793152, -0.27177543, -0.07131569],
       [-0.50322