In [1]:
from trainlib.FileCollection import FileCollection
from trainlib.Preprocessor import Preprocessor
from trainlib.PCAWhiteningPreprocessor import PCAWhiteningPreprocessor
from trainlib.ConfigFileUtils import ConfigFileUtils
import trainlib.cuts as cuts
import pandas as pd
import numpy as np
import re

Welcome to JupyROOT 6.10/09


In [2]:
candidate_branches = ["PFMET", "nCleanedJetsPt30", "nCleanedJetsPt30BTagged_bTagSF", "nExtraLep", "ZZMass", "nExtraZ", "Z1Mass", "Z2Mass", "Z1Pt", "Z2Pt", "ZZMassErr", "ZZPt", "ZZEta", "ZZPhi", "Z1Flav", "Z2Flav"]
MELA_branches = ["D_VBF2j_ggH_ME", "D_VBF1j_ggH_ME", "D_WHh_ggH_ME", "D_ZHh_ggH_ME", "D_WHh_ZHh_ME", "D_VBF2j_WHh_ME", "D_VBF2j_ZHh_ME"]
allbranches = ["JetPt", "JetEta", "JetPhi", "LepPt", "LepEta", "LepPhi", "ExtraLepPt", "ExtraLepEta", "ExtraLepPhi"] + candidate_branches + MELA_branches + ["LHEAssociatedParticleId", "GenAssocLep1Id", "GenAssocLep2Id", "training_weight"]

In [3]:
class FlexiblePCAWhiteningPreprocessor(Preprocessor):
    def __init__(self, name, nonperiodic_columns, periodic_columns, cuts, cuts_s = None):
        self.name = name
        self.nonperiodic_columns = nonperiodic_columns
        self.periodic_columns = periodic_columns
        self.cuts = cuts
        self.cuts_s = cuts_s
        self.base_preprocessor = None
        self.processed_columns = None
        self.last_indices = None

    @classmethod
    def from_config(cls, config_section):
        preprocessor_name = re.sub('[<>]', '', config_section.name)
        nonperiodic_columns = ConfigFileUtils.parse_list(config_section['nonperiodic_columns'], lambda x: x)
        periodic_columns = ConfigFileUtils.parse_list(config_section['periodic_columns'], lambda x: x)
        preprocessor_cuts, cuts_s = ConfigFileUtils.parse_lambda_s(config_section['preprocessor_cuts'])
        
        obj = cls(name = preprocessor_name, nonperiodic_columns = nonperiodic_columns, periodic_columns = periodic_columns, cuts = preprocessor_cuts, cuts_s = cuts_s)
        
        return obj

    def to_config(self, confhandler):
        section_name = '<' + self.name
        confhandler.new_section(section_name)
        
        confhandler.set_field(section_name, 'preprocessor_type', 'FlexiblePCAWhiteningPreprocessor')
        confhandler.set_field(section_name, 'nonperiodic_columns', ConfigFileUtils.serialize_list(self.nonperiodic_columns, lambda x: x))
        confhandler.set_field(section_name, 'periodic_columns', ConfigFileUtils.serialize_list(self.periodic_columns, lambda x: x))
        confhandler.set_field(section_name, 'preprocessor_cuts', ConfigFileUtils.serialize_lambda_s(self.cuts, self.cuts_s))
    
    def setup_generator(self, datagen, len_setupdata):
        self.len_setupdata = len_setupdata
        
        # draw some data from the generator
        extracted_data = []
        extracted_rows = 0
        
        for data in datagen:
            extracted_data.append(data)
            extracted_row += len(data)
            
            if extracted_rows > self.len_setupdata:
                break
                
        input_data = pd.concat(extracted_data)
        input_data = input_data.reset_index(drop = True)
        
        self.setup(input_data)
    
    def setup(self, data):
        # first perform the row selection
        cut_data = self._row_cut(data)
        
        # then prepare the data and the base preprocessor
        prepared_data = self.prepare_data(cut_data)
        self.processed_columns = prepared_data.columns
        
        self.base_preprocessor = PCAWhiteningPreprocessor(self.name, self.processed_columns, cuts.no_cut, None)
        self.base_preprocessor.setup(prepared_data)
        
    def get_last_indices(self):
        return self.last_indices
    
    def process(self, data):
        cut_data = self._row_cut(data)        
        self.last_indices = cut_data.index
        
        prepared_data = self.prepare_data(cut_data)
                
        return self.base_preprocessor.process(prepared_data)
    
    def save(self, folder, filename):
        self.base_preprocessor.save(folder, filename)
        
    def load(self, folder, filename):
        self.base_preprocessor.load(folder, filename)
    
    # puts the data in a format such that PCAWhiteningPreprocessor can act on them
    def prepare_data(self, data):
        prepared_data = pd.DataFrame()

        for column in self.nonperiodic_columns:
            cf = self.extract_column(df, column)
            prepared_data = pd.concat([prepared_data, cf], axis = 1)

        for column in self.periodic_columns:
            cf = self.extract_column(df, column)
            cf = self.encode_periodic_column(cf)
            prepared_data = pd.concat([prepared_data, cf], axis = 1)

        return prepared_data

    def extract_order(self, df, sorted_column, columns, order):
        def get_index(row, order, sorted_column):
            sorted_column = row[sorted_column]
            if order >= len(sorted_column):
                return -1
            else:
                return np.flipud(np.argsort(sorted_column))[order]

        index_column = pd.DataFrame(df.transform(lambda row: get_index(row, order, sorted_column), axis = 1, raw = True))
        index_column.columns = ["index"]
        df_temp = pd.concat([index_column, df], axis = 1)

        def get_element(row, column_name):
            if row["index"] == -1:
                return 0
            else:
                return row[column_name][row["index"]]

        extracted_cols = pd.DataFrame()
        for column in columns:
            extracted_col = pd.DataFrame(df_temp.transform(lambda row: get_element(row, column), axis = 1, raw = True))
            extracted_col.columns = [column + "_" + str(order)]
            extracted_cols = pd.concat([extracted_cols, extracted_col], axis = 1)

        return extracted_cols
    
    def extract_order_filtered(self, df, sorted_column, columns, order):
        extracted_raw = self.extract_order(df, sorted_column, columns, order)

        if any("Jet" in col for col in extracted_raw.columns):
            # apply the jet-pt cut
            mask_column = self.extract_order(df, "JetPt", ["JetPt"], order)
            mask = mask_column < 30.0
            extracted_raw[mask.as_matrix()] = 0.0

        return extracted_raw
    
    def extract_column(self, df, colstring):
        if '[' in colstring and ']' in colstring:
            col, sorted_col_order, _ = re.split('[\[\]]', colstring)
            sorted_col, order = re.split('\|', sorted_col_order)
            cf = self.extract_order_filtered(df, sorted_col, [col], int(order))
        else:
            cf = df[[colstring]]
        return cf
    
    def encode_periodic_column(self, df):
        of = pd.DataFrame()
        sincol = df.apply(np.sin, raw = True)
        sincol.columns = [col + "_sin" for col in df.columns]
        coscol = df.apply(np.cos, raw = True)
        coscol.columns = [col + "_cos" for col in df.columns]
        of = pd.concat([sincol, coscol], axis = 1)

        return of
    
    def _row_cut(self, data):
        output_data = data.loc[data.apply(self.cuts, axis = 1)]
        return output_data

In [6]:
MC_path = "/data_CMS/cms/wind/CJLST_NTuples_prepared/"

In [21]:
j2cut = lambda row: row["nCleanedJetsPt30"] >= 2

In [66]:
coll = FileCollection({MC_path + "VBFH125/ZZ4lAnalysis.root": j2cut}, 0.0, 1.0)
df = coll.get_data(allbranches + ["JetJERUp", "JetJERDown"], 0, 62320)

skimming /data_CMS/cms/wind/CJLST_NTuples_prepared/VBFH125/ZZ4lAnalysis.root
collection set up: 1 files, 62320 entries in total, 62320 of which will be used


In [61]:
nonperiodic_columns = ["ZZMass","JetEta[JetJERDown|1]", "JetJERDown[JetJERDown|1]"]
periodic_columns = ["JetPhi[JetPt|0]", "ZZPhi"]

In [62]:
pre = FlexiblePCAWhiteningPreprocessor("test", nonperiodic_columns, periodic_columns, cuts.no_cut)

In [49]:
pre.setup(df)

PCA setup for 'test': Index([u'ZZMass', u'JetEta_1', u'JetJERDown_1', u'JetPhi_0_sin',
       u'JetPhi_0_cos', u'ZZPhi_sin', u'ZZPhi_cos'],
      dtype='object')
PCAWhiteningPreprocessor for stream 'test'
setting up PCA whitening on 156 events
156 remaining after the cuts


In [50]:
test = pre.process(df)["test"]

In [51]:
pre.prepare_data(df)

Unnamed: 0,ZZMass,JetEta_1,JetJERDown_1,JetPhi_0_sin,JetPhi_0_cos,ZZPhi_sin,ZZPhi_cos
0,127.398865,-4.135607,39.599533,0.286449,-0.958096,0.566552,0.824026
1,119.234932,2.796306,59.755253,0.958974,0.283493,-0.995944,0.089979
2,125.888557,-3.117151,90.827896,-0.258232,0.966083,0.136897,-0.990585
3,122.632553,3.486774,46.127247,-0.051374,-0.998679,0.136807,0.990598
4,126.975670,2.018586,61.272934,-0.896828,0.442380,0.935775,0.352598
5,123.719505,1.532946,109.908363,-0.244027,-0.969769,-0.088370,0.996088
6,117.013641,-2.653456,58.208000,0.999644,-0.026697,-0.876049,0.482222
7,126.353348,-1.440020,111.407310,0.918206,-0.396102,-0.984203,-0.177043
8,125.481903,0.268997,34.268375,-0.843232,-0.537550,0.478615,-0.878025
9,125.671066,-3.821916,64.962814,0.615630,0.788036,-0.220348,-0.975421


In [67]:
pd.DataFrame(df[["JetEta", "JetPt", "JetJERDown", "JetJERUp"]])

Unnamed: 0,JetEta,JetPt,JetJERDown,JetJERUp
0,"[-0.0974471, -4.13561]","[61.4677, 38.7829]","[61.3476, 39.5995]","[61.5879, 37.9662]"
1,"[2.4551, 2.79631]","[124.482, 60.5661]","[124.967, 59.7553]","[123.996, 61.377]"
2,"[1.86427, -3.11715]","[126.419, 91.9705]","[125.509, 90.8279]","[127.328, 93.1132]"
3,"[-0.989047, 3.48677]","[114.946, 46.3603]","[114.898, 46.1272]","[114.995, 46.5934]"
4,"[-2.70578, 2.01859, 2.81681]","[95.014, 61.1027, 26.5004]","[93.4564, 61.2729, 24.5328]","[96.5717, 60.9325, 28.4681]"
5,"[1.60626, 1.53295, -0.0386098]","[151.398, 110.382, 88.7523]","[149.239, 109.908, 88.6612]","[153.327, 110.855, 88.8433]"
6,"[2.67587, -2.65346, -1.0599, -2.73019, 1.43137]","[77.8489, 60.9083, 43.2174, 41.9124, 20.9111]","[77.9968, 58.208, 42.999, 40.4011, 20.8938]","[77.701, 63.6086, 43.4358, 43.4238, 20.9283]"
7,"[2.78669, -1.44002, -0.819337]","[256.678, 111.005, 22.5173]","[252.952, 111.407, 22.5256]","[260.404, 110.603, 22.509]"
8,"[1.54688, 0.268997, -3.62434]","[41.2479, 34.1485, 27.7341]","[41.2751, 34.2684, 28.0855]","[41.2207, 34.0285, 27.3827]"
9,"[1.34337, -3.82192, 0.350135]","[173.164, 62.9158, 39.2286]","[172.204, 64.9628, 39.2186]","[174.124, 61.3492, 39.2385]"


In [25]:
test

array([[-0.3638802 , -0.91109655, -1.74968031, ..., -1.42941275,
         0.44151793,  0.76544905],
       [-0.71783617,  1.63937521,  0.71406488, ...,  0.64675154,
         0.10465547,  2.64755519],
       [-0.45828798,  0.90849814, -0.47082078, ..., -0.84029849,
        -0.75687354,  0.98584085],
       ..., 
       [-0.57117187,  2.18712712, -1.56872704, ...,  0.62357936,
        -0.42115719,  0.10642452],
       [ 0.27881501,  1.24626137, -0.8303008 , ..., -1.12977196,
        -1.21918221, -0.04276018],
       [ 0.36519844,  0.15433412, -1.38707013, ...,  0.27247079,
        -0.43107047, -1.25666884]])

In [24]:
df[["ZZMass"]]

Unnamed: 0,ZZMass
0,122.344070
1,123.417145
2,122.310593
3,123.935692
4,123.242645
5,124.532410
6,123.790184
7,125.737633
8,124.864723
9,122.803452


In [29]:
for x in ["a", "b",
          "c"]:
    print x

a
b
c


In [41]:
test = ["a", "b", "c", ""]

In [42]:
len([name for name in test if not name])

1

In [43]:
filter(None, test)

['a', 'b', 'c']