In [49]:
from trainlib.Preprocessor import Preprocessor
import trainlib.cuts as cuts
from trainlib.FileCollection import FileCollection
from trainlib.utils import read_data
from trainlib.PCAWhiteningPreprocessor import PCAWhiteningPreprocessor
from trainlib.ConfigFileUtils import ConfigFileUtils
import pandas as pd
import numpy as np
import re

In [50]:
class PNPPreprocessor(Preprocessor):
    def __init__(self, name, nonperiodic_columns, periodic_columns, preprocessor_cuts, preprocessor_basetype):
        self.name = name
        self.nonperiodic_columns = nonperiodic_columns
        self.periodic_columns = periodic_columns
        self.preprocessor_cuts = preprocessor_cuts
        
        # each periodic column will be encoded with two values:
        self.periodic_columns_encoded = []
        for periodic_column in self.periodic_columns:
            self.periodic_columns_encoded.append(periodic_column + "_sin")
            self.periodic_columns_encoded.append(periodic_column + "_cos")
            
        self.processed_columns = self.nonperiodic_columns + self.periodic_columns_encoded
         
        print self.processed_columns    
        
        # all cuts are already handled at this level (the topmost level)
        self.pre = preprocessor_basetype(self.name, self.processed_columns, cuts.no_cut)
    
    @classmethod
    def from_config(cls, config_section):
        preprocessor_name = re.sub('[<>]', '', config_section.name)
        nonperiodic_inputs = ConfigFileUtils.parse_list(config_section['processed_nonperiodic_columns'], lambda x: x.encode("ascii"))
        periodic_inputs = ConfigFileUtils.parse_list(config_section['processed_periodic_columns'], lambda x: x.encode("ascii"))
        preprocessor_cuts = ConfigFileUtils.parse_lambda(config_section['preprocessor_cuts'])
        preprocessor_type = eval(config_section['scalar_preprocessor_type'])
        
        obj = cls(name = preprocessor_name, nonperiodic_columns = nonperiodic_inputs, periodic_columns = periodic_inputs, 
                      preprocessor_cuts = preprocessor_cuts, preprocessor_basetype = preprocessor_type)
    
    def to_config(self, confhandler):
        section_name = '<' + self.name
        confhandler.new_section(section_name)
        
        confhandler.set_field(section_name, 'preprocessor_type', 'PNPPreprocessor')
        confhandler.set_field(section_name, 'processed_nonperiodic_columns', ConfigFileUtils.serialize_list(self.nonperiodic_columns, lambda x: x))
        confhandler.set_field(section_name, 'processed_periodic_columns', ConfigFileUtils.serialize_list(self.periodic_columns))
        confhandler.set_field(section_name, 'preprocessor_cuts', ConfigFileUtils.serialize_lambda(self.preprocessor_cuts))
        confhandler.set_field(section_name, 'scalar_preprocessor_type', self.pre.__class__.__name__)
        
    def _prepare_data(self, data):
        periodic_data_encoded = []
        
        for periodic_column in self.periodic_columns:
            periodic_data_encoded.append(self._encode_angles(data, periodic_column))
            
        prepared_data = pd.concat(periodic_data_encoded + [data[self.nonperiodic_columns]], axis = 1)
        
        return prepared_data
        
    def _encode_angles(self, df, col):
        df_out = pd.DataFrame()
        
        sin_encoding = df[col].apply(lambda x: np.sin(x))
        cos_encoding = df[col].apply(lambda x: np.cos(x))
        
        df_out[col + "_sin"] = sin_encoding
        df_out[col + "_cos"] = cos_encoding
        
        return df_out
    
    def setup_generator(self, datagen, len_setupdata):
        self.len_setupdata = len_setupdata
        extracted_data = []
        extracted_rows = 0
        
        for data in datagen:
            extracted_data.append(data)
            extracted_rows += len(data)
            
            if extracted_rows > self.len_setupdata:
                break
                
        print "setting up PNPPreprocessor on " + str(extracted_rows) + " events"
        
        input_data = pd.concat(extracted_data)
        input_data = input_data.reset_index(drop = True)
        
        self.setup(input_data)
    
    def setup(self, data):
        prepared_data = self._prepare_data(data)
        cut_data = self._rowcol_cut(prepared_data)
        
        self.pre.setup(cut_data)
    
    def process(self, data):
        prepared_data = self._prepare_data(data)
        cut_data = self._rowcol_cut(prepared_data)
        self.last_indices = cut_data.index
        
        return self.pre.process(cut_data)
            
    def get_last_indices(self):
        return self.last_indices
    
    def save(self, folder, filename):
        self.pre.save(folder, filename)
    
    def load(self, folder, filename):
        self.pre.load(folder, filename)
    
    def _rowcol_cut(self, data):
        data = data.loc[data.apply(self.preprocessor_cuts, axis = 1)]
        
        output_data = data.loc[:, self.processed_columns]
        
        return output_data
    

In [29]:
# prepare the generator for tests
H1_stream = {"/data_CMS/cms/wind/CJLST_NTuples/WplusH125/ZZ4lAnalysis.root" : cuts.no_cut}
H0_stream = {"/data_CMS/cms/wind/CJLST_NTuples/WminusH125/ZZ4lAnalysis.root" : cuts.no_cut}

In [30]:
periodic_branches = ["helphi", "phistarZ1"]
nonperiodic_branches =  ["costhetastar", "helcosthetaZ1", "helcosthetaZ2", "ZZMass", "Z1Mass", "Z2Mass"]

In [31]:
# read some input data to test it
fcoll = FileCollection({"/data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root" : cuts.no_cut}, 0.0, 1.0)
setup_data = read_data(fcoll, 0, 10, branches = periodic_branches + nonperiodic_branches)
validation_data = read_data(fcoll, 400, 800, branches = branches)

skimming /data_CMS/cms/wind/CJLST_NTuples/ggH125/ZZ4lAnalysis.root
collection set up: 1 files, 110483 entries in total, 110483 of which will be used


In [32]:
setup_data

Unnamed: 0,helphi,phistarZ1,costhetastar,helcosthetaZ1,helcosthetaZ2,ZZMass,Z1Mass,Z2Mass
0,0.189036,-0.903419,0.896658,0.01951,-0.070481,118.850044,97.210434,19.613565
1,2.382739,-0.940283,-0.054466,-0.985844,-0.314212,121.319794,89.475548,23.639349
2,-2.965919,-1.769761,-0.362188,0.700304,0.067684,126.025253,94.119347,14.938018
3,0.050613,-1.976605,-0.427963,0.576112,-0.967473,125.182335,92.499802,28.825556
4,-1.075787,1.381181,-0.165037,0.928623,0.511503,123.1082,87.41951,24.146938
5,1.167816,-1.831574,-0.930648,-0.286921,0.664325,123.341728,75.285919,22.240332
6,-3.05557,-0.544058,0.720742,-0.820293,-0.892987,125.868134,51.268044,44.458668
7,-1.857772,-1.775368,-0.608478,0.258011,0.71439,127.013412,90.610374,27.985203
8,-0.401952,-0.473919,-0.876926,0.199595,0.349395,117.682243,85.557762,28.025738
9,2.073953,2.079252,0.390263,-0.05,-0.718251,123.951996,88.337654,19.201677


In [51]:
pre = PNPPreprocessor("test", nonperiodic_branches, periodic_branches, cuts.no_cut, PCAWhiteningPreprocessor)

['costhetastar', 'helcosthetaZ1', 'helcosthetaZ2', 'ZZMass', 'Z1Mass', 'Z2Mass', 'helphi_sin', 'helphi_cos', 'phistarZ1_sin', 'phistarZ1_cos']
PCA setup for 'test': ['costhetastar', 'helcosthetaZ1', 'helcosthetaZ2', 'ZZMass', 'Z1Mass', 'Z2Mass', 'helphi_sin', 'helphi_cos', 'phistarZ1_sin', 'phistarZ1_cos']
PCAWhiteningPreprocessor for stream 'test'


In [46]:
pre._prepare_data(setup_data)

Unnamed: 0,helphi_sin,helphi_cos,phistarZ1_sin,phistarZ1_cos,costhetastar,helcosthetaZ1,helcosthetaZ2,ZZMass,Z1Mass,Z2Mass
0,0.187912,0.982186,-0.785448,0.618928,0.896658,0.01951,-0.070481,118.850044,97.210434,19.613565
1,0.68809,-0.725625,-0.807725,0.589559,-0.054466,-0.985844,-0.314212,121.319794,89.475548,23.639349
2,-0.174771,-0.984609,-0.980272,-0.197655,-0.362188,0.700304,0.067684,126.025253,94.119347,14.938018
3,0.050591,0.998719,-0.918783,-0.394762,-0.427963,0.576112,-0.967473,125.182335,92.499802,28.825556
4,-0.879964,0.47504,0.982077,0.188481,-0.165037,0.928623,0.511503,123.1082,87.41951,24.146938
5,0.919896,0.392162,-0.96619,-0.257832,-0.930648,-0.286921,0.664325,123.341728,75.285919,22.240332
6,-0.085917,-0.996302,-0.517612,0.855615,0.720742,-0.820293,-0.892987,125.868134,51.268044,44.458668
7,-0.959104,-0.283053,-0.979148,-0.203148,-0.608478,0.258011,0.71439,127.013412,90.610374,27.985203
8,-0.391215,0.920299,-0.456376,0.889787,-0.876926,0.199595,0.349395,117.682243,85.557762,28.025738
9,0.876065,-0.482193,0.873498,-0.486828,0.390263,-0.05,-0.718251,123.951996,88.337654,19.201677


In [47]:
pre._rowcol_cut(setup_data)

Unnamed: 0,costhetastar,helcosthetaZ1,helcosthetaZ2,ZZMass,Z1Mass,Z2Mass,helphi_sin,helphi_cos,phistarZ1_sin,phistarZ1_cos
0,0.896658,0.01951,-0.070481,118.850044,97.210434,19.613565,,,,
1,-0.054466,-0.985844,-0.314212,121.319794,89.475548,23.639349,,,,
2,-0.362188,0.700304,0.067684,126.025253,94.119347,14.938018,,,,
3,-0.427963,0.576112,-0.967473,125.182335,92.499802,28.825556,,,,
4,-0.165037,0.928623,0.511503,123.1082,87.41951,24.146938,,,,
5,-0.930648,-0.286921,0.664325,123.341728,75.285919,22.240332,,,,
6,0.720742,-0.820293,-0.892987,125.868134,51.268044,44.458668,,,,
7,-0.608478,0.258011,0.71439,127.013412,90.610374,27.985203,,,,
8,-0.876926,0.199595,0.349395,117.682243,85.557762,28.025738,,,,
9,0.390263,-0.05,-0.718251,123.951996,88.337654,19.201677,,,,
