In [1]:
from trainlib.FileCollection import FileCollection
from trainlib.utils import read_data
from trainlib.PCAWhiteningPreprocessor import PCAWhiteningPreprocessor
from trainlib.config import Config
import pandas as pd

Welcome to JupyROOT 6.10/09


In [8]:
fcoll = FileCollection(["/data_CMS/cms/wind/CJLST_NTuples/WplusH125/ZZ4lAnalysis.root"], 0.0, 1.0)

skimming /data_CMS/cms/wind/CJLST_NTuples/WplusH125/ZZ4lAnalysis.root
collection set up: 1 files, 32401 entries in total, 32401 of which will be used


In [3]:
def countNeutrinos(row):
    number_neutrinos = 0
    associated_particles = row["LHEAssociatedParticleId"]
    
    if associated_particles is not None:
        for pdg_code in associated_particles:
            if abs(pdg_code) == 12 or abs(pdg_code) == 14 or abs(pdg_code) == 16:
                number_neutrinos += 1
                
    return number_neutrinos

In [4]:
def countAssocLeptons(row):
    if row["GenAssocLep1Id"] == 0 and row["GenAssocLep2Id"] == 0:
        return 0
    elif (row["GenAssocLep1Id"] != 0 and row["GenAssocLep2Id"] == 0) or (row["GenAssocLep1Id"] == 0 and row["GenAssocLep2Id"] != 0):
        return 1
    elif row["GenAssocLep1Id"] != 0 and row["GenAssocLep2Id"] != 0:
        return 2
    
    return 0

In [15]:
processed_columns = ["PFMET", "nCleanedJetsPt30", "nCleanedJetsPt30BTagged_bTagSF", "nExtraLep", "D_VBF2j_ggH_ME"]
cuts = lambda row: row["nCleanedJetsPt30"] >= 2 and row["ZZMass"] > 120.0 and countAssocLeptons(row) >= 1

pre2 = PCAWhiteningPreprocessor(processed_columns = processed_columns, cuts = cuts)
pre2.load("/home/llr/cms/wind/", "sample_pre.pkl")

In [16]:
indata = read_data(fcoll, start = 0, stop = 100, branches = Config.branches)

In [17]:
indata.loc[indata.apply(cuts, axis = 1)]

Unnamed: 0,PFMET,nCleanedJetsPt30,nCleanedJetsPt30BTagged_bTagSF,nExtraLep,ZZMass,LHEAssociatedParticleId,GenAssocLep1Id,GenAssocLep2Id,D_VBF2j_ggH_ME,D_VBF1j_ggH_ME,D_WHh_ggH_ME,D_ZHh_ggH_ME,D_WHh_ZHh_ME,D_VBF2j_WHh_ME,D_VBF2j_ZHh_ME
4,117.625648,3,0,0,124.248848,"[-15, 21, 21, 16]",-15,0,0.098514,0.220828,0.987014,0.960443,0.757899,0.001436,0.004481
7,54.169739,2,0,1,125.632874,"[-2, -11, 12, 21]",-11,0,0.035193,0.220958,0.0056,0.004085,0.578586,0.866264,0.898921
13,90.698059,3,0,0,144.862228,"[1, 21, 12, -11]",-11,0,0.047992,0.247933,0.640451,0.513259,0.628147,0.027522,0.045626
32,34.020985,2,0,0,126.305573,"[21, -15, 16, 21]",-15,0,0.072049,0.22139,0.99496,0.986424,0.730945,0.000393,0.001067
78,12.983487,2,0,0,120.013527,"[-15, 21, 21, 16]",-15,0,0.019024,0.226287,0.884929,0.624822,0.82199,0.002515,0.011511


In [18]:
outdata = pre2.process(indata)["PCA_w_0"]

In [14]:
indata.index

Int64Index([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
            17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
            34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
            51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
            68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
            85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
           dtype='int64')

In [20]:
outdata.index

Int64Index([ 0,  1,  2,  3,  4,  5,  7,  8, 12, 16, 17, 20, 24, 25, 28, 30, 32,
            34, 35, 41, 48, 49, 52, 53, 61, 63, 65, 66, 67, 69, 71, 72, 73, 74,
            75, 76, 77, 81, 88, 92, 97, 98, 99],
           dtype='int64')

In [23]:
missed_indices = [ind for ind in indata.index if ind not in outdata.index]

In [27]:
missed_outdata = pd.Series(0.0, index = missed_indices)

In [30]:
pd.concat([outdata, missed_outdata]).sort_index()

0     1.197417
1    -0.475494
2     1.058973
3    -0.776113
4     0.242395
5    -0.503223
6     0.000000
7     0.838380
8    -0.920595
9     0.000000
10    0.000000
11    0.000000
12   -0.758167
13    0.000000
14    0.000000
15    0.000000
16    0.717080
17    0.442967
18    0.000000
19    0.000000
20    0.401404
21    0.000000
22    0.000000
23    0.000000
24   -0.272584
25    1.442952
26    0.000000
27    0.000000
28    0.052920
29    0.000000
        ...   
70    0.000000
71   -0.640253
72   -1.200286
73    3.342651
74   -1.274001
75   -0.516456
76   -0.557556
77    1.105944
78    0.000000
79    0.000000
80    0.000000
81    0.070442
82    0.000000
83    0.000000
84    0.000000
85    0.000000
86    0.000000
87    0.000000
88   -0.104277
89    0.000000
90    0.000000
91    0.000000
92    1.739923
93    0.000000
94    0.000000
95    0.000000
96    0.000000
97    0.466456
98    0.673819
99   -1.259476
Length: 100, dtype: float64

In [31]:
missed_indices

[6,
 9,
 10,
 11,
 13,
 14,
 15,
 18,
 19,
 21,
 22,
 23,
 26,
 27,
 29,
 31,
 33,
 36,
 37,
 38,
 39,
 40,
 42,
 43,
 44,
 45,
 46,
 47,
 50,
 51,
 54,
 55,
 56,
 57,
 58,
 59,
 60,
 62,
 64,
 68,
 70,
 78,
 79,
 80,
 82,
 83,
 84,
 85,
 86,
 87,
 89,
 90,
 91,
 93,
 94,
 95,
 96]

In [6]:
abs(-5)

5