In [3]:
import uproot4
import pickle
import pandas as pd
import numpy as np

In [4]:
# unpickle trained model
bdt = pickle.load(open('classifier.pkl', 'rb'))

In [7]:
# Open root file and ttree

file = uproot4.open('test_sys_signal.root')
tree = file['Events']
tree.show()

name                 | typename                 | interpretation                
---------------------+--------------------------+-------------------------------
Muon_pt_1            | float                    | AsDtype('>f4')
Muon_pt_2            | float                    | AsDtype('>f4')
Electron_pt_1        | float                    | AsDtype('>f4')
Electron_pt_2        | float                    | AsDtype('>f4')
Muon_pt_1_Up         | float                    | AsDtype('>f4')
Muon_pt_2_Up         | float                    | AsDtype('>f4')
Electron_pt_1_Up     | float                    | AsDtype('>f4')
Electron_pt_2_Up     | float                    | AsDtype('>f4')
Muon_pt_1_Down       | float                    | AsDtype('>f4')
Muon_pt_2_Down       | float                    | AsDtype('>f4')
Electron_pt_1_Down   | float                    | AsDtype('>f4')
Electron_pt_2_Down   | float                    | AsDtype('>f4')


Perform operations with Pandas

In [8]:
df = tree.arrays(library='pd')

In [9]:
df

Unnamed: 0,Muon_pt_1,Muon_pt_2,Electron_pt_1,Electron_pt_2,Muon_pt_1_Up,Muon_pt_2_Up,Electron_pt_1_Up,Electron_pt_2_Up,Muon_pt_1_Down,Muon_pt_2_Down,Electron_pt_1_Down,Electron_pt_2_Down
0,17.382437,13.238400,44.744175,58.942070,17.882437,13.738400,45.244175,59.442070,16.882437,12.738400,44.244175,58.442070
1,50.051613,41.024353,15.959146,5.622648,50.551613,41.524353,16.459146,6.122648,49.551613,40.524353,15.459146,5.122648
2,24.702271,41.462658,5.042018,7.994947,25.202271,41.962658,5.542018,8.494947,24.202271,40.962658,4.542018,7.494947
3,55.392330,32.569691,16.479095,14.080798,55.892330,33.069691,16.979095,14.580798,54.892330,32.069691,15.979095,13.580798
4,37.216637,54.203739,14.535382,17.012287,37.716637,54.703739,15.035382,17.512287,36.716637,53.703739,14.035382,16.512287
...,...,...,...,...,...,...,...,...,...,...,...,...
22833,15.948185,10.853278,34.428360,56.252823,16.448185,11.353278,34.928360,56.752823,15.448185,10.353278,33.928360,55.752823
22834,3.165293,7.369295,27.072086,28.327864,3.665293,7.869295,27.572086,28.827864,2.665293,6.869295,26.572086,27.827864
22835,33.588539,35.164211,14.531511,12.232145,34.088539,35.664211,15.031511,12.732145,33.088539,34.664211,14.031511,11.732145
22836,59.003090,21.311920,5.186389,14.721363,59.503090,21.811920,5.686389,15.221363,58.503090,20.811920,4.686389,14.221363


In [10]:
def perform_inference(df, clf, nominals, systematics, new_column):
    '''Given a classifier that takes n columns as input, recursively apply
    the classifier on the n columns specified by the combination of nominal
    with the elements in systematics
    '''
    model_features = clf.get_booster().feature_names
    for sys in systematics:
        columns = list(map(lambda pref: pref + sys, nominals))
        df[new_column + sys] = clf.predict(df.rename(
            columns=dict(zip(columns, model_features)), inplace=False)[model_features])
        df = df.rename(columns=dict(zip(model_features, columns)))
    return df

In [11]:
%%time

nominals = ['Muon_pt_1', 'Muon_pt_2', 'Electron_pt_1', 'Electron_pt_2']
systematics = ['', '_Up', '_Down']
new_column = 'Y'

df = perform_inference(df, bdt, nominals, systematics, new_column)

CPU times: user 4.67 s, sys: 20 ms, total: 4.69 s
Wall time: 107 ms


In [12]:
df

Unnamed: 0,Muon_pt_1,Muon_pt_2,Electron_pt_1,Electron_pt_2,Muon_pt_1_Up,Muon_pt_2_Up,Electron_pt_1_Up,Electron_pt_2_Up,Muon_pt_1_Down,Muon_pt_2_Down,Electron_pt_1_Down,Electron_pt_2_Down,Y,Y_Up,Y_Down
0,17.382437,13.238400,44.744175,58.942070,17.882437,13.738400,45.244175,59.442070,16.882437,12.738400,44.244175,58.442070,1.0,1.0,1.0
1,50.051613,41.024353,15.959146,5.622648,50.551613,41.524353,16.459146,6.122648,49.551613,40.524353,15.459146,5.122648,1.0,1.0,1.0
2,24.702271,41.462658,5.042018,7.994947,25.202271,41.962658,5.542018,8.494947,24.202271,40.962658,4.542018,7.494947,0.0,0.0,0.0
3,55.392330,32.569691,16.479095,14.080798,55.892330,33.069691,16.979095,14.580798,54.892330,32.069691,15.979095,13.580798,1.0,1.0,1.0
4,37.216637,54.203739,14.535382,17.012287,37.716637,54.703739,15.035382,17.512287,36.716637,53.703739,14.035382,16.512287,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22833,15.948185,10.853278,34.428360,56.252823,16.448185,11.353278,34.928360,56.752823,15.448185,10.353278,33.928360,55.752823,1.0,1.0,1.0
22834,3.165293,7.369295,27.072086,28.327864,3.665293,7.869295,27.572086,28.827864,2.665293,6.869295,26.572086,27.827864,0.0,0.0,0.0
22835,33.588539,35.164211,14.531511,12.232145,34.088539,35.664211,15.031511,12.732145,33.088539,34.664211,14.031511,11.732145,1.0,1.0,1.0
22836,59.003090,21.311920,5.186389,14.721363,59.503090,21.811920,5.686389,15.221363,58.503090,20.811920,4.686389,14.221363,1.0,1.0,0.0


However, the application of the model to subsets of columns in this case is run sequentially.

How does this behavior scale with larger datasets? 

We need to investigate how to parallelize them both locally and on a cluster

## Fetch big datasets

Study workcase in the assumption that datasets are made of max 2000 files with max 50000 events each (to verify after installing dbs)

## Try to implement systematics as rows

In [13]:
new_cols = [nominal + '_New' for nominal in nominals]
aliases = {}
for nominal in nominals:
    branches = [nominal + sys for sys in systematics]
    aliases['{}_New'.format(nominal)] = '[{}, {}, {}]'.format(*branches)

systs = tree.arrays(aliases.keys(), aliases=aliases, library='np')

for key, value in systs.items():
    systs[key] = np.array(value).T.flatten()

In [14]:
for col in systs.values():
    print(col.shape)

(68514,)
(68514,)
(68514,)
(68514,)


In [15]:
# Naming for rows
indexes = []
new_events = systs[list(systs.keys())[0]].shape[0]
original_events = int(new_events/len(systematics))
for sys in systematics:
    for ev in range(original_events):
        indexes.append('{}{}'.format(ev, sys))

In [16]:
df_new = pd.DataFrame.from_dict(systs)
df_new.index = indexes

In [17]:
df_new

Unnamed: 0,Muon_pt_1_New,Muon_pt_2_New,Electron_pt_1_New,Electron_pt_2_New
0,17.382437,13.238400,44.744175,58.942070
1,17.882437,13.738400,45.244175,59.442070
2,16.882437,12.738400,44.244175,58.442070
3,50.051613,41.024353,15.959146,5.622648
4,50.551613,41.524353,16.459146,6.122648
...,...,...,...,...
22833_Down,59.503090,21.811920,5.686389,15.221363
22834_Down,58.503090,20.811920,4.686389,14.221363
22835_Down,42.957253,40.313286,6.656038,8.904449
22836_Down,43.457253,40.813286,7.156038,9.404449


In [18]:
# Perform inference

model_features = bdt.get_booster().feature_names
columns = df_new.columns.values
df_new['Y_New'] = bdt.predict(df_new.rename(columns=dict(zip(columns, model_features)), inplace=False)[model_features])
df_new = df_new.rename(columns=dict(zip(model_features, columns)))

In [19]:
df_new

Unnamed: 0,Muon_pt_1_New,Muon_pt_2_New,Electron_pt_1_New,Electron_pt_2_New,Y_New
0,17.382437,13.238400,44.744175,58.942070,1.0
1,17.882437,13.738400,45.244175,59.442070,1.0
2,16.882437,12.738400,44.244175,58.442070,1.0
3,50.051613,41.024353,15.959146,5.622648,1.0
4,50.551613,41.524353,16.459146,6.122648,1.0
...,...,...,...,...,...
22833_Down,59.503090,21.811920,5.686389,15.221363,1.0
22834_Down,58.503090,20.811920,4.686389,14.221363,0.0
22835_Down,42.957253,40.313286,6.656038,8.904449,1.0
22836_Down,43.457253,40.813286,7.156038,9.404449,1.0
