# Awkward Array 4 Flashgg 

As a natural consequence of ```study_flip_systematics.ipynb```, here we focus on data maniuplation with **Awkward Array**, taking care of implementing in a clear and efficient way all the operations that Flashgg is supposed to perform

In [1]:
import uproot4
import awkward1 as ak
import numpy as np
import pickle

In [2]:
# Open root file and ttree

file = uproot4.open('test_sys_signal.root')
tree = file['Events']
tree.show()

name                 | typename                 | interpretation                
---------------------+--------------------------+-------------------------------
Muon_pt_1            | float                    | AsDtype('>f4')
Muon_pt_2            | float                    | AsDtype('>f4')
Electron_pt_1        | float                    | AsDtype('>f4')
Electron_pt_2        | float                    | AsDtype('>f4')
Muon_pt_1_Up         | float                    | AsDtype('>f4')
Muon_pt_2_Up         | float                    | AsDtype('>f4')
Electron_pt_1_Up     | float                    | AsDtype('>f4')
Electron_pt_2_Up     | float                    | AsDtype('>f4')
Muon_pt_1_Down       | float                    | AsDtype('>f4')
Muon_pt_2_Down       | float                    | AsDtype('>f4')
Electron_pt_1_Down   | float                    | AsDtype('>f4')
Electron_pt_2_Down   | float                    | AsDtype('>f4')
PV_x                 | float                    | AsDtype(

In [4]:
# Useful variables

variables_with_sys = ['Muon_pt_1', 'Muon_pt_2', 'Electron_pt_1', 'Electron_pt_2']
variables_without_sys = ['PV_x', 'PV_y', 'PV_z']
systematics = ['', '_Up', '_Down']

variables = variables_with_sys + variables_without_sys

In [8]:
# Useful functions

def get_branches(file_name, tree_name):
    """ Extract as a record type Awkward array the branches belonging to the tree "tree_name" inside "file_name".
    """
    file = uproot4.open(file_name)
    tree = file[tree_name]
    branches = tree.arrays(library='ak')
    return branches
    
def get_systematics_record(event, var, systematics):
    """ Given an event (i.e. a row of a record type Awkward array) and a variable, return a dictionary where the keys are
    the systematics ("Nominal", "Up", "Down", etc.) and the values are the elements found in evt[variable + systematic].
    This is done for every "variable + systematic" EXISTENT field. 
    
    """
    systematics_record = {}
    for sys in systematics:
        if sys == "":
            placeholder = "Nominal"
        else:
            placeholder = sys.replace("_", "")
        key = "{}{}".format(var, sys)
        if key in event.fields:
            systematics_record[placeholder] = event[key]
    return systematics_record

def get_variables_record(event, variables, systematics):
    """Given an event, a list of variables and a list of systematics, return a dictionary where the keys are the variables
    and the values are dictionaries built with the function get_systematics_record.
    """
    variables_record = {}
    for var in variables:
        variables_record[var] = get_systematics_record(event, var, systematics)
    return variables_record

def extract_and_manipulate(file_name, tree_name, variables, systematics):
    input_df = get_branches(file_name, tree_name)
    output_df = ak.Array([get_variables_record(evt, variables, systematics) for evt in input_df])
    return output_df

In [9]:
%%time
df = extract_and_manipulate('test_sys_signal.root', 'Events', variables, systematics)

CPU times: user 3.06 s, sys: 34.3 ms, total: 3.1 s
Wall time: 3.1 s


In [10]:
df.type

22838 * {"Muon_pt_1": {"Nominal": float64, "Up": float64, "Down": float64}, "Muon_pt_2": {"Nominal": float64, "Up": float64, "Down": float64}, "Electron_pt_1": {"Nominal": float64, "Up": float64, "Down": float64}, "Electron_pt_2": {"Nominal": float64, "Up": float64, "Down": float64}, "PV_x": {"Nominal": float64}, "PV_y": {"Nominal": float64}, "PV_z": {"Nominal": float64}}

## Tagger

In [11]:
class Tagger:
    def __init__(self, clf, variables, systematics, prediction):
        self.clf = clf
        self.variables = variables
        self.systematics = systematics
        self.prediction = prediction
        
    def predict(self, df):
        def get_predicted_array(df, sys):
            np_arr_input = np.array([[evt[var][sys] for var in self.variables] for evt in df])
            np_arr_output = self.clf.predict(np_arr_input)
            return ak.from_numpy(np_arr_output)
        def make_evt_dict(d, evt):
            return {k: v[evt] for (k, v) in d.items()}
        predictions = {sys: arr for (sys, arr) in [(sys, get_predicted_array(df, sys)) for sys in self.systematics]}
        df[self.prediction] = ak.Array([make_evt_dict(predictions, evt) for evt in range(len(df))])
        return df

In [12]:
bdt = pickle.load(open('classifier.pkl', 'rb'))

In [13]:
%%time
tagger = Tagger(bdt, variables_with_sys, ['Nominal', 'Up', 'Down'], 'Y')
df_pred = tagger.predict(df)

CPU times: user 11.4 s, sys: 70.3 ms, total: 11.4 s
Wall time: 6.54 s


In [14]:
df_pred.type

22838 * {"Muon_pt_1": {"Nominal": float64, "Up": float64, "Down": float64}, "Muon_pt_2": {"Nominal": float64, "Up": float64, "Down": float64}, "Electron_pt_1": {"Nominal": float64, "Up": float64, "Down": float64}, "Electron_pt_2": {"Nominal": float64, "Up": float64, "Down": float64}, "PV_x": {"Nominal": float64}, "PV_y": {"Nominal": float64}, "PV_z": {"Nominal": float64}, "Y": {"Nominal": float64, "Up": float64, "Down": float64}}

## Selections

In [15]:
sel_one = df_pred[df_pred['Muon_pt_1']['Nominal'] > 25]

In [16]:
sel_one.type

9270 * {"Muon_pt_1": {"Nominal": float64, "Up": float64, "Down": float64}, "Muon_pt_2": {"Nominal": float64, "Up": float64, "Down": float64}, "Electron_pt_1": {"Nominal": float64, "Up": float64, "Down": float64}, "Electron_pt_2": {"Nominal": float64, "Up": float64, "Down": float64}, "PV_x": {"Nominal": float64}, "PV_y": {"Nominal": float64}, "PV_z": {"Nominal": float64}, "Y": {"Nominal": float64, "Up": float64, "Down": float64}}

In [19]:
sel_two = df_pred[(df_pred['Muon_pt_1']['Nominal'] > 25) & (df_pred['Electron_pt_1']['Nominal'] < 40)]

In [21]:
sel_two.type

8455 * {"Muon_pt_1": {"Nominal": float64, "Up": float64, "Down": float64}, "Muon_pt_2": {"Nominal": float64, "Up": float64, "Down": float64}, "Electron_pt_1": {"Nominal": float64, "Up": float64, "Down": float64}, "Electron_pt_2": {"Nominal": float64, "Up": float64, "Down": float64}, "PV_x": {"Nominal": float64}, "PV_y": {"Nominal": float64}, "PV_z": {"Nominal": float64}, "Y": {"Nominal": float64, "Up": float64, "Down": float64}}

In [22]:
sel_two = df_pred[np.logical_and(df_pred['Muon_pt_1']['Nominal'] > 25, df_pred['Electron_pt_1']['Nominal'] < 40)]

In [23]:
sel_two.type

8455 * {"Muon_pt_1": {"Nominal": float64, "Up": float64, "Down": float64}, "Muon_pt_2": {"Nominal": float64, "Up": float64, "Down": float64}, "Electron_pt_1": {"Nominal": float64, "Up": float64, "Down": float64}, "Electron_pt_2": {"Nominal": float64, "Up": float64, "Down": float64}, "PV_x": {"Nominal": float64}, "PV_y": {"Nominal": float64}, "PV_z": {"Nominal": float64}, "Y": {"Nominal": float64, "Up": float64, "Down": float64}}