# Awkward Array 4 Flashgg 

As a natural consequence of ```study_flip_systematics.ipynb```, here we focus on data maniuplation with **Awkward Array**, taking care of implementing in a clear and efficient way all the operations that Flashgg is supposed to perform

In [None]:
import uproot
import awkward as ak
import numpy as np
import pickle

In [None]:
# Open root file and ttree

file = uproot.open('test_sys_signal.root')
tree = file['Events']
tree.show()

In [None]:
# Useful variables

variables_with_sys = ['Muon_pt_1', 'Muon_pt_2', 'Electron_pt_1', 'Electron_pt_2']
variables_without_sys = ['PV_x', 'PV_y', 'PV_z']
systematics = ['', '_Up', '_Down']

variables = variables_with_sys + variables_without_sys

In [None]:
# Useful functions

def get_branches(file_name, tree_name):
    """ Extract as a record type Awkward array the branches belonging to the tree "tree_name" inside "file_name".
    """
    file = uproot.open(file_name)
    tree = file[tree_name]
    branches = tree.arrays(library='ak')
    return branches
    
def get_systematics_record(event, var, systematics):
    """ Given an event (i.e. a row of a record type Awkward array) and a variable, return a dictionary where the keys are
    the systematics ("Nominal", "Up", "Down", etc.) and the values are the elements found in evt[variable + systematic].
    This is done for every "variable + systematic" EXISTENT field. 
    
    """
    systematics_record = {}
    for sys in systematics:
        if sys == "":
            placeholder = "Nominal"
        else:
            placeholder = sys.replace("_", "")
        key = "{}{}".format(var, sys)
        if key in event.fields:
            systematics_record[placeholder] = event[key]
    return systematics_record

def get_variables_record(event, variables, systematics):
    """Given an event, a list of variables and a list of systematics, return a dictionary where the keys are the variables
    and the values are dictionaries built with the function get_systematics_record.
    """
    variables_record = {}
    for var in variables:
        variables_record[var] = get_systematics_record(event, var, systematics)
    return variables_record

def extract_and_manipulate(file_name, tree_name, variables, systematics):
    input_df = get_branches(file_name, tree_name)
    output_df = ak.Array([get_variables_record(evt, variables, systematics) for evt in input_df])
    return output_df

In [None]:
%%time

df = extract_and_manipulate('test_sys_signal.root', 'Events', variables, systematics)

In [None]:
df.type

## Tagger

In [None]:
class Tagger:
    def __init__(self, clf, variables, systematics, prediction):
        self.clf = clf
        self.variables = variables
        self.systematics = systematics
        self.prediction = prediction
        
    def predict(self, df):
        def get_predicted_array(df, sys):
            np_arr_input = np.array([df[var][sys] for var in self.variables])
            np_arr_output = self.clf.predict(np_arr_input.T)
            return ak.from_numpy(np_arr_output)
        
        predictions = {sys: arr for (sys, arr) in [(sys, get_predicted_array(df, sys)) for sys in self.systematics]}
        df[self.prediction] = ak.Array(predictions)
        return df

In [None]:
bdt = pickle.load(open('classifier.pkl', 'rb'))

In [None]:
tagger = Tagger(bdt, variables_with_sys, ['Nominal', 'Up', 'Down'], 'Y')

In [None]:
%%time

df_pred = tagger.predict(df)

In [None]:
df_pred.type

## Selections

In [None]:
sel_one = df_pred[df_pred['Muon_pt_1']['Nominal'] > 25]

In [None]:
sel_one.type

In [None]:
sel_two = df_pred[(df_pred['Muon_pt_1']['Nominal'] > 25) & (df_pred['Electron_pt_1']['Nominal'] < 40)]

In [None]:
sel_two.type

In [None]:
sel_two = df_pred[np.logical_and(df_pred['Muon_pt_1']['Nominal'] > 25, df_pred['Electron_pt_1']['Nominal'] < 40)]

In [None]:
sel_two.type

# Alternative: Awkward without structured array-like configuration

In [None]:
%%time

file = uproot.open('test_sys_signal.root')
tree = file['Events']
df = tree.arrays(library='ak')

In [None]:
df.type

In [None]:
class Tagger:
    def __init__(self, clf, variables, systematics_dict, prediction):
        self.clf = clf
        self.variables = variables
        self.systematics_dict = systematics_dict
        self.prediction = prediction
        
    def predict(self, df):
        for suff in self.systematics_dict.values():
            features = np.array([df[var + suff] for var in self.variables])
            df[self.prediction + suff] = self.clf.predict(features.T)
        return df

In [None]:
systematics_dict = {'Nominal': '', 'Up': '_Up', 'Down': '_Down'}
tagger = Tagger(bdt, variables_with_sys, systematics_dict, 'Y')

In [None]:
%%time

df_pred = tagger.predict(df)

In [None]:
df_pred.type