# Numpy Structured Arrays 4 Flashgg

As a natural consequence of ```study_flip_systematics.ipynb``` (and as an alternative to ```awkward_flashgg.ipynb```), here we focus on data maniuplation with **numpy** (structured arrays), taking care of implementing in a clear and efficient way all the operations that Flashgg is supposed to perform

In [None]:
import numpy as np
import uproot
import pickle

In [None]:
# Open root file and ttree

file = uproot.open('test_sys_signal.root')
tree = file['Events']
tree.show()

In [None]:
# Useful variables

variables_with_sys = ['Muon_pt_1', 'Muon_pt_2', 'Electron_pt_1', 'Electron_pt_2']
variables_without_sys = ['PV_x', 'PV_y', 'PV_z']
systematics = ['', '_Up', '_Down']

variables = variables_with_sys + variables_without_sys

In [None]:
# Useful functions

sys_type = np.dtype([('Nominal', 'float32') if sys == '' else (sys.replace('_', ''), 'float32') for sys in systematics])
nom_type = np.dtype([('Nominal', 'float32')])
table_type = np.dtype([(var, sys_type) for var in variables_with_sys] + [(var, nom_type) for var in variables_without_sys])

def get_branches(file_name, tree_name):
    """ Extract as a record type Awkward array the branches belonging to the tree "tree_name" inside "file_name".
    """
    file = uproot.open(file_name)
    tree = file[tree_name]
    branches = tree.arrays(library='ak')
    return branches

def get_row_of_vars_with_sys(df, variables_with_sys, systematics, evt):
    lst = []
    for var in variables_with_sys:
        elements = []
        for name in list(map(lambda suf: var + suf, systematics)):
            element = df[name][evt]
            elements.append(element)
        arr = np.array(tuple(elements), sys_type)
        lst.append(arr)
    return lst

def get_row_of_vars_without_sys(df, variables_without_sys, evt):
    lst = []
    for var in variables_without_sys:
        arr = np.array(tuple([df[var][evt]]), nom_type)
        lst.append(arr)
    return lst

def extract_and_manipulate(file_name, tree_name, variables_with_sys, variables_without_sys, systematics):
    input_df = get_branches(file_name, tree_name)
    
    all_rows_lst = []
    for evt in range(len(input_df)):
        with_sys_lst = get_row_of_vars_with_sys(input_df, variables_with_sys, systematics, evt)
        without_sys_lst = get_row_of_vars_without_sys(input_df, variables_without_sys, evt)
        tpl = tuple(with_sys_lst + without_sys_lst)
        all_rows_lst.append(tpl)
    
    output_df = np.array(all_rows_lst, dtype=table_type)
    return output_df
    

""" One line version

final = np.array([tuple([np.array(tuple([branches[name][i] for name in list(map(lambda suf: var + suf, systematics))]), sys_type) for var in nominals_with_sys]
                        + [np.array(tuple([branches[var][i]]), nom_type) for var in nominals_without_sys])
                  for i in range(len(branches))], dtype=table_type)
""";

In [None]:
%%time

df = extract_and_manipulate('test_sys_signal.root', 'Events', variables_with_sys, variables_without_sys, systematics)

In [None]:
df

## Tagger

In [None]:
class Tagger:
    def __init__(self, clf, variables, systematics, prediction):
        self.clf = clf
        self.variables = variables
        self.systematics = systematics
        self.prediction = prediction
        
    def predict(self, df):
        # Make new dtype for final dataframe
        new_dtype = np.dtype(df.dtype.descr + [(self.prediction, sys_type)])
        
        # Create empty dataframe of the new type
        final_df = np.empty(df.shape, dtype=new_dtype)
        
        # Dictionary where keys are systematics and values predicted arrays
        predictions = {}
        for sys in self.systematics:
            predictions[sys] = bdt.predict(np.vstack([df[var][sys] for var in variables_with_sys]).T)
        l = len(predictions[self.systematics[0]])
        
        # New column
        new_col = np.array([tuple([predictions[sys][ev] for sys in self.systematics]) 
                            for ev in range(l)], dtype=sys_type)
        
        # Fill the two parts: the one we already had and the empty Predicted produced above
        for var in variables_with_sys + variables_without_sys:
            final_df[var] = df[var]
        final_df[self.prediction] = new_col
        
        return final_df

In [None]:
bdt = pickle.load(open('classifier.pkl', 'rb'))

In [None]:
tagger = Tagger(bdt, variables_with_sys, ['Nominal', 'Up', 'Down'], 'Y')

In [None]:
%%time

df_pred = tagger.predict(df)

In [None]:
df_pred

## Selections

In [None]:
sel_one = df_pred[df_pred['Muon_pt_1']['Nominal'] > 25]

In [None]:
sel_one