In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from trainlib.FileCollection import FileCollection
from trainlib.config import Config
import trainlib.cuts as cuts
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd
import copy
import re
from scipy import interpolate
import scipy.integrate as integrate

Welcome to JupyROOT 6.10/09


In [3]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error



In [4]:
#candidate_branches = ["PFMET", "nCleanedJetsPt30", "nCleanedJetsPt30BTagged_bTagSF", "nExtraLep", "ZZMass", "nExtraZ", "Z1Mass", "Z2Mass", "Z1Pt", "Z2Pt", "ZZMassErr", "ZZPt", "ZZEta", "ZZPhi", "Z1Flav", "Z2Flav", "costhetastar", "helphi", "helcosthetaZ1", "helcosthetaZ2", "phistarZ1", "phistarZ2", "xi", "xistar"]
candidate_branches = ["PFMET", "nCleanedJetsPt30", "nCleanedJetsPt30BTagged_bTagSF", "nExtraLep", "ZZMass", "nExtraZ", "Z1Mass", "Z2Mass", "Z1Pt", "Z2Pt", "ZZMassErr", "ZZPt", "ZZEta", "ZZPhi", "Z1Flav", "Z2Flav"]
MELA_branches = ["D_VBF2j_ggH_ME", "D_VBF1j_ggH_ME", "D_WHh_ggH_ME", "D_ZHh_ggH_ME", "D_WHh_ZHh_ME", "D_VBF2j_WHh_ME", "D_VBF2j_ZHh_ME"]
#list_branches = ["Jet", "Lep", "ExtraLep"]
list_branches = ["Jet", "ExtraLep"]
pt_limits = [30.0, 0.0, 0.0]

In [5]:
allbranches = ["JetPt", "JetEta", "JetPhi", "LepPt", "LepEta", "LepPhi", "ExtraLepPt", "ExtraLepEta", "ExtraLepPhi"] + candidate_branches + MELA_branches + ["LHEAssociatedParticleId", "GenAssocLep1Id", "GenAssocLep2Id", "training_weight"]

In [6]:
#MC_path = "/data_CMS/cms/wind/CJLST_NTuples_randomizeda/"
MC_path = "/data_CMS/cms/wind/CJLST_NTuples/"

In [7]:
def WHhadr0j_cut(row):
    return cuts.WHhadr_cut(row) and row["nCleanedJetsPt30"] == 0

def WHhadr01j_cut(row):
    return cuts.WHhadr_cut(row) and (row["nCleanedJetsPt30"] == 0 or row["nCleanedJetsPt30"] == 1)

def WHhadr1j_cut(row):
    return cuts.WHhadr_cut(row) and row["nCleanedJetsPt30"] == 1

def WHhadr2j_cut(row):
    return cuts.WHhadr_cut(row) and row["nCleanedJetsPt30"] >= 2

def ZHhadr0j_cut(row):
    return cuts.ZHhadr_cut(row) and row["nCleanedJetsPt30"] == 0

def ZHhadr01j_cut(row):
    return cuts.ZHhadr_cut(row) and (row["nCleanedJetsPt30"] == 0 or row["nCleanedJetsPt30"] == 1)

def ZHhadr1j_cut(row):
    return cuts.ZHhadr_cut(row) and row["nCleanedJetsPt30"] == 1

def ZHhadr2j_cut(row):
    return cuts.ZHhadr_cut(row) and row["nCleanedJetsPt30"] >= 2

def mZZ0j_cut(row):
    return cuts.mZZ_cut(row) and row["nCleanedJetsPt30"] == 0

def mZZ01j_cut(row):
    return cuts.mZZ_cut(row) and (row["nCleanedJetsPt30"] == 0 or row["nCleanedJetsPt30"] == 1)

def mZZ1j_cut(row):
    return cuts.mZZ_cut(row) and row["nCleanedJetsPt30"] == 1

def mZZ2j_cut(row):
    return cuts.mZZ_cut(row) and row["nCleanedJetsPt30"] >= 2

In [8]:
collections = {"VBF2j": {MC_path + "VBFH125/ZZ4lAnalysis.root": mZZ2j_cut},
            "VBF1j": {MC_path + "VBFH125/ZZ4lAnalysis.root": mZZ1j_cut},
            "VBF0j": {MC_path + "VBFH125/ZZ4lAnalysis.root": mZZ0j_cut},
            "VBF01j": {MC_path + "VBFH125/ZZ4lAnalysis.root": mZZ01j_cut},
            "VBF": {MC_path + "VBFH125/ZZ4lAnalysis.root": cuts.mZZ_cut},
            "ggH2j": {MC_path + "ggH125/ZZ4lAnalysis.root": mZZ2j_cut},
            "ggH1j": {MC_path + "ggH125/ZZ4lAnalysis.root": mZZ1j_cut},
            "ggH0j": {MC_path + "ggH125/ZZ4lAnalysis.root": mZZ0j_cut},
            "ggH01j": {MC_path + "ggH125/ZZ4lAnalysis.root": mZZ01j_cut},
            "ggH" : {MC_path + "ggH125/ZZ4lAnalysis.root": cuts.mZZ_cut},
            "WHh2j": {MC_path + "WplusH125/ZZ4lAnalysis.root": WHhadr2j_cut, MC_path + "WminusH125/ZZ4lAnalysis.root": WHhadr2j_cut},
            "WHh1j": {MC_path + "WplusH125/ZZ4lAnalysis.root": WHhadr1j_cut, MC_path + "WminusH125/ZZ4lAnalysis.root": WHhadr1j_cut},
            "WHh0j": {MC_path + "WplusH125/ZZ4lAnalysis.root": WHhadr0j_cut, MC_path + "WminusH125/ZZ4lAnalysis.root": WHhadr0j_cut},
            "WHh": {MC_path + "WplusH125/ZZ4lAnalysis.root": cuts.WHhadr_cut, MC_path + "WminusH125/ZZ4lAnalysis.root": cuts.WHhadr_cut},
            "WHh01j": {MC_path + "WplusH125/ZZ4lAnalysis.root": WHhadr01j_cut, MC_path + "WminusH125/ZZ4lAnalysis.root": WHhadr01j_cut},
            "WHl": {MC_path + "WplusH125/ZZ4lAnalysis.root": cuts.WHlept_cut, MC_path + "WminusH125/ZZ4lAnalysis.root": cuts.WHlept_cut},
            "ZHh2j": {MC_path + "ZH125/ZZ4lAnalysis.root": ZHhadr2j_cut},
            "ZHh1j": {MC_path + "ZH125/ZZ4lAnalysis.root": ZHhadr1j_cut},
            "ZHh01j": {MC_path + "ZH125/ZZ4lAnalysis.root": ZHhadr01j_cut},
            "ZHh0j": {MC_path + "ZH125/ZZ4lAnalysis.root": ZHhadr0j_cut},
            "ZHh": {MC_path + "ZH125/ZZ4lAnalysis.root": cuts.ZHhadr_cut},
            "ZHl": {MC_path + "ZH125/ZZ4lAnalysis.root": cuts.ZHlept_cut},
            "ttHh": {MC_path + "ttH125/ZZ4lAnalysis.root": cuts.ttHhadr_cut},
            "ttHl": {MC_path + "ttH125/ZZ4lAnalysis.root": cuts.ttHlept_cut},
            "ZHMET": {MC_path + "ZH125/ZZ4lAnalysis.root": cuts.ZHMET_cut}
          }

In [9]:
# all the model combinations for which neural networks are currently trained
discriminant_pairs = [("VBF2j", "ggH2j"), ("VBF1j", "ggH1j"), ("VBF0j", "ggH0j"), ("WHh2j", "ggH2j"), 
                     ("WHh1j", "ggH1j"), ("WHh0j", "ggH0j"), ("ZHh2j", "ggH2j"), ("ZHh1j", "ggH1j"), 
                      ("ZHh0j", "ggH0j"), ("WHh2j", "ZHh2j"), ("WHh01j", "ZHh01j"), ("VBF2j", "WHh2j"),
                     ("VBF1j", "WHh1j"), ("VBF0j", "WHh0j"), ("VBF2j", "ZHh2j"), ("VBF1j", "ZHh1j"), 
                      ("VBF0j", "ZHh0h"), ("WHl", "ggH"), ("WHl", "VBF"), ("WHl", "WHh"), ("WHl", "ZHh"),
                     ("WHl", "ZHl"), ("WHl", "ZHMET"), ("WHl", "ttHh"), ("WHl", "ttHl"), ("ZHh", "ZHl"),
                     ("ZHh", "ZHMET"), ("ZHh", "ttHh"), ("ZHh", "ttHl"), ("ZHl", "ggH"), ("ZHl", "VBF"),
                     ("ZHl", "WHh"), ("ZHl", "ZHMET"), ("ZHl", "ttHh"), ("ZHl", "ttHl"), ("ZHMET", "ggH"),
                     ("ZHMET", "VBF"), ("ZHMET", "WHh"), ("ZHMET", "ttHh"), ("ZHMET", "ttHl"), ("ttHh", "ggH"),
                      ("ttHh", "VBF"), ("ttHh", "WHh"), ("ttHh", "ttHl"), ("ttHl", "ggH"), ("ttHl", "VBF"),
                     ("ttHl", "WHh")]

In [10]:
def extract_order(df, col_basename, sorted_column, columns, order):
    def get_index(row, order, col_basename, sorted_column):
        sorted_column = row[col_basename + sorted_column]
        if order >= len(sorted_column):
            return -1
        else:
            return np.flipud(np.argsort(sorted_column))[order]
    
    index_column = pd.DataFrame(df.transform(lambda row: get_index(row, order, col_basename, sorted_column), axis = 1, raw = True))
    index_column.columns = ["index"]
    df_temp = pd.concat([index_column, df], axis = 1)
    
    def get_element(row, column_name):
        if row["index"] == -1:
            return 0
        else:
            return row[column_name][row["index"]]
        
    extracted_cols = pd.DataFrame()
    for column in columns:
        extracted_col = pd.DataFrame(df_temp.transform(lambda row: get_element(row, col_basename + column), axis = 1, raw = True))
        extracted_col.columns = [col_basename + column + "_" + str(order)]
        extracted_cols = pd.concat([extracted_cols, extracted_col], axis = 1)
        
    return extracted_cols

In [11]:
def prepare_data(df, col_basenames, sorted_column, columns, orders, pt_limits):
    all_extracted = pd.DataFrame()
    for col_basename, pt_limit in zip(col_basenames, pt_limits):
        for order in orders:
            extracted = extract_order(df, col_basename, sorted_column, columns, order)
            mask = extracted[col_basename + "Pt_" + str(order)] < pt_limit
            extracted[mask] = 0.0

            all_extracted = pd.concat([all_extracted, extracted], axis = 1)
            
    return all_extracted

In [12]:
def get_data(H1_coll, H0_coll, read_branches, input_branches, list_branches, pt_limits):
    H1_df = H1_coll.get_data(read_branches, 0.0, 1.0)
    H0_df = H0_coll.get_data(read_branches, 0.0, 1.0)
    
    H1_list_df = prepare_data(H1_df, list_branches, "Pt", ["Pt", "Eta", "Phi"], range(2), pt_limits)
    H0_list_df = prepare_data(H0_df, list_branches, "Pt", ["Pt", "Eta", "Phi"], range(2), pt_limits)
    
    list_branches_unrolled = H1_list_df.columns
            
    H1_df = pd.concat([H1_df, H1_list_df], axis = 1)
    H0_df = pd.concat([H0_df, H0_list_df], axis = 1)        
    
    complete_input_branches = np.concatenate([input_branches, list_branches_unrolled])
            
    H1_df = H1_df[complete_input_branches]
    H0_df = H0_df[complete_input_branches]
    
    return H1_df, H0_df

In [13]:
def get_data_dmatrix(H1_coll, H0_coll, read_branches, input_branches, list_branches, pt_limits):
    H1_df, H0_df = get_data(H1_coll, H0_coll, read_branches, input_branches, list_branches, pt_limits)
    
    complete_input_branches = H1_df.columns
    print "number of input variables: " + str(len(complete_input_branches))
    print "final list of inputs: " + str(complete_input_branches)
    
    H1_data = H1_df.as_matrix()
    H0_data = H0_df.as_matrix()
    H1_target = np.ones(np.shape(H1_data)[0])
    H0_target = np.zeros(np.shape(H0_data)[0])
    
    target = np.concatenate([H1_target, H0_target])
    data = np.concatenate([H1_data, H0_data])
    
    dmatrix = xgb.DMatrix(data, label = target, feature_names = complete_input_branches)
    
    return dmatrix

In [14]:
def get_feature_correlation(source, corr_branches, mandatory_branches, optional_branches, list_branches, pt_limits):    
    coll = FileCollection(collections[source], 0.0, 0.5)
    
    input_branches = [branch for branch in mandatory_branches]
    
    for optional_branch in optional_branches:
        if "0j" in source and ("0j" in optional_branch):
            input_branches.append(optional_branch)
            
        if "1j" in source and ("1j" in optional_branch):
            input_branches.append(optional_branch)
            
        if "2j" in source and ("2j" in optional_branch):
            input_branches.append(optional_branch)

    df, _ = get_data(coll, coll, allbranches, input_branches, list_branches, pt_limits)

    df = df[corr_branches]
    
    fig = plt.figure(figsize=(10,15))
    ax = fig.add_subplot(111)
    
    cax = ax.matshow(df.corr(), vmin = -1.0, vmax = 1.0, cmap = "RdBu")
    
    fig.colorbar(cax)
    
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    
    ax.set_yticklabels([''] + corr_branches)
    ax.set_xticklabels([''] + corr_branches, rotation = 'vertical')
    
    return fig

In [15]:
def get_interpolating_function(data, bins):
    bin_centers = [np.mean([bins[i], bins[i + 1]]) for i in range(len(bins) - 1)]
    intf = interpolate.interp1d(bin_centers, data, kind = "linear")
    interpolated_function = lambda x: intf(x) if x > bin_centers[0] and x < bin_centers[-1] else 0
    
    return interpolated_function

In [16]:
def get_binned_data(df, branch):
    data = df[branch].as_matrix()
    weights = df["training_weight"].as_matrix()
    
    # set the bin width
    q75, q25 = np.percentile(data, [75, 25])
    bin_width = max(2 * (q75 - q25) / len(data)**0.33, 0.005)

    data_max = np.max(data)
    data_min = np.min(data)
    bins = np.arange(data_min, data_max + bin_width, bin_width)
    
    weights = weights / (np.sum(weights) * bin_width)
    
    hist = np.histogram(data, bins = bins, weights = weights)
    return hist

In [62]:
def get_feature_importance_list_separation(disc_pair, mandatory_branches, optional_branches, list_branches, pt_limits):
    H1_name = disc_pair[0]
    H0_name = disc_pair[1]
    
    input_branches = [branch for branch in mandatory_branches]
    
    for optional_branch in optional_branches:
        if ("0j" in H1_name or "0j" in H0_name) and ("0j" in optional_branch):
            input_branches.append(optional_branch)  
        elif ("1j" in H1_name or "1j" in H0_name) and ("1j" in optional_branch):
            input_branches.append(optional_branch)
        elif ("2j" in H1_name or "2j" in H0_name) and not ("1j" in optional_branch):
            input_branches.append(optional_branch)
            
    # needed to build the histograms
    input_branches.append("training_weight")
    
    H1_coll = FileCollection(collections[H1_name], 0.0, 0.5)
    H0_coll = FileCollection(collections[H0_name], 0.0, 0.5)
    
    H1_df, H0_df = get_data(H1_coll, H0_coll, allbranches, input_branches, list_branches, pt_limits)
    
    available_branches = H1_df.columns
    
    implist = {}
    for branch in available_branches:
                
        if "training_weight" not in branch:
            data_H1, bins_H1 = get_binned_data(H1_df, branch)
            data_H0, bins_H0 = get_binned_data(H0_df, branch)

            if(len(data_H1) > 0):
                H1_func = get_interpolating_function(data_H1, bins_H1)
                H0_func = get_interpolating_function(data_H0, bins_H0)

                # compute the separation in this branch
                global_min = np.min(np.concatenate([bins_H0, bins_H1]))
                global_max = np.max(np.concatenate([bins_H0, bins_H1]))

                separation_func = lambda x: (H1_func(x) + H0_func(x)) * (H1_func(x) - H0_func(x))**2

                sep = integrate.quad(separation_func, global_min, global_max)[0]
            else:
                sep = 0.0
                
            print "separation for " + branch + " = " + str(sep)
            implist[branch] = sep
        
    # normalize the importance list
    impsum = sum([val for key, val in implist.iteritems()])
    for key in implist.keys():
        implist[key] /= impsum
        
    return None, None, implist

In [61]:
def get_feature_importance_list_BDT(disc_pair, mandatory_branches, optional_branches, list_branches, pt_limits):
    H1_name = disc_pair[0]
    H0_name = disc_pair[1]
    
    # first assemble the list of branches that can serve as input: it will *always* contain the mandatory branches,
    # and *can* contain some of the optional branches, if the name of the categories allows it
    input_branches = [branch for branch in mandatory_branches]
    
    for optional_branch in optional_branches:
        if ("0j" in H1_name or "0j" in H0_name) and ("0j" in optional_branch):
            input_branches.append(optional_branch)  
        elif ("1j" in H1_name or "1j" in H0_name) and ("1j" in optional_branch):
            input_branches.append(optional_branch)
        elif ("2j" in H1_name or "2j" in H0_name) and not ("1j" in optional_branch):
            input_branches.append(optional_branch)
            
        # the fully inclusive categories (i.e. those with NO "xxj" in their name, can not use MELA, since there may
        # be events with low number of jets contained)
    
    # get the training data for the BDT ...
    H1_coll_train = FileCollection(collections[H1_name], 0.0, 0.5)
    H0_coll_train = FileCollection(collections[H0_name], 0.0, 0.5)
    
    dtrain = get_data_dmatrix(H1_coll_train, H0_coll_train, allbranches, input_branches, list_branches, pt_limits)
    
    # ... and the validation data as well
    H1_coll_val = FileCollection(collections[H1_name], 0.5, 1.0)
    H0_coll_val = FileCollection(collections[H0_name], 0.5, 1.0)
    dval = get_data_dmatrix(H1_coll_val, H0_coll_val, allbranches, input_branches, list_branches, pt_limits)
    
    evallist = [(dtrain, 'train'), (dval, 'eval')]
    
    # perform the training
    # try different tree depths and choose the one that gives the best RMSE (i.e. avoid too deep trees to start with)
        
    params = {'eta': 0.01, 'silent': 1, 'gamma': 0.5, 'objective': 'binary:logistic'}
    params['nthread'] = 4
    params['eval_metric'] = 'rmse'
    max_num_rounds = 2000
    
    best_loss = 1e6
    best_imp = None
    best_params = None
    for tree_depth in range(1,8):
        params['max_depth'] = tree_depth
        
        bst = xgb.train(params, dtrain, max_num_rounds, evals = evallist, early_stopping_rounds = 10, verbose_eval = False)
    
        pred = bst.predict(dval)
        cur_loss = np.sqrt(mean_squared_error(pred, dval.get_label()))
        cur_imp = bst.get_fscore()

        print "for max_depth = " + str(params['max_depth']) + ": loss = " + str(cur_loss)
        
        if cur_loss < best_loss:
            best_loss = cur_loss
            best_imp = copy.copy(cur_imp)
            best_params = copy.copy(params)
            
    # normalize the usage score w.r.t. the total score (i.e. sum of all individuals)
    score_sum = sum([val for key, val in best_imp.iteritems()])
    used_variables = {key: val / float(score_sum) for key, val in sorted(best_imp.iteritems(), key = lambda x: x[1], reverse = True)}           
    return best_params, dtrain.feature_names, used_variables

In [19]:
def get_histogram(df, branch, label):
    data = df[branch].as_matrix()
    weights = df["training_weight"].as_matrix()
    
    # set the bin width
    q75, q25 = np.percentile(data, [75, 25])
    bin_width = max(2 * (q75 - q25) / len(data)**0.33, 0.005)

    data_max = np.max(data)
    data_min = np.min(data)
    bins = np.arange(data_min, data_max + bin_width, bin_width)
    
    weights = weights / (np.sum(weights) * bin_width)
    
    fig = plt.hist(data, bins = bins, weights = weights, alpha = 0.5, label = label)
    return fig

In [20]:
def plot_branch(disc_pair, branch, start_fraction = 0.0, end_fraction = 1.0):
    H1_name = disc_pair[0]
    H0_name = disc_pair[1]
    
    # get the training data for the BDT ...
    H1_coll = FileCollection(collections[H1_name], start_fraction, end_fraction)
    H0_coll = FileCollection(collections[H0_name], start_fraction, end_fraction)
    
    H1_df, H0_df = get_data(H1_coll, H0_coll, allbranches, allbranches, list_branches, pt_limits)
    
    plt.figure()
    H1_hist = get_histogram(H1_df, branch, H1_name)
    H0_hist = get_histogram(H0_df, branch, H0_name)
    
    plt.legend(loc = 'upper right')
    plt.show()

In [21]:
def plot_variables(discs):
    plotframe = pd.DataFrame()
    
    for disc in discs:
        _, _, implist = get_feature_importance_list_BDT(disc, candidate_branches, MELA_branches, list_branches, pt_limits)
        
        # cut the list to select only the 95% most important variables
        cutimplist = {key: val for key, val in implist.iteritems() if val > 0.00}
        curframe = pd.DataFrame(cutimplist, index = [len(plotframe)])
        
        plotframe = pd.concat([plotframe, curframe])
        
    plotframe = plotframe.fillna(0.0)
    
    print plotframe
    print "number of pre-selected input variables = " + str(len(plotframe.columns))
    
    # start the plotting
    parameters = plotframe.columns
    plotdata = np.transpose(plotframe.as_matrix())
    
    plt.close('all')
    fig = plt.figure(figsize=(10,15))
    ax = fig.add_subplot(111)
    cax = ax.matshow(plotdata, cmap = 'Blues')
    
    # make axis labels
    disclabels = []
    for disc in discs:
        if "0j" in disc[0] or "0j" in disc[1]:
            disclabels.append('D_' + re.sub('0j', '', disc[0]) + "_" + re.sub('0j', '', disc[1]) + "_0j")
        elif "01j" in disc[0] or "01j" in disc[1]:
            disclabels.append('D_' + re.sub('01j', '', disc[0]) + "_" + re.sub('01j', '', disc[1]) + "_01j")
        elif "1j" in disc[0] or "1j" in disc[1]:
            disclabels.append('D_' + re.sub('1j', '', disc[0]) + "_" + re.sub('1j', '', disc[1]) + "_1j")
        elif "2j" in disc[0] or "2j" in disc[1]:
            disclabels.append('D_' + re.sub('2j', '', disc[0]) + "_" + re.sub('2j', '', disc[1]) + "_2j")
        else:
            disclabels.append('D_' + disc[0] + "_" + disc[1] + "_2j")
            
    disclabels = np.concatenate([[''], np.array(disclabels)])
    parameters = np.concatenate([[''], np.array(parameters)])
        
    ax.set_xticklabels(disclabels, rotation = 'vertical')
    ax.set_yticklabels(parameters)
    
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    
    # sort the used variables according to their importance
    sorted_implist = []
    for key, val in sorted(cutimplist.iteritems(), key = lambda x: x[1], reverse = True):
        sorted_implist.append((key, val))
    
    return fig, sorted_implist

In [None]:
corr1 = get_feature_correlation("WHh1j", ["LepPt_0", "LepPt_1"], candidate_branches, MELA_branches, list_branches, pt_limits)

In [None]:
plt.show()

In [24]:
%%capture
fig, implist = plot_variables([("VBF2j", "ggH2j")])

In [25]:
implist

[('D_VBF2j_ggH_ME', 0.0934156378600823),
 ('JetPt_0', 0.08994708994708994),
 ('JetPt_1', 0.07360376249265138),
 ('ZZPt', 0.059376837154614934),
 ('nCleanedJetsPt30', 0.04838330393885949),
 ('PFMET', 0.046325690770135215),
 ('D_WHh_ZHh_ME', 0.046325690770135215),
 ('Z2Pt', 0.0435626102292769),
 ('D_ZHh_ggH_ME', 0.042328042328042326),
 ('Z1Pt', 0.042034097589653145),
 ('JetEta_1', 0.03968253968253968),
 ('JetPhi_1', 0.03827160493827161),
 ('ZZEta', 0.036331569664902995),
 ('JetEta_0', 0.035155790711346266),
 ('D_VBF2j_WHh_ME', 0.03403880070546737),
 ('ZZMass', 0.03368606701940035),
 ('ZZPhi', 0.031099353321575544),
 ('JetPhi_0', 0.030393885949441504),
 ('Z1Mass', 0.029218106995884775),
 ('D_VBF2j_ZHh_ME', 0.027865961199294534),
 ('Z2Mass', 0.025279247501469725),
 ('ZZMassErr', 0.024926513815402705),
 ('D_WHh_ggH_ME', 0.024573780129335684),
 ('Z1Flav', 0.001763668430335097),
 ('Z2Flav', 0.0014697236919459142),
 ('nCleanedJetsPt30BTagged_bTagSF', 0.0009406231628453851)]

In [26]:
%%capture
fig, implist = plot_variables([("VBF1j", "ggH1j")])

In [27]:
implist

[('JetEta_0', 0.21530531324345759),
 ('ZZPt', 0.09448283675087799),
 ('D_VBF1j_ggH_ME', 0.09227370567576754),
 ('JetPt_0', 0.08468335787923417),
 ('ZZEta', 0.07584683357879234),
 ('PFMET', 0.07465730146142517),
 ('ZZMassErr', 0.05392545598731166),
 ('Z1Pt', 0.052509346323779314),
 ('Z2Pt', 0.043049733771383256),
 ('ZZMass', 0.0417469128809335),
 ('JetPhi_0', 0.04135040217514444),
 ('Z2Mass', 0.04101053585589668),
 ('Z1Mass', 0.04038744760394245),
 ('ZZPhi', 0.03460972017673049),
 ('nCleanedJetsPt30BTagged_bTagSF', 0.00708054831766172),
 ('ExtraLepPt_0', 0.0030021524866885693),
 ('ExtraLepPhi_0', 0.0019825535289452814),
 ('Z2Flav', 0.0012461765039084626),
 ('Z1Flav', 0.0005664438654129376),
 ('ExtraLepEta_0', 0.00016993315962388127),
 ('nExtraLep', 0.00011328877308258752)]

In [31]:
%%capture
fig, implist = plot_variables([("VBF0j", "ggH0j")])

In [32]:
implist

[('ZZPt', 0.6882453151618398),
 ('PFMET', 0.07722884724588303),
 ('Z1Pt', 0.06700738216922203),
 ('ZZPhi', 0.05281090289608177),
 ('ZZEta', 0.04713231118682567),
 ('Z2Pt', 0.020442930153321975),
 ('Z2Mass', 0.016467915956842702),
 ('ZZMass', 0.009653605905735378),
 ('nExtraLep', 0.0073821692220329355),
 ('ZZMassErr', 0.005110732538330494),
 ('ExtraLepEta_0', 0.004542873367404884),
 ('Z1Mass', 0.0034071550255536627),
 ('ExtraLepPt_0', 0.0005678591709256105)]

In [33]:
%%capture
fig, implist = plot_variables([("WHh2j", "ggH2j")])

In [34]:
implist

[('JetPt_0', 0.0807279063014229),
 ('JetPt_1', 0.07763043267834673),
 ('D_WHh_ggH_ME', 0.07075791307714645),
 ('ZZPt', 0.06732165327654632),
 ('D_VBF2j_ggH_ME', 0.06451456780563353),
 ('JetEta_0', 0.05793243635659665),
 ('ZZEta', 0.0492207917916949),
 ('Z1Pt', 0.04176749588616784),
 ('PFMET', 0.04138031168328332),
 ('ZZPhi', 0.03949278869422128),
 ('JetPhi_0', 0.03881521633917336),
 ('ZZMass', 0.03818604200948601),
 ('ZZMassErr', 0.037653663730519794),
 ('D_WHh_ZHh_ME', 0.03465298615816475),
 ('nCleanedJetsPt30', 0.03450779208208305),
 ('JetEta_1', 0.03247507501693931),
 ('Z2Mass', 0.031507114509728006),
 ('Z2Pt', 0.029232407317781435),
 ('JetPhi_1', 0.02836124286129126),
 ('Z1Mass', 0.028312844835930693),
 ('D_ZHh_ggH_ME', 0.02424741070564321),
 ('D_VBF2j_ZHh_ME', 0.023860226502758687),
 ('D_VBF2j_WHh_ME', 0.020617558803600812),
 ('Z2Flav', 0.003484657825960701),
 ('Z1Flav', 0.0019843190397831768),
 ('ExtraLepPt_0', 0.0010163585325718711),
 ('nCleanedJetsPt30BTagged_bTagSF', 0.0003387

In [35]:
%%capture
fig, implist = plot_variables([("WHh1j", "ggH1j")])

In [36]:
implist

[('JetEta_0', 0.13644053459651498),
 ('ZZEta', 0.11850786668922349),
 ('JetPt_0', 0.09896802571476908),
 ('D_VBF1j_ggH_ME', 0.08340382338013873),
 ('ZZPt', 0.07793379574804038),
 ('PFMET', 0.06087520442113573),
 ('ZZMassErr', 0.058647718942085376),
 ('ZZPhi', 0.05292392714148762),
 ('Z2Mass', 0.0519652625049343),
 ('Z2Pt', 0.0519652625049343),
 ('ZZMass', 0.05182428241132352),
 ('JetPhi_0', 0.05145773416793549),
 ('Z1Pt', 0.04649523487283595),
 ('Z1Mass', 0.043985789206564034),
 ('Z2Flav', 0.004680539107877968),
 ('nCleanedJetsPt30BTagged_bTagSF', 0.004116618733434839),
 ('ExtraLepPt_0', 0.0029041899283821122),
 ('Z1Flav', 0.002875993909659956),
 ('nExtraLep', 2.819601872215643e-05)]

In [37]:
%%capture
fig, implist = plot_variables([("WHh0j", "ggH0j")])

In [38]:
implist

[('ZZPt', 0.48711484593837534),
 ('ZZEta', 0.13165266106442577),
 ('ZZMassErr', 0.08263305322128851),
 ('Z2Pt', 0.05546218487394958),
 ('Z2Mass', 0.05266106442577031),
 ('Z1Pt', 0.050140056022408966),
 ('PFMET', 0.046218487394957986),
 ('ZZMass', 0.031932773109243695),
 ('Z1Mass', 0.025210084033613446),
 ('ZZPhi', 0.02492997198879552),
 ('ExtraLepPt_0', 0.011204481792717087),
 ('ExtraLepEta_0', 0.0008403361344537816)]

In [39]:
%%capture
fig, implist = plot_variables([("ZHh2j", "ggH2j")])

In [40]:
implist

[('JetPt_0', 0.10333591931730023),
 ('nCleanedJetsPt30BTagged_bTagSF', 0.09107835531419706),
 ('JetPt_1', 0.07757951900698215),
 ('D_VBF2j_ggH_ME', 0.07183863460046548),
 ('ZZPt', 0.07168347556245151),
 ('ZZEta', 0.07013188518231186),
 ('D_ZHh_ggH_ME', 0.06966640806826997),
 ('D_WHh_ggH_ME', 0.0660977501939488),
 ('JetEta_0', 0.05120248254460822),
 ('nCleanedJetsPt30', 0.04127230411171451),
 ('Z1Pt', 0.038789759503491075),
 ('D_VBF2j_WHh_ME', 0.03413498836307215),
 ('JetPhi_0', 0.02916989914662529),
 ('PFMET', 0.02591155934833204),
 ('D_WHh_ZHh_ME', 0.021722265321955005),
 ('JetEta_1', 0.020325833979829323),
 ('ZZPhi', 0.01970519782777347),
 ('ZZMass', 0.01768813033359193),
 ('Z1Mass', 0.01768813033359193),
 ('ZZMassErr', 0.015671062839410395),
 ('D_VBF2j_ZHh_ME', 0.01334367726920093),
 ('Z2Pt', 0.012102404965089216),
 ('JetPhi_1', 0.009464701318851822),
 ('Z2Mass', 0.008688906128782002),
 ('Z2Flav', 0.0017067494181536075)]

In [41]:
%%capture
fig, implist = plot_variables([("ZHh1j", "ggH1j")])

In [42]:
implist

[('ZZEta', 0.12485887665819927),
 ('JetEta_0', 0.12461191081004798),
 ('D_VBF1j_ggH_ME', 0.10277307366638443),
 ('JetPt_0', 0.09624611910810048),
 ('ZZPt', 0.07285492520462884),
 ('ZZPhi', 0.06593988145639289),
 ('PFMET', 0.05895427603725656),
 ('ZZMass', 0.0558848433530906),
 ('Z2Mass', 0.05486169912503528),
 ('ZZMassErr', 0.04784081287044877),
 ('JetPhi_0', 0.045441716059836296),
 ('Z2Pt', 0.04360711261642676),
 ('Z1Pt', 0.04304261924922382),
 ('Z1Mass', 0.04067880327406153),
 ('nCleanedJetsPt30BTagged_bTagSF', 0.012912785774767146),
 ('Z2Flav', 0.004939316963025684),
 ('ExtraLepPt_0', 0.0026460626587637595),
 ('Z1Flav', 0.001905165114309907)]

In [43]:
%%capture
fig, implist = plot_variables([("ZHh0j", "ggH0j")])

In [44]:
implist

[('ZZPt', 0.3042638538107265),
 ('Z1Pt', 0.1209329965829743),
 ('ZZEta', 0.10117367404546129),
 ('Z2Mass', 0.0901797652651909),
 ('Z1Mass', 0.08423711187045016),
 ('ZZPhi', 0.08082008616847422),
 ('PFMET', 0.05571237557569455),
 ('Z2Pt', 0.048432625167137124),
 ('ZZMassErr', 0.03223889466646858),
 ('ZZMass', 0.03179319566186302),
 ('ExtraLepPt_0', 0.029861833308572276),
 ('ExtraLepEta_0', 0.014262368147377804),
 ('ExtraLepPhi_0', 0.004011291041450008),
 ('Z2Flav', 0.0016342296835537068),
 ('Z1Flav', 0.00044569900460555636)]

In [45]:
%%capture
fig, implist = plot_variables([("WHh2j", "ZHh2j")])

In [46]:
implist

[('D_WHh_ZHh_ME', 0.12430568856540893),
 ('nCleanedJetsPt30BTagged_bTagSF', 0.11166443210113006),
 ('D_ZHh_ggH_ME', 0.06473855583221605),
 ('JetPhi_1', 0.06205707718827811),
 ('PFMET', 0.05190576517908447),
 ('nCleanedJetsPt30', 0.051522696801379046),
 ('JetEta_0', 0.044435931813828766),
 ('D_VBF2j_WHh_ME', 0.04386132924727064),
 ('ZZMass', 0.04117985060333269),
 ('JetPt_1', 0.04079678222562728),
 ('Z2Mass', 0.03485922237119326),
 ('ZZEta', 0.033518483049224286),
 ('ZZPt', 0.032560812104960736),
 ('Z1Pt', 0.030262401838728213),
 ('JetEta_1', 0.029879333461022794),
 ('JetPt_0', 0.029687799272170082),
 ('ZZMassErr', 0.02643171806167401),
 ('D_WHh_ggH_ME', 0.02585711549511588),
 ('D_VBF2j_ZHh_ME', 0.025282512928557748),
 ('Z2Pt', 0.02489944455085233),
 ('JetPhi_0', 0.023367171040030645),
 ('Z1Mass', 0.01647194024133308),
 ('D_VBF2j_ggH_ME', 0.010151312009193642),
 ('Z2Flav', 0.00995977782034093),
 ('ZZPhi', 0.00881057268722467),
 ('Z1Flav', 0.0015322735108216816)]

In [63]:
%%capture
fig, implist = plot_variables([("WHh01j", "ZHh01j")])

In [64]:
implist

[('nCleanedJetsPt30BTagged_bTagSF', 0.20610687022900764),
 ('ZZMass', 0.12213740458015267),
 ('ZZPt', 0.10347752332485156),
 ('ZZEta', 0.09245122985581),
 ('JetPt_0', 0.07633587786259542),
 ('D_VBF1j_ggH_ME', 0.06446140797285835),
 ('ZZPhi', 0.043256997455470736),
 ('PFMET', 0.04240882103477523),
 ('Z2Mass', 0.039016115351993216),
 ('JetEta_0', 0.0364715860899067),
 ('Z1Mass', 0.03307888040712468),
 ('Z2Pt', 0.031382527565733676),
 ('JetPhi_0', 0.029686174724342665),
 ('Z1Pt', 0.020356234096692113),
 ('ExtraLepEta_0', 0.019508057675996608),
 ('ZZMassErr', 0.016963528413910092),
 ('ExtraLepPt_0', 0.01272264631043257),
 ('ExtraLepPhi_0', 0.006785411365564037),
 ('Z1Flav', 0.0033927056827820186)]

In [65]:
%%capture
fig, implist = plot_variables([("VBF2j", "WHh2j")])

In [66]:
implist

[('D_VBF2j_ggH_ME', 0.12462303366207515),
 ('nCleanedJetsPt30', 0.10864781155758416),
 ('D_WHh_ggH_ME', 0.07906104817018501),
 ('JetEta_0', 0.05917352677479827),
 ('ZZEta', 0.05265302795663868),
 ('JetPt_0', 0.0506968783111908),
 ('JetEta_1', 0.049474284782785885),
 ('D_WHh_ZHh_ME', 0.048496209960061945),
 ('D_VBF2j_ZHh_ME', 0.044665416904393185),
 ('ZZPt', 0.04238324231803733),
 ('JetPt_1', 0.03455864373624582),
 ('PFMET', 0.029016219740810173),
 ('ZZPhi', 0.028934713505583177),
 ('Z1Mass', 0.028038144918086233),
 ('D_VBF2j_WHh_ME', 0.02730458880104328),
 ('ZZMass', 0.0269785638601353),
 ('D_ZHh_ggH_ME', 0.025429945390822398),
 ('ZZMassErr', 0.02461488303855245),
 ('JetPhi_0', 0.024370364332871465),
 ('Z1Pt', 0.02428885809764447),
 ('Z2Mass', 0.022006683511288615),
 ('JetPhi_1', 0.018012877985165866),
 ('Z2Pt', 0.01776835927948488),
 ('nCleanedJetsPt30BTagged_bTagSF', 0.004727361643165702),
 ('Z2Flav', 0.0017116309397668921),
 ('ExtraLepPt_0', 0.0016301247045398973),
 ('Z1Flav', 0.000

In [67]:
%%capture
fig, implist = plot_variables([("VBF1j", "WHh1j")])

In [68]:
implist

[('JetEta_0', 0.1589121338912134),
 ('ZZEta', 0.13893305439330544),
 ('JetPt_0', 0.1002092050209205),
 ('D_VBF1j_ggH_ME', 0.06527196652719665),
 ('PFMET', 0.058744769874476986),
 ('ZZMassErr', 0.05784518828451883),
 ('Z2Mass', 0.05713389121338912),
 ('ZZPt', 0.055230125523012555),
 ('JetPhi_0', 0.05489539748953975),
 ('Z1Pt', 0.05179916317991632),
 ('Z2Pt', 0.051401673640167365),
 ('ZZPhi', 0.05069037656903766),
 ('ZZMass', 0.047740585774058576),
 ('Z1Mass', 0.04117154811715481),
 ('Z2Flav', 0.004435146443514644),
 ('nCleanedJetsPt30BTagged_bTagSF', 0.0027615062761506275),
 ('Z1Flav', 0.0021548117154811717),
 ('ExtraLepPt_0', 0.0006694560669456067)]

In [69]:
%%capture
fig, implist = plot_variables([("VBF0j", "WHh0j")])

In [70]:
implist

[('ZZEta', 0.23030303030303031),
 ('PFMET', 0.21363636363636362),
 ('ZZMassErr', 0.2015151515151515),
 ('Z2Pt', 0.11818181818181818),
 ('Z1Pt', 0.09242424242424242),
 ('ExtraLepPt_0', 0.04242424242424243),
 ('Z1Mass', 0.03484848484848485),
 ('ZZPhi', 0.02727272727272727),
 ('Z2Mass', 0.022727272727272728),
 ('ZZMass', 0.012121212121212121),
 ('ZZPt', 0.004545454545454545)]

In [71]:
%%capture
fig, implist = plot_variables([("VBF2j", "ZHh2j")])

In [72]:
implist

[('D_VBF2j_ggH_ME', 0.09419977592002068),
 ('nCleanedJetsPt30', 0.06683616306127725),
 ('ZZEta', 0.06295785572696716),
 ('nCleanedJetsPt30BTagged_bTagSF', 0.05416702576919762),
 ('JetEta_0', 0.050116349220029305),
 ('PFMET', 0.04572093424114453),
 ('JetEta_1', 0.045419288115142634),
 ('D_ZHh_ggH_ME', 0.043566319055416705),
 ('D_VBF2j_WHh_ME', 0.04261828837369646),
 ('JetPt_1', 0.04210118072912178),
 ('JetPt_0', 0.039989657847108506),
 ('D_WHh_ggH_ME', 0.03964491941739205),
 ('JetPhi_1', 0.036973196587089543),
 ('D_WHh_ZHh_ME', 0.034818581401361716),
 ('Z1Pt', 0.03395673532707059),
 ('ZZMassErr', 0.03356890459363958),
 ('D_VBF2j_ZHh_ME', 0.03300870464535034),
 ('ZZMass', 0.032534689304490216),
 ('Z2Mass', 0.031931397052486425),
 ('ZZPhi', 0.030121520296475048),
 ('JetPhi_0', 0.028397828147892785),
 ('ZZPt', 0.028009997414461776),
 ('Z1Mass', 0.023743859346720676),
 ('Z2Pt', 0.022494182538998535),
 ('Z1Flav', 0.0019391536671550462),
 ('Z2Flav', 0.0011634922002930277)]

In [73]:
%%capture
fig, implist = plot_variables([("VBF1j", "ZHh1j")])

In [74]:
implist

[('JetEta_0', 0.16034299590226134),
 ('ZZEta', 0.15538523802296758),
 ('JetPt_0', 0.08979612485455557),
 ('PFMET', 0.07115394344108868),
 ('D_VBF1j_ggH_ME', 0.06189608944199929),
 ('ZZPt', 0.05688774219659028),
 ('ZZMassErr', 0.05602772297263115),
 ('Z2Pt', 0.05567359740982446),
 ('ZZPhi', 0.051778216218950776),
 ('ZZMass', 0.05104467041027976),
 ('Z2Mass', 0.04811048717559569),
 ('Z1Pt', 0.04026913542773309),
 ('JetPhi_0', 0.038802043810391054),
 ('Z1Mass', 0.03852380229675722),
 ('nCleanedJetsPt30BTagged_bTagSF', 0.014468558708959377),
 ('Z2Flav', 0.005868366469368139),
 ('nExtraLep', 0.0022512267921282947),
 ('Z1Flav', 0.0015429756665148986),
 ('ExtraLepPt_0', 0.00015176809834572774),
 ('ExtraLepEta_0', 2.5294683057621287e-05)]

In [75]:
%%capture
fig, implist = plot_variables([("VBF0j", "ZHh0j")])

In [76]:
implist

[('ExtraLepPt_0', 0.20869565217391303),
 ('PFMET', 0.15072463768115943),
 ('Z1Pt', 0.13043478260869565),
 ('Z2Pt', 0.12173913043478261),
 ('ZZEta', 0.11884057971014493),
 ('Z1Mass', 0.08985507246376812),
 ('ZZPt', 0.0782608695652174),
 ('ZZMassErr', 0.06956521739130435),
 ('ZZPhi', 0.03188405797101449)]

In [None]:
%%capture
fig, implist = plot_variables([("WHl", "ggH")])

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("WHl", "VBF")])

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("WHl", "WHh")])

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("WHl", "ZHh")])

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("WHl", "ZHl")])

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("WHl", "ZHMET")])

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("WHl", "ttHh")])

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("WHl", "ttHl")])

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ZHh", "ZHl")])

In [None]:
implist

In [None]:
plot_branch(("WHh2j", "ggH2j"), "JetEta_0", 0.0, 1.0)