In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from trainlib.FileCollection import FileCollection
from trainlib.config import Config
from trainlib.ConfigFileHandler import ConfigFileHandler
from trainlib.ConfigFileUtils import ConfigFileUtils
import trainlib.cuts as cuts
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import pandas as pd
import copy
import re
from scipy import interpolate
import scipy.integrate as integrate
import pickle
import os

Welcome to JupyROOT 6.10/09


In [3]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error



In [4]:
#candidate_branches = ["PFMET", "nCleanedJetsPt30", "nCleanedJetsPt30BTagged_bTagSF", "nExtraLep", "ZZMass", "nExtraZ", "Z1Mass", "Z2Mass", "Z1Pt", "Z2Pt", "ZZMassErr", "ZZPt", "ZZEta", "ZZPhi", "Z1Flav", "Z2Flav", "costhetastar", "helphi", "helcosthetaZ1", "helcosthetaZ2", "phistarZ1", "phistarZ2", "xi", "xistar"]
candidate_branches = ["PFMET", "nCleanedJetsPt30", "nCleanedJetsPt30BTagged_bTagSF", "nExtraLep", "ZZMass", "nExtraZ", "Z1Mass", "Z2Mass", "Z1Pt", "Z2Pt", "ZZMassErr", "ZZPt", "ZZEta", "ZZPhi", "Z1Flav", "Z2Flav", "D_VBF2j_ggH_ME", "D_VBF1j_ggH_ME", "D_WHh_ggH_ME", "D_ZHh_ggH_ME", "D_WHh_ZHh_ME", "D_VBF2j_WHh_ME", "D_VBF2j_ZHh_ME"]
#list_branches = ["Jet", "Lep", "ExtraLep"]
MELA_branches = []
list_branches = ["Jet", "ExtraLep"]
pt_limits = [30.0, 0.0, 0.0]

In [5]:
allbranches = ["JetPt", "JetEta", "JetPhi", "LepPt", "LepEta", "LepPhi", "ExtraLepPt", "ExtraLepEta", "ExtraLepPhi"] + candidate_branches + MELA_branches + ["LHEAssociatedParticleId", "GenAssocLep1Id", "GenAssocLep2Id", "training_weight"]

In [6]:
#MC_path = "/data_CMS/cms/wind/CJLST_NTuples_randomizeda/"
MC_path = "/data_CMS/cms/wind/CJLST_NTuples/"

In [7]:
# these are the cuts without any m4l restriction imposed
def WHhadr0j_cut(row):
    return cuts.WHhadr_cut(row) and row["nCleanedJetsPt30"] == 0

def WHhadr01j_cut(row):
    return cuts.WHhadr_cut(row) and (row["nCleanedJetsPt30"] == 0 or row["nCleanedJetsPt30"] == 1)

def WHhadr1j_cut(row):
    return cuts.WHhadr_cut(row) and row["nCleanedJetsPt30"] == 1

def WHhadr2j_cut(row):
    return cuts.WHhadr_cut(row) and row["nCleanedJetsPt30"] >= 2

def ZHhadr0j_cut(row):
    return cuts.ZHhadr_cut(row) and row["nCleanedJetsPt30"] == 0

def ZHhadr01j_cut(row):
    return cuts.ZHhadr_cut(row) and (row["nCleanedJetsPt30"] == 0 or row["nCleanedJetsPt30"] == 1)

def ZHhadr1j_cut(row):
    return cuts.ZHhadr_cut(row) and row["nCleanedJetsPt30"] == 1

def ZHhadr2j_cut(row):
    return cuts.ZHhadr_cut(row) and row["nCleanedJetsPt30"] >= 2

def mZZ0j_cut(row):
    return row["nCleanedJetsPt30"] == 0

def mZZ01j_cut(row):
    return (row["nCleanedJetsPt30"] == 0 or row["nCleanedJetsPt30"] == 1)

def mZZ1j_cut(row):
    return row["nCleanedJetsPt30"] == 1

def mZZ2j_cut(row):
    return row["nCleanedJetsPt30"] >= 2

In [8]:
collections = {"VBF2j": {MC_path + "VBFH125/ZZ4lAnalysis.root": mZZ2j_cut},
            "VBF1j": {MC_path + "VBFH125/ZZ4lAnalysis.root": mZZ1j_cut},
            "VBF0j": {MC_path + "VBFH125/ZZ4lAnalysis.root": mZZ0j_cut},
            "VBF01j": {MC_path + "VBFH125/ZZ4lAnalysis.root": mZZ01j_cut},
            "VBF": {MC_path + "VBFH125/ZZ4lAnalysis.root": cuts.no_cut},
            "ggH2j": {MC_path + "ggH125/ZZ4lAnalysis.root": mZZ2j_cut},
            "ggH1j": {MC_path + "ggH125/ZZ4lAnalysis.root": mZZ1j_cut},
            "ggH0j": {MC_path + "ggH125/ZZ4lAnalysis.root": mZZ0j_cut},
            "ggH01j": {MC_path + "ggH125/ZZ4lAnalysis.root": mZZ01j_cut},
            "ggH" : {MC_path + "ggH125/ZZ4lAnalysis.root": cuts.no_cut},
            "WHh2j": {MC_path + "WplusH125/ZZ4lAnalysis.root": WHhadr2j_cut, MC_path + "WminusH125/ZZ4lAnalysis.root": WHhadr2j_cut},
            "WHh1j": {MC_path + "WplusH125/ZZ4lAnalysis.root": WHhadr1j_cut, MC_path + "WminusH125/ZZ4lAnalysis.root": WHhadr1j_cut},
            "WHh0j": {MC_path + "WplusH125/ZZ4lAnalysis.root": WHhadr0j_cut, MC_path + "WminusH125/ZZ4lAnalysis.root": WHhadr0j_cut},
            "WHh": {MC_path + "WplusH125/ZZ4lAnalysis.root": cuts.WHhadr_cut, MC_path + "WminusH125/ZZ4lAnalysis.root": cuts.WHhadr_cut},
            "WHh01j": {MC_path + "WplusH125/ZZ4lAnalysis.root": WHhadr01j_cut, MC_path + "WminusH125/ZZ4lAnalysis.root": WHhadr01j_cut},
            "WHl": {MC_path + "WplusH125/ZZ4lAnalysis.root": cuts.WHlept_cut, MC_path + "WminusH125/ZZ4lAnalysis.root": cuts.WHlept_cut},
            "ZHh2j": {MC_path + "ZH125/ZZ4lAnalysis.root": ZHhadr2j_cut},
            "ZHh1j": {MC_path + "ZH125/ZZ4lAnalysis.root": ZHhadr1j_cut},
            "ZHh01j": {MC_path + "ZH125/ZZ4lAnalysis.root": ZHhadr01j_cut},
            "ZHh0j": {MC_path + "ZH125/ZZ4lAnalysis.root": ZHhadr0j_cut},
            "ZHh": {MC_path + "ZH125/ZZ4lAnalysis.root": cuts.ZHhadr_cut},
            "ZHl": {MC_path + "ZH125/ZZ4lAnalysis.root": cuts.ZHlept_cut},
            "ttHh": {MC_path + "ttH125/ZZ4lAnalysis.root": cuts.ttHhadr_cut},
            "ttHl": {MC_path + "ttH125/ZZ4lAnalysis.root": cuts.ttHlept_cut},
            "ZHMET": {MC_path + "ZH125/ZZ4lAnalysis.root": cuts.ZHMET_cut}
          }

In [9]:
# all the model combinations for which neural networks are currently trained
discriminant_pairs = [("VBF", "ggH"), ("WHh", "ggH"), ("ZHh", "ggH"), ("WHh", "ZHh"), ("VBF", "WHh"),
                     ("VBF", "ZHh"), ("WHl", "ggH"), ("WHl", "VBF"), ("WHl", "WHh"), ("WHl", "ZHh"),
                     ("WHl", "ZHl"), ("WHl", "ZHMET"), ("WHl", "ttHh"), ("WHl", "ttHl"), ("ZHh", "ZHl"),
                     ("ZHh", "ZHMET"), ("ZHh", "ttHh"), ("ZHh", "ttHl"), ("ZHl", "ggH"), ("ZHl", "VBF"),
                     ("ZHl", "WHh"), ("ZHl", "ZHMET"), ("ZHl", "ttHh"), ("ZHl", "ttHl"), ("ZHMET", "ggH"),
                     ("ZHMET", "VBF"), ("ZHMET", "WHh"), ("ZHMET", "ttHh"), ("ZHMET", "ttHl"), ("ttHh", "ggH"),
                      ("ttHh", "VBF"), ("ttHh", "WHh"), ("ttHh", "ttHl"), ("ttHl", "ggH"), ("ttHl", "VBF"),
                     ("ttHl", "WHh")]

In [10]:
def extract_order(df, col_basename, sorted_column, columns, order):
    def get_index(row, order, col_basename, sorted_column):
        sorted_column = row[col_basename + sorted_column]
        if order >= len(sorted_column):
            return -1
        else:
            return np.flipud(np.argsort(sorted_column))[order]
    
    index_column = pd.DataFrame(df.transform(lambda row: get_index(row, order, col_basename, sorted_column), axis = 1, raw = True))
    index_column.columns = ["index"]
    df_temp = pd.concat([index_column, df], axis = 1)
    
    def get_element(row, column_name):
        if row["index"] == -1:
            return 0
        else:
            return row[column_name][row["index"]]
        
    extracted_cols = pd.DataFrame()
    for column in columns:
        extracted_col = pd.DataFrame(df_temp.transform(lambda row: get_element(row, col_basename + column), axis = 1, raw = True))
        extracted_col.columns = [col_basename + column + "(" + col_basename + "Pt|" + str(order) + ")"]
        extracted_cols = pd.concat([extracted_cols, extracted_col], axis = 1)
        
    return extracted_cols

In [11]:
def prepare_data(df, col_basenames, sorted_column, columns, orders, pt_limits):
    all_extracted = pd.DataFrame()
    for col_basename, pt_limit in zip(col_basenames, pt_limits):
        for order in orders:
            extracted = extract_order(df, col_basename, sorted_column, columns, order)
            mask = extracted[col_basename + "Pt(" + col_basename + "Pt|" + str(order) + ")"] < pt_limit
            extracted[mask] = 0.0

            all_extracted = pd.concat([all_extracted, extracted], axis = 1)
            
    return all_extracted

In [12]:
def get_data(H1_coll, H0_coll, read_branches, input_branches, list_branches, pt_limits):
    H1_df = H1_coll.get_data(read_branches, 0.0, 1.0)
    H0_df = H0_coll.get_data(read_branches, 0.0, 1.0)
    
    H1_list_df = prepare_data(H1_df, list_branches, "Pt", ["Pt", "Eta", "Phi"], range(4), pt_limits)
    H0_list_df = prepare_data(H0_df, list_branches, "Pt", ["Pt", "Eta", "Phi"], range(4), pt_limits)
    
    list_branches_unrolled = H1_list_df.columns
            
    H1_df = pd.concat([H1_df, H1_list_df], axis = 1)
    H0_df = pd.concat([H0_df, H0_list_df], axis = 1)        
    
    complete_input_branches = np.concatenate([input_branches, list_branches_unrolled])
            
    H1_df = H1_df[complete_input_branches]
    H0_df = H0_df[complete_input_branches]
    
    return H1_df, H0_df

In [13]:
def get_data_dmatrix(H1_coll, H0_coll, read_branches, input_branches, list_branches, pt_limits):
    H1_df, H0_df = get_data(H1_coll, H0_coll, read_branches, input_branches, list_branches, pt_limits)
    
    complete_input_branches = H1_df.columns
    print "number of input variables: " + str(len(complete_input_branches))
    print "final list of inputs: " + str(complete_input_branches)
    
    # try with the same weights as used later in the neural network training, to balance out some (very)
    # unbalanced datasets
    H1_class_weight = 1.0 + float(len(H0_df)) / float(len(H1_df))
    H0_class_weight = 1.0 + float(len(H1_df)) / float(len(H0_df))
    
    print "using class weights: " + str(H1_class_weight) + " (H1), " + str(H0_class_weight) + " (H0)"
    
    H1_weights = np.full(len(H1_df), H1_class_weight)
    H0_weights = np.full(len(H0_df), H0_class_weight)
    
    H1_data = H1_df.as_matrix()
    H0_data = H0_df.as_matrix()
    H1_target = np.ones(np.shape(H1_data)[0])
    H0_target = np.zeros(np.shape(H0_data)[0])
    
    target = np.concatenate([H1_target, H0_target])
    data = np.concatenate([H1_data, H0_data])
    weights = np.concatenate([H1_weights, H0_weights])
    
    dmatrix = xgb.DMatrix(data, label = target, feature_names = complete_input_branches, weight = weights)
    
    return dmatrix

In [14]:
def get_feature_correlation(source, corr_branches, mandatory_branches, optional_branches, list_branches, pt_limits):    
    coll = FileCollection(collections[source], 0.0, 0.5)
    
    input_branches = [branch for branch in mandatory_branches]
    
    for optional_branch in optional_branches:
        if "0j" in source and ("0j" in optional_branch):
            input_branches.append(optional_branch)
            
        if "1j" in source and ("1j" in optional_branch):
            input_branches.append(optional_branch)
            
        if "2j" in source and ("2j" in optional_branch):
            input_branches.append(optional_branch)

    df, _ = get_data(coll, coll, allbranches, input_branches, list_branches, pt_limits)

    df = df[corr_branches]
    
    fig = plt.figure(figsize=(10,15))
    ax = fig.add_subplot(111)
    
    cax = ax.matshow(df.corr(), vmin = -1.0, vmax = 1.0, cmap = "RdBu")
    
    fig.colorbar(cax)
    
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    
    ax.set_yticklabels([''] + corr_branches)
    ax.set_xticklabels([''] + corr_branches, rotation = 'vertical')
    
    return fig

In [15]:
def get_interpolating_function(data, bins):
    bin_centers = [np.mean([bins[i], bins[i + 1]]) for i in range(len(bins) - 1)]
    intf = interpolate.interp1d(bin_centers, data, kind = "linear")
    interpolated_function = lambda x: intf(x) if x > bin_centers[0] and x < bin_centers[-1] else 0
    
    return interpolated_function

In [16]:
def get_binned_data(df, branch):
    data = df[branch].as_matrix()
    weights = df["training_weight"].as_matrix()
    
    # set the bin width
    q75, q25 = np.percentile(data, [75, 25])
    bin_width = max(2 * (q75 - q25) / len(data)**0.33, 0.005)

    data_max = np.max(data)
    data_min = np.min(data)
    bins = np.arange(data_min, data_max + bin_width, bin_width)
    
    weights = weights / (np.sum(weights) * bin_width)
    
    hist = np.histogram(data, bins = bins, weights = weights)
    return hist

In [17]:
def get_feature_importance_list_BDT(disc_pair, mandatory_branches, optional_branches, list_branches, pt_limits):
    H1_name = disc_pair[0]
    H0_name = disc_pair[1]
    
    # first assemble the list of branches that can serve as input: it will *always* contain the mandatory branches,
    # and *can* contain some of the optional branches, if the name of the categories allows it
    input_branches = [branch for branch in mandatory_branches]
    
    for optional_branch in optional_branches:
        if ("0j" in H1_name or "0j" in H0_name) and ("0j" in optional_branch):
            input_branches.append(optional_branch)  
        elif ("1j" in H1_name or "1j" in H0_name) and ("1j" in optional_branch):
            input_branches.append(optional_branch)
        elif ("2j" in H1_name or "2j" in H0_name) and not ("1j" in optional_branch):
            input_branches.append(optional_branch)
            
        # the fully inclusive categories (i.e. those with NO "xxj" in their name, can not use MELA, since there may
        # be events with low number of jets contained)
    
    # get the training data for the BDT ...
    H1_coll_train = FileCollection(collections[H1_name], 0.0, 0.5)
    H0_coll_train = FileCollection(collections[H0_name], 0.0, 0.5)
    
    dtrain = get_data_dmatrix(H1_coll_train, H0_coll_train, allbranches, input_branches, list_branches, pt_limits)
    
    # ... and the validation data as well
    H1_coll_val = FileCollection(collections[H1_name], 0.5, 1.0)
    H0_coll_val = FileCollection(collections[H0_name], 0.5, 1.0)
    dval = get_data_dmatrix(H1_coll_val, H0_coll_val, allbranches, input_branches, list_branches, pt_limits)
    
    evallist = [(dtrain, 'train'), (dval, 'eval')]
    
    # perform the training
    # try different tree depths and choose the one that gives the best RMSE (i.e. avoid too deep trees to start with)
        
    params = {'eta': 0.01, 'silent': 1, 'gamma': 0.5, 'objective': 'binary:logistic'}
    params['nthread'] = 4
    params['eval_metric'] = 'rmse'
    max_num_rounds = 2000
    
    best_loss = 1e6
    best_imp = None
    best_params = None
    for tree_depth in range(1,8):
        params['max_depth'] = tree_depth
        
        bst = xgb.train(params, dtrain, max_num_rounds, evals = evallist, early_stopping_rounds = 10, verbose_eval = False)
    
        pred = bst.predict(dval)
        cur_loss = np.sqrt(mean_squared_error(pred, dval.get_label()))
        cur_imp = bst.get_fscore()

        print "for max_depth = " + str(params['max_depth']) + ": loss = " + str(cur_loss)
        
        if cur_loss < best_loss:
            best_loss = cur_loss
            best_imp = copy.copy(cur_imp)
            best_params = copy.copy(params)
            
    # normalize the usage score w.r.t. the total score (i.e. sum of all individuals)
    score_sum = sum([val for key, val in best_imp.iteritems()])
    used_variables = {key: val / float(score_sum) for key, val in sorted(best_imp.iteritems(), key = lambda x: x[1], reverse = True)}           
    return best_params, dtrain.feature_names, used_variables

In [18]:
def get_histogram(df, branch, label):
    data = df[branch].as_matrix()
    weights = df["training_weight"].as_matrix()
    
    # set the bin width
    q75, q25 = np.percentile(data, [75, 25])
    bin_width = max(2 * (q75 - q25) / len(data)**0.33, 0.005)

    data_max = np.max(data)
    data_min = np.min(data)
    bins = np.arange(data_min, data_max + bin_width, bin_width)
    
    weights = weights / (np.sum(weights) * bin_width)
    
    fig = plt.hist(data, bins = bins, weights = weights, alpha = 0.5, label = label)
    return fig

In [19]:
def plot_branch(disc_pair, branch, start_fraction = 0.0, end_fraction = 1.0):
    H1_name = disc_pair[0]
    H0_name = disc_pair[1]
    
    # get the training data for the BDT ...
    H1_coll = FileCollection(collections[H1_name], start_fraction, end_fraction)
    H0_coll = FileCollection(collections[H0_name], start_fraction, end_fraction)
    
    H1_df, H0_df = get_data(H1_coll, H0_coll, allbranches, allbranches, list_branches, pt_limits)
    
    plt.figure()
    H1_hist = get_histogram(H1_df, branch, H1_name)
    H0_hist = get_histogram(H0_df, branch, H0_name)
    
    plt.legend(loc = 'upper right')
    plt.show()

In [20]:
def plot_variables(discs):
    plotframe = pd.DataFrame()
    
    for disc in discs:
        _, _, implist = get_feature_importance_list_BDT(disc, candidate_branches, MELA_branches, list_branches, pt_limits)
        
        # cut the list to select only the 95% most important variables
        cutimplist = {key: val for key, val in implist.iteritems() if val > 0.00}
        curframe = pd.DataFrame(cutimplist, index = [len(plotframe)])
        
        plotframe = pd.concat([plotframe, curframe])
        
    plotframe = plotframe.fillna(0.0)
    
    print plotframe
    print "number of pre-selected input variables = " + str(len(plotframe.columns))
    
    # start the plotting
    parameters = plotframe.columns
    plotdata = np.transpose(plotframe.as_matrix())
    
    plt.close('all')
    fig = plt.figure(figsize=(10,15))
    ax = fig.add_subplot(111)
    cax = ax.matshow(plotdata, cmap = 'Blues')
    
    # make axis labels
    disclabels = []
    for disc in discs:
        if "0j" in disc[0] or "0j" in disc[1]:
            disclabels.append('D_' + re.sub('0j', '', disc[0]) + "_" + re.sub('0j', '', disc[1]) + "_0j")
        elif "01j" in disc[0] or "01j" in disc[1]:
            disclabels.append('D_' + re.sub('01j', '', disc[0]) + "_" + re.sub('01j', '', disc[1]) + "_01j")
        elif "1j" in disc[0] or "1j" in disc[1]:
            disclabels.append('D_' + re.sub('1j', '', disc[0]) + "_" + re.sub('1j', '', disc[1]) + "_1j")
        elif "2j" in disc[0] or "2j" in disc[1]:
            disclabels.append('D_' + re.sub('2j', '', disc[0]) + "_" + re.sub('2j', '', disc[1]) + "_2j")
        else:
            disclabels.append('D_' + disc[0] + "_" + disc[1] + "_2j")
            
    disclabels = np.concatenate([[''], np.array(disclabels)])
    parameters = np.concatenate([[''], np.array(parameters)])
        
    ax.set_xticklabels(disclabels, rotation = 'vertical')
    ax.set_yticklabels(parameters)
    
    ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
    ax.yaxis.set_major_locator(ticker.MultipleLocator(1))
    
    # sort the used variables according to their importance
    sorted_implist = []
    for key, val in sorted(cutimplist.iteritems(), key = lambda x: x[1], reverse = True):
        sorted_implist.append((key, val))
    
    return fig, sorted_implist

In [21]:
def append_variables(confhandler, impdict, threshold_fscore):
    confhandler.new_section(impdict["discriminant"])
    cur_sec = confhandler.get_section(impdict["discriminant"])

    periodic_inputs = []
    nonperiodic_inputs = []
    for key, val in impdict.iteritems():
        if val[0] > threshold_fscore and key is not "discriminant":
            if "phi" in key or "Phi" in key:
                periodic_inputs.append(key)
            else:
                nonperiodic_inputs.append(key)
    cur_sec["nonperiodic_columns"] = ConfigFileUtils.serialize_list(nonperiodic_inputs, lambda x: x)
    cur_sec["periodic_columns"] = ConfigFileUtils.serialize_list(periodic_inputs, lambda x: x)

In [22]:
def convert_varname(raw):
    raw = raw.replace('(', '[')
    raw = raw.replace(')', ']')
    return raw

In [23]:
df = pd.DataFrame()

In [24]:
out_dir = "/data_CMS/cms/wind/InputConfigurations/"
out_path = os.path.join(out_dir, "inclusive_99_fullmassrange.conf")
threshold_fscore = 0.01

In [25]:
confhandler = ConfigFileHandler()

In [26]:
%%capture
fig, implist = plot_variables([("VBF", "ggH")])

In [27]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_VBF_ggH_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [28]:
append_variables(confhandler, impdict, threshold_fscore)

In [29]:
implist

[('ZZPt', 0.08877805486284289),
 ('JetEta(JetPt|0)', 0.083854878931703),
 ('JetPt(JetPt|0)', 0.06099267959134422),
 ('ZZEta', 0.057887539216474944),
 ('PFMET', 0.05447671144718848),
 ('Z1Pt', 0.04863647333279704),
 ('ZZPhi', 0.046561016812806694),
 ('D_VBF2j_ggH_ME', 0.04437293862118896),
 ('ZZMassErr', 0.043150189043520235),
 ('ZZMass', 0.041106910143994854),
 ('Z2Mass', 0.04060815702678787),
 ('Z2Pt', 0.040254203201673236),
 ('Z1Mass', 0.03835572359424021),
 ('JetPt(JetPt|1)', 0.033529080524495214),
 ('JetPhi(JetPt|0)', 0.03330383718124045),
 ('D_VBF1j_ggH_ME', 0.03019869680637117),
 ('JetEta(JetPt|1)', 0.02548467540825356),
 ('D_WHh_ZHh_ME', 0.022057758828734616),
 ('JetEta(JetPt|2)', 0.021961225967339716),
 ('JetPhi(JetPt|1)', 0.021269407127342932),
 ('D_WHh_ggH_ME', 0.020223634462231518),
 ('D_VBF2j_WHh_ME', 0.019773147775721985),
 ('D_ZHh_ggH_ME', 0.019563993242699702),
 ('JetPt(JetPt|2)', 0.017665513635266673),
 ('D_VBF2j_ZHh_ME', 0.01303193628831148),
 ('JetPhi(JetPt|2)', 0.008

In [30]:
%%capture
fig, implist = plot_variables([("WHh", "ggH")])

In [31]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_WHh_ggH_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [32]:
append_variables(confhandler, impdict, threshold_fscore)

In [33]:
implist

[('ZZMass', 0.10130758643024856),
 ('ZZPt', 0.08033762448384746),
 ('ZZEta', 0.07681564245810056),
 ('JetEta(JetPt|0)', 0.07246376811594203),
 ('JetPt(JetPt|0)', 0.06092624079021942),
 ('Z1Pt', 0.04961136750060724),
 ('ZZMassErr', 0.044652254878147515),
 ('PFMET', 0.043781880009715814),
 ('ZZPhi', 0.03957169459962756),
 ('Z2Pt', 0.035746093433730065),
 ('Z2Mass', 0.03445065176908752),
 ('Z1Mass', 0.033134968828434946),
 ('D_WHh_ggH_ME', 0.032952797344344587),
 ('D_VBF1j_ggH_ME', 0.030685774431220145),
 ('JetPt(JetPt|1)', 0.028762853210266375),
 ('JetPhi(JetPt|0)', 0.026920897093352766),
 ('JetEta(JetPt|2)', 0.025767144360780505),
 ('D_VBF2j_ggH_ME', 0.02471459800825844),
 ('JetPt(JetPt|2)', 0.021111650878471378),
 ('JetEta(JetPt|1)', 0.020727066634280623),
 ('D_WHh_ZHh_ME', 0.015788195287830945),
 ('D_ZHh_ggH_ME', 0.013946239170917335),
 ('D_VBF2j_WHh_ME', 0.01360213747874666),
 ('JetPhi(JetPt|1)', 0.012913934094405312),
 ('D_VBF2j_ZHh_ME', 0.011942352845923408),
 ('JetPhi(JetPt|2)', 0

In [34]:
%%capture
fig, implist = plot_variables([("ZHh", "ggH")])

In [35]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ZHh_ggH_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [36]:
append_variables(confhandler, impdict, threshold_fscore)

In [37]:
implist

[('ZZMass', 0.10272090756004609),
 ('ZZPt', 0.08461106679665574),
 ('JetEta(JetPt|0)', 0.06286744069248722),
 ('ZZEta', 0.06097669059647257),
 ('JetPt(JetPt|0)', 0.06014948742946616),
 ('ZZPhi', 0.05199562764040297),
 ('Z1Pt', 0.04842092824012526),
 ('Z2Mass', 0.044698513988596415),
 ('Z1Mass', 0.04443262725634435),
 ('PFMET', 0.04440308428609412),
 ('nCleanedJetsPt30BTagged_bTagSF', 0.03731277142603917),
 ('ZZMassErr', 0.03654465419953322),
 ('D_ZHh_ggH_ME', 0.035126591627522234),
 ('D_VBF1j_ggH_ME', 0.032792696977754145),
 ('Z2Pt', 0.03004520074448285),
 ('JetPt(JetPt|1)', 0.028774853023723006),
 ('JetEta(JetPt|1)', 0.022748087092676297),
 ('D_VBF2j_ggH_ME', 0.02215722768767172),
 ('JetEta(JetPt|2)', 0.019734704127152943),
 ('JetPhi(JetPt|0)', 0.019143844722148366),
 ('JetPt(JetPt|2)', 0.015716860173121804),
 ('D_WHh_ggH_ME', 0.012614848296847765),
 ('JetPhi(JetPt|2)', 0.010989984933085173),
 ('D_WHh_ZHh_ME', 0.010369582557830364),
 ('D_VBF2j_WHh_ME', 0.009128777807320748),
 ('D_VBF2

In [38]:
%%capture
fig, implist = plot_variables([("WHh", "ZHh")])

In [39]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_WHh_ZHh_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [40]:
append_variables(confhandler, impdict, threshold_fscore)

In [41]:
implist

[('PFMET', 0.05654778887303852),
 ('D_WHh_ZHh_ME', 0.05609129814550642),
 ('ZZMassErr', 0.05352353780313837),
 ('JetPt(JetPt|0)', 0.047075606276747506),
 ('Z2Mass', 0.04553495007132668),
 ('JetEta(JetPt|0)', 0.044907275320970046),
 ('JetPhi(JetPt|1)', 0.04456490727532097),
 ('ZZMass', 0.04433666191155492),
 ('Z1Pt', 0.04365192582025677),
 ('JetPhi(JetPt|0)', 0.042738944365192585),
 ('Z1Mass', 0.04251069900142653),
 ('D_WHh_ggH_ME', 0.0385734664764622),
 ('ZZEta', 0.03754636233951498),
 ('JetEta(JetPt|1)', 0.035606276747503565),
 ('ZZPhi', 0.034293865905848785),
 ('JetPt(JetPt|1)', 0.03372325249643367),
 ('Z2Pt', 0.03320970042796006),
 ('D_ZHh_ggH_ME', 0.032924393723252496),
 ('ZZPt', 0.030813124108416547),
 ('JetPt(JetPt|2)', 0.02579172610556348),
 ('nCleanedJetsPt30BTagged_bTagSF', 0.025677603423680456),
 ('JetPhi(JetPt|2)', 0.024308131241084167),
 ('D_VBF2j_ggH_ME', 0.021854493580599144),
 ('D_VBF2j_WHh_ME', 0.019514978601997145),
 ('JetEta(JetPt|2)', 0.018716119828815977),
 ('D_VBF2

In [42]:
%%capture
fig, implist = plot_variables([("VBF", "WHh")])

In [43]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_VBF_WHh_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [44]:
append_variables(confhandler, impdict, threshold_fscore)

In [45]:
implist

[('JetEta(JetPt|0)', 0.10215739276788902),
 ('ZZEta', 0.07499872494517264),
 ('ZZMass', 0.07438669862804101),
 ('JetEta(JetPt|2)', 0.06477278522976487),
 ('D_VBF2j_ggH_ME', 0.05758147600346815),
 ('JetPt(JetPt|0)', 0.048044065894833475),
 ('JetEta(JetPt|1)', 0.03804763604835008),
 ('PFMET', 0.03774162288978426),
 ('D_WHh_ZHh_ME', 0.0342479726628245),
 ('ZZPt', 0.033431937573315654),
 ('Z2Pt', 0.03338093538022135),
 ('JetPt(JetPt|2)', 0.03322792880093844),
 ('D_WHh_ggH_ME', 0.03118784107716632),
 ('JetPhi(JetPt|0)', 0.0300402917325445),
 ('Z1Pt', 0.029861784056714438),
 ('ZZMassErr', 0.029683276380884378),
 ('Z2Mass', 0.029606773091242922),
 ('Z1Mass', 0.02690365685724486),
 ('ZZPhi', 0.026470138215943285),
 ('D_VBF1j_ggH_ME', 0.023231498954455043),
 ('JetPt(JetPt|1)', 0.021624929871984496),
 ('D_ZHh_ggH_ME', 0.02055388381700413),
 ('D_VBF2j_ZHh_ME', 0.016805222624572857),
 ('JetPhi(JetPt|1)', 0.014943642576630795),
 ('D_VBF2j_WHh_ME', 0.012215025246085582),
 ('JetPhi(JetPt|2)', 0.01139

In [46]:
%%capture
fig, implist = plot_variables([("VBF", "ZHh")])

KeyboardInterrupt: 

In [47]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_VBF_ZHh_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [48]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

[('JetEta(JetPt|0)', 0.10215739276788902),
 ('ZZEta', 0.07499872494517264),
 ('ZZMass', 0.07438669862804101),
 ('JetEta(JetPt|2)', 0.06477278522976487),
 ('D_VBF2j_ggH_ME', 0.05758147600346815),
 ('JetPt(JetPt|0)', 0.048044065894833475),
 ('JetEta(JetPt|1)', 0.03804763604835008),
 ('PFMET', 0.03774162288978426),
 ('D_WHh_ZHh_ME', 0.0342479726628245),
 ('ZZPt', 0.033431937573315654),
 ('Z2Pt', 0.03338093538022135),
 ('JetPt(JetPt|2)', 0.03322792880093844),
 ('D_WHh_ggH_ME', 0.03118784107716632),
 ('JetPhi(JetPt|0)', 0.0300402917325445),
 ('Z1Pt', 0.029861784056714438),
 ('ZZMassErr', 0.029683276380884378),
 ('Z2Mass', 0.029606773091242922),
 ('Z1Mass', 0.02690365685724486),
 ('ZZPhi', 0.026470138215943285),
 ('D_VBF1j_ggH_ME', 0.023231498954455043),
 ('JetPt(JetPt|1)', 0.021624929871984496),
 ('D_ZHh_ggH_ME', 0.02055388381700413),
 ('D_VBF2j_ZHh_ME', 0.016805222624572857),
 ('JetPhi(JetPt|1)', 0.014943642576630795),
 ('D_VBF2j_WHh_ME', 0.012215025246085582),
 ('JetPhi(JetPt|2)', 0.01139

In [None]:
%%capture
fig, implist = plot_variables([("WHl", "ggH")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_WHl_ggH_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("WHl", "VBF")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_WHl_VBF_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("WHl", "WHh")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_WHl_WHh_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("WHl", "ZHh")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_WHl_ZHh_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("WHl", "ZHl")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_WHl_ZHl_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("WHl", "ZHMET")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_WHl_ZHMET_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("WHl", "ttHh")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_WHl_ttHh_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("WHl", "ttHl")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_WHl_ttHl_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ZHh", "ZHl")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ZHh_ZHl_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ZHh", "ZHMET")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ZHh_ZHMET_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ZHh", "ttHh")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ZHh_ttHh_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ZHh", "ttHl")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ZHh_ttHl_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ZHl", "ggH")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ZHl_ggH_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ZHl", "VBF")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ZHl_VBF_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ZHl", "WHh")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ZHl_WHh_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ZHl", "ZHMET")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ZHl_ZHMET_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ZHl", "ttHh")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ZHl_ttHh_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ZHl", "ttHl")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ZHl_ttHl_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ZHMET", "ggH")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ZHMET_ggH_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ZHMET", "VBF")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ZHMET_VBF_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ZHMET", "WHh")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ZHMET_WHh_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ZHMET", "ttHh")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ZHMET_ttHh_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ZHMET", "ttHl")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ZHMET_ttHl_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ttHh", "ggH")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ttHh_ggH_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ttHh", "VBF")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ttHh_VBF_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ttHh", "WHh")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ttHh_WHh_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ttHh", "ttHl")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ttHh_ttHl_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ttHl", "ggH")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ttHl_ggH_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ttHl", "VBF")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ttHl_VBF_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
%%capture
fig, implist = plot_variables([("ttHl", "WHh")])

In [None]:
impdict = {convert_varname(entry[0]): [entry[1]] for entry in implist}
impdict["discriminant"] = "D_ttHl_WHh_ML"
df = df.append(pd.DataFrame.from_dict(impdict))

In [None]:
append_variables(confhandler, impdict, threshold_fscore)

In [None]:
implist

In [None]:
# save the variable configuration
confhandler.save_configuration(out_path)

In [None]:
df = df.fillna(0.0)

In [None]:
df.to_csv("input_parameters_table_inclusive.csv")

In [None]:
# now plot the data contained in the table to have a global picture of the relevant input variables
datacol_labels = [col for col in df.columns.tolist() if col != "discriminant"]
variable_data = df[datacol_labels].as_matrix().transpose()
datacol_labels = np.concatenate([[''], np.array(datacol_labels)])

In [None]:
discriminant_labels = np.concatenate([[''], df["discriminant"].as_matrix()])

In [None]:
fig = plt.figure(figsize = (15, 10))
ax = fig.add_subplot(111)
cax = ax.matshow(variable_data, interpolation = 'nearest', cmap = 'Blues', vmin = np.min(variable_data), vmax = np.max(variable_data))
ax.set_xticklabels(discriminant_labels, rotation = 'vertical')
ax.set_yticklabels(datacol_labels)
ax.xaxis.set_major_locator(ticker.MultipleLocator(1))
ax.yaxis.set_major_locator(ticker.MultipleLocator(1))

In [None]:
plt.tight_layout()
plt.savefig(os.path.join(out_dir, "input_variables_inclusive_fullmassrange.pdf"))