In [1]:
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from collections import defaultdict
import matplotlib as mpl

mpl.rcParams['figure.figsize'] = (6,6)
mpl.rcParams['figure.dpi'] = 100
mpl.rcParams["image.origin"] = 'lower'

In [2]:
base_dir = "/eos/user/m/mpresill/www/VBS/Numpy/"

#plot_configs  = ["Full2016v6s5_v2","Full2017v6s5_v2","Full2018v6s5_v2"]
plot_configs  = ["DNN_2018_v6"]
#cut     = "boos_sig_mjjincl"
cut     = "Resolved_SR"
version = "v1"

output_dir = os.path.join(base_dir, "FullRun2", cut, "samples/" + version)

samples_dirs = [os.path.join(base_dir, p, cut, "samples/" + version) for p in plot_configs]

In [3]:
#samples_dirs #print here sample directory

In [4]:
classes = { "VBS_ZV": 0, "DY":1, "top": 2, "VBS_VV_QCD": 3, "WW":3, "VBF-V":3, "VVV":3, "ggWW":3, "Vg":3, "VZ":3, "WJets":3}
signal = "VBS_ZV"
sample_names = ["WJets", "VBS_ZV","VBS_VV_QCD", "WW", "ggWW", "Vg", "DY", "top", "VZ", "VBF-V", "VVV"]

samples = {}

for samples_dir in samples_dirs:
    for file in os.listdir(samples_dir):
        if os.path.isdir(os.path.join(samples_dir, file)): continue
        print(file)  
        if file =="index.php": continue
        sname = file.split("_part")[0]
        s = pickle.load(open(os.path.join(samples_dir, file), "rb"))
        s.rename(columns=lambda c: c.split(cut+"_")[1] if cut in c else c, inplace=True)
        s["sample_name"]= sname
        s["class"] = classes[sname]
        if sname == signal:
            s["signal"] = 1
        else: 
            s["signal"] = 0
        
        if "2016" in samples_dir:
            s["year"] = 2016
        if "2017" in samples_dir:
            s["year"] = 2017
        if "2018" in samples_dir:
            s["year"] = 2018
            
        if sname in samples:
            samples[sname] = pd.concat([samples[sname], s], ignore_index=True)
        else:
            samples[sname] = s

DY_part1.pkl
DY_part2.pkl
DY_part3.pkl
VBF-V_part1.pkl
VBS_VV_QCD_part1.pkl
VBS_ZV_part1.pkl
VVV_part1.pkl
VZ_part1.pkl
Vg_part1.pkl
Vg_part2.pkl
WJets_part1.pkl
WW_part1.pkl
ggWW_part1.pkl
top_part1.pkl
top_part2.pkl


In [5]:
#for s, df in samples.items():
#    print(f"Sample name {s:10}, nsamples: {len(df):10},   XS total: {(df.weight_).sum()*137 :15}")

## Balancing

In [6]:
background = pd.concat([
                samples["top"],
                samples["DY"],
                samples["WJets"],
                samples["VZ"],
                samples["VVV"],
                samples["VBS_VV_QCD"],
                samples["WW"],
                samples["ggWW"],
                samples["Vg"],
                samples["VBF-V"], 
                ], ignore_index=True)

signal = samples["VBS_ZV"]

In [7]:
'''
If random sampling will be done for the signal we need to assume that the number of signal 
events will be the same of the number of background events --> This assumption will go in the background weight. 
'''
random_sampling = False

### Normalization by bkg events

In [8]:
ratio_neve_bkgsignal= len(background) / len(signal)
print("ratio n ev signal - bkg",ratio_neve_bkgsignal)

tot_ev_weighted_bkg = (background.weight_ / background.weight_.mean()).sum()
print("TOT bkg weighted events", tot_ev_weighted_bkg)

rescale_factor_sig  = (tot_ev_weighted_bkg )/ signal.weight_.sum()
print("Rescale facor for signal", rescale_factor_sig)

signal["weight_norm"] = signal.weight_ * rescale_factor_sig
background["weight_norm"] = background.weight_ / background.weight_.mean()

print("Effective sig events: ", signal.weight_norm.sum())
print("Effective bkg events: ", background.weight_norm.sum())

ratio n ev signal - bkg 67.77438025926769
TOT bkg weighted events 11920158.000000011
Rescale facor for signal 1027603.1990748154
Effective sig events:  11920158.00000001
Effective bkg events:  11920158.000000011


In [15]:
background.weight_.sum()
#background.weight_.mean()

101678.25715853645

In [16]:
signal.weight_.sum()

11.599961941274723

### Normalization by signal events

In [14]:
tot_ev_weighted_sig = (signal.weight_ / signal.weight_.mean()).sum()
print("TOT signal weighted events", tot_ev_weighted_sig)

if random_sampling:
    rescale_factor_bkg = (tot_ev_weighted_sig *ratio_neve_bkgsignal )/ background.weight_.sum()
else:
    rescale_factor_bkg = (tot_ev_weighted_sig )/ background.weight_.sum()
print(rescale_factor_bkg)

signal["weight_norm"] = signal.weight_ / signal.weight_.mean()
background["weight_norm"] = background.weight_ * rescale_factor_bkg

print("Effective sig events: ", signal.weight_norm.sum())
print("Effective bkg events: ", background.weight_norm.sum())

TOT signal weighted events 175879.99999999997
1.7297700109647667
Effective sig events:  175879.99999999997
Effective bkg events:  175879.99999999965


## Save signal and bkg samples

In [None]:
import os

os.makedirs(os.path.join(output_dir, "for_training"), exist_ok=True)
pickle.dump(background, open(os.path.join(output_dir, "for_training/background_balanced.pkl"), "wb"))
pickle.dump(signal, open(os.path.join(output_dir, "for_training/signal_balanced.pkl"), "wb"))

In [None]:
signal.columns

In [None]:
plt.hist(background["weight_norm"], bins=100, range=(0, 10))
plt.yscale("log")

In [None]:
plt.hist(signal["weight_norm"], bins=100, range=(0, 10))
plt.yscale("log")

The total number of events will be manually balanced

In [None]:
f = plt.figure(figsize=(19, 15))
plt.matshow(df.corr(), fignum=f.number)
plt.xticks(range(df.shape[1]), df.columns, fontsize=12, rotation=45)
plt.yticks(range(df.shape[1]), df.columns, fontsize=12)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);