# How to rediscover the Higgs boson yourself!
This notebook uses ATLAS Open Data http://opendata.atlas.cern to show you the steps to rediscover the Higgs boson yourself!

The idea is that you add extra cuts to increase the ratio of signal ($H \rightarrow ZZ \rightarrow \ell\ell\ell\ell$) to background ($Z, t\bar{t}, ZZ \rightarrow \ell\ell\ell\ell$)

First, try to reduce the amount of $Z$ and $t\bar{t}$ background, since these are quite different to the signal.

Then, try to reduce the amount of $ZZ \rightarrow \ell\ell\ell\ell$, whilst keeping $H \rightarrow ZZ \rightarrow \ell\ell\ell\ell$ signal

The datasets used in this notebook have already been filtered to include at least 4 leptons per event, so that processing is quicker.

<CENTER><img src="HZZ_feynman.pdf" style="width:40%"></CENTER>

## First time setup
This first cell only needs to be run the first time you ever open this notebook. 

If you close jupyter and re-open on the same computer, you won't need to run this first cell again.

In [None]:
import sys
!{sys.executable} -m pip install --upgrade --user pip
!{sys.executable} -m pip install -U numpy pandas uproot matplotlib --user

## To setup everytime
Cell -> Run All Below

to be done every time you re-open this notebook

In [None]:
import uproot
import pandas as pd
import time
import math
import numpy as np
import matplotlib.pyplot as plt
import glob

import infofile

In [None]:
lumi = 1000
                                                                                                                                  
tuple_path = "Input/"

stack_order = ['data',r'$Z,t\bar{t}$','ZZ',r'$H \rightarrow ZZ \rightarrow \ell\ell\ell\ell$']

In [None]:
samples = {

    'data': {
        'list' : ['DataEgamma','DataMuons']
    },

    r'$Z,t\bar{t}$' : {
        'list' : ['Zee','Zmumu','ttbar_lep'],
        'color' : "#8700da"
    },

    'ZZ' : {
        'list' : ['ZZ'],
        'color' : "#f90000"
    },

    r'$H \rightarrow ZZ \rightarrow \ell\ell\ell\ell$' : {
        'list' : ['ggH125_ZZ4lep','VBFH125_ZZ4lep'],
        'color' : "#4faeff"
    }

}

In [None]:
def get_data_from_files():

    data = {}

    for s in samples:
        print(s+':')
        frames = []
        for val in samples[s]['list']:
            prefix = "MC/skim.mc_"
            if s == 'data':
                prefix = "Data/skim."
            else: prefix += str(infofile.infos[val]["DSID"])+"."
            fileString = tuple_path+prefix+val+".root"
            print(fileString)
            f = glob.glob(fileString,recursive=False)[0]
            if f != "":
                temp = read_file(f,val)
                frames.append(temp)
            else:
                print("Error: "+val+" not found!")
        data[s] = pd.concat(frames)

    return data

In [None]:
def calc_mllll(lep_pts,lep_etas,lep_phis):
    theta_0 = 2*math.atan(math.exp(-lep_etas[0]))
    theta_1 = 2*math.atan(math.exp(-lep_etas[1]))
    theta_2 = 2*math.atan(math.exp(-lep_etas[2]))
    theta_3 = 2*math.atan(math.exp(-lep_etas[3]))
    p_0 = lep_pts[0]/math.sin(theta_0)
    p_1 = lep_pts[1]/math.sin(theta_1)
    p_2 = lep_pts[2]/math.sin(theta_2)
    p_3 = lep_pts[3]/math.sin(theta_3)
    pz_0 = p_0*math.cos(theta_0)
    pz_1 = p_1*math.cos(theta_1)
    pz_2 = p_2*math.cos(theta_2)
    pz_3 = p_3*math.cos(theta_3)
    px_0 = p_0*math.sin(theta_0)*math.cos(lep_phis[0])
    px_1 = p_1*math.sin(theta_1)*math.cos(lep_phis[1])
    px_2 = p_2*math.sin(theta_2)*math.cos(lep_phis[2])
    px_3 = p_3*math.sin(theta_3)*math.cos(lep_phis[3])
    py_0 = p_0*math.sin(theta_0)*math.sin(lep_phis[0])
    py_1 = p_1*math.sin(theta_1)*math.sin(lep_phis[1])
    py_2 = p_2*math.sin(theta_2)*math.sin(lep_phis[2])
    py_3 = p_3*math.sin(theta_3)*math.sin(lep_phis[3])
    sumpz = pz_0 + pz_1 + pz_2 + pz_3
    sumpx = px_0 + px_1 + px_2 + px_3
    sumpy = py_0 + py_1 + py_2 + py_3
    sumE = p_0 + p_1 + p_2 + p_3
    mllll = sumE**2 - sumpz**2 - sumpx**2 - sumpy**2
    return math.sqrt(mllll)/1000.

In [None]:
def mllll_window(mllll):
    return 120 < mllll < 130

In [None]:
def calc_mll(lep_pts,lep_etas,lep_phis):
    mll = 2*lep_pts[i]*lep_pts[j!=i]
    cosh = math.cosh(lep_etas[i]-lep_etas[j!=i])
    cos = math.cos(lep_phis[i]-lep_phis[j!=i])
    mll *= ( cosh - cos )
    return math.sqrt(mll)/1000.

In [None]:
def calc_weight(mcWeight,scaleFactor_PILEUP,scaleFactor_ELE,
                scaleFactor_MUON, scaleFactor_TRIGGER):
    return mcWeight*scaleFactor_PILEUP*scaleFactor_ELE*scaleFactor_MUON*scaleFactor_TRIGGER

In [None]:
def get_xsec_weight(totalWeight,sample):
    info = infofile.infos[sample]
    weight = (lumi*info["xsec"])/(info["sumw"]*info["red_eff"])
    weight *= totalWeight
    return weight

In [None]:
def plot_data(data):

    bins = [80 + x*5 for x in range(35) ]
    data_x = [82.5 + x*5 for x in range(34) ]

    data_mllll = []
    data_mllll_errors = []

    mc_mllll = []
    mc_weights = []
    mc_colors = []
    mc_labels = []
    mc_in_mllll_window = []

    for s in stack_order:
        if s == "data":
            data_mllll,_ = np.histogram(data[s].mllll.values, bins=bins)
            data_mllll_errors = np.sqrt(data_mllll)
        else:
            mc_labels.append(s)
            mc_mllll.append(data[s].mllll.values)
            mc_colors.append(samples[s]['color'])
            mc_weights.append(data[s].totalWeight.values)
            mc_in_mllll_window.append([data[s].totalWeight.values[mllll_iter] for mllll_iter in range(len(data[s].mllll.values)) if 120 < data[s].mllll.values[mllll_iter] < 130])
    
    HZZ_in_mllll_window = sum(mc_in_mllll_window[2])
    background_in_mllll_window = sum(mc_in_mllll_window[0]+mc_in_mllll_window[1])
    SoversqrtB = HZZ_in_mllll_window/math.sqrt(background_in_mllll_window)
    print('Signal/sqrt(Background) for 120<mllll<130 '+str(SoversqrtB))
    
    top = np.amax(data_mllll)+math.sqrt(np.amax(data_mllll))

    plt.hist(mc_mllll,bins=bins,weights=mc_weights,stacked=True,color=mc_colors, label=mc_labels)
    plt.errorbar( x=data_x, y=data_mllll, yerr=data_mllll_errors, fmt='ko', label='Data')

    plt.xlabel(r'$M_{\ell\ell\ell\ell}$ [GeV]',fontname='sans-serif',horizontalalignment='right',x=1.0,fontsize=11)

    plt.ylabel(r'Events',fontname='sans-serif',horizontalalignment='right',y=1.0,fontsize=11)
    #plt.yscale('log')                                                                                                                                                                        
    plt.ylim(bottom=0,top=top)

    ax = plt.gca()
    plt.text(0.05,0.97,r'$\mathbf{{ATLAS}}$ Open Data',ha="left",va="top",family='sans-serif',transform=ax.transAxes,fontsize=13)
    plt.text(0.05,0.92,'for education only',ha="left",va="top",family='sans-serif',transform=ax.transAxes,style='italic',fontsize=8)
    plt.text(0.05,0.9,r'$\sqrt{s}=8\,\mathrm{TeV},\;\int L\,dt=1\,\mathrm{fb}^{-1}$',ha="left",va="top",family='sans-serif',transform=ax.transAxes)

    plt.legend()

    plt.savefig("plot.pdf")

    return

## Adding a cut
If you add a cut: Cell -> Run All Below

In [None]:
def read_file(path,sample):
    start = time.time()
    print("\tProcessing: "+sample)
    mc = uproot.open(path)["mini"]
    data = mc.pandas.df(["lep_n","lep_pt","lep_eta","lep_phi","lep_charge","lep_type","lep_etcone20","lep_trackd0pvunbiased","lep_tracksigd0pvunbiased", 
                         "mcWeight","scaleFactor_PILEUP","scaleFactor_ELE","scaleFactor_MUON", # add more variables here if you make cuts on them
                         "scaleFactor_TRIGGER"], flatten=False)

    nIn = len(data.index)

    if 'Data' not in sample:
        data['totalWeight'] = np.vectorize(calc_weight)(data.mcWeight,data.scaleFactor_PILEUP,data.scaleFactor_ELE,data.scaleFactor_MUON,data.scaleFactor_TRIGGER)
        data['totalWeight'] = np.vectorize(get_xsec_weight)(data.totalWeight,sample)

    data.drop(["mcWeight","scaleFactor_PILEUP","scaleFactor_ELE","scaleFactor_MUON","scaleFactor_TRIGGER"], axis=1, inplace=True)

    # cut on minimum lepton pt
    #fail = data[ np.vectorize(cut_...)(data.lep_pt...
    #data.drop(fail, inplace=True)
    
    # cut on lepton etcone20
    #fail = data[ np.vectorize(cut_...)(data.lep_etcone20,data.lep_pt...
    #data.drop(fail, inplace=True)
    
    # cut on lepton d0
    #fail = data[ np.vectorize(cut_...)(data.lep_trackd0pvunbiased,data.lep_tracksigd0pvunbiased...
    #data.drop(fail, inplace=True)
    
    # cut on number of leptons
    fail = data[ np.vectorize(cut_n_lep)(data.lep_n) ].index
    data.drop(fail, inplace=True)
    
    # cut on lepton charge
    #fail = data[ np.vectorize(cut_lep_charge)(data.lep_charge) ].index
    #data.drop(fail, inplace=True)
    
    # cut on lepton type
    #fail = data[ np.vectorize(cut_lep_type)(data.lep_type) ].index
    #data.drop(fail, inplace=True)
    
    # cut on lepton pt
    #fail = data[ np.vectorize(cut_lep_pt)(data.lep_pt) ].index
    #data.drop(fail, inplace=True)
    
    # cut on deltaR
    #fail = data[ np.vectorize(cut_deltaR)(data.lep_eta,data.lep_phi...
    #data.drop(fail, inplace=True)
    
    # cut on minimum opposite-charge-same-type lepton pair invariant mass
    #fail = data[ np.vectorize(cut_OCST)(data....

    # calculation of Z boson candidate 1 invariant mass
    #data['mZ1'] np.vectorize(calc_mZ1)(data.lep_pt,data.lep_eta,data.lep_phi)
    
    # cut on mZ1
    #fail = data[ np.vectorize(cut_mZ1)(data.mZ1) ].index
    #data.drop(fail, inplace=True)
    
    # calculation of Z boson candidate 2 invariant mass
    #data['mZ2'] np.vectorize(calc_mZ2)(data....
    
    # cut on mZ2
    #fail = data[ np.vectorize(cut_mZ2)(data.mZ2) ].index
    #data.drop(fail, inplace=True)
    
    # calculation of 4-lepton invariant mass
    data['mllll'] = np.vectorize(calc_mllll)(data.lep_pt,data.lep_eta,data.lep_phi)
    
    mllll_window_list = data[ np.vectorize(mllll_window)(data.mllll) ]

    #print(data)                                                                                                                                                                              

    nOut = len(data.index)

    elapsed = time.time() - start
    print("\t\tTime taken: "+str(elapsed)+", nIn: "+str(nIn)+", nOut: "+str(nOut))

    return data

## Changing a cut

If you change a cut: Cell -> Run All Below

In [None]:
# cut on number of leptons
def cut_n_lep(lep_n):
    return lep_n < 4

# cut on lepton charge
#def cut_lep_charge(lep_charge):
#    return lep_charge[] ...

# cut on lepton type
#def cut_lep_type(lep_type):
#    return lep_type[] ...

# cut on lepton pt
#def cut_lep_pt(lep_pt):
# want 2nd highest pt lepton used to have lep_pt > 20000
# want 3rd highest pt lepton used to have lep_pt > 15000

# cut on minimum opposite-charge-same-type lepton pair invariant mass
#def cut_mOCST():
# want invariant mass of all opposite-charge-same-type lepton pairs to be > 5

# cut on invariant mass of Z boson candidate 1
#def cut_mZ1(mZ1):
# want invariant mass of same-type-opposite-charge lepton pair that's closest to Z mass (91 GeV) to be in range 50 < m < 106 GeV

# cut on invariant mass of Z boson candidate 2
#def cut_mZ2(mZ2):
# want invariant mass of remaining lepton pair that's closest to Z mass (91 GeV) to be in range 17.5 < m < 115 GeV
# advanced: vary the lower range monotically from 17.5 at mllll=120 to 50 at mllll=190, and constant above mllll=190

# cut on deltaR
# want lepton to be separated from all other leptons by deltaR = math.sqrt(delta(lep_eta)**2 + delta(lep_phi)**2) > 0.2
# want both leptons within each opposite-charge-same-type pair to be separated by deltaR = math.sqrt(delta(lep_eta)**2 + delta(lep_phi)**2) > 0.1

# cut on minimum lepton pt
# want all muons used to have lep_pt > 6000
# want all electrons used to have lep_pt > 7000

# cut on maximum lepton etcone20
# want all muons used to have lep_etcone20/lep_pt < 0.3
# want all electrons used to have lep_etcone20/lep_pt < 0.2

# cut on maximum lepton d0
# want all muons used to have lep_trackd0pvunbiased/lep_tracksigd0pvunbiased < 3.5
# want all electrons used to have lep_trackd0pvunbiased/lep_tracksigd0pvunbiased < 6.5

In [None]:
if __name__=="__main__":
    start = time.time()
    data = get_data_from_files()
    plot_data(data)
    elapsed = time.time() - start
    print("Time taken: "+str(elapsed))