# How to rediscover the Z boson yourself!
This notebook uses ATLAS Open Data http://opendata.atlas.cern to show you the steps to rediscover the Z boson yourself!

The idea is that you add extra cuts to increase the ratio of signal ($Z \rightarrow e^{+}e^{-}$) to background ($Z \rightarrow \tau^{+}\tau^{-}$, $W$, single top, $t\bar{t}$, dibosons, low-mass Drell-Yan)

The datasets used in this notebook have already been filtered to include exactly 2 leptons per event, so that processing is quicker.

<CENTER><img src="Zee_feynman.pdf" style="width:40%"></CENTER>

## First time setup
This first cell only needs to be run the first time you open this notebook on your computer. 

If you close jupyter and re-open on the same computer, you won't need to run this first cell again.

If you re-open on binder, you will need to run this cell again.

If you run into a problem of "uproot not being available", Kernel -> Restart & Run All

In [11]:
import sys
!{sys.executable} -m pip install --upgrade --user pip
!{sys.executable} -m pip install -U numpy pandas uproot matplotlib --user

Requirement already up-to-date: pip in /Users/meirinevans/.local/lib/python3.7/site-packages (19.3.1)
Requirement already up-to-date: numpy in /Users/meirinevans/.local/lib/python3.7/site-packages (1.17.4)
Requirement already up-to-date: pandas in /Users/meirinevans/.local/lib/python3.7/site-packages (0.25.3)
Requirement already up-to-date: uproot in /Users/meirinevans/.local/lib/python3.7/site-packages (3.10.12)


Requirement already up-to-date: matplotlib in /Users/meirinevans/.local/lib/python3.7/site-packages (3.1.2)


## To setup everytime
Cell -> Run All Below

to be done every time you re-open this notebook

In [12]:
import uproot
import pandas as pd
import time
import math
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import MultipleLocator # for minor ticks

import infofile

In [13]:
lumi = 1000

fraction = 0.3
                                                                                                                                  
tuple_path = "Input/"
#tuple_path = "http://opendata.atlas.cern/release/samples/"

stack_order = ['W','top','Diboson','Drell Yan','Z']

In [14]:
samples = {

    'data': {
        'list' : ['DataEgamma']
    },

    'W' : {
        'list' : ['WenuNoJetsBVeto','WenuJetsBVeto','WenuWithB','WmunuNoJetsBVeto','WmunuJetsBVeto','WmunuWithB','WtaunuNoJetsBVeto','WtaunuJetsBVeto','WtaunuWithB'],
        'color' : "#e55934"
    },

    'Z' : {
        'list' : ['Zee','Ztautau'],
        'color' : "#086788"
    },
   
    'top' : {
        'list' : ['ttbar_lep','ttbar_had','stop_schan','stop_tchan_top','stop_tchan_antitop','stop_wtchan'],
        'color' : "#fde74c"
    },
    
    'Drell Yan' : {
        'list' : ['DYeeM08to15', 'DYeeM15to40', 'DYtautauM08to15', 'DYtautauM15to40'],
        'color' : "#5bc0eb"
    },

    'Diboson' : {
        'list' : ['WW','WZ','ZZ'],
        'color' : "#fa7921"
    }

}

In [15]:
def get_data_from_files():

    data = {}
    
    for s in samples:
        print(s+':')
        frames = []
        for val in samples[s]['list']:
            prefix = "MC/exactly2lep.mc_"
            if s == 'data':
                prefix = "Data/exactly2lep."
            else: prefix += str(infofile.infos[val]["DSID"])+"."
            fileString = tuple_path+prefix+val+".root"
            if fileString != "":
                temp = read_file(fileString,val)
                frames.append(temp)
            else:
                print("Error: "+val+" not found!")
        data[s] = pd.concat(frames)

    return data

In [None]:
def calc_weight(mcWeight,scaleFactor_PILEUP,scaleFactor_ELE,
                scaleFactor_MUON, scaleFactor_TRIGGER):
    return mcWeight*scaleFactor_PILEUP*scaleFactor_ELE*scaleFactor_MUON*scaleFactor_TRIGGER

In [None]:
def get_xsec_weight(totalWeight,sample):
    info = infofile.infos[sample]
    weight = (lumi*info["xsec"])/(info["sumw"]*info["red_eff"])
    weight *= totalWeight
    return weight

In [None]:
def calc_mll(lep_pts,lep_etas,lep_phis):
    mll = 2*lep_pts[0]*lep_pts[1]
    cosh = math.cosh(lep_etas[0]-lep_etas[1])
    cos = math.cos(lep_phis[0]-lep_phis[1])
    mll *= ( cosh - cos )
    return math.sqrt(mll)/1000.

## Changing an already uncommented cut

If you change a cut: Cell -> Run All Below

If you uncomment a cut here, you also need to uncomment the corresponding cut in the cell above.

In [None]:
# cut on number of leptons
def cut_n_lep(lep_n):
    # return when number of leptons is not equal to 2
    # exclamation mark (!) means "not"
    # so != means "not equal to"
    return lep_n != 2

# cut on lepton charge
def cut_lep_charge(lep_charge):
    # return when sum of lepton charges is not equal to 0
    # first lepton is [0], 2nd lepton is [1]
    return lep_charge[0] + lep_charge[1] != 0

# cut on lepton type
def cut_lep_type(lep_type):
# for an electron lep_type is 11
# for a muon lep_type is 13
    return lep_type[0]!=11 or lep_type[1]!=11

# cut on lepton pt
def cut_lep_pt(lep_pt):
# want to throw away events where the leptons have lep_pt[] < 25000 MeV 
    return (lep_pt[0] < 25000) or (lep_pt[1] < 25000)

# cut on invariant mass of Z boson candidate 
def cut_mll(mll):
# want invariant mass of same-type-opposite-charge lepton pair to be in range 66 < m < 116 GeV
    return (mll < 71.12) or (mll > 111.12)

## Uncommenting a new cut
If you add a cut: Cell -> Run All Below

In [None]:
def read_file(path,sample):
    start = time.time()
    print("\tProcessing: "+sample)
    data_all = pd.DataFrame()
    mc = uproot.open(path)["mini"]
    numevents = uproot.numentries(path, "mini")
    for data in mc.iterate(["lep_n","lep_pt","lep_eta","lep_phi","lep_charge","lep_type", # add more variables here if you make cuts on them
                         "mcWeight","scaleFactor_PILEUP","scaleFactor_ELE","scaleFactor_MUON", 
                         "scaleFactor_TRIGGER"], flatten=False, entrysteps=2500000, outputtype=pd.DataFrame, entrystop=numevents*fraction):

        nIn = len(data.index)

        if 'Data' not in sample:
            data['totalWeight'] = np.vectorize(calc_weight)(data.mcWeight,data.scaleFactor_PILEUP,data.scaleFactor_ELE,data.scaleFactor_MUON,data.scaleFactor_TRIGGER)
            data['totalWeight'] = np.vectorize(get_xsec_weight)(data.totalWeight,sample)

        data.drop(["mcWeight","scaleFactor_PILEUP","scaleFactor_ELE","scaleFactor_MUON","scaleFactor_TRIGGER"], axis=1, inplace=True)

        # cut on number of leptons
        fail = data[ np.vectorize(cut_n_lep)(data.lep_n) ].index
        data.drop(fail, inplace=True)

        # dataframe contents can be printed at any stage like this
        #print(data)

        # dataframe column can be printed at any stage like this
        #print(data['lep_charge'])

        # dataframe columns can be printed at any stage like this
        #print(data[['lep_charge','lep_type']])

        # cut on lepton charge
        #fail = data[ np.vectorize(cut_lep_charge)(data.lep_charge) ].index
        #data.drop(fail, inplace=True)

        # cut on lepton type
        fail = data[ np.vectorize(cut_lep_type)(data.lep_type) ].index
        data.drop(fail, inplace=True)

        # cut on lepton pt
        #fail = data[ np.vectorize(cut_lep_pt)(data.lep_pt) ].index
        #data.drop(fail, inplace=True)

        # calculation of 2-lepton invariant mass
        data['mll'] = np.vectorize(calc_mll)(data.lep_pt,data.lep_eta,data.lep_phi)

        # cut on mll
        #fail = data[ np.vectorize(cut_mll)(data.mll) ].index
        #data.drop(fail, inplace=True)

        nOut = len(data.index)
        data_all = data_all.append(data)
        elapsed = time.time() - start
        print("\t\tTime taken: "+str(elapsed)+", nIn: "+str(nIn)+", nOut: "+str(nOut))

    return data_all

In [None]:
def plot_data(data):
    
    bin_width = 5
    num_bins = 24
    xrange_min = 0
    bins = [xrange_min + x*bin_width for x in range(num_bins+1) ]
    data_x = [xrange_min+bin_width/2 + x*bin_width for x in range(num_bins) ]
    
    data_mll,_ = np.histogram(data['data'].mll.values, bins=bins)
    data_mll_errors = np.sqrt(data_mll)
    
    mc_mll = []
    mc_weights = []
    mc_colors = []
    mc_labels = []
    mc_mll_tot = np.zeros(len(data_x))

    for s in stack_order:
        mc_labels.append(s)
        mc_mll.append(data[s].mll.values)
        mc_colors.append(samples[s]['color'])
        mc_weights.append(data[s].totalWeight.values)
        mc_mll_heights,_ = np.histogram(data[s].mll.values,bins=bins,weights=data[s].totalWeight.values)
        mc_mll_tot = np.add(mc_mll_tot, mc_mll_heights)

    top = np.amax(data_mll)*1.2
    
    mc_mll_err = np.sqrt(mc_mll_tot)
    
    plt.axes([0.1,0.3,0.85,0.65]) #(left, bottom, width, height)
    main_axes = plt.gca()
    main_axes.errorbar( x=data_x, y=data_mll, yerr=data_mll_errors, fmt='ko', label='Data')
    main_axes.hist(mc_mll,bins=bins,weights=mc_weights,stacked=True,color=mc_colors, label=mc_labels)
    main_axes.bar(data_x,2*mc_mll_err,bottom=mc_mll_tot-mc_mll_err,alpha=0.5,color='none',hatch="////",width=bin_width,label='Stat. Unc.')
    
    # Create new legend handles but use the colors from the existing ones 
    handles, labels = main_axes.get_legend_handles_labels()
    
    # specify order within legend
    handles = [handles[labels.index('Data')],handles[labels.index('W')],handles[labels.index('top')],
               handles[labels.index('Diboson')],handles[labels.index('Drell Yan')],handles[labels.index('Z')],
               handles[labels.index('Stat. Unc.')]]
    
    labels = ['Data','W','top','Diboson','Drell Yan','Z','Stat. Unc.']

    main_axes.set_ylim(top=top)
    main_axes.set_xlim(left=xrange_min,right=bins[-1])
    main_axes.xaxis.set_minor_locator(MultipleLocator(5))
    main_axes.tick_params(which='both',direction='in',top=True,labeltop=False,labelbottom=False,right=True,labelright=False)
        
    main_axes.set_ylabel(r'Events / '+str(bin_width)+r' GeV',fontname='sans-serif',horizontalalignment='right',y=1.0,fontsize=11)
    
    lumi_used = str(lumi*fraction/1000)
    plt.text(0.05,0.97,r'$\mathbf{{ATLAS}}$ Open Data',ha="left",va="top",family='sans-serif',transform=main_axes.transAxes,fontsize=13)
    plt.text(0.05,0.90,'for education only',ha="left",va="top",family='sans-serif',transform=main_axes.transAxes,style='italic',fontsize=8)
    plt.text(0.05,0.86,r'$\sqrt{s}=8\,\mathrm{TeV},\;\int L\,dt=$'+lumi_used+'$\,\mathrm{fb}^{-1}$',ha="left",va="top",family='sans-serif',transform=main_axes.transAxes)
    plt.text(0.05,0.79,r'$Z \rightarrow e^+e^-$',ha="left",va="top",family='sans-serif',transform=main_axes.transAxes)

    main_axes.legend(handles=handles, labels=labels, frameon=False, loc=(0.05,0.1))
    
    
    # Data/MC ratio 
    plt.axes([0.1,0.1,0.85,0.2])
    ratio_axes = plt.gca()
    ratio_axes.errorbar( x=data_x, y=data_mll/mc_mll_tot, yerr=data_mll_errors/mc_mll_tot, fmt='ko', label='Data')
    ratio_axes.bar(data_x,2*mc_mll_err/mc_mll_tot,bottom=1-mc_mll_err/mc_mll_tot,alpha=0.5,color='none',
            hatch="////",width=bin_width)
    ratio_axes.plot(bins,np.ones(len(bins)),color='k')
    ratio_axes.set_ylim(bottom=0,top=2.5)
    ratio_axes.set_xlim(left=xrange_min,right=bins[-1])
    ratio_axes.set_yticks([0,1,2])
    ratio_axes.yaxis.set_minor_locator(MultipleLocator(0.2))
    ratio_axes.xaxis.set_minor_locator(MultipleLocator(5))
    ratio_axes.set_ylabel(r'Data/MC',fontname='sans-serif',fontsize=11)
    ratio_axes.set_xlabel(r'$\mathrm{m_{ee}}$ [GeV]',fontname='sans-serif',horizontalalignment='right',x=1.0,fontsize=11)
    ratio_axes.yaxis.set_label_coords(-0.09,0.5)
    ratio_axes.tick_params(which='both',direction='in',top=True,labeltop=False,right=True,labelright=False)
    
    #plt.savefig("plot.pdf")

    return

In [None]:
if __name__=="__main__":
    start = time.time()
    data = get_data_from_files()    
    plot_data(data)
    elapsed = time.time() - start
    print("Time taken: "+str(elapsed))

data:
	Processing: DataEgamma
