# Import packages and define functions

In [None]:
from default_values import *
from ntuples_to_dfs import *
from template_fit import *

import matplotlib.pyplot as plt
import numpy as np
import time
from sklearn.model_selection import train_test_split

%matplotlib inline

In [None]:
nBins = 40
varinfo[nn1] = (np.linspace(0, 1, nBins + 1), nn1purity, 'Deep Neural Network Output')
varinfo[lambda2] = (np.linspace(0, 2, nBins + 1), lambda2purity, '$\sigma^2_{long}$')
varinfo[emaxe] = (np.linspace(0, 1, nBins + 1), emaxepurity, 'Emax/Ecluster')

nSamples = 1000
ptranges = [(12, 14), (14, 16), (20, 30)]

def getTrueFrac(fulldatadf, fullgjmcdf, fitvar, ptrange):
    binEdges, purityRange, _ = varinfo[fitvar]

    ptcut = '{0}>{1} and {0}<{2}'.format('cluster_pt', *ptrange)
    datadf = applyCut(fulldatadf, ptcut)
    mcdf = applyCut(fullgjmcdf, ptcut)

    isodf = applyCut(datadf, isocuttext)
    signaldf = applyCut(mcdf, isocuttext)
    bkgdf = applyCut(datadf, antiisocuttext)

    datahist, dataerr = getHistAndErr(isodf, fitvar, binEdges)
    signalhist, signalerr = getHistAndErr(signaldf, fitvar, binEdges)
    bkghist, bkgerr = getHistAndErr(bkgdf, fitvar, binEdges)

    tf = TemplateFit(datahist, dataerr, signalhist, signalerr, bkghist, bkgerr, binEdges, verbosity=0)

    return tf.fitf


def getNormHistAndErr(df, fitvar):
    binEdges = varinfo[fitvar][0]

    hist, err = getHistAndErr(df, fitvar, binEdges)
    nTotal = np.sum(hist)
    return np.divide(hist, nTotal), np.divide(err, nTotal)


def makePseudodataset(signalhist, signalerr, bkghist, bkgerr, signalFrac):
    hist = np.sum([np.multiply(signalhist, signalFrac), np.multiply(bkghist, 1 - signalFrac)], axis=0)
    err = np.sqrt(np.sum([np.multiply(np.square(signalerr), signalFrac ** 2), np.multiply(np.square(bkgerr), (1 - signalFrac) ** 2)], axis=0))

    return hist, err


def plotTfComp(fitvar, signalFrac, ptrange):
    signaldf = applyCut(fullgjmcdf, ptcuttext(ptrange))
    bkgdf = applyCut(fulljjmcdf, ptcuttext(ptrange))

    signalisodf = applyCut(signaldf, isocuttext)
    bkgisodf = applyCut(bkgdf, isocuttext)
    bkgantiisodf = applyCut(bkgdf, antiisocuttext)

    samplesignaldf, templatesignaldf = train_test_split(signalisodf, test_size=0.5)
    samplebkgdf, templatebkgdf = train_test_split(bkgisodf, test_size=0.5)

    samplesignalhist, samplesignalerr = getNormHistAndErr(samplesignaldf, fitvar)
    samplebkghist, samplebkgerr = getNormHistAndErr(samplebkgdf, fitvar)
    pseudodatahist, pseudodataerr = makePseudodataset(samplesignalhist, samplesignalerr, samplebkghist, samplebkgerr, signalFrac)

    print 'signal: {0}; iso bkg: {1}; anti-iso bkg: {2}'.format(signalisodf.shape[0], bkgisodf.shape[0], bkgantiisodf.shape[0])
    
    binEdges, purityRange, xlabel = varinfo[fitvar]

    signalisohist, signalisoerr = getHistAndErr(templatesignaldf, fitvar, binEdges)
    bkgisohist, bkgisoerr = getHistAndErr(templatebkgdf, fitvar, binEdges)
    bkgantiisohist, bkgantiisoerr = getHistAndErr(bkgantiisodf, fitvar, binEdges)

    isotf = TemplateFit(pseudodatahist, pseudodataerr, signalisohist, signalisoerr, bkgisohist, bkgisoerr, binEdges, verbosity=0)
    antiisotf = TemplateFit(pseudodatahist, pseudodataerr, signalisohist, signalisoerr, bkgantiisohist, bkgantiisoerr, binEdges, verbosity=0)
    truePurity = getPurity(samplesignalhist, samplebkghist, binEdges, signalFrac, *purityRange)
    
    print '{0}: true purity = {1}'.format(fitvar, truePurity)
    print 'Iso: fit fraction = {0}, purity = {1}'.format(isotf.fitf, isotf.getPurity(*purityRange)[0])
    print 'Anti-iso: fit fraction = {0}, purity = {1}'.format(antiisotf.fitf, antiisotf.getPurity(*purityRange)[0])

    fig = plt.figure(figsize=(25, 5))
    plt.subplot(121)
    isotf.plotFit(xlabel, dataLabel='Pseudodata', signalLabel='Signal (MC)', bkgLabel='Bkg (isolated dijet MC)', texts=['Purity = {0:2.2f}'.format(isotf.getPurity(*purityRange)[0]), '{0} < pT < {1}'.format(*ptrange)])
    plt.ylabel('Arb. units')
    plt.subplot(122)
    antiisotf.plotFit(xlabel, dataLabel='Pseudodata', signalLabel='Signal (MC)', bkgLabel='Bkg (anti-isolated dijet MC)', texts=['Purity = {0:2.2f}'.format(antiisotf.getPurity(*purityRange)[0]), '{0} < pT < {1}'.format(*ptrange)])
    plt.ylabel('Arb. units')
    plt.show()


def calculatePurityCorrection(fullgjmcdf, fulljjmcdf, fitvar, signalFrac, ptrange, nSamples):
    start = time.time()
    signaldf = applyCut(fullgjmcdf, ptcuttext(ptrange))
    bkgdf = applyCut(fulljjmcdf, ptcuttext(ptrange))

    signalisodf = applyCut(signaldf, isocuttext)
    bkgisodf = applyCut(bkgdf, isocuttext)
    bkgantiisodf = applyCut(bkgdf, antiisocuttext)
    
    print 'signal: {0}; iso bkg: {1}; anti-iso bkg: {2}'.format(signalisodf.shape[0], bkgisodf.shape[0], bkgantiisodf.shape[0])

    binEdges, purityRange, _ = varinfo[fitvar]

    truePurities = []
    isoPurities = []
    antiisoPurities = []

    for _ in range(nSamples):
        samplesignaldf, templatesignaldf = train_test_split(signalisodf, test_size=0.5)
        samplebkgdf, templatebkgdf = train_test_split(bkgisodf, test_size=0.5)

        samplesignalhist, samplesignalerr = getNormHistAndErr(samplesignaldf, fitvar)
        samplebkghist, samplebkgerr = getNormHistAndErr(samplebkgdf, fitvar)
        pseudodatahist, pseudodataerr = makePseudodataset(samplesignalhist, samplesignalerr, samplebkghist, samplebkgerr, signalFrac)

        signalisohist, signalisoerr = getHistAndErr(templatesignaldf, fitvar, binEdges)
        bkgisohist, bkgisoerr = getHistAndErr(templatebkgdf, fitvar, binEdges)
        bkgantiisohist, bkgantiisoerr = getHistAndErr(bkgantiisodf, fitvar, binEdges)

        isotf = TemplateFit(pseudodatahist, pseudodataerr, signalisohist, signalisoerr, bkgisohist, bkgisoerr, binEdges, verbosity=0)
        antiisotf = TemplateFit(pseudodatahist, pseudodataerr, signalisohist, signalisoerr, bkgantiisohist, bkgantiisoerr, binEdges, verbosity=0)
        truePurity = getPurity(samplesignalhist, samplebkghist, binEdges, signalFrac, *purityRange)

        truePurities.append(truePurity)
        isoPurities.append(isotf.getPurity(*purityRange)[0])
        antiisoPurities.append(antiisotf.getPurity(*purityRange)[0])

    purityDiffs = np.subtract(isoPurities, antiisoPurities)
    print 'Correction = {0} +/- {1}'.format(np.mean(purityDiffs), np.std(purityDiffs))
    end = time.time()
    print '{0} samples in {1} seconds'.format(nSamples, end - start)

    return truePurities, isoPurities, antiisoPurities

def calculateCorrectionsInBins(ptranges, nSamples, var, showPlot=False):
    for ptrange in ptranges:
        print 'system: pp; variable: {0}; ptrange: {1} < pT < {2}'.format(var, *ptrange)
        frac = getTrueFrac(fulldatadf, fullgjmcdf, var, ptrange)
        if showPlot:
            plotTfComp(var, frac, ptrange)
        calculatePurityCorrection(fullgjmcdf, fulljjmcdf, var, frac, ptrange, nSamples)

# Process ntuples

In [None]:
fulldatadf = applyCuts(getData(ppdataFiles, ntupledir), fullpt)
fullgjmcdf = applyCuts(getData(ppgjmcFiles, ntupledir), fullpt)
fulljjmcdf = applyCuts(getData(ppjjmcFiles, ntupledir), fullpt)

# Plot Dijet MC shower shape distributions for iso vs anti-iso

In [None]:
bkgisodf = applyCut(fulljjmcdf, isocuttext)
bkgantiisodf = applyCut(fulljjmcdf, antiisocuttext)

fig = plt.figure(figsize=(15, 5))
for (i, fitvar) in enumerate([emaxe, lambda2, nn1]):
    binEdges, _, xlabel = varinfo[fitvar]
    plt.subplot(1, 3, i + 1)
    plt.hist(bkgisodf[fitvar], bins=binEdges, weights=bkgisodf['weights'], normed=True, histtype='step', label='Dijet MC, isolated')
    plt.hist(bkgantiisodf[fitvar], bins=binEdges, weights=bkgantiisodf['weights'], normed=True, histtype='step', label='Dijet MC, anti-iso')
    if fitvar == lambda2:
        plt.xlabel('$\mathrm{\sigma^2_{long}}$', fontsize=14)
    else:
        plt.xlabel(xlabel)
    plt.ylabel('Normalized counts')
    plt.legend(loc=0, fontsize=8, frameon=False, framealpha=0.9)

plt.suptitle('18g7a Dijet MC, 12 < $p_T$ < 30 GeV/c')
plt.show()

# Calculate purity corrections in full pT range

In [ ]:
calculateCorrectionsInBins([(12, 30)], nSamples, nn1, False)

system: pp; variable: cluster_NN1; ptrange: 12 < pT < 30
signal: 240720; iso bkg: 30045; anti-iso bkg: 29514
Correction = -0.0975778475404 +/- 0.0114870276302
1000 samples in 4402.1679492 seconds


In [ ]:
calculateCorrectionsInBins([(12, 30)], nSamples, lambda2, False)

system: pp; variable: cluster_Lambda; ptrange: 12 < pT < 30
signal: 240720; iso bkg: 30045; anti-iso bkg: 29514
Correction = -0.0744957849383 +/- 0.0141493547708
1000 samples in 4461.46616197 seconds


In [ ]:
calculateCorrectionsInBins([(12, 30)], nSamples, emaxe, False)

system: pp; variable: cluster_emax_over_e; ptrange: 12 < pT < 30
signal: 240720; iso bkg: 30045; anti-iso bkg: 29514
Correction = -0.0602083876729 +/- 0.0126861836761
1000 samples in 5924.55793905 seconds


## Check for pT dependence of purity corrections

In [ ]:
calculateCorrectionsInBins(ptranges, nSamples, nn1, False)

system: pp; variable: cluster_NN1; ptrange: 12 < pT < 14
signal: 45688; iso bkg: 13406; anti-iso bkg: 11262
Correction = -0.0924268960953 +/- 0.0161132961512
1000 samples in 1041.59933901 seconds
system: pp; variable: cluster_NN1; ptrange: 14 < pT < 16
signal: 37493; iso bkg: 6781; anti-iso bkg: 6726
Correction = -0.0891616642475 +/- 0.0296562556177
1000 samples in 731.023452997 seconds
system: pp; variable: cluster_NN1; ptrange: 20 < pT < 30
signal: 101485; iso bkg: 3886; anti-iso bkg: 4901
Correction = -0.108597576618 +/- 0.0729878768325
1000 samples in 1861.52579498 seconds


In [None]:
calculateCorrectionsInBins(ptranges, nSamples, lambda2, False)

system: pp; variable: cluster_Lambda; ptrange: 12 < pT < 14
signal: 45688; iso bkg: 13406; anti-iso bkg: 11262
Correction = -0.0425683408976 +/- 0.0220940820873
1000 samples in 969.767201185 seconds
system: pp; variable: cluster_Lambda; ptrange: 14 < pT < 16
signal: 37493; iso bkg: 6781; anti-iso bkg: 6726
Correction = -0.0538754761219 +/- 0.0511893630028
1000 samples in 975.478887081 seconds
system: pp; variable: cluster_Lambda; ptrange: 20 < pT < 30
signal: 101485; iso bkg: 3886; anti-iso bkg: 4901
Correction = -0.0790451541543 +/- 0.0304083526134
1000 samples in 2529.05548692 seconds


In [ ]:
calculateCorrectionsInBins(ptranges, nSamples, emaxe, False)

system: pp; variable: cluster_emax_over_e; ptrange: 12 < pT < 14
signal: 45688; iso bkg: 13406; anti-iso bkg: 11262
Correction = -0.0257553588599 +/- 0.0194715820253
1000 samples in 968.588465929 seconds
system: pp; variable: cluster_emax_over_e; ptrange: 14 < pT < 16
signal: 37493; iso bkg: 6781; anti-iso bkg: 6726
Correction = -0.0311748199165 +/- 0.0253843497485
1000 samples in 707.783909082 seconds
system: pp; variable: cluster_emax_over_e; ptrange: 20 < pT < 30
signal: 101485; iso bkg: 3886; anti-iso bkg: 4901
Correction = -0.104386046529 +/- 0.0350959859788
1000 samples in 1694.56827617 seconds


# Plot examples of fits with iso vs anti-iso background

In [None]:
def makeTfCompFigure(fitvar, ptrange, subplot):
    signalFrac = getTrueFrac(fulldatadf, fullgjmcdf, fitvar, ptrange)
    
    signaldf = applyCut(fullgjmcdf, ptcuttext(ptrange))
    bkgdf = applyCut(fulljjmcdf, ptcuttext(ptrange))

    signalisodf = applyCut(signaldf, isocuttext)
    bkgisodf = applyCut(bkgdf, isocuttext)
    bkgantiisodf = applyCut(bkgdf, antiisocuttext)

    samplesignaldf, templatesignaldf = train_test_split(signalisodf, test_size=0.5)
    samplebkgdf, templatebkgdf = train_test_split(bkgisodf, test_size=0.5)

    samplesignalhist, samplesignalerr = getNormHistAndErr(samplesignaldf, fitvar)
    samplebkghist, samplebkgerr = getNormHistAndErr(samplebkgdf, fitvar)
    pseudodatahist, pseudodataerr = makePseudodataset(samplesignalhist, samplesignalerr, samplebkghist, samplebkgerr, signalFrac)
    
    binEdges, purityRange, xlabel = varinfo[fitvar]

    signalisohist, signalisoerr = getHistAndErr(templatesignaldf, fitvar, binEdges)
    bkgisohist, bkgisoerr = getHistAndErr(templatebkgdf, fitvar, binEdges)
    bkgantiisohist, bkgantiisoerr = getHistAndErr(bkgantiisodf, fitvar, binEdges)

    isotf = TemplateFit(pseudodatahist, pseudodataerr, signalisohist, signalisoerr, bkgisohist, bkgisoerr, binEdges, verbosity=0)
    antiisotf = TemplateFit(pseudodatahist, pseudodataerr, signalisohist, signalisoerr, bkgantiisohist, bkgantiisoerr, binEdges, verbosity=0)
    
    truePurity = getPurity(samplesignalhist, samplebkghist, binEdges, signalFrac, *purityRange)
    
    plt.subplot(subplot)
    isotf.plotFit(xlabel, dataLabel='Pseudodata', signalLabel='Signal (MC)', bkgLabel='Bkg (isolated dijet MC)',
                  texts=['Purity = {0:2.2f}'.format(isotf.getPurity(*purityRange)[0]), '{0} < pT < {1} GeV/c'.format(*ptrange)],
                  legendoptions={'fontsize': 8, 'framealpha': 0.9, 'frameon': False})
    plt.ylabel('Arb. units')
    plt.subplot(subplot + 1)
    antiisotf.plotFit(xlabel, dataLabel='Pseudodata', signalLabel='Signal (MC)', bkgLabel='Bkg (anti-isolated dijet MC)',
                      texts=['Purity = {0:2.2f}'.format(antiisotf.getPurity(*purityRange)[0]), '{0} < pT < {1} GeV/c'.format(*ptrange)],
                      legendoptions={'fontsize': 8, 'framealpha': 0.9, 'frameon': False})
    plt.ylabel('Arb. units')

fig = plt.figure(figsize=(15, 15))
makeTfCompFigure(nn1, (20, 30), 321)
makeTfCompFigure(lambda2, (12, 14), 323)
makeTfCompFigure(emaxe, (14, 16), 325)
plt.show()

In [None]:
for var in [nn1, lambda2, emaxe]:
    for ptrange in ptranges:
        print 'system: pp; variable: {0}; ptrange: {1} < pT < {2}'.format(var, *ptrange)
        frac = getTrueFrac(fulldatadf, fullgjmcdf, var, ptrange)
        plotTfComp(var, frac, ptrange)
        
plt.show()