# Import packages

In [None]:
from default_values import *
from ntuples_to_dfs import *
from template_fit import *

import matplotlib.pyplot as plt
import numpy as np
import datetime
import time
from sklearn.model_selection import train_test_split

%matplotlib inline

# Get data

In [None]:
ppfulldatadf = applyCuts(getData(ppdataFiles, ntupledir), fullpt)
ppfullgjmcdf = applyCuts(getData(ppgjmcFiles, ntupledir), fullpt)
ppfulljjmcdf = applyCuts(getData(ppjjmcFiles, ntupledir), fullpt)

ppisodatadf = applyCut(ppfulldatadf, isocuttext)
ppantiisodatadf = applyCut(ppfulldatadf, antiisocuttext)
ppisogjmcdf = applyCut(ppfullgjmcdf, isocuttext)

In [None]:
ppbfulldatadf = applyCuts(getData(ppbdataFiles, ntupledir), fullpt)
ppbfullgjmcdf = applyCuts(getData(ppbgjmcFiles, ntupledir), fullpt)
ppbfulljjmcdf = applyCuts(getData(ppbjjmcFiles, ntupledir), fullpt)

ppbisodatadf = applyCut(ppbfulldatadf, isocuttext)
ppbantiisodatadf = applyCut(ppbfulldatadf, antiisocuttext)
ppbisogjmcdf = applyCut(ppbfullgjmcdf, isocuttext)

# Define functions

## Purity vs pT

In [None]:
def purityVsPt(isodatadf, antiisodatadf, isogjmcdf, var, ptranges=None):
    print 'Purities calculated using {0} in {1}'.format(var, system)
    if not ptranges:
        ptranges = [(12.0, 13.0), (13.0, 14.0), (14.0, 16.0), (16.0, 18.0), (18.0, 20.0), (20.0, 25.0), (25.0, 30.0), (30.0, 40.0)]
        
    for ptrange in ptranges:
        binEdges, purityRange, _ = varinfo[var]
        
        isodist, isoerr = getHistAndErr(applyCut(isodatadf, ptcuttext(ptrange)), var, binEdges)
        antiisodist, antiisoerr = getHistAndErr(applyCut(antiisodatadf, ptcuttext(ptrange)), var, binEdges)
        mcdist, mcerr = getHistAndErr(applyCut(isogjmcdf, ptcuttext(ptrange)), var, binEdges)
        
        if max(isodist) > 0 and max(antiisodist) > 0 and max(mcdist) > 0:
            tf = TemplateFit(isodist, isoerr, mcdist, mcerr, antiisodist, antiisoerr, binEdges)
            print '{1} < pT < {2}; chi2/dof = {0}'.format(tf.chi2/tf.dof, *ptrange)
            tf.getPurity(*purityRange, verbosity=1)

## Signal template uncertainty

In [None]:
def checkBkgOnly(isodatadf, antiisodatadf, isogjmcdf, var, ptrange):
    binEdges = varbinEdges[var]
    purityRange = varpurityRange[var]
    bkgRange = varbkgRange[var]
        
    isodist, isoerr = getHistAndErr(applyCut(isodatadf, ptcuttext(ptrange)), var, binEdges)
    antiisodist, antiisoerr = getHistAndErr(applyCut(antiisodatadf, ptcuttext(ptrange)), var, binEdges)
    mcdist, mcerr = getHistAndErr(applyCut(isogjmcdf, ptcuttext(ptrange)), var, binEdges)
    
    tf = TemplateFit(isodist, isoerr, mcdist, mcerr, antiisodist, antiisoerr, binEdges)
    print 'Template fit in {0} with {1} for {2} < pT < {3}'.format(system, var, *ptrange)
    tf.getPurity(*purityRange, verbosity=1)
    
    bf = BackgroundFit(isodist, isoerr, antiisodist, antiisoerr, binEdges, bkgRange)
    print 'Background-only fit in {0} with {1} for {2} < pT < {3}'.format(system, var, *ptrange)
    bf.getPurity(*purityRange, verbosity=1)


def fitSmearResults(purities, errors):
    def Chi2(p):
        errs = [herr if p > purity else lerr for purity, herr, lerr in zip(purities, *zip(*errors))]
        residuals = np.divide(np.subtract(purities, p), errs)
        return np.sum(np.power(residuals, 2.0))

    mt = iminuit.Minuit(Chi2, p=np.mean(purities), error_p=1, errordef=1, print_level=0)
    mt.migrad()
    
    purity = mt.values['p']
    purityerr = mt.errors['p']
    
    return purity, purityerr


def smearShowerShape(isodatadf, antiisodatadf, isogjmcdf, var, ptrange, widths):
    print 'Smearing {0} in {1}, {2} < pT < {3}'.format(var, system, *ptrange)
    binEdges, purityRange, _ = varinfo[var]
    ptcut = ptcuttext(ptrange)
    fig = plt.figure()
    
    truemcdf = applyCut(isogjmcdf, ptcut)
    isodist, isoerr = getHistAndErr(applyCut(isodatadf, ptcut), var, binEdges)
    antiisodist, antiisoerr = getHistAndErr(applyCut(antiisodatadf, ptcut), var, binEdges)
    
    purities = []
    chi2dofs = []
    peakchi2s = []
    fullerrs = []
    peakerrs = []
    
    for width in widths:
        smearedmcdist = np.multiply(truemcdf[var], np.random.normal(1, width, truemcdf.shape[0]))
        mchist = np.histogram(smearedmcdist, binEdges, weights=truemcdf['weights'])[0]
        mcerr = []
        for left, right in zip(binEdges[:-1], binEdges[1:]):
            weights = [w for (x, w) in zip(smearedmcdist, truemcdf['weights']) if x >= left and x <= right]
            mcerr.append(np.sqrt(np.sum(np.square(weights))))
            
        tf = TemplateFit(isodist, isoerr, mchist, mcerr, antiisodist, antiisoerr, binEdges)
        purity, _, _, purhigh, purlow = tf.getPurity(*purityRange)
        fullchi2 = tf.chi2 / tf.dof
        peakchi2 = tf.getChi2DofInRange(*purityRange)
        
        purities.append(purity)
        chi2dofs.append(fullchi2)
        peakchi2s.append(peakchi2)
        
        if fullchi2 > 1:
            fullerrs.append(((purhigh - purity) * fullchi2, (purity - purlow) * fullchi2))
        else:
            fullerrs.append((purhigh - purity, purity - purlow))
            
        if peakchi2 > 1:
            peakerrs.append(((purhigh - purity) * peakchi2, (purity - purlow) * peakchi2))
        else:
            peakerrs.append((purhigh - purity, purity - purlow))
        
        print 'width = {0}, purity = {1}, fullchi2 = {2}, peakchi2 = {3}'.format(width, purity, fullchi2, peakchi2)
        
        plt.hist(smearedmcdist, binEdges, weights=truemcdf['weights'], histtype='step', normed=True)
        
    fig.savefig('plots/smeared-templates-{0}-{1}-{2}-{3}.pdf'.format(system, var, *ptrange))
    
    fullp, fullperr = fitSmearResults(purities, fullerrs)
    peakp, peakperr = fitSmearResults(purities, peakerrs)
    
    print 'Fit with full chi2: {0} +/- {1}; fit with peak chi2: {2} +/- {3}'.format(fullp, fullperr, peakp, peakperr)
    
    fig = plt.figure()
    plt.errorbar(widths, purities, yerr=zip(*peakerrs), fmt='ro', label='peak chi2')
    plt.errorbar(widths, purities, yerr=zip(*fullerrs), fmt='bo', label='full chi2')
    plt.axhspan(fullp - fullperr, fullp + fullperr, facecolor='b', alpha=0.2)
    plt.axhspan(peakp - peakperr, peakp + peakperr, facecolor='r', alpha=0.2)
    plt.xlabel('smearing width')
    plt.ylabel('purity')
    plt.xlim([min(widths)-0.01, max(widths)+0.01])
    plt.legend(numpoints=1, loc=3)
    fig.savefig('plots/smeared-purities-{0}-{1}-{2}-{3}.pdf'.format(system, var, *ptrange))
    
    fig = plt.figure()
    plt.errorbar(widths, purities, yerr=zip(*peakerrs), fmt='ro', label='peak chi2')
    plt.errorbar(widths, purities, yerr=zip(*fullerrs), fmt='bo', label='full chi2')
    plt.xlabel('smearing width')
    plt.ylabel('purity')
    plt.xlim([min(widths)-0.01, max(widths)+0.01])
    plt.legend(numpoints=1, loc=3)
    fig.savefig('plots/smeared-purities-{0}-{1}-{2}-{3}-noband.pdf'.format(system, var, *ptrange))
    
    fig = plt.figure()
    plt.plot(widths, peakchi2s,'ro', label='peak chi2')
    plt.plot(widths, chi2dofs, 'bo', label='full chi2')
    plt.xlabel('smearing width')
    plt.ylabel('$\chi^2$/dof')
    plt.xlim([min(widths)-0.01, max(widths)+0.01])
    plt.legend(numpoints=1, loc=2)
    fig.savefig('plots/smeared-chi2s-{0}-{1}-{2}-{3}.pdf'.format(system, var, *ptrange))
        
    plt.show()

## Correction factor

In [None]:
def getTrueFrac(fulldatadf, fullgjmcdf, fitvar, ptrange):
    binEdges, purityRange, _ = varinfo[fitvar]

    ptcut = ptcuttext(ptrange)
    datadf = applyCut(fulldatadf, ptcut)
    mcdf = applyCut(fullgjmcdf, ptcut)

    isodf = applyCut(datadf, isocuttext)
    signaldf = applyCut(mcdf, isocuttext)
    bkgdf = applyCut(datadf, antiisocuttext)

    datahist, dataerr = getHistAndErr(isodf, fitvar, binEdges)
    signalhist, signalerr = getHistAndErr(signaldf, fitvar, binEdges)
    bkghist, bkgerr = getHistAndErr(bkgdf, fitvar, binEdges)

    tf = TemplateFit(datahist, dataerr, signalhist, signalerr, bkghist, bkgerr, binEdges, verbosity=0)

    return tf.fitf


def makePseudodataset(signalhist, signalerr, bkghist, bkgerr, signalFrac):
    hist = np.sum([np.multiply(signalhist, signalFrac), np.multiply(bkghist, 1 - signalFrac)], axis=0)
    err = np.sqrt(np.sum([np.multiply(np.square(signalerr), signalFrac ** 2), np.multiply(np.square(bkgerr), (1 - signalFrac) ** 2)], axis=0))

    return hist, err


def calculatePurityCorrection(fullgjmcdf, fulljjmcdf, fitvar, signalFrac, nIsoClusters, nAntiisoClusters, ptrange, nSamples, printFrequency=0):
    start = time.time()
    signaldf = applyCut(fullgjmcdf, ptcuttext(ptrange))
    bkgdf = applyCut(fulljjmcdf, ptcuttext(ptrange))

    signalisodf = applyCut(signaldf, isocuttext)
    bkgisodf = applyCut(bkgdf, isocuttext)
    bkgantiisodf = applyCut(bkgdf, antiisocuttext)
    
    print 'signal: {0}; iso bkg: {1}; anti-iso bkg: {2}'.format(signalisodf.shape[0], bkgisodf.shape[0], bkgantiisodf.shape[0])

    binEdges, purityRange, _ = varinfo[fitvar]
    nBins = 40
    if system == 'pp':
        if fitvar == lambda2:
            binEdges = np.linspace(0, 2, nBins + 1)
        else:
            binEdges = np.linspace(0, 1, nBins + 1)

    truePurities = []
    fitFracs = []
    antiisoPurities = []
    
    nSignalClusters = int(round(signalFrac* nIsoClusters))
    nBkgClusters = int(round((1 - signalFrac) * nIsoClusters))

    for i in range(nSamples):
        samplesignaldf, templatesignaldf = train_test_split(signalisodf, test_size=nSignalClusters)
        if system == 'pp':
            samplebkgdf = bkgisodf
            matchedantiisodf = bkgantiisodf
        else:
            samplebkgdf, _ = train_test_split(bkgisodf, test_size=nBkgClusters)
            matchedantiisodf, _ = train_test_split(bkgantiisodf, test_size=nAntiisoClusters)

        samplesignalhist, samplesignalerr = getNormHistAndErr(samplesignaldf, fitvar, binEdges)
        samplebkghist, samplebkgerr = getNormHistAndErr(samplebkgdf, fitvar, binEdges)
        pseudodatahist, pseudodataerr = makePseudodataset(samplesignalhist, samplesignalerr, samplebkghist, samplebkgerr, signalFrac)

        signalisohist, signalisoerr = getHistAndErr(templatesignaldf, fitvar, binEdges)
        bkgantiisohist, bkgantiisoerr = getHistAndErr(matchedantiisodf, fitvar, binEdges)

        antiisotf = TemplateFit(pseudodatahist, pseudodataerr, signalisohist, signalisoerr, bkgantiisohist, bkgantiisoerr, binEdges, verbosity=0)
        truePurity = getPurity(samplesignalhist, samplebkghist, binEdges, signalFrac, *purityRange)

        truePurities.append(truePurity)
        antiisoPurities.append(antiisotf.getPurity(*purityRange)[0])
        fitFracs.append(antiisotf.fitf)
        
        if printFrequency > 0 and i % printFrequency == 0:
            print 'Sample {0}/{1} completed at {2}'.format(i + 1, nSamples, datetime.datetime.now())

    purityDiffs = np.subtract(truePurities, antiisoPurities)
    fracDiffs = np.subtract(fitFracs, signalFrac)
    print 'Correction = {0} +/- {1}'.format(np.mean(purityDiffs), np.std(purityDiffs))
    print 'True purity = {0} +/- {1}'.format(np.mean(truePurities), np.std(truePurities))
    print 'Fit fraction difference = {0} +/- {1}'.format(np.mean(fracDiffs), np.std(fracDiffs))
    end = time.time()
    print '{0} samples in {1} seconds'.format(nSamples, end - start)

    return truePurities, antiisoPurities, fitFracs

def calculateCorrectionsInBins(ptranges, nSamples, var, printFrequency=0):
    for ptrange in ptranges:
        print 'system: {1}; variable: {0}; ptrange: {2} < pT < {3}'.format(var, system, *ptrange)
        frac = getTrueFrac(fulldatadf, fullgjmcdf, var, ptrange)
        dataptdf = applyCut(fulldatadf, ptcuttext(ptrange))
        nIsoClusters = applyCut(dataptdf, isocuttext).shape[0]
        nAntiisoClusters = applyCut(dataptdf, antiisocuttext).shape[0]
        print 'Isolated clusters: {0}; fraction: {1}; anti-isolated clusters: {2}'.format(nIsoClusters, frac, nAntiisoClusters)
        calculatePurityCorrection(fullgjmcdf, fulljjmcdf, var, frac, nIsoClusters, nAntiisoClusters, ptrange, nSamples, printFrequency)

def fitDataWithMC(isodatadf, antiisodatadf, var, ptrange, system):
    binEdges, purityRange, xlabel = varinfo[var]
    bkgRange = varbkgRange[var]
        
    isodist, isoerr = getHistAndErr(applyCut(isodatadf, ptcuttext(ptrange)), var, binEdges)
    antiisodist, antiisoerr = getHistAndErr(applyCut(antiisodatadf, ptcuttext(ptrange)), var, binEdges)
      
    bf = BackgroundFit(isodist, isoerr, antiisodist, antiisoerr, binEdges, bkgRange)
    purity, plow, phigh = bf.getPurity(*purityRange)
    
    print 'system: {0}, variable: {1}, ptrange: {5} < pT < {6}, purity: {2} +{3} -{4}'.format(system, var, purity, phigh, plow, *ptrange)

## Sideband selection

In [None]:
def checkAntiisoSlices(isovar, fitvar, isohist, mchist, mcerr, antiisoslices, datadf, binEdges, purityRange):
    chi2dofs = []
    purities = []
    
    for antiisoslice in antiisoslices:
        antiisodf = applyCut(datadf, '{0}>{1} and {0}<{2}'.format(isovar, *antiisoslice))
        antiisohist = np.histogram(antiisodf[fitvar], bins=binEdges, weights=antiisodf['weights'])[0]

        tf = TemplateFit(isohist, None, mchist, mcerr, antiisohist, None, binEdges, verbosity=0)
        chi2dof = tf.chi2 / tf.dof
        purity = tf.getPurity(*purityRange)[0]

        chi2dofs.append(chi2dof)
        purities.append(purity)

    return chi2dofs, purities


def getBand(antiisoslices, values, antiisoregion):
    minValue = np.inf
    maxValue = -np.inf
    
    for (antiisoslice, value) in zip(antiisoslices, values):
        if antiisoslice[0] >= antiisoregion[0] and antiisoslice[1] <= antiisoregion[1]:
            minValue = min(minValue, value)
            maxValue = max(maxValue, value)
            
    return minValue, maxValue


def getRanges(fulldatadf, fullmcdf, ptrange, fitvar):
    antiisoslices = [(x, x + 2.0) for x in np.arange(-3, 25, 0.5)]
    antiisocenters = [np.mean(s) for s in antiisoslices]
    antiisowidths = [(s[1] - s[0]) / 2.0 for s in antiisoslices]

    ptcut = ptcuttext(ptrange)
    datadf = applyCut(fulldatadf, ptcut)
    mcdf = applyCut(fullmcdf, ptcut)

    isodf = applyCut(datadf, isocuttext)
    mcisodf = applyCut(mcdf, isocuttext)

    binEdges, purityRange, _ = varinfo[fitvar]
    isohist = np.histogram(isodf[fitvar], bins=binEdges, weights=isodf['weights'])[0]
    mchist, mcerr = getHistAndErr(mcisodf, fitvar, binEdges)

    chi2dofs, purities = checkAntiisoSlices(isovar, fitvar, isohist, mchist, mcerr, antiisoslices, datadf, binEdges, purityRange)
    chi2band = getBand(antiisoslices, chi2dofs, antiisocut)
    purband = getBand(antiisoslices, purities, antiisocut)

    print '{0} purity in {1} for {2} < pT < {3}: {4:1.2f} to {5:1.2f}; {6:2.2f}\%'.format(fitvar, system, ptrange[0], ptrange[1], purband[0], purband[1], 100*(purband[1]-purband[0])/np.sqrt(12))
    print '{0} chi2 in {1} for {2} < pT < {3}: {4:1.2f} to {5:1.2f}'.format(fitvar, system, ptrange[0], ptrange[1], chi2band[0], chi2band[1])