In [1]:
import xgboost as xgb
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import pickle

import sys
sys.path.insert(0, '/home/btannenw/Desktop/ML/dihiggsMLProject/')
from rectangularCuts.rectangularAnalyzer import sequentialOneDimAnalyzer
from utils.commonFunctions import *

In [2]:
testingFraction = 0.3

In [None]:
# *** 0. Rectangular Analysis
hh_csv, qcd_csv = importDatasets()
variableNames = ['hh_mass', 'h1_mass', 'h2_mass', 'deltaR(h1, h2)', 'deltaR(h1 jets)', 'deltaR(h2 jets)']

# *** 1. Make training and testing data
data_train, data_test, labels_train, labels_test = makeTestTrainSamplesWithUserVariables(hh_csv, qcd_csv, variableNames, testingFraction)

# *** 2. Split hh and qcd from testing data
data_signal_test, labels_signal_test, data_bkg_test, labels_bkg_test = returnTestSamplesSplitIntoSignalAndBackground(data_test, labels_test)

# *** 3. Perform rectangular analysis and get best cuts
rectangularAnalysis = sequentialOneDimAnalyzer(hh_csv, qcd_csv, variableNames)
rectangularAnalysis.analyze()

In [None]:
data_train

In [None]:
# *** 4. Use previously calculated const. efficiency cuts on testing data from BDT
cuts = rectangularAnalysis.dictOfCutsByEfficiency
rectangularAnalysis.setReducedData(data_signal_test, data_bkg_test)
rectangularAnalysis.calculateYieldsAfterCuts(0.90) 
rectangularAnalysis.calculateYieldsAfterCuts(0.85) 
rectangularAnalysis.calculateYieldsAfterCuts(0.80) 

In [None]:
# *** 5. Calculate rectangular significance
nSig_raw = 12465*3
nBkg_raw = 5296

hh_lumiScale = getLumiScaleFactor(testingFraction, True)
qcd_lumiScale = getLumiScaleFactor(testingFraction, False)
nSig_lumi = nSig_raw*hh_lumiScale
nBkg_lumi = nBkg_raw*qcd_lumiScale

print(hh_lumiScale, qcd_lumiScale)

print('nSig = {0} , nBkg = {1} with significance = {2} for BDT score > {3}'.format(nSig_lumi, nBkg_lumi, nSig_lumi/np.sqrt(nBkg_lumi), 0.85) )

In [None]:
def plotBDTOutputAndTree(_model, _modelName, _signalData, _signalLabels, _bkgData, _bkgLabels, _savePlots=False):
    """make plots of BDT outputs for signal+background probabilities and tree plot"""
    
    # *** 1. Make some 1D histograms of signal decision outputs
    signal_DMatrix = xgb.DMatrix(_signalData, label=_signalLabels)
    preds_signal = _model.predict(signal_DMatrix)
    sig_pred_isBkg     = [x[0] for x in preds_signal]
    sig_pred_isSignal  = [x[1] for x in preds_signal]
    #plt.hist(preds_hh)

    # *** 2. Make some 1D histograms of background decision outputs
    bkg_DMatrix = xgb.DMatrix(_bkgData, label=_bkgLabels)
    preds_bkg = _model.predict(bkg_DMatrix)
    bkg_pred_isBkg    = [x[0] for x in preds_bkg]
    bkg_pred_isSignal = [x[1] for x in preds_bkg]
    #plt.hist(preds_qcd)
    
    # *** 3. Plot feature importance
    if(_savePlots):
        xgb.plot_importance(_model)
        _fig = plt.gcf()
        _scope    = _modelName.split(' ')[0].lower()
        _variable = 'featureImportance'
        _filename  = _scope + '_' + _variable
        _fig.savefig( _filename+'.png', bbox_inches='tight' )
    
    # *** 4. Make dict for plotting with borrowed functions
    _nBins = 40
    predictionResults = {'hh_pred_isSignal':sig_pred_isSignal, 'hh_pred_isBkg':sig_pred_isBkg, 'qcd_pred_isSignal':bkg_pred_isSignal, 'qcd_pred_isBkg':bkg_pred_isBkg,}
    compareManyHistograms( predictionResults, ['hh_pred_isSignal', 'qcd_pred_isSignal'], 2, 'Signal Prediction', 'BDT Score ({0})'.format(_modelName), 0, 1, _nBins, _normed=True, _savePlot=_savePlots )
    compareManyHistograms( predictionResults, ['hh_pred_isBkg', 'qcd_pred_isBkg'], 2, 'Bkg Prediction', 'BDT Score ({0})'.format(_modelName), 0, 1, _nBins, _normed=True, _savePlot=_savePlots )
    xgb.plot_importance(_model)

    # *** 5. Make plot of 0th tree
    #xgb.plot_tree(_model,num_trees=0)
    #plt.gcf().set_size_inches(100, 67)
    #_fig = plt.gcf()
    #plt.show()
    
    #if(_savePlots):
    #    _variable = 'firstTrainedDecisionTree'
    #    _filename  = _scope + '_' + _variable
    #    _fig.savefig( _filename+'.png' )
        
    # *** 6. restore figure defaults
    plt.rcParams['figure.figsize'] = [6.4, 4.8]

    
    return


In [3]:
def loadModelAndCalculateSignifiance(_modelPath, _modelName, _testingFraction, _signalDF, _bkgDF):
    
    # *** A. Load model
    _loadedModel = pickle.load(open(_modelPath, 'rb'))
   
    # *** B. Get datasets for testing and training
    _data_train, _data_test, _labels_train, _labels_test = makeTestTrainSamplesWithUserVariables(_signalDF.copy(), _bkgDF.copy(), _loadedModel.feature_names, _testingFraction)

    # *** C. Split hh and qcd from testing data
    _data_signal_test, _labels_signal_test, _data_bkg_test, _labels_bkg_test = returnTestSamplesSplitIntoSignalAndBackground(_data_test, _labels_test)

    # *** D. Plot output BDT predictions
    #plotBDTOutputAndTree(_loadedModel, _modelName, _data_signal_test, _labels_signal_test, _data_bkg_test, _labels_bkg_test, _savePlots=True)

    # *** E. Make predictions 
    _sig_DMatrix = xgb.DMatrix(_data_signal_test, label=_labels_signal_test)
    _preds_sig   = _loadedModel.predict(_sig_DMatrix)
    _bkg_DMatrix = xgb.DMatrix(_data_bkg_test, label=_labels_bkg_test)
    _preds_bkg   = _loadedModel.predict(_bkg_DMatrix)

    _sig_pred_isSignal   = [x[1] for x in _preds_sig]
    _bkg_pred_isSignal  = [x[1] for x in _preds_bkg]

    # *** F. Calculate significance 
    returnBestCutValue('BDT', _sig_pred_isSignal.copy(), _bkg_pred_isSignal.copy(), _minBackground=200)
    
    return

###########################################
#*** 1. Import BDT model and check stuff

# *** A. Import Dataset
hh_raw, qcd_raw = importDatasets()

# *** B. Load model and calculate significance
#loadModelAndCalculateSignifiance('models/model.top10.pkl', 'load10', 0.3, hh_raw.copy(), qcd_raw.copy() )
loadModelAndCalculateSignifiance('models/grid-model.allVars.pkl', 'gridAllVars', testingFraction, hh_raw.copy(), qcd_raw.copy() )


N_sig = 78876 , N_bkg = 116481
195357 rows of total data with  195357 labels [Train+Test]
136749 rows of training data with  136749 labels [Train]
58608 rows of testing data with  58608 labels [Test]


  if getattr(data, 'base', None) is not None and \


23567 35041
nSig = 480.63095999999996 , nBkg = 360562.65599999996 with significance = 0.8004263377133864 for BDT score > 0.6418133667021086


In [None]:
# *** 2. Get best cut value for BDT assuming some minimal amount of signal
returnBestCutValue('BDT', hh_pred_isSignal.copy(), qcd_pred_isSignal.copy(), _minBackground=200)

In [None]:
# *** 3. Get signifiance for any user-specified BDT score cut value
cut = 0.485
_nSignal = sum( value > cut for value in hh_pred_isSignal)*lumiScale_hh_to_qcd*totalLumi_HLLHC
_nBackground = sum( value > cut for value in qcd_pred_isSignal)*totalLumi_HLLHC

print('nSig = {0} , nBkg = {1} with significance = {2} for BDT score > {3}'.format(_nSignal, _nBackground, _nSignal/np.sqrt(_nBackground), cut) )

In [None]:
pickle.dump(rectangularAnalysis, open('rectangularModel.normSignal_qcd2M.pkl', 'wb'))


In [None]:
# *** 4. Use previously calculated const. efficiency cuts on testing data from BDT
rectangularAnalysis = 
cuts = rectangularAnalysis.dictOfCutsByEfficiency
rectangularAnalysis.setReducedData(data_signal_test, data_bkg_test)
rectangularAnalysis.calculateYieldsAfterCuts(0.90) 
rectangularAnalysis.calculateYieldsAfterCuts(0.85) 
rectangularAnalysis.calculateYieldsAfterCuts(0.80) 