### Set up dask + condor at UCSD

In [4]:
import numpy as np
import pandas as pd

from dask.distributed import Client

from condor_utils import make_htcondor_cluster

In [11]:
cluster = make_htcondor_cluster(local=False, dashboard_address=13347)
cluster

VBox(children=(HTML(value='<h2>UCSDHTCondorCluster</h2>'), HBox(children=(HTML(value='\n<div>\n  <style scoped…

In [6]:
cluster.scale(101)

In [7]:
client = Client(cluster)
client

0,1
Client  Scheduler: tcp://169.228.130.37:24762  Dashboard: http://169.228.130.37:13346/status,Cluster  Workers: 0  Cores: 0  Memory: 0 B


### set up function to do one scan of one cut

In [8]:
#def g(lowCut, highCut, cutIndex, processTag, year, date, postfix, specialCut, doLow, doOneBin):
def g(listParams):
    import subprocess
    script="""
    source /cvmfs/cms.cern.ch/cmsset_default.sh 
    cd /cvmfs/cms.cern.ch/slc7_amd64_gcc700/cms/cmssw/CMSSW_10_5_0/ 
    eval `scramv1 runtime -sh` 
    cd - 

    cp /hadoop/cms/store/user/hmei/combineStandalone.tar.gz .
    tar -xzf combineStandalone.tar.gz
    
    # https://root-forum.cern.ch/t/error-in-cling-insertintoautoloadingstate/29347
    # Error in cling::AutoloadingVisitor::InsertIntoAutoloadingState
    export ROOT_INCLUDE_PATH=/srv/temp/

    cd HiggsAnalysis/CombinedLimit/
    source ./env_standalone.sh 
    cd -
    
    cp /hadoop/cms/store/user/hmei/doScan.tar .
    tar -xzf doScan.tar
    
    python doScan/doAllScans.py --low """ + str(listParams[0])

    script += """ --high """ + str(listParams[1])
    script += """ --cutIndex """ + str(listParams[2])
    script += """ --processTag """ + listParams[3]
    script += """ --year """ + listParams[4]
    script += """ --date """ + listParams[5]
    script += """ --postfix """ + listParams[6]
    script += """ --specialCut """ + listParams[7]
    
    if listParams[8]:
        script += """ --doLow """
    if listParams[9]:
        script += """ --doOneBin """
     
    print (script)
    with open("temp.sh","w") as fh:
        fh.write(script)
    _ = subprocess.getoutput("chmod u+x temp.sh")
    cmd = "time ./temp.sh"
    subprocess.call("./temp.sh", shell=True)
    #return subprocess.getoutput("./temp.sh")
    return subprocess.getoutput("cat parseout_" + str(listParams[2]) + ".txt")

### function to parse result

In [9]:
def ParseResult(results):
    n = 100
    matrix = np.zeros((n,4))
    for i in range(n):
        sig = results[i].split()[1]
        nbkg0 = results[i].split()[2]
        nbkg1 = results[i].split()[3]
        mva = results[i].split()[5]
    
        matrix[i,:] = [sig, nbkg0, nbkg1, mva]

    # sort according to significance
    ind = np.argsort( matrix[:,0] )[::-1]
    matrix =  matrix[ind]
    #print (matrix[matrix[:,1]>10])#, matrix)

    # get best entry while making sure the number of backgrounds are bigger than 10
    bestCut = matrix[ (matrix[:,1] > 10) & (matrix[:,2] > 10) ][0]

    return bestCut

### prepare function to gather result

In [12]:
def Scan(lowCut, highCut, step, doLow=False):
  
    from dask.distributed import wait

    processTag = "TTHLeptonicTag"
    year = "2020"
    date= "20200328"
    postfix = "pt_lep_binning_200_in_step"
    specialCut = "\"(global_features[28]*mass > 200)\""

    #lowCut = 0
    #highCut = 1
    #step = 1
    nScan = 100
    
    inputs = []
    for i in range(1, nScan+2):
        inputs.append([lowCut, highCut, i, processTag, year, date, postfix+str(step), specialCut, doLow, False])

    parameter_oneBin = [lowCut, highCut, nScan+1, processTag, year, date, postfix+str(step), specialCut, doLow, True]

    # prepare n cuts within low-high cut
    futures = []
    for parameters in inputs:
        future = client.submit(g, parameters)
        futures.append(future)

    # try get significance between low/high cut without finner splitting
    future_onebin = client.submit(g, parameter_oneBin)
    
    results = client.gather(futures)
    result_1bin = future_onebin.result().split()

    bestCut = ParseResult(results)
    
    final = np.append(bestCut, result_1bin[1])
        
    # significance with two bins, nbkg in bin1, nbkg in bin2, mva score, significance with one bin
    return final

In [None]:
boundaries = [0.8541606, 0.94937336, 1]

In [13]:
scanresult1 = Scan(0,1,1)
print (scanresult1)

['0.630718' '10.6903019931' '770.721169762' '0.94937336' '0.145423']


In [14]:
# if nbkg0 < 20, then dont change lowCut, only change highCut
# if nsig from two bins is smaller than 1.2*nsig from one bin, don't further split
scanresult2 = Scan(0,0.94937336,2)
print (scanresult2)

['0.212181' '21.5344444693' '749.233257875' '0.8541606' '0.0618357']


In [15]:
scanresult3 = Scan(0.8541606, 0.94937336, 3)
print (scanresult3)

['0.214152' '10.6217865686' '10.8539718541' '0.91408724' '0.209817']


In [16]:
scanresult4 = Scan(0,0.8541606,4,True)
print (scanresult4)

['0.553977' '32.241908885' '21.4036622087' '0.7491814' '0.54968']
