In [3]:
import itertools
from itertools import product
import numpy as np

In [4]:
def getCodeFromBinary(s):
    for i in range(len(s)):
        if s[i].isalpha():
            return getCodeFromBinary(s[:i] + '0' + s[i+1:]) + getCodeFromBinary(s[:i] + '1' + s[i+1:])
    return [int(s, 2)]

def getVarNames(listVars):
    return "_".join(["v" + str(v) for v in listVars])

In [5]:
def generateModelsDict(startIndex=0, numModels=10, nameOptionsFile = "/home/submit/pdmonte/CMSSW_10_6_27/src/Hrare2023/analysis/TMVA_regression/optionModels.out"):
    '''This function has to solve the following:
        - Create a model and append it to a dictionary
        - Save in a file the id of the model and the properties'''
    
    allModels = {}
    with open(nameOptionsFile, "a") as file:
        for i in range(numModels):
            modelOptions = getRandomOptions()
            allModels[startIndex + i] = getOptionsString(modelOptions)
            file.write(str(startIndex + i) + ' $ ' + ' $ '.join(modelOptions.values()) + "\n")

    return allModels
    

def getOptionsString(modelDict):
    outString = "\"!V"
    if modelDict["VarTransform"] != "":
        outString += ":VarTransform=" + modelDict["VarTransform"]
    outString += ":NTrees=" + modelDict["NTrees"]
    outString += ":BoostType=" + modelDict["BoostType"]
    outString += ":Shrinkage=" + modelDict["Shrinkage"]
    outString += ":MaxDepth=" + modelDict["MaxDepth"]
    outString += ":SeparationType=" + modelDict["SeparationType"]
    outString += ":nCuts=" + modelDict["nCuts"]
    outString += ":UseRandomisedTrees=" + modelDict["UseRandomisedTrees"]
    outString += ":UseNvars=" + modelDict["UseNvars"]
    outString += ":UseBaggedBoost=" + modelDict["UseBaggedBoost"]
    outString += ":BaggedSampleFraction=" + modelDict["BaggedSampleFraction"]
    outString += ":PruneMethod=" + modelDict["PruneMethod"]
    outString += ":PruneStrength=" + modelDict["PruneStrength"]
    outString += ":PruningValFraction=" + modelDict["PruningValFraction"]
    outString += "\""
    return outString


def getRandomOptions():
    outOptions = {}
    outOptions["VarTransform"] = getVarTransformChoice(maxSymbols=4)
    outOptions["NTrees"] = str(getRandomInt(1500, 5000))
    outOptions["BoostType"] = "Grad"
    outOptions["Shrinkage"] = str(getRandomFloat(0.10, 0.5))
    outOptions["MaxDepth"] = str(getRandomInt(5, 10))
    outOptions["SeparationType"] = "RegressionVariance"
    outOptions["nCuts"] = str(getRandomInt(15, 100))
    outOptions["UseRandomisedTrees"] = np.random.choice(["F", "T"])
    outOptions["UseNvars"] = str(getRandomInt(2, 100))
    outOptions["UseBaggedBoost"] = np.random.choice(["F", "T"])
    outOptions["BaggedSampleFraction"] = str(getRandomFloat(0.50, 5.00))
    outOptions["PruneMethod"] = getPruneMethodChoice()
    outOptions["PruneStrength"] = str(getRandomInt(0, 100))
    outOptions["PruningValFraction"] = str(getRandomFloat(0.0, 2.0))
    return outOptions


def getVarTransformChoice(minSymbols=0, maxSymbols=2):
    nTsf = np.random.randint(minSymbols, maxSymbols+1)
    varTransform = ["P", "G", "D", "N"]
    good = []
    initList = list(product(varTransform, repeat=nTsf))
    for e in initList:
        repeated = False
        for i in range(len(e)-1):
            if e[i] == e[i+1]:
                repeated = True
        if e.count("P") > 1:
            repeated = True
        if not repeated:
            good.append(",".join(e))
    return np.random.choice(good)


def getBestVarTransform(n=30):
    best = ["G", "N,G,N,G", "", "N", "P,N,D", "P,N,D,N", "P,G", "P,N", "P,D", "P", "N,G", "G,N", "G,N,G,N", "N,G,N", "G,N,G", "P,D,N", "P,D,N,D", "N,P,D", "N,P,N,D", "N,P,G", "P,D,G", "D,N,D", "P,G,N", "P,N,G", "D,N,G", "P,G,N,G", "D", "P,N,P,D"]
    return best[:n]


def getPruneMethodChoice():
    return np.random.choice(["NoPruning", "NoPruning", "ExpectedError", "CostComplexity"])


def getRandomInt(minNum = 0, maxNum=100):
    return np.random.randint(minNum, maxNum+1)


def getRandomFloat(minNum = 0.0, maxNum=1.0):
    return np.round(np.random.uniform(minNum, maxNum), 5)

In [6]:
channel="phi"
f = open("commands_temp.txt", "w")
#comm = "RECO:python computeErrors.py -m RECO -c {channel}".format(channel=channel)
#print(comm)
#f.write(comm + "\n")
dfC, dlC = 15, 3684

modelNames=["BDTG_df15_dl3684_v0_v1", "BDTG_df15_dl3684_v1_v4", "BDTG_df15_dl3684_v0_v1_v4", "BDTG_df15_dl3684_v0_v1_v9", "BDTG_df15_dl3684_v1_v4_v9", "BDTG_df15_dl3684_v0_v1_v4_v9"]



optionsDict = {0: "\"!V:NTrees=1000:BoostType=Grad:Shrinkage=0.2:MaxDepth=5:SeparationType=SDivSqrtSPlusB:nCuts=90:UseRandomisedTrees=T:UseNvars=67:UseBaggedBoost:BaggedSampleFraction=2.4:PruneMethod=NoPruning\""}

optionsDict = generateModelsDict(startIndex=2103, numModels=1200)

for k, v in optionsDict.items():
    for line in modelNames:
        jobName = line.strip()
        dfC = jobName.split("_")[1].replace("df", "")
        dlC = jobName.split("_")[2].replace("dl", "")
        listVars = [int(x.replace("v", "")) for x in jobName.split("_")[3:]]
        varNamesShort = "".join([str(v) for v in listVars])
        comm = "o{idx}_v{varNamesShort}:::root -l -q -b 'TMVA_GF_regression.C(\"BDTG_df{dfC}_dl{dlC}_{varNames}_opt{idx}\", \"{channel}\", 0, {{{listVars}}}, {dfC}, {dlC}, {optStr})' && root -l -q -b 'TMVA_GF_regression.C(\"BDTG_df{dfC}_dl{dlC}_{varNames}_opt{idx}\", \"{channel}\", 1, {{{listVars}}}, {dfC}, {dlC}, {optStr})' && root -l -q -b 'TMVA_GF_regression.C(\"BDTG_df{dfC}_dl{dlC}_{varNames}_opt{idx}\", \"{channel}\", 2, {{{listVars}}}, {dfC}, {dlC}, {optStr})' && python computeErrors.py -m BDTG_df{dfC}_dl{dlC}_{varNames}_opt{idx} -c {channel}".format(dfC=dfC, dlC=dlC, channel=channel, listVars=str(listVars)[1:-1], varNames=getVarNames(listVars), varNamesShort=varNamesShort, idx=k, optStr=v)
        print(comm)
        f.write(comm + "\n")
f.close()

o2103_v01:::root -l -q -b 'TMVA_GF_regression.C("BDTG_df15_dl3684_v0_v1_opt2103", "phi", 0, {0, 1}, 15, 3684, "!V:VarTransform=P:NTrees=3258:BoostType=Grad:Shrinkage=0.21661:MaxDepth=9:SeparationType=RegressionVariance:nCuts=35:UseRandomisedTrees=F:UseNvars=42:UseBaggedBoost=F:BaggedSampleFraction=1.74956:PruneMethod=NoPruning:PruneStrength=68:PruningValFraction=1.6627")' && root -l -q -b 'TMVA_GF_regression.C("BDTG_df15_dl3684_v0_v1_opt2103", "phi", 1, {0, 1}, 15, 3684, "!V:VarTransform=P:NTrees=3258:BoostType=Grad:Shrinkage=0.21661:MaxDepth=9:SeparationType=RegressionVariance:nCuts=35:UseRandomisedTrees=F:UseNvars=42:UseBaggedBoost=F:BaggedSampleFraction=1.74956:PruneMethod=NoPruning:PruneStrength=68:PruningValFraction=1.6627")' && root -l -q -b 'TMVA_GF_regression.C("BDTG_df15_dl3684_v0_v1_opt2103", "phi", 2, {0, 1}, 15, 3684, "!V:VarTransform=P:NTrees=3258:BoostType=Grad:Shrinkage=0.21661:MaxDepth=9:SeparationType=RegressionVariance:nCuts=35:UseRandomisedTrees=F:UseNvars=42:UseBagg

In [21]:
# To check the remaining models to compute
completed = []
file_path = "completedModels.txt"
with open(file_path, "r") as file:
    for line in file:
        completed.append(line.strip())
print(len(completed), completed[0])

total = []
file_path = "totalModels.txt"
with open(file_path, "r") as file:
    for line in file:
        total.append(line.strip())
print(len(total), total[0])

todo = sorted(list(set(total).difference(set(completed))))
print(len(todo))

f = open("remainingModels.txt", "w")
for line in todo:
    f.write(line + "\n")
f.close()

561 BDTG_df15_dl3684
518 BDTG_df15_dl3684_v0_v1_v4
32


In [None]:
# To solve the errors
channel="phi"
f = open("commands_temp.txt", "w")

file_path = "errorFiles.txt"
with open(file_path, "r") as file:
    for line in file:
        jobName = line.strip().split("/")[-1][:-4]
        dfC = jobName.split("_")[0].replace("df", "")
        dlC = jobName.split("_")[1].replace("dl", "")
        listVars = [int(x.replace("v", "")) for x in jobName.split("_")[2:]]
        comm = "df{dfC}_dl{dlC}_{varNames}:root -l -q -b 'TMVA_GF_regression.C(\"BDTG_df{dfC}_dl{dlC}_{varNames}\", \"{channel}\", 0, {{{listVars}}}, {dfC}, {dlC})' && root -l -q -b 'TMVA_GF_regression.C(\"BDTG_df{dfC}_dl{dlC}_{varNames}\", \"{channel}\", 1, {{{listVars}}}, {dfC}, {dlC})' && root -l -q -b 'TMVA_GF_regression.C(\"BDTG_df{dfC}_dl{dlC}_{varNames}\", \"{channel}\", 2, {{{listVars}}}, {dfC}, {dlC})' && python computeErrors.py -m BDTG_df{dfC}_dl{dlC}_{varNames} -c {channel}".format(dfC=dfC, dlC=dlC, channel=channel, listVars=str(listVars)[1:-1], varNames=getVarNames(listVars))
        print(comm)
        f.write(comm + "\n")
f.close()

In [22]:
# To create the commands for the remaining models
channel="phi"
f = open("commands_temp.txt", "w")

file_path = "remainingModels.txt"
with open(file_path, "r") as file:
    for line in file:
        jobName = line.strip()
        dfC = jobName.split("_")[1].replace("df", "")
        dlC = jobName.split("_")[2].replace("dl", "")
        listVars = [int(x.replace("v", "")) for x in jobName.split("_")[3:]]
        comm = "df{dfC}_dl{dlC}_{varNames}:root -l -q -b 'TMVA_GF_regression.C(\"BDTG_df{dfC}_dl{dlC}_{varNames}\", \"{channel}\", 0, {{{listVars}}}, {dfC}, {dlC})' && root -l -q -b 'TMVA_GF_regression.C(\"BDTG_df{dfC}_dl{dlC}_{varNames}\", \"{channel}\", 1, {{{listVars}}}, {dfC}, {dlC})' && root -l -q -b 'TMVA_GF_regression.C(\"BDTG_df{dfC}_dl{dlC}_{varNames}\", \"{channel}\", 2, {{{listVars}}}, {dfC}, {dlC})' && python computeErrors.py -m BDTG_df{dfC}_dl{dlC}_{varNames} -c {channel}".format(dfC=dfC, dlC=dlC, channel=channel, listVars=str(listVars)[1:-1], varNames=getVarNames(listVars))
        print(comm)
        f.write(comm + "\n")
f.close()

df15_dl3684_v0_v1_v10_v11:root -l -q -b 'TMVA_GF_regression.C("BDTG_df15_dl3684_v0_v1_v10_v11", "phi", 0, {0, 1, 10, 11}, 15, 3684)' && root -l -q -b 'TMVA_GF_regression.C("BDTG_df15_dl3684_v0_v1_v10_v11", "phi", 1, {0, 1, 10, 11}, 15, 3684)' && root -l -q -b 'TMVA_GF_regression.C("BDTG_df15_dl3684_v0_v1_v10_v11", "phi", 2, {0, 1, 10, 11}, 15, 3684)' && python computeErrors.py -m BDTG_df15_dl3684_v0_v1_v10_v11 -c phi
df15_dl3684_v0_v1_v4_v10:root -l -q -b 'TMVA_GF_regression.C("BDTG_df15_dl3684_v0_v1_v4_v10", "phi", 0, {0, 1, 4, 10}, 15, 3684)' && root -l -q -b 'TMVA_GF_regression.C("BDTG_df15_dl3684_v0_v1_v4_v10", "phi", 1, {0, 1, 4, 10}, 15, 3684)' && root -l -q -b 'TMVA_GF_regression.C("BDTG_df15_dl3684_v0_v1_v4_v10", "phi", 2, {0, 1, 4, 10}, 15, 3684)' && python computeErrors.py -m BDTG_df15_dl3684_v0_v1_v4_v10 -c phi
df15_dl3684_v0_v1_v4_v10_v11:root -l -q -b 'TMVA_GF_regression.C("BDTG_df15_dl3684_v0_v1_v4_v10_v11", "phi", 0, {0, 1, 4, 10, 11}, 15, 3684)' && root -l -q -b 'TMVA_G