In [2]:
import itertools
from itertools import product
import numpy as np
import pandas as pd

In [3]:
def getCodeFromBinary(s):
    for i in range(len(s)):
        if s[i].isalpha():
            return getCodeFromBinary(s[:i] + '0' + s[i+1:]) + getCodeFromBinary(s[:i] + '1' + s[i+1:])
    return [int(s, 2)]

def getVarNames(listVars):
    return "_".join(["v" + str(v) for v in listVars])

In [4]:
def generateModelsDict(startIndex=0, numModels=10, nameOptionsFile = "/home/submit/pdmonte/CMSSW_10_6_27/src/Hrare2023/analysis/TMVA_regression/optionModels.out", centralValues=None):
    '''This function has to solve the following:
        - Create a model and append it to a dictionary
        - Save in a file the id of the model and the properties'''
    
    allModels = {}
    with open(nameOptionsFile, "a") as file:
        for i in range(numModels):
            modelOptions = getRandomOptions(centralValues=centralValues)
            allModels[startIndex + i] = getOptionsString(modelOptions)
            file.write(str(startIndex + i) + ' $ ' + ' $ '.join(modelOptions.values()) + "\n")

    return allModels
    

def getOptionsString(modelDict):
    outString = "\"!V"
    if modelDict["VarTransform"] != "":
        outString += ":VarTransform=" + modelDict["VarTransform"]
    outString += ":NTrees=" + modelDict["NTrees"]
    outString += ":BoostType=" + modelDict["BoostType"]
    outString += ":Shrinkage=" + modelDict["Shrinkage"]
    outString += ":MaxDepth=" + modelDict["MaxDepth"]
    outString += ":SeparationType=" + modelDict["SeparationType"]
    outString += ":nCuts=" + modelDict["nCuts"]
    outString += ":UseRandomisedTrees=" + modelDict["UseRandomisedTrees"]
    outString += ":UseNvars=" + modelDict["UseNvars"]
    outString += ":UseBaggedBoost=" + modelDict["UseBaggedBoost"]
    outString += ":BaggedSampleFraction=" + modelDict["BaggedSampleFraction"]
    outString += ":PruneMethod=" + modelDict["PruneMethod"]
    outString += ":PruneStrength=" + modelDict["PruneStrength"]
    outString += ":PruningValFraction=" + modelDict["PruningValFraction"]
    outString += "\""
    return outString


def getRandomOptions(centralValues=None):
    outOptions = {}
    if centralValues is None or len(centralValues) != 12:
        outOptions["VarTransform"] = getVarTransformChoice(maxSymbols=4)
        outOptions["NTrees"] = str(getRandomInt(1500, 5000))
        outOptions["BoostType"] = "Grad"
        outOptions["Shrinkage"] = str(getRandomFloat(0.10, 0.5))
        outOptions["MaxDepth"] = str(getRandomInt(5, 10))
        outOptions["SeparationType"] = "RegressionVariance"
        outOptions["nCuts"] = str(getRandomInt(15, 100))
        outOptions["UseRandomisedTrees"] = getRandomBool()
        outOptions["UseNvars"] = str(getRandomInt(2, 100))
        outOptions["UseBaggedBoost"] = getRandomBool()
        outOptions["BaggedSampleFraction"] = str(getRandomFloat(0.50, 5.00))
        outOptions["PruneMethod"] = getPruneMethodChoice()
        outOptions["PruneStrength"] = str(getRandomInt(0, 100))
        outOptions["PruningValFraction"] = str(getRandomFloat(0.0, 2.0))
    else:
        outOptions["VarTransform"] = getVarTransformChoice(maxSymbols=4, meanSigma=centralValues[0])
        outOptions["NTrees"] = str(getRandomInt(1000, 5000, meanSigma=centralValues[1], spacing=100))
        outOptions["BoostType"] = "Grad"
        outOptions["Shrinkage"] = str(getRandomFloat(0.04, 0.5, meanSigma=centralValues[2]))
        outOptions["MaxDepth"] = str(getRandomInt(4, 10, meanSigma=centralValues[3]))
        outOptions["SeparationType"] = "RegressionVariance"
        outOptions["nCuts"] = str(getRandomInt(10, 100, meanSigma=centralValues[4]))
        outOptions["UseRandomisedTrees"] = getRandomBool(meanSigma=centralValues[5])
        outOptions["UseNvars"] = str(getRandomInt(5, 100, meanSigma=centralValues[6]))
        outOptions["UseBaggedBoost"] = getRandomBool(meanSigma=centralValues[7])
        outOptions["BaggedSampleFraction"] = str(getRandomFloat(0.70, 5.00, meanSigma=centralValues[8]))
        outOptions["PruneMethod"] = getPruneMethodChoice(meanSigma=centralValues[9])
        outOptions["PruneStrength"] = str(getRandomInt(0, 100, meanSigma=centralValues[10]))
        outOptions["PruningValFraction"] = str(getRandomFloat(0.0, 2.0, meanSigma=centralValues[11]))
    return outOptions


def getVarTransformChoice(minSymbols=0, maxSymbols=2, meanSigma=None):
    # todo with sigma
    nTsf = np.random.randint(minSymbols, maxSymbols+1)
    varTransform = ["P", "G", "D", "N"]
    good = []
    initList = list(product(varTransform, repeat=nTsf))
    for e in initList:
        repeated = False
        for i in range(len(e)-1):
            if e[i] == e[i+1]:
                repeated = True
        if e.count("P") > 1:
            repeated = True
        if not repeated:
            good.append(",".join(e))
    if meanSigma is None:
        return np.random.choice(good)
    else:
        if np.random.uniform(0, 1) < meanSigma[1]:
            return meanSigma[0]
        else:
            return np.random.choice(good)


def getBestVarTransform(n=30):
    best = ["G", "N,G,N,G", "", "N", "P,N,D", "P,N,D,N", "P,G", "P,N", "P,D", "P", "N,G", "G,N", "G,N,G,N", "N,G,N", "G,N,G", "P,D,N", "P,D,N,D", "N,P,D", "N,P,N,D", "N,P,G", "P,D,G", "D,N,D", "P,G,N", "P,N,G", "D,N,G", "P,G,N,G", "D", "P,N,P,D"]
    return best[:n]


def getPruneMethodChoice(meanSigma=None):
    return getRandomItem(["NoPruning", "NoPruning", "ExpectedError", "CostComplexity"], meanSigma=meanSigma)


def getRandomInt(minNum = 0, maxNum=100, meanSigma=None, spacing=1):
    if meanSigma is None:
        return int(np.round(np.random.randint(minNum, maxNum+1)/spacing, 0)*spacing)
    else:
        num = np.random.normal(meanSigma[0], meanSigma[1])
        while num < minNum or num > maxNum:
            num = np.random.normal(meanSigma[0], meanSigma[1])
        return int(np.round(num/spacing, 0)*spacing)


def getRandomFloat(minNum = 0.0, maxNum=1.0, meanSigma=None):
    if meanSigma is None:
        return np.round(np.random.uniform(minNum, maxNum), 3)
    else:
        num = np.random.normal(meanSigma[0], meanSigma[1])
        while num < minNum or num > maxNum:
            num = np.random.normal(meanSigma[0], meanSigma[1])
        return np.round(num, 3)
    

def getRandomBool(meanSigma=None):
    return getRandomItem(["F", "T"], meanSigma=meanSigma)

        
def getRandomItem(listOfChoices, meanSigma=None):
    if meanSigma is None:
        return np.random.choice(listOfChoices)
    else:
        remainingChoices = [x for x in listOfChoices if x != meanSigma[0]]
        if np.random.uniform(0, 1) < meanSigma[1]:
            return meanSigma[0]
        else:
            return np.random.choice(remainingChoices)

def getInitValsModelOptions(modelsId, variances= [0.5, 600, 0.15, 2, 10, 0.7, 10, 0.7, 0.2, 0.6, 10, 0.3], fileName="/home/submit/pdmonte/CMSSW_10_6_27/src/Hrare2023/analysis/TMVA_regression/optionModels.out"):
    column_names = ['Id', 'VarTsf', 'NTrees', 'BoostType', 'Shrinkage', 'MaxDepth', 'SeparationType', 'nCuts', 'RndmTrees',
                    'UseNvars', 'UseBaggedBoost', 'BaggedSampleFraction', 'PruneMethod', 'PruneStrength', 'PruningValFraction']
    column_data_types = {'Id': int, 'VarTsf': str, 'NTrees': int, 'BoostType': str, 'Shrinkage': float, 'MaxDepth': int, 'SeparationType': str, 'nCuts': int, 'RndmTrees': str,
                    'UseNvars': int, 'UseBaggedBoost': str, 'BaggedSampleFraction': float, 'PruneMethod': str, 'PruneStrength': int, 'PruningValFraction': float}
    
    df = pd.read_csv(fileName, sep='$', names=column_names, dtype=column_data_types)
    df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
    df = df[df["Id"].isin(modelsId)]
    selected_columns = ['VarTsf', 'NTrees', 'Shrinkage', 'MaxDepth', 'nCuts', 'RndmTrees', 'UseNvars', 'UseBaggedBoost', 
                        'BaggedSampleFraction', 'PruneMethod', 'PruneStrength', 'PruningValFraction']
    list_of_lists = df[selected_columns].values.tolist()
    list_of_lists = [[[op, vr] for op, vr in zip(model, variances)] for model in list_of_lists]

    return list_of_lists

In [5]:
channel="d0starrho"
f = open("commands_temp.txt", "a")
#comm = "RECO:::python computeErrors.py -m RECO -c {channel}".format(channel=channel)
#print(comm)
#f.write(comm + "\n")
dfC, dlC = 7, 3684

modelNames=["BDTG_df15_dl3684_v0_v1"] #13/3620
#modelNames=["BDTG_df13_dl3620_v0_v1"] #13/3620
#modelNames=["BDTG_df7_dl3684_v0_v1"] #5/3620
#modelNames=["BDTG_df7_dl3684_v0_v1"]

goodModels = [10329]
goodModels = [11264] #omega
#goodModels = [12167] #d0star
goodModels = [17130, 17883, 18070, 18073, 18284, 18652, 18679, 18920, 18984, 18992, 19274, 19774] #d0starrho

#optionsDict = {0: "\"!V:NTrees=1000:BoostType=Grad:Shrinkage=0.2:MaxDepth=5:SeparationType=SDivSqrtSPlusB:nCuts=90:UseRandomisedTrees=T:UseNvars=67:UseBaggedBoost:BaggedSampleFraction=2.4:PruneMethod=NoPruning\""}

#initVals = [["G", 0.3], [2000, 200], [0.2, 0.001], [7, 1], [30, 1], ["F", 0.37], [30, 1], ["F", 0.15], [1.0, 1], ["NoPruning", 0.677], [50, 1], [1, 1]]

#initVals = getInitValsModelOptions(goodModels, variances = [0.5, 700, 0.07, 1.5, 10, 0.6, 8, 0.6, 0.25, 0.6, 8, 0.25], fileName="/home/submit/pdmonte/CMSSW_10_6_27/src/Hrare2023/analysis/TMVA_regression/optionModels.out")
initVals = getInitValsModelOptions(goodModels, variances = [0.5, 500, 0.07, 1.5, 10, 0.6, 8, 0.6, 0.25, 0.6, 8, 0.25], fileName="/home/submit/pdmonte/CMSSW_10_6_27/src/Hrare2023/analysis/TMVA_regression/optionModels.out")

for k, init in enumerate(initVals):
    numMod = 500
    optionsDict = generateModelsDict(startIndex=20000+k*numMod, numModels=numMod, nameOptionsFile = "/home/submit/pdmonte/CMSSW_10_6_27/src/Hrare2023/analysis/TMVA_regression/optionModels.out", centralValues=init)
    #optionsDict = generateModelsDict(startIndex=2103, numModels=1200)

    for key, v in optionsDict.items():
        for line in modelNames:
            jobName = line.strip()
            dfC = jobName.split("_")[1].replace("df", "")
            dlC = jobName.split("_")[2].replace("dl", "")
            listVars = [int(x.replace("v", "")) for x in jobName.split("_")[3:]]
            varNamesShort = "".join([str(v) for v in listVars])
            comm = "o{idx}_v{varNamesShort}:::root -l -q -b 'TMVA_GF_regression.C(\"BDTG_df{dfC}_dl{dlC}_{varNames}_opt{idx}\", \"{channel}\", 0, {{{listVars}}}, {dfC}, {dlC}, {optStr})' && root -l -q -b 'TMVA_GF_regression.C(\"BDTG_df{dfC}_dl{dlC}_{varNames}_opt{idx}\", \"{channel}\", 1, {{{listVars}}}, {dfC}, {dlC}, {optStr})' && root -l -q -b 'TMVA_GF_regression.C(\"BDTG_df{dfC}_dl{dlC}_{varNames}_opt{idx}\", \"{channel}\", 2, {{{listVars}}}, {dfC}, {dlC}, {optStr})' && python computeErrors.py -m BDTG_df{dfC}_dl{dlC}_{varNames}_opt{idx} -c {channel}".format(dfC=dfC, dlC=dlC, channel=channel, listVars=str(listVars)[1:-1], varNames=getVarNames(listVars), varNamesShort=varNamesShort, idx=key, optStr=v)
            print(comm)
            f.write(comm + "\n")
f.close()

o20000_v01:::root -l -q -b 'TMVA_GF_regression.C("BDTG_df15_dl3684_v0_v1_opt20000", "d0starrho", 0, {0, 1}, 15, 3684, "!V:VarTransform=N,P,N,D:NTrees=2100:BoostType=Grad:Shrinkage=0.048:MaxDepth=5:SeparationType=RegressionVariance:nCuts=33:UseRandomisedTrees=F:UseNvars=47:UseBaggedBoost=F:BaggedSampleFraction=1.061:PruneMethod=NoPruning:PruneStrength=4:PruningValFraction=1.567")' && root -l -q -b 'TMVA_GF_regression.C("BDTG_df15_dl3684_v0_v1_opt20000", "d0starrho", 1, {0, 1}, 15, 3684, "!V:VarTransform=N,P,N,D:NTrees=2100:BoostType=Grad:Shrinkage=0.048:MaxDepth=5:SeparationType=RegressionVariance:nCuts=33:UseRandomisedTrees=F:UseNvars=47:UseBaggedBoost=F:BaggedSampleFraction=1.061:PruneMethod=NoPruning:PruneStrength=4:PruningValFraction=1.567")' && root -l -q -b 'TMVA_GF_regression.C("BDTG_df15_dl3684_v0_v1_opt20000", "d0starrho", 2, {0, 1}, 15, 3684, "!V:VarTransform=N,P,N,D:NTrees=2100:BoostType=Grad:Shrinkage=0.048:MaxDepth=5:SeparationType=RegressionVariance:nCuts=33:UseRandomisedT

In [None]:
# To check the remaining models to compute
completed = []
file_path = "completedModels.txt"
with open(file_path, "r") as file:
    for line in file:
        completed.append(line.strip())
print(len(completed), completed[0])

total = []
file_path = "totalModels.txt"
with open(file_path, "r") as file:
    for line in file:
        total.append(line.strip())
print(len(total), total[0])

todo = sorted(list(set(total).difference(set(completed))))
print(len(todo))

f = open("remainingModels.txt", "w")
for line in todo:
    f.write(line + "\n")
f.close()

In [None]:
# To solve the errors
channel="phi"
f = open("commands_temp.txt", "w")

file_path = "errorFiles.txt"
with open(file_path, "r") as file:
    for line in file:
        jobName = line.strip().split("/")[-1][:-4]
        dfC = jobName.split("_")[0].replace("df", "")
        dlC = jobName.split("_")[1].replace("dl", "")
        listVars = [int(x.replace("v", "")) for x in jobName.split("_")[2:]]
        comm = "df{dfC}_dl{dlC}_{varNames}:root -l -q -b 'TMVA_GF_regression.C(\"BDTG_df{dfC}_dl{dlC}_{varNames}\", \"{channel}\", 0, {{{listVars}}}, {dfC}, {dlC})' && root -l -q -b 'TMVA_GF_regression.C(\"BDTG_df{dfC}_dl{dlC}_{varNames}\", \"{channel}\", 1, {{{listVars}}}, {dfC}, {dlC})' && root -l -q -b 'TMVA_GF_regression.C(\"BDTG_df{dfC}_dl{dlC}_{varNames}\", \"{channel}\", 2, {{{listVars}}}, {dfC}, {dlC})' && python computeErrors.py -m BDTG_df{dfC}_dl{dlC}_{varNames} -c {channel}".format(dfC=dfC, dlC=dlC, channel=channel, listVars=str(listVars)[1:-1], varNames=getVarNames(listVars))
        print(comm)
        f.write(comm + "\n")
f.close()

In [None]:
# To create the commands for the remaining models
channel="phi"
f = open("commands_temp.txt", "w")

file_path = "remainingModels.txt"
with open(file_path, "r") as file:
    for line in file:
        jobName = line.strip()
        dfC = jobName.split("_")[1].replace("df", "")
        dlC = jobName.split("_")[2].replace("dl", "")
        listVars = [int(x.replace("v", "")) for x in jobName.split("_")[3:]]
        comm = "df{dfC}_dl{dlC}_{varNames}:root -l -q -b 'TMVA_GF_regression.C(\"BDTG_df{dfC}_dl{dlC}_{varNames}\", \"{channel}\", 0, {{{listVars}}}, {dfC}, {dlC})' && root -l -q -b 'TMVA_GF_regression.C(\"BDTG_df{dfC}_dl{dlC}_{varNames}\", \"{channel}\", 1, {{{listVars}}}, {dfC}, {dlC})' && root -l -q -b 'TMVA_GF_regression.C(\"BDTG_df{dfC}_dl{dlC}_{varNames}\", \"{channel}\", 2, {{{listVars}}}, {dfC}, {dlC})' && python computeErrors.py -m BDTG_df{dfC}_dl{dlC}_{varNames} -c {channel}".format(dfC=dfC, dlC=dlC, channel=channel, listVars=str(listVars)[1:-1], varNames=getVarNames(listVars))
        print(comm)
        f.write(comm + "\n")
f.close()