From c727e1739a7b3a3649d3ff543734fc7ec5f4d0ef Mon Sep 17 00:00:00 2001 From: Greg Daues Date: Mon, 23 Mar 2026 08:28:33 -0700 Subject: [PATCH 1/3] DM-54449: updates for mempercore, peakmemory --- python/lsst/ctrl/execute/allocationConfig.py | 5 +++ python/lsst/ctrl/execute/allocator.py | 40 ++++++++++++++++++-- python/lsst/ctrl/execute/allocatorParser.py | 11 +++++- python/lsst/ctrl/execute/condorConfig.py | 2 + python/lsst/ctrl/execute/slurmPlugin.py | 13 +++++++ 5 files changed, 66 insertions(+), 5 deletions(-) diff --git a/python/lsst/ctrl/execute/allocationConfig.py b/python/lsst/ctrl/execute/allocationConfig.py index b61da3c..b41facc 100644 --- a/python/lsst/ctrl/execute/allocationConfig.py +++ b/python/lsst/ctrl/execute/allocationConfig.py @@ -43,6 +43,11 @@ class AllocatedPlatformConfig(pexConfig.Config): dtype=str, default=None, ) + collector = pexConfig.Field( + doc="host where HTCondor collector service is running", + dtype=str, + default=None, + ) loginHostName = pexConfig.Field(doc="the host to login and copy files to", dtype=str, default=None) utilityPath = pexConfig.Field( doc="the directory containing the scheduler commands", dtype=str, default=None diff --git a/python/lsst/ctrl/execute/allocator.py b/python/lsst/ctrl/execute/allocator.py index 4cfe3a4..9d41fa2 100644 --- a/python/lsst/ctrl/execute/allocator.py +++ b/python/lsst/ctrl/execute/allocator.py @@ -109,15 +109,26 @@ def __init__( self.defaults["USER_SCRATCH"] = user_scratch self.commandLineDefaults = {} self.commandLineDefaults["NODE_COUNT"] = self.opts.nodeCount - self.commandLineDefaults["COLLECTOR"] = self.opts.collector + if self.configuration.platform.collector: + self.commandLineDefaults["COLLECTOR"] = self.configuration.platform.collector + if self.opts.collector: + self.commandLineDefaults["COLLECTOR"] = self.opts.collector self.commandLineDefaults["CPORT"] = self.opts.collectorport + self.commandLineDefaults["PEAKCPUS"] = self.configuration.platform.peakcpus + self.commandLineDefaults["PEAKMEMORY"] = self.configuration.platform.peakmemory if self.opts.exclusive: self.commandLineDefaults["CPUS"] = self.configuration.platform.peakcpus else: - self.commandLineDefaults["CPUS"] = self.opts.cpus + if self.opts.cpus < self.configuration.platform.peakcpus: + self.commandLineDefaults["CPUS"] = self.opts.cpus + else: + self.commandLineDefaults["CPUS"] = self.configuration.platform.peakcpus self.commandLineDefaults["WALL_CLOCK"] = self.opts.maximumWallClock self.commandLineDefaults["ACCOUNT"] = self.opts.account - self.commandLineDefaults["MEMPERCORE"] = 4096 + if self.opts.mempercore: + self.commandLineDefaults["MEMPERCORE"] = self.opts.mempercore + else: + self.commandLineDefaults["MEMPERCORE"] = 4096 self.commandLineDefaults["ALLOWEDAUTO"] = 500 self.commandLineDefaults["AUTOCPUS"] = 16 self.commandLineDefaults["MINAUTOCPUS"] = 15 @@ -223,7 +234,7 @@ def createSubmitFile(self, inputFile): if not os.path.exists(self.configDir): os.makedirs(self.configDir) outfile = self.createFile(inputFile, self.submitFileName) - _LOG.debug("Wrote new Slurm submit file to %s", outfile) + _LOG.debug("Wrote new submit file to %s", outfile) return outfile def createCondorConfigFile(self, input): @@ -350,6 +361,21 @@ def getCPUs(self): """ return self.getParameter("CPUS") + def getPeakcpus(self): + """Accessor for PEAKCPUS + @return the value of PEAKCPUS + """ + return self.getParameter("PEAKCPUS") + + def getPeakmemory(self): + """Accessor for PEAKMEMORY + @return the value of PEAKMEMORY + """ + peakmemory = self.getParameter("PEAKMEMORY") + if self.opts.queue == "torino": + peakmemory = int(3*peakmemory/2) + return peakmemory + def getAutoCPUs(self): """Size of standard glideins for allocateNodes auto @return the value of autoCPUs @@ -366,6 +392,12 @@ def getMinAutoCPUs(self): """ return self.getParameter("MINAUTOCPUS") + def getCollector(self): + """Accessor for COLLECTOR + @return the value of COLLECTOR + """ + return self.getParameter("COLLECTOR") + def getWallClock(self): """Accessor for WALL_CLOCK @return the value of WALL_CLOCK diff --git a/python/lsst/ctrl/execute/allocatorParser.py b/python/lsst/ctrl/execute/allocatorParser.py index c22e6c8..d7a4df4 100644 --- a/python/lsst/ctrl/execute/allocatorParser.py +++ b/python/lsst/ctrl/execute/allocatorParser.py @@ -123,6 +123,15 @@ def parseArgs(self, basename) -> argparse.Namespace: type=int, required=False, ) + parser.add_argument( + "--mempercore", + action="store", + default=4096, + dest="mempercore", + help="Memory per core to be scheduled by default", + type=int, + required=False, + ) parser.add_argument( "-s", "--qos", @@ -147,7 +156,7 @@ def parseArgs(self, basename) -> argparse.Namespace: "--queue", action="store", dest="queue", - default="roma,milano", + default="milano", help="queue / partition name", ) parser.add_argument( diff --git a/python/lsst/ctrl/execute/condorConfig.py b/python/lsst/ctrl/execute/condorConfig.py index e8c1c28..b794bd2 100644 --- a/python/lsst/ctrl/execute/condorConfig.py +++ b/python/lsst/ctrl/execute/condorConfig.py @@ -44,6 +44,8 @@ class PlatformConfig(pexConfig.Config): nodeSetRequired = pexConfig.Field(doc="is the nodeset required", dtype=bool, default=False) scheduler = pexConfig.Field(doc="scheduler type", dtype=str, default=None) peakcpus = pexConfig.Field(doc="peakcpus", dtype=int, default=None) + peakmemory = pexConfig.Field(doc="peakmemory", dtype=int, default=None) + collector = pexConfig.Field(doc="collector", dtype=str, default=None) manager = pexConfig.Field(doc="workflow manager", dtype=str, default=None) setup_using = pexConfig.Field(doc="environment setup type", dtype=str, default=None) manager_software_home = pexConfig.Field( diff --git a/python/lsst/ctrl/execute/slurmPlugin.py b/python/lsst/ctrl/execute/slurmPlugin.py index f4f1c7a..cadfb86 100644 --- a/python/lsst/ctrl/execute/slurmPlugin.py +++ b/python/lsst/ctrl/execute/slurmPlugin.py @@ -150,6 +150,10 @@ def submit(self): cpus = self.getCPUs() memoryPerCore = self.getMemoryPerCore() totalMemory = cpus * memoryPerCore + peakMemory = self.getPeakmemory() + if totalMemory > peakMemory: + totalMemory = peakMemory + _LOG.debug("Direct: Setting job memory to peak memory on platform.") # run the sbatch command template = Template(self.getLocalScratchDirectory()) @@ -324,6 +328,11 @@ def glideinsFromJobPressure(self): autoCPUs = cpus memoryPerCore = self.getMemoryPerCore() memoryLimit = autoCPUs * memoryPerCore + peakMemory = self.getPeakmemory() + if memoryLimit > peakMemory: + memoryLimit = peakMemory + _LOG.debug("Auto: Setting job memory to peak memory on platform.") + auser = self.getUserName() anodeset = self.getNodeset() @@ -400,6 +409,10 @@ def glideinsFromJobPressure(self): _LOG.debug("\n%d.%d", ajob["ClusterId"], ajob["ProcId"]) _LOG.debug("%s", ajob) thisMemory = ajob["RequestMemoryEval"] + peakMemory = self.getPeakmemory() + if thisMemory > peakMemory: + thisMemory = peakMemory + _LOG.debug("Auto large: Setting job memory to peak memory on platform.") useCores = ajob["RequestCpus"] clusterid = ajob["ClusterId"] procid = ajob["ProcId"] From fc9b4e98dae935f5c3a5669383309d0a78c4cc98 Mon Sep 17 00:00:00 2001 From: Greg Daues Date: Mon, 23 Mar 2026 08:37:30 -0700 Subject: [PATCH 2/3] DM-54449: python formatting, include tests --- python/lsst/ctrl/execute/allocator.py | 2 +- tests/test_allocatorParser.py | 6 ++++++ tests/test_condorConfig.py | 2 ++ tests/test_slurmPlugin.py | 4 ++++ tests/testfiles/config_condor_slurm.py | 2 ++ tests/testfiles/config_execconfig.py | 1 + 6 files changed, 16 insertions(+), 1 deletion(-) diff --git a/python/lsst/ctrl/execute/allocator.py b/python/lsst/ctrl/execute/allocator.py index 9d41fa2..135b321 100644 --- a/python/lsst/ctrl/execute/allocator.py +++ b/python/lsst/ctrl/execute/allocator.py @@ -373,7 +373,7 @@ def getPeakmemory(self): """ peakmemory = self.getParameter("PEAKMEMORY") if self.opts.queue == "torino": - peakmemory = int(3*peakmemory/2) + peakmemory = int(3 * peakmemory / 2) return peakmemory def getAutoCPUs(self): diff --git a/tests/test_allocatorParser.py b/tests/test_allocatorParser.py index e34f092..81bc4e8 100644 --- a/tests/test_allocatorParser.py +++ b/tests/test_allocatorParser.py @@ -46,6 +46,10 @@ def test1(self): "sdfmilan003", "--nodelist", "sdfmilan004", + "--mempercore", + "6144", + "--collector", + "sdfiana039", "-q", "normal", "-O", @@ -63,6 +67,8 @@ def test1(self): self.assertEqual(args.maximumWallClock, "00:30:00") self.assertEqual(args.exclude, "sdfmilan003") self.assertEqual(args.nodelist, "sdfmilan004") + self.assertEqual(args.mempercore, 6144) + self.assertEqual(args.collector, "sdfiana039") self.assertEqual(args.queue, "normal") self.assertEqual(args.outputLog, "outlog") self.assertEqual(args.errorLog, "errlog") diff --git a/tests/test_condorConfig.py b/tests/test_condorConfig.py index 4b76fa0..cff4492 100644 --- a/tests/test_condorConfig.py +++ b/tests/test_condorConfig.py @@ -95,6 +95,8 @@ def test5(self): self.assertEqual(self.config.platform.scheduler, "slurm") self.assertEqual(self.config.platform.setup_using, "getenv") self.assertEqual(self.config.platform.manager, "dagman") + self.assertEqual(self.config.platform.peakcpus, 120) + self.assertEqual(self.config.platform.peakmemory, 491520) def test6(self): path = os.path.join("tests", "testfiles", "config_pegasus.py") diff --git a/tests/test_slurmPlugin.py b/tests/test_slurmPlugin.py index 9e5e984..0db65e6 100644 --- a/tests/test_slurmPlugin.py +++ b/tests/test_slurmPlugin.py @@ -88,12 +88,16 @@ def test1(self): scheduler: Allocator = schedulerClass(platform, args, configuration, condor_info_file) self.assertTrue(scheduler) + peakcpus = scheduler.getPeakcpus() + peakmemory = scheduler.getPeakmemory() autocpus = scheduler.getAutoCPUs() minautocpus = scheduler.getMinAutoCPUs() cpus = scheduler.getCPUs() nodes = scheduler.getNodes() nodeset = scheduler.getNodeset() wallclock = scheduler.getWallClock() + self.assertEqual(peakcpus, 120) + self.assertEqual(peakmemory, 737280) self.assertEqual(autocpus, 16) self.assertEqual(minautocpus, 15) self.assertEqual(cpus, 12) diff --git a/tests/testfiles/config_condor_slurm.py b/tests/testfiles/config_condor_slurm.py index 035abfb..60d4e44 100644 --- a/tests/testfiles/config_condor_slurm.py +++ b/tests/testfiles/config_condor_slurm.py @@ -1,4 +1,6 @@ # flake8: noqa +config.platform.peakcpus = 120 +config.platform.peakmemory = 491520 config.platform.defaultRoot = "/usr" config.platform.localScratch = "./tests/condor_scratch_slurm" config.platform.dataDirectory = "/tmp/data_slurm" diff --git a/tests/testfiles/config_execconfig.py b/tests/testfiles/config_execconfig.py index 246fad4..f2ac041 100644 --- a/tests/testfiles/config_execconfig.py +++ b/tests/testfiles/config_execconfig.py @@ -4,3 +4,4 @@ config.platform.fileSystemDomain = "slac.stanford.edu" config.platform.scheduler = "slurm" config.platform.peakcpus = 120 +config.platform.peakmemory = 737280 From 0815924d397afbc16b2493e5af371b189c63c5ab Mon Sep 17 00:00:00 2001 From: Greg Daues Date: Wed, 25 Mar 2026 13:00:12 -0700 Subject: [PATCH 3/3] DM-54449: clean up handling of default values --- python/lsst/ctrl/execute/allocator.py | 15 +++++++++------ python/lsst/ctrl/execute/allocatorParser.py | 2 +- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/python/lsst/ctrl/execute/allocator.py b/python/lsst/ctrl/execute/allocator.py index 135b321..fe963c1 100644 --- a/python/lsst/ctrl/execute/allocator.py +++ b/python/lsst/ctrl/execute/allocator.py @@ -114,8 +114,14 @@ def __init__( if self.opts.collector: self.commandLineDefaults["COLLECTOR"] = self.opts.collector self.commandLineDefaults["CPORT"] = self.opts.collectorport - self.commandLineDefaults["PEAKCPUS"] = self.configuration.platform.peakcpus - self.commandLineDefaults["PEAKMEMORY"] = self.configuration.platform.peakmemory + if self.configuration.platform.peakcpus: + self.commandLineDefaults["PEAKCPUS"] = self.configuration.platform.peakcpus + else: + self.commandLineDefaults["PEAKCPUS"] = 256 + if self.configuration.platform.peakmemory: + self.commandLineDefaults["PEAKMEMORY"] = self.configuration.platform.peakmemory + else: + self.commandLineDefaults["PEAKMEMORY"] = 1000000 if self.opts.exclusive: self.commandLineDefaults["CPUS"] = self.configuration.platform.peakcpus else: @@ -125,10 +131,7 @@ def __init__( self.commandLineDefaults["CPUS"] = self.configuration.platform.peakcpus self.commandLineDefaults["WALL_CLOCK"] = self.opts.maximumWallClock self.commandLineDefaults["ACCOUNT"] = self.opts.account - if self.opts.mempercore: - self.commandLineDefaults["MEMPERCORE"] = self.opts.mempercore - else: - self.commandLineDefaults["MEMPERCORE"] = 4096 + self.commandLineDefaults["MEMPERCORE"] = self.opts.mempercore self.commandLineDefaults["ALLOWEDAUTO"] = 500 self.commandLineDefaults["AUTOCPUS"] = 16 self.commandLineDefaults["MINAUTOCPUS"] = 15 diff --git a/python/lsst/ctrl/execute/allocatorParser.py b/python/lsst/ctrl/execute/allocatorParser.py index d7a4df4..ee7e3ff 100644 --- a/python/lsst/ctrl/execute/allocatorParser.py +++ b/python/lsst/ctrl/execute/allocatorParser.py @@ -128,7 +128,7 @@ def parseArgs(self, basename) -> argparse.Namespace: action="store", default=4096, dest="mempercore", - help="Memory per core to be scheduled by default", + help="Memory per core in MB to be scheduled by default", type=int, required=False, )