Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix SLURM RAM monitoring #473

Merged
merged 11 commits into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .conda/meta.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{% set version = "2.3.1" %}
{% set version = "2.3.2" %}

package:
name: codecarbon
Expand Down
2 changes: 1 addition & 1 deletion codecarbon/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "2.3.1"
__version__ = "2.3.2"
20 changes: 15 additions & 5 deletions codecarbon/core/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,11 @@

from codecarbon.external.logger import logger

SLURM_JOB_ID = os.environ.get(
"SLURM_JOB_ID", # default
os.environ.get("SLURM_JOBID"), # deprecated but may still be used
)


@contextmanager
def suppress(*exceptions):
Expand Down Expand Up @@ -75,16 +80,20 @@ def detect_cpu_model() -> str:


def count_cpus() -> int:
if os.environ.get("SLURM_JOB_ID") is None:
if SLURM_JOB_ID is None:
return psutil.cpu_count()

try:
logger.debug(
"SLURM environment detected for job {SLURM_JOB_ID}, running"
+ " `scontrol show job $SLURM_JOB_ID` to count SLURM-available cpus."
)
scontrol = subprocess.check_output(
["scontrol show job $SLURM_JOBID"], shell=True
[f"scontrol show job {SLURM_JOB_ID}"], shell=True
).decode()
except subprocess.CalledProcessError:
logger.warning(
"Error running `scontrol show job $SLURM_JOBID` "
"Error running `scontrol show job $SLURM_JOB_ID` "
+ "to count SLURM-available cpus. Using the machine's cpu count."
)
return psutil.cpu_count()
Expand All @@ -93,17 +102,18 @@ def count_cpus() -> int:

if len(num_cpus_matches) == 0:
logger.warning(
"Could not find NumCPUs= after running `scontrol show job $SLURM_JOBID` "
"Could not find NumCPUs= after running `scontrol show job $SLURM_JOB_ID` "
+ "to count SLURM-available cpus. Using the machine's cpu count."
)
return psutil.cpu_count()

if len(num_cpus_matches) > 1:
logger.warning(
"Unexpected output after running `scontrol show job $SLURM_JOBID` "
"Unexpected output after running `scontrol show job $SLURM_JOB_ID` "
+ "to count SLURM-available cpus. Using the machine's cpu count."
)
return psutil.cpu_count()

num_cpus = num_cpus_matches[0].replace("NumCPUs=", "")
logger.debug(f"Detected {num_cpus} cpus available on SLURM.")
return int(num_cpus)
55 changes: 43 additions & 12 deletions codecarbon/external/hardware.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
"""
Encapsulates external dependencies to retrieve hardware metadata
"""

import os
import re
import subprocess
from abc import ABC, abstractmethod
Expand All @@ -14,7 +12,7 @@
from codecarbon.core.cpu import IntelPowerGadget, IntelRAPL
from codecarbon.core.gpu import AllGPUDevices
from codecarbon.core.units import Energy, Power, Time
from codecarbon.core.util import detect_cpu_model
from codecarbon.core.util import SLURM_JOB_ID, detect_cpu_model
from codecarbon.external.logger import logger

# default W value for a CPU if no model is found in the ref csv
Expand Down Expand Up @@ -238,6 +236,7 @@ class RAM(BaseHardware):
# 3 watts of power for every 8GB of DDR3 or DDR4 memory
# https://www.crucial.com/support/articles-faq-memory/how-much-power-does-memory-use
power_per_GB = 3 / 8 # W/GB
memory_size = None

def __init__(
self,
Expand Down Expand Up @@ -273,13 +272,29 @@ def _get_children_memories(self):

def _read_slurm_scontrol(self):
try:
return subprocess.check_output(
["scontrol show job $SLURM_JOBID"], shell=True
).decode()
logger.debug(
"SLURM environment detected, running `scontrol show job $SLURM_JOB_ID`..."
)
return (
subprocess.check_output(
[f"scontrol show job {SLURM_JOB_ID}"], shell=True
)
.decode()
.strip()
)
except subprocess.CalledProcessError:
return

def _parse_scontrol_memory_GB(self, mem):
"""
Parse the memory string (B) returned by scontrol to a float (GB)

Args:
mem (str): Memory string (B) as `[amount][unit]` (e.g. `128G`)

Returns:
float: Memory (GB)
"""
nb = int(mem[:-1])
unit = mem[-1]
if unit == "T":
Expand All @@ -292,16 +307,16 @@ def _parse_scontrol_memory_GB(self, mem):
return nb / (1000**2)

def _parse_scontrol(self, scontrol_str):
mem_matches = re.findall(r"mem=\d+[A-Z]", scontrol_str)
mem_matches = re.findall(r"AllocTRES=.*?,mem=(\d+[A-Z])", scontrol_str)
if len(mem_matches) == 0:
logger.warning(
"Could not find mem= after running `scontrol show job $SLURM_JOBID` "
"Could not find mem= after running `scontrol show job $SLURM_JOB_ID` "
+ "to count SLURM-available RAM. Using the machine's total RAM."
)
return psutil.virtual_memory().total / B_TO_GB
if len(mem_matches) > 1:
logger.warning(
"Unexpected output after running `scontrol show job $SLURM_JOBID` "
"Unexpected output after running `scontrol show job $SLURM_JOB_ID` "
+ "to count SLURM-available RAM. Using the machine's total RAM."
)
return psutil.virtual_memory().total / B_TO_GB
Expand All @@ -310,17 +325,27 @@ def _parse_scontrol(self, scontrol_str):

@property
def slurm_memory_GB(self):
"""
Property to compute the SLURM-available RAM in GigaBytes.

Returns:
float: Memory allocated to the job (GB)
"""
# Prevent calling scontrol at each mesure
if self.memory_size:
return self.memory_size
scontrol_str = self._read_slurm_scontrol()
if scontrol_str is None:
logger.warning(
"Error running `scontrol show job $SLURM_JOBID` "
"Error running `scontrol show job $SLURM_JOB_ID` "
+ "to retrieve SLURM-available RAM."
+ "Using the machine's total RAM."
)
return psutil.virtual_memory().total / B_TO_GB
mem = self._parse_scontrol(scontrol_str)
if isinstance(mem, str):
return self._parse_scontrol_memory_GB(mem)
mem = self._parse_scontrol_memory_GB(mem)
self.memory_size = mem
return mem

@property
Expand All @@ -338,9 +363,15 @@ def process_memory_GB(self):

@property
def machine_memory_GB(self):
"""
Property to compute the machine's total memory in bytes.

Returns:
float: Total RAM (GB)
"""
return (
self.slurm_memory_GB
if os.environ.get("SLURM_JOB_ID")
if SLURM_JOB_ID
else psutil.virtual_memory().total / B_TO_GB
)

Expand Down
4 changes: 2 additions & 2 deletions docs/_static/documentation_options.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
const DOCUMENTATION_OPTIONS = {
VERSION: '2.3.1',
VERSION: '2.3.2',
LANGUAGE: 'en',
COLLAPSE_INDEX: false,
BUILDER: 'html',
Expand All @@ -10,4 +10,4 @@ const DOCUMENTATION_OPTIONS = {
NAVIGATION_WITH_KEYS: false,
SHOW_SEARCH_SUMMARY: true,
ENABLE_SEARCH_SHORTCUTS: true,
};
};
2 changes: 1 addition & 1 deletion docs/_static/jquery-3.5.1.js
Original file line number Diff line number Diff line change
Expand Up @@ -3761,7 +3761,7 @@ jQuery.extend( {

returned = handler.apply( that, args );

// Support: Promises/A+ section 2.3.1
// Support: Promises/A+ section 2.3.2
// https://promisesaplus.com/#point-48
if ( returned === deferred.promise() ) {
throw new TypeError( "Thenable self-resolution" );
Expand Down
2 changes: 1 addition & 1 deletion docs/_static/jquery-3.6.0.js
Original file line number Diff line number Diff line change
Expand Up @@ -3765,7 +3765,7 @@ jQuery.extend( {

returned = handler.apply( that, args );

// Support: Promises/A+ section 2.3.1
// Support: Promises/A+ section 2.3.2
// https://promisesaplus.com/#point-48
if ( returned === deferred.promise() ) {
throw new TypeError( "Thenable self-resolution" );
Expand Down
20 changes: 10 additions & 10 deletions docs/api.html
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
<meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />

<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>CodeCarbon API &mdash; CodeCarbon 2.3.1 documentation</title>
<title>CodeCarbon API &mdash; CodeCarbon 2.3.2 documentation</title>
<link rel="stylesheet" href="_static/pygments.css" type="text/css" />
<link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
<!--[if lt IE 9]>
<script src="_static/js/html5shiv.min.js"></script>
<![endif]-->

<script src="_static/jquery.js?v=5d32c60e"></script>
<script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="_static/documentation_options.js?v=67b02a41"></script>
Expand All @@ -20,17 +20,17 @@
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="Parameters" href="parameters.html" />
<link rel="prev" title="Quickstart" href="usage.html" />
<link rel="prev" title="Quickstart" href="usage.html" />
</head>

<body class="wy-body-for-nav">
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >



<a href="index.html" class="icon icon-home">
CodeCarbon
</a>
Expand Down Expand Up @@ -92,7 +92,7 @@
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">

<section id="codecarbon-api">
<span id="api"></span><h1>CodeCarbon API<a class="headerlink" href="#codecarbon-api" title="Link to this heading"></a></h1>
<section id="id1">
Expand Down Expand Up @@ -164,7 +164,7 @@ <h2>CodeCarbon API<a class="headerlink" href="#id1" title="Link to this heading"
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.


</footer>
</div>
Expand All @@ -175,7 +175,7 @@ <h2>CodeCarbon API<a class="headerlink" href="#id1" title="Link to this heading"
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</script>

</body>
</html>
</html>
20 changes: 10 additions & 10 deletions docs/comet.html
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@
<meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />

<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Comet Integration &mdash; CodeCarbon 2.3.1 documentation</title>
<title>Comet Integration &mdash; CodeCarbon 2.3.2 documentation</title>
<link rel="stylesheet" href="_static/pygments.css" type="text/css" />
<link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
<!--[if lt IE 9]>
<script src="_static/js/html5shiv.min.js"></script>
<![endif]-->

<script src="_static/jquery.js?v=5d32c60e"></script>
<script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
<script src="_static/documentation_options.js?v=67b02a41"></script>
Expand All @@ -20,17 +20,17 @@
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="next" title="Output" href="output.html" />
<link rel="prev" title="Examples" href="examples.html" />
<link rel="prev" title="Examples" href="examples.html" />
</head>

<body class="wy-body-for-nav">
<body class="wy-body-for-nav">
<div class="wy-grid-for-nav">
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
<div class="wy-side-scroll">
<div class="wy-side-nav-search" >



<a href="index.html" class="icon icon-home">
CodeCarbon
</a>
Expand Down Expand Up @@ -89,7 +89,7 @@
</div>
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
<div itemprop="articleBody">

<section id="comet-integration">
<span id="comet"></span><h1>Comet Integration<a class="headerlink" href="#comet-integration" title="Link to this heading"></a></h1>
<p>CodeCarbon can be automatically integrated with <a class="reference external" href="https://www.comet.ml/site/">Comet</a> for experiment tracking and visualization. Comet provides data scientists with powerful tools to track, compare, explain, and reproduce their experiments. Now, with CodeCarbon you can easily track the carbon footprint of your jobs along with your training metrics, hyperparameters, dataset samples, artifacts, and more.</p>
Expand Down Expand Up @@ -128,7 +128,7 @@
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
provided by <a href="https://readthedocs.org">Read the Docs</a>.


</footer>
</div>
Expand All @@ -139,7 +139,7 @@
jQuery(function () {
SphinxRtdTheme.Navigation.enable(true);
});
</script>
</script>

</body>
</html>
</html>
2 changes: 1 addition & 1 deletion docs/edit/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
author = "BCG GAMMA, Comet.ml, Haverford College, MILA, Data For Good"

# The full version, including alpha/beta/rc tags
release = "2.3.1"
release = "2.3.2"


# -- General configuration ---------------------------------------------------
Expand Down
Loading
Loading