Skip to content
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,10 @@
*.code-workspace

# --- Agent ---
build/
.cache/
compile_commands.json
sources/agent/agent.conf
.clang-format

# --- Server ---
sources/server/generated/
sources/server/generated/
18 changes: 18 additions & 0 deletions .helix/languages.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
[[language]]
name = "c"
scope = "source.c"
file-types = []

[[language]]
name = "cpp"
scope = "source.cpp"
file-types = ["cc", "hh", "c++", "cpp", "hpp", "h", "ipp", "tpp", "cxx", "hxx", "ixx", "txx", "ino", "C", "H", "cu", "cuh"]
language-servers = ["clangd"]

[language-server.clangd]
command = "clangd"
args = [
"--header-insertion=never",
"--query-driver=/usr/bin/g++,/usr/bin/c++",
"--compile-commands-dir=build"
]
94 changes: 94 additions & 0 deletions sources/agent/src/collectors/rapl_collector.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
#include "rapl_collector.h"

#include <fcntl.h>
#include <unistd.h>

#include <algorithm>
#include <chrono>
#include <cmath>
#include <filesystem>

namespace volta {
namespace agent {
namespace collectors {

RaplCollector::RaplCollector() {
OpenMSR();
uint64_t readout = ReadMSR(0, MSR_RAPL::POWER_UNIT);
power_units_ = pow(0.5, (double)(readout & 0xf));
energy_units_ = pow(0.5, (double)((readout >> 8) & 0x1f));
time_units_ = pow(0.5, (double)((readout >> 16) & 0xf));
readout = ReadMSR(0, MSR_RAPL::PKG::ENERGY_STATUS);
last_value = energy_units_ * readout;
}

std::vector<Metric> RaplCollector::Collect() {
uint64_t readout;

try {
readout = ReadMSR(0, MSR_RAPL::PKG::ENERGY_STATUS);
} catch (const MSR_Read_Exception &e) {
return {};
}

double value = energy_units_ * readout;

Metric m;
m.name = "cpu_energy_usage_total";
m.value = value - last_value;
m.timestamp = std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count();
last_value = value;
return {m};
}

uint64_t RaplCollector::ReadMSR(uint8_t core, uint32_t offset) {
uint64_t data;
if (core + 1 > MSR_files_.size()) {
throw MSR_Read_Exception();
}
// c-like read for thread safety
if (pread(MSR_files_[core], &data, sizeof data, offset) != sizeof data) {
return {};
}

return data;
}

void RaplCollector::OpenMSR() {
const std::filesystem::path cpu_base = "/dev/cpu";
MSR_files_ = std::vector<int>();
std::error_code ec;

if (!std::filesystem::exists(cpu_base, ec)) {
throw MSR_Open_Exception();
}
std::vector<std::pair<int, std::filesystem::path>> cpu_entries;
for (const auto &entry : std::filesystem::directory_iterator(cpu_base)) {
Comment thread
szlachta99 marked this conversation as resolved.
if (!entry.is_directory()) continue;
const auto &dirname = entry.path().filename().string();
if (!std::ranges::all_of(dirname, ::isdigit)) continue;
cpu_entries.emplace_back(std::stoi(dirname), entry.path());
}

std::ranges::sort(cpu_entries);

for (const auto &[id, path] : cpu_entries) {
int fd = open((path / "msr").c_str(), O_RDONLY);
if (fd >= 0) {
MSR_files_.push_back(fd);
}
}
}

void RaplCollector::CloseMSR(int fd) { close(fd); }

RaplCollector::~RaplCollector() {
for (auto file : MSR_files_) {
CloseMSR(file);
}
};
} // namespace collectors
} // namespace agent
} // namespace volta
74 changes: 74 additions & 0 deletions sources/agent/src/collectors/rapl_collector.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#ifndef VOLTA_AGENT_SRC_COLLECTORS_RAPL_COLLECTOR_H_
#define VOLTA_AGENT_SRC_COLLECTORS_RAPL_COLLECTOR_H_

#include "collectors/collector.h"

namespace volta {
namespace agent {
namespace collectors {

class RaplCollector : public Collector {
public:
RaplCollector();
// ~RaplCollector() override;
RaplCollector(const RaplCollector&) = delete;
RaplCollector& operator=(const RaplCollector&) = delete;
std::vector<Metric> Collect() override;
~RaplCollector();

private:
uint64_t ReadMSR(uint8_t core, uint32_t offset);
void OpenMSR();
void CloseMSR(int fd);
bool initialized_ = false;
double power_units_, energy_units_, time_units_;
std::vector<int> MSR_files_;
double last_value;

class MSR_Read_Exception : std::exception {};
class MSR_Open_Exception : std::exception {};

struct MSR_RAPL {
static constexpr uint32_t POWER_UNIT = 0x606;
struct Units {
static constexpr uint32_t POWER_UNIT_OFFSET = 0;
static constexpr uint32_t POWER_UNIT_MASK = 0x0F;
static constexpr uint32_t ENERGY_UNIT_OFFSET = 0x08;
static constexpr uint32_t ENERGY_UNIT_MASK = 0x1F00;
static constexpr uint32_t TIME_UNIT_OFFSET = 0x10;
static constexpr uint32_t TIME_UNIT_MASK = 0xF000;
};

struct PKG {
static constexpr uint32_t POWER_LIMIT = 0x610;
static constexpr uint32_t ENERGY_STATUS = 0x611;
static constexpr uint32_t PERF_STATUS = 0x613;
static constexpr uint32_t POWER_INFO = 0x614;
};

struct PP0 {
static constexpr uint32_t POWER_LIMIT = 0x638;
static constexpr uint32_t ENERGY_STATUS = 0x639;
static constexpr uint32_t POLICY = 0x63A;
static constexpr uint32_t PERF_STATUS = 0x63B;
};

struct PP1 {
static constexpr uint32_t POWER_LIMIT = 0x640;
static constexpr uint32_t ENERGY_STATUS = 0x641;
static constexpr uint32_t POLICY = 0x642;
};

struct DRAM {
static constexpr uint32_t POWER_LIMIT = 0x618;
static constexpr uint32_t ENERGY_STATUS = 0x619;
static constexpr uint32_t PERF_STATUS = 0x61B;
static constexpr uint32_t POWER_INFO = 0x61C;
};
};
};

} // namespace collectors
} // namespace agent
} // namespace volta
#endif
5 changes: 5 additions & 0 deletions sources/agent/src/config/config_loader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,11 @@ Config ConfigLoader::LoadDefaultConfig() {
proc_stat_config.metrics["cpu_usage_percent"] = true;
config.collectors[CollectorNames::kProcStat] = proc_stat_config;

CollectorConfig rapl_collector;
rapl_collector.enabled = true;
rapl_collector.metrics = {{"cpu_energy_usage_total", true}};
config.collectors[CollectorNames::kRapl] = rapl_collector;

return config;
}

Expand Down
10 changes: 5 additions & 5 deletions sources/agent/src/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
#include "collectors/nvml_collector.h"
#include "collectors/proc_stat_collector.h"
#include "collectors/ram_collector.h"
#include "collectors/rapl_collector.h"
#include "config/config.h"
#include "config/config_loader.h"
#include "platform/platform_detector.h"
Expand All @@ -28,10 +29,9 @@ int main() {

active_collectors.push_back(
std::make_unique<collectors::ProcStatCollector>());

active_collectors.push_back(std::make_unique<collectors::RamCollector>());

for (const auto& gpu : hw.gpus) {
active_collectors.push_back(std::make_unique<collectors::RaplCollector>());
for (const auto &gpu : hw.gpus) {
if (gpu.vendor == platform::GpuVendor::NVIDIA) {
auto nvml = std::make_unique<collectors::NvmlCollector>();
if (nvml->Init()) {
Expand All @@ -43,8 +43,8 @@ int main() {
Scheduler scheduler(config, std::move(active_collectors));
scheduler.Run();

} catch (const std::exception& e) {
// std::cerr << "CRITICAL ERROR: " << e.what() << std::endl;
} catch (const std::exception &e) {
std::cerr << "CRITICAL ERROR: " << e.what() << std::endl;
return 1;
}
return 0;
Expand Down