Skip to content

Commit

Permalink
Merge branch 'INSTX-3348_cuda_device_inspection' into 'master'
Browse files Browse the repository at this point in the history
INSTX-3348 Improved error reporting when the device string is invalid for CUDA devices

Closes INSTX-3348

See merge request machine-learning/dorado!791
  • Loading branch information
MarkBicknellONT committed Jan 5, 2024
2 parents faf3c13 + be1846c commit 901f700
Show file tree
Hide file tree
Showing 2 changed files with 81 additions and 3 deletions.
68 changes: 66 additions & 2 deletions dorado/utils/cuda_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -121,17 +121,25 @@ std::vector<std::string> parse_cuda_device_string(std::string device_string) {
std::regex e("[0-9]+");
std::smatch m;

auto num_devices = torch::cuda::device_count();
if (device_string.substr(0, 5) != "cuda:") {
return devices; // empty vector;
} else if (device_string == "cuda:all" || device_string == "cuda:auto") {
auto num_devices = torch::cuda::device_count();
for (size_t i = 0; i < num_devices; i++) {
devices.push_back("cuda:" + std::to_string(i));
}
} else {
while (std::regex_search(device_string, m, e)) {
for (auto x : m) {
devices.push_back("cuda:" + x.str());
std::string device_id = x.str();
int device_idx = std::stoi(device_id);
if (device_idx >= int(num_devices) || device_idx < 0) {
throw std::runtime_error("Invalid CUDA device index \"" + device_id +
"\" from device string " + device_string +
", there are " + std::to_string(num_devices) +
" visible CUDA devices.");
}
devices.push_back("cuda:" + device_id);
}
device_string = m.suffix().str();
}
Expand All @@ -140,6 +148,62 @@ std::vector<std::string> parse_cuda_device_string(std::string device_string) {
return devices;
}

std::vector<CUDADeviceInfo> get_cuda_device_info(std::string device_string) {
std::vector<CUDADeviceInfo> results;
std::regex e("[0-9]+");
std::smatch m;
auto num_devices = torch::cuda::device_count();

// Get the set of ids that are in use according to the device_string
std::set<int> device_ids;
if (device_string.substr(0, 5) != "cuda:") {
// Nothing to add to device_ids
} else if (device_string == "cuda:all" || device_string == "cuda:auto") {
if (num_devices == 0) {
throw std::runtime_error("device string set to " + device_string +
" but no CUDA devices available.");
}
for (int i = 0; i < int(num_devices); i++) {
device_ids.insert(i);
}
} else {
while (std::regex_search(device_string, m, e)) {
for (auto x : m) {
std::string device_id = x.str();
int device_idx = std::stoi(device_id);
if (device_idx >= int(num_devices) || device_idx < 0) {
throw std::runtime_error("Invalid CUDA device index \"" + device_id +
"\" from device string " + device_string +
", there are " + std::to_string(num_devices) +
" visible CUDA devices.");
}
device_ids.insert(device_idx);
}
device_string = m.suffix().str();
}
}

// Now inspect all the devices on the host to create the CUDADeviceInfo
for (int device_id = 0; device_id < int(num_devices); device_id++) {
CUDADeviceInfo device_info;
device_info.device_id = device_id;

cudaSetDevice(device_id);
cudaMemGetInfo(&device_info.free_mem, &device_info.total_mem);
cudaDeviceGetAttribute(&device_info.compute_cap_major, cudaDevAttrComputeCapabilityMajor,
device_id);
cudaDeviceGetAttribute(&device_info.compute_cap_minor, cudaDevAttrComputeCapabilityMinor,
device_id);
cudaGetDeviceProperties(&device_info.device_properties, device_id);

device_info.in_use = device_ids.find(device_id) != device_ids.end();

results.push_back(device_info);
}

return results;
}

std::unique_lock<std::mutex> acquire_gpu_lock(int gpu_index, bool use_lock) {
static std::vector<std::mutex> gpu_mutexes(torch::cuda::device_count());

Expand Down
16 changes: 15 additions & 1 deletion dorado/utils/cuda_utils.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once

#include <cuda_runtime.h>
#include <torch/torch.h>

#include <mutex>
Expand All @@ -16,9 +17,22 @@ namespace dorado::utils {
std::unique_lock<std::mutex> acquire_gpu_lock(int gpu_index, bool use_lock);

// Given a string representing cuda devices (e.g "cuda:0,1,3") returns a vector of strings, one for
// each device (e.g ["cuda:0", "cuda:2", ..., "cuda:7"]
// each device (e.g ["cuda:0", "cuda:2", ..., "cuda:7"]. This function will validate that the device IDs
// exist and will raise an exception if there is any issue with the string.
std::vector<std::string> parse_cuda_device_string(std::string device_string);

struct CUDADeviceInfo {
size_t free_mem, total_mem;
int device_id;
int compute_cap_major, compute_cap_minor;
cudaDeviceProp device_properties;
bool in_use;
};

// Given a string representing cuda devices (e.g "cuda:0,1,3") returns a vector of CUDADeviceInfo for all
// visible devices on the host machine, with information on whether they are in use or not.
std::vector<CUDADeviceInfo> get_cuda_device_info(std::string device_string);

// Reports the amount of available memory (in bytes) for a given device.
size_t available_memory(torch::Device device);

Expand Down

0 comments on commit 901f700

Please sign in to comment.