Skip to content

Commit

Permalink
Merge branch 'DOR-570-gpu-info-in-output-files' into 'release-v0.6.0'
Browse files Browse the repository at this point in the history
DOR-570 (v0.6.0 MR) Add DS:gpu information to output FASTQ and SAM/BAM files

See merge request machine-learning/dorado!920
  • Loading branch information
MarkBicknellONT committed Mar 27, 2024
2 parents 35413fa + f04abbb commit 913f062
Show file tree
Hide file tree
Showing 13 changed files with 83 additions and 13 deletions.
2 changes: 1 addition & 1 deletion documentation/SAM.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

```
@HD VN:1.6 SO:unknown
@PG ID:basecaller PN:dorado VN:0.2.4+3fc2b0f CL:dorado basecaller dna_r10.4.1_e8.2_400bps_hac@v4.1.0 pod5/
@PG ID:basecaller PN:dorado VN:0.2.4+3fc2b0f CL:dorado basecaller dna_r10.4.1_e8.2_400bps_hac@v4.1.0 pod5/ DS:gpu:Quadro GV100
```

#### Read Group Header
Expand Down
2 changes: 1 addition & 1 deletion dorado/cli/aligner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ int aligner(int argc, char* argv[]) {
utils::HtsFile hts_file(file_info.output, file_info.output_mode, writer_threads);

PipelineDescriptor pipeline_desc;
auto hts_writer = pipeline_desc.add_node<HtsWriter>({}, hts_file);
auto hts_writer = pipeline_desc.add_node<HtsWriter>({}, hts_file, "");
auto aligner = pipeline_desc.add_node<AlignerNode>({hts_writer}, index_file_access, index,
bed_file, options, aligner_threads);

Expand Down
11 changes: 9 additions & 2 deletions dorado/cli/basecaller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
#include "utils/bam_utils.h"
#include "utils/barcode_kits.h"
#include "utils/basecaller_utils.h"
#if DORADO_CUDA_BUILD
#include "utils/cuda_utils.h"
#endif
#include "utils/fs_utils.h"
#include "utils/log_utils.h"
#include "utils/parameters.h"
Expand Down Expand Up @@ -176,7 +179,7 @@ void setup(std::vector<std::string> args,
}

SamHdrPtr hdr(sam_hdr_init());
cli::add_pg_hdr(hdr.get(), args);
cli::add_pg_hdr(hdr.get(), args, device);

if (barcode_enabled) {
std::unordered_map<std::string, std::string> custom_barcodes{};
Expand All @@ -199,7 +202,11 @@ void setup(std::vector<std::string> args,
utils::HtsFile hts_file("-", output_mode, thread_allocations.writer_threads);

PipelineDescriptor pipeline_desc;
auto hts_writer = pipeline_desc.add_node<HtsWriter>({}, hts_file);
std::string gpu_names{};
#if DORADO_CUDA_BUILD
gpu_names = utils::get_cuda_gpu_names(device);
#endif
auto hts_writer = pipeline_desc.add_node<HtsWriter>({}, hts_file, gpu_names);
auto aligner = PipelineDescriptor::InvalidNodeHandle;
auto current_sink_node = hts_writer;
if (enable_aligner) {
Expand Down
16 changes: 15 additions & 1 deletion dorado/cli/cli_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,10 @@
#endif // _WIN32
#include "data_loader/ModelFinder.h"

#if DORADO_CUDA_BUILD
#include "utils/cuda_utils.h"
#endif

#include <htslib/sam.h>

#include <algorithm>
Expand Down Expand Up @@ -58,14 +62,24 @@ inline std::pair<int, int> worker_vs_writer_thread_allocation(int available_thre
return std::make_pair(aligner_threads, writer_threads);
}

inline void add_pg_hdr(sam_hdr_t* hdr, const std::vector<std::string>& args) {
inline void add_pg_hdr(sam_hdr_t* hdr, const std::vector<std::string>& args, std::string device) {
sam_hdr_add_lines(hdr, "@HD\tVN:1.6\tSO:unknown", 0);

std::stringstream pg;
pg << "@PG\tID:basecaller\tPN:dorado\tVN:" << DORADO_VERSION << "\tCL:dorado";
for (const auto& arg : args) {
pg << " " << arg;
}

#if DORADO_CUDA_BUILD
auto gpu_string = utils::get_cuda_gpu_names(device);
if (!gpu_string.empty()) {
pg << "\tDS:gpu:" << gpu_string;
}
#else
(void)device;
#endif

pg << std::endl;
sam_hdr_add_lines(hdr, pg.str().c_str(), 0);
}
Expand Down
13 changes: 10 additions & 3 deletions dorado/cli/duplex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
#include "utils/SampleSheet.h"
#include "utils/bam_utils.h"
#include "utils/basecaller_utils.h"
#if DORADO_CUDA_BUILD
#include "utils/cuda_utils.h"
#endif
#include "utils/duplex_utils.h"
#include "utils/fs_utils.h"
#include "utils/log_utils.h"
Expand Down Expand Up @@ -357,7 +360,7 @@ int duplex(int argc, char* argv[]) {
spdlog::debug("> Reads to process: {}", num_reads);

SamHdrPtr hdr(sam_hdr_init());
cli::add_pg_hdr(hdr.get(), args);
cli::add_pg_hdr(hdr.get(), args, device);

constexpr int WRITER_THREADS = 4;
utils::HtsFile hts_file("-", output_mode, WRITER_THREADS);
Expand All @@ -366,15 +369,19 @@ int duplex(int argc, char* argv[]) {
auto hts_writer = PipelineDescriptor::InvalidNodeHandle;
auto aligner = PipelineDescriptor::InvalidNodeHandle;
auto converted_reads_sink = PipelineDescriptor::InvalidNodeHandle;
std::string gpu_names{};
#if DORADO_CUDA_BUILD
gpu_names = utils::get_cuda_gpu_names(device);
#endif
if (ref.empty()) {
hts_writer = pipeline_desc.add_node<HtsWriter>({}, hts_file);
hts_writer = pipeline_desc.add_node<HtsWriter>({}, hts_file, gpu_names);
converted_reads_sink = hts_writer;
} else {
auto options = cli::process_minimap2_arguments(parser, alignment::dflt_options);
auto index_file_access = std::make_shared<alignment::IndexFileAccess>();
aligner = pipeline_desc.add_node<AlignerNode>({}, index_file_access, ref, "", options,
std::thread::hardware_concurrency());
hts_writer = pipeline_desc.add_node<HtsWriter>({}, hts_file);
hts_writer = pipeline_desc.add_node<HtsWriter>({}, hts_file, gpu_names);
pipeline_desc.add_node_sink(aligner, hts_writer);
converted_reads_sink = aligner;
}
Expand Down
2 changes: 1 addition & 1 deletion dorado/cli/trim.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ int trim(int argc, char* argv[]) {
hts_file.set_and_write_header(header.get());

PipelineDescriptor pipeline_desc;
auto hts_writer = pipeline_desc.add_node<HtsWriter>({}, hts_file);
auto hts_writer = pipeline_desc.add_node<HtsWriter>({}, hts_file, "");

pipeline_desc.add_node<AdapterDetectorNode>({hts_writer}, trim_threads, true,
!parser.get<bool>("--no-trim-primers"),
Expand Down
14 changes: 13 additions & 1 deletion dorado/read_pipeline/HtsWriter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@ namespace dorado {

using OutputMode = dorado::utils::HtsFile::OutputMode;

HtsWriter::HtsWriter(utils::HtsFile& file) : MessageSink(10000, 1), m_file(file) {
HtsWriter::HtsWriter(utils::HtsFile& file, std::string gpu_names)
: MessageSink(10000, 1), m_file(file), m_gpu_names(std::move(gpu_names)) {
if (!m_gpu_names.empty()) {
m_gpu_names = "gpu:" + m_gpu_names;
}
start_input_processing(&HtsWriter::input_thread_fn, this);
}

Expand All @@ -43,6 +47,14 @@ void HtsWriter::input_thread_fn() {
}

auto aln = std::move(std::get<BamPtr>(message));

if (m_file.get_output_mode() == utils::HtsFile::OutputMode::FASTQ) {
if (!m_gpu_names.empty()) {
bam_aux_append(aln.get(), "DS", 'Z', int(m_gpu_names.length() + 1),
(uint8_t*)m_gpu_names.c_str());
}
}

auto res = write(aln.get());
if (res < 0) {
throw std::runtime_error("Failed to write SAM record, error code " +
Expand Down
4 changes: 3 additions & 1 deletion dorado/read_pipeline/HtsWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ namespace dorado {

class HtsWriter : public MessageSink {
public:
HtsWriter(utils::HtsFile& file);
HtsWriter(utils::HtsFile& file, std::string gpu_names);
~HtsWriter();
std::string get_name() const override { return "HtsWriter"; }
stats::NamedStats sample_stats() const override;
Expand All @@ -40,6 +40,8 @@ class HtsWriter : public MessageSink {

utils::HtsFile& m_file;

std::string m_gpu_names{};

void input_thread_fn();
std::atomic<int> m_duplex_reads_written{0};
std::atomic<int> m_split_reads_written{0};
Expand Down
21 changes: 21 additions & 0 deletions dorado/utils/cuda_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,27 @@ std::vector<CUDADeviceInfo> get_cuda_device_info(std::string device_string) {
return results;
}

std::string get_cuda_gpu_names(std::string device_string) {
auto dev_info = utils::get_cuda_device_info(device_string);
std::set<std::string> gpu_strs;
std::string gpu_names;

for (const auto &dev : dev_info) {
if (dev.in_use) {
gpu_strs.insert(dev.device_properties.name);
}
}

for (const auto &gpu_id : gpu_strs) {
if (!gpu_names.empty()) {
gpu_names += "|";
}
gpu_names += gpu_id;
}

return gpu_names;
}

std::unique_lock<std::mutex> acquire_gpu_lock(int gpu_index, bool use_lock) {
static std::vector<std::mutex> gpu_mutexes(torch::cuda::device_count());

Expand Down
4 changes: 4 additions & 0 deletions dorado/utils/cuda_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,10 @@ struct CUDADeviceInfo {
// visible devices on the host machine, with information on whether they are in use or not.
std::vector<CUDADeviceInfo> get_cuda_device_info(std::string device_string);

// Given a string representing cuda devices (e.g "cuda:0,1,3") returns a string containing
// the set of types of gpu that will be used.
std::string get_cuda_gpu_names(std::string device_string);

// Reports the amount of available memory (in bytes) for a given device.
size_t available_memory(torch::Device device);

Expand Down
1 change: 1 addition & 0 deletions dorado/utils/hts_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ HtsFile::HtsFile(const std::string& filename, OutputMode mode, size_t threads) :
m_file.reset(hts_open(filename.c_str(), "wf"));
hts_set_opt(m_file.get(), FASTQ_OPT_AUX, "RG");
hts_set_opt(m_file.get(), FASTQ_OPT_AUX, "st");
hts_set_opt(m_file.get(), FASTQ_OPT_AUX, "DS");
break;
case OutputMode::BAM: {
auto file = filename;
Expand Down
2 changes: 2 additions & 0 deletions dorado/utils/hts_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@ class HtsFile {
int writer_threads,
bool sort_if_mapped);

OutputMode get_output_mode() const { return m_mode; }

private:
HtsFilePtr m_file;
SamHdrPtr m_header;
Expand Down
4 changes: 2 additions & 2 deletions tests/BamWriterTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class HtsWriterTestsFixture {
hts_file.set_and_write_header(reader.header);

PipelineDescriptor pipeline_desc;
auto writer = pipeline_desc.add_node<HtsWriter>({}, hts_file);
auto writer = pipeline_desc.add_node<HtsWriter>({}, hts_file, "");
auto pipeline = Pipeline::create(std::move(pipeline_desc), nullptr);

reader.read(*pipeline, 1000);
Expand Down Expand Up @@ -89,7 +89,7 @@ TEST_CASE("HtsWriterTest: Read and write FASTQ with tag", TEST_GROUP) {
{
// Write with tags into temporary folder.
utils::HtsFile hts_file(out_fastq.string(), HtsFile::OutputMode::FASTQ, 2);
HtsWriter writer(hts_file);
HtsWriter writer(hts_file, "");
reader.read();
CHECK_THAT(bam_aux2Z(bam_aux_get(reader.record.get(), "RG")),
Equals("6a94c5e38fbe36232d63fd05555e41368b204cda_dna_r10.4.1_e8.2_400bps_hac@v4."
Expand Down

0 comments on commit 913f062

Please sign in to comment.