Skip to content

Commit

Permalink
Merge branch 'DOR-592_demux_header_merge_improvements' into 'master'
Browse files Browse the repository at this point in the history
DOR-592 Demux header merge improvements

Closes DOR-592

See merge request machine-learning/dorado!950
  • Loading branch information
kdolan1973 committed Apr 25, 2024
2 parents 9e5c55b + 8a8e94a commit a3dce7e
Show file tree
Hide file tree
Showing 18 changed files with 652 additions and 268 deletions.
21 changes: 10 additions & 11 deletions dorado/cli/aligner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -72,15 +72,14 @@ bool create_output_folder(const std::filesystem::path& output_folder) {
return true;
}

void add_pg_hdr(sam_hdr_t* hdr) {
sam_hdr_add_pg(hdr, "aligner", "PN", "dorado", "VN", DORADO_VERSION, "DS", MM_VERSION, nullptr);
}

} // namespace

namespace dorado {

void add_pg_hdr(sam_hdr_t* hdr) {
sam_hdr_add_line(hdr, "PG", "ID", "aligner", "PN", "dorado", "VN", DORADO_VERSION, "DS",
MM_VERSION, NULL);
}

int aligner(int argc, char* argv[]) {
cli::ArgParser parser("dorado aligner");
parser.visible.add_description(
Expand Down Expand Up @@ -217,10 +216,10 @@ int aligner(int argc, char* argv[]) {
}

spdlog::debug("> input fmt: {} aligned: {}", reader->format, reader->is_aligned);
auto header = sam_hdr_dup(reader->header);
dorado::utils::strip_alignment_data_from_header(header);

add_pg_hdr(header);
auto header = SamHdrPtr(sam_hdr_dup(reader->header));
utils::add_hd_header_line(header.get());
add_pg_hdr(header.get());
dorado::utils::strip_alignment_data_from_header(header.get());

const bool sort_bam = (file_info.output_mode == utils::HtsFile::OutputMode::BAM &&
file_info.output != "-");
Expand All @@ -244,9 +243,9 @@ int aligner(int argc, char* argv[]) {
// At present, header output file header writing relies on direct node method calls
// rather than the pipeline framework.
const auto& aligner_ref = dynamic_cast<AlignerNode&>(pipeline->get_node_ref(aligner));
utils::add_sq_hdr(header, aligner_ref.get_sequence_records_for_header());
utils::add_sq_hdr(header.get(), aligner_ref.get_sequence_records_for_header());
auto& hts_writer_ref = dynamic_cast<HtsWriter&>(pipeline->get_node_ref(hts_writer));
hts_file.set_header(header);
hts_file.set_header(header.get());

// All progress reporting is in the post-processing part.
ProgressTracker tracker(0, false, 1.f);
Expand Down
6 changes: 4 additions & 2 deletions dorado/cli/cli_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "dorado_version.h"
#include "models/kits.h"
#include "utils/bam_utils.h"
#include "utils/dev_utils.h"

#include <optional>
Expand Down Expand Up @@ -66,10 +67,11 @@ inline std::pair<int, int> worker_vs_writer_thread_allocation(int available_thre
inline void add_pg_hdr(sam_hdr_t* hdr,
const std::vector<std::string>& args,
const std::string& device) {
sam_hdr_add_lines(hdr, "@HD\tVN:1.6\tSO:unknown", 0);
utils::add_hd_header_line(hdr);
auto safe_id = sam_hdr_pg_id(hdr, "basecaller");

std::stringstream pg;
pg << "@PG\tID:basecaller\tPN:dorado\tVN:" << DORADO_VERSION << "\tCL:dorado";
pg << "@PG\tID:" << safe_id << "\tPN:dorado\tVN:" << DORADO_VERSION << "\tCL:dorado";
for (const auto& arg : args) {
pg << " " << arg;
}
Expand Down
48 changes: 36 additions & 12 deletions dorado/cli/demux.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "read_pipeline/ProgressTracker.h"
#include "read_pipeline/read_output_progress_stats.h"
#include "summary/summary.h"
#include "utils/MergeHeaders.h"
#include "utils/SampleSheet.h"
#include "utils/bam_utils.h"
#include "utils/barcode_kits.h"
Expand All @@ -28,16 +29,29 @@ using namespace std::chrono_literals;
#include <unistd.h>
#endif

namespace dorado {

namespace {

void add_pg_hdr(sam_hdr_t* hdr) {
sam_hdr_add_line(hdr, "PG", "ID", "demux", "PN", "dorado", "VN", DORADO_VERSION, NULL);
sam_hdr_add_pg(hdr, "demux", "PN", "dorado", "VN", DORADO_VERSION, nullptr);
}

// This function allows us to map the reference id from input BAM records to what
// they should be in the output file, based on the new ordering of references in
// the merged header.
void adjust_tid(const std::vector<uint32_t>& mapping, dorado::BamPtr& record) {
auto tid = record.get()->core.tid;
if (tid >= 0) {
if (tid >= int32_t(mapping.size())) {
throw std::range_error("BAM tid field out of range with respect to SQ lines.");
}
record.get()->core.tid = int32_t(mapping.at(tid));
}
}

} // anonymous namespace

namespace dorado {

int demuxer(int argc, char* argv[]) {
cli::ArgParser parser("dorado demux");
parser.visible.add_description(
Expand Down Expand Up @@ -141,7 +155,8 @@ int demuxer(int argc, char* argv[]) {
return EXIT_FAILURE;
}

auto no_trim(parser.visible.get<bool>("--no-trim"));
auto no_trim(parser.visible.get<bool>("--no-trim") ||
parser.visible.get<bool>("--no-classify"));
auto sort_bam(parser.visible.get<bool>("--sort-bam"));
if (sort_bam && !no_trim) {
spdlog::error("If --sort-bam is specified then --no-trim must also be specified.");
Expand All @@ -161,6 +176,7 @@ int demuxer(int argc, char* argv[]) {
auto emit_summary = parser.visible.get<bool>("emit-summary");
auto threads(parser.visible.get<int>("threads"));
auto max_reads(parser.visible.get<int>("max-reads"));
auto strip_alignment = !no_trim;

alignment::AlignmentProcessingItems processing_items{reads, recursive_input, output_dir, true};
if (!processing_items.initialise()) {
Expand Down Expand Up @@ -200,24 +216,23 @@ int demuxer(int argc, char* argv[]) {
}

HtsReader reader(all_files[0].input, read_list);
auto header = SamHdrPtr(sam_hdr_dup(reader.header));
utils::MergeHeaders hdr_merger(strip_alignment);
hdr_merger.add_header(reader.header, all_files[0].input);

// Fold in the headers from all the other files in the input list.
for (size_t input_idx = 1; input_idx < all_files.size(); input_idx++) {
HtsReader header_reader(all_files[input_idx].input, read_list);
std::string error_msg;
if (!utils::sam_hdr_merge(header.get(), header_reader.header, error_msg)) {
auto error_msg = hdr_merger.add_header(header_reader.header, all_files[input_idx].input);
if (!error_msg.empty()) {
spdlog::error("Unable to combine headers from all input files: " + error_msg);
return EXIT_FAILURE;
}
}

hdr_merger.finalize_merge();
auto sq_mapping = hdr_merger.get_sq_mapping();
auto header = SamHdrPtr(sam_hdr_dup(hdr_merger.get_merged_header()));
add_pg_hdr(header.get());
if (!no_trim) {
// Remove SQ lines from header since alignment information
// is invalidated after trimming.
utils::strip_alignment_data_from_header(header.get());
}

auto barcode_sample_sheet = parser.visible.get<std::string>("--sample-sheet");
std::unique_ptr<const utils::SampleSheet> sample_sheet;
Expand Down Expand Up @@ -286,6 +301,10 @@ int demuxer(int argc, char* argv[]) {
// End stats counting setup.

spdlog::info("> starting barcode demuxing");
if (!strip_alignment) {
reader.set_record_mutator(
[&sq_mapping](BamPtr& record) { adjust_tid(sq_mapping[0], record); });
}
auto num_reads_in_file = reader.read(*pipeline, max_reads);
spdlog::trace("pushed to pipeline: {}", num_reads_in_file);

Expand All @@ -294,6 +313,11 @@ int demuxer(int argc, char* argv[]) {
// Barcode all the other files passed in
for (size_t input_idx = 1; input_idx < all_files.size(); input_idx++) {
HtsReader input_reader(all_files[input_idx].input, read_list);
if (!strip_alignment) {
input_reader.set_record_mutator([&sq_mapping, input_idx](BamPtr& record) {
adjust_tid(sq_mapping[input_idx], record);
});
}
num_reads_in_file = input_reader.read(*pipeline, max_reads);
spdlog::trace("pushed to pipeline: {}", num_reads_in_file);
progress_stats.update_reads_per_file_estimate(num_reads_in_file);
Expand Down
3 changes: 2 additions & 1 deletion dorado/cli/trim.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ using OutputMode = dorado::utils::HtsFile::OutputMode;
namespace {

void add_pg_hdr(sam_hdr_t* hdr) {
sam_hdr_add_line(hdr, "PG", "ID", "trim", "PN", "dorado", "VN", DORADO_VERSION, NULL);
sam_hdr_add_pg(hdr, "trim", "PN", "dorado", "VN", DORADO_VERSION, NULL);
}

} // anonymous namespace
Expand Down Expand Up @@ -116,6 +116,7 @@ int trim(int argc, char* argv[]) {

HtsReader reader(reads[0], read_list);
auto header = SamHdrPtr(sam_hdr_dup(reader.header));
utils::add_hd_header_line(header.get());
add_pg_hdr(header.get());
// Always remove alignment information from input header
// because at minimum the adapters are trimmed, which
Expand Down
7 changes: 7 additions & 0 deletions dorado/read_pipeline/HtsReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ HtsReader::~HtsReader() {
hts_close(m_file);
}

void HtsReader::set_record_mutator(std::function<void(BamPtr&)> mutator) {
m_record_mutator = std::move(mutator);
}

bool HtsReader::read() { return sam_read1(m_file, header, record.get()) >= 0; }

bool HtsReader::has_tag(const char* tagname) {
Expand All @@ -56,6 +60,9 @@ std::size_t HtsReader::read(Pipeline& pipeline, std::size_t max_reads) {
continue;
}
}
if (m_record_mutator) {
m_record_mutator(record);
}
pipeline.push_message(BamPtr(bam_dup1(record.get())));
++num_reads;
if (max_reads > 0 && num_reads >= max_reads) {
Expand Down
2 changes: 2 additions & 0 deletions dorado/read_pipeline/HtsReader.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ class HtsReader {
template <typename T>
T get_tag(const char* tagname);
bool has_tag(const char* tagname);
void set_record_mutator(std::function<void(BamPtr&)> mutator);

char* format{nullptr};
bool is_aligned{false};
Expand All @@ -38,6 +39,7 @@ class HtsReader {
private:
htsFile* m_file{nullptr};

std::function<void(BamPtr&)> m_record_mutator{};
std::optional<std::unordered_set<std::string>> m_read_list;
};

Expand Down
2 changes: 2 additions & 0 deletions dorado/utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ add_library(dorado_utils
math_utils.h
memory_utils.cpp
memory_utils.h
MergeHeaders.cpp
MergeHeaders.h
module_utils.h
parameters.cpp
parameters.h
Expand Down
Loading

0 comments on commit a3dce7e

Please sign in to comment.