Skip to content

Commit

Permalink
Merge branch 'DOR-633_demux_sort_bam_option' into 'release-v0.6.0'
Browse files Browse the repository at this point in the history
DOR-633 Demux sort bam option

See merge request machine-learning/dorado!917
  • Loading branch information
kdolan1973 committed Mar 26, 2024
2 parents 9dc052d + b9e669a commit f6b6554
Show file tree
Hide file tree
Showing 12 changed files with 47 additions and 19 deletions.
6 changes: 6 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,12 @@ SQK-RPB004_barcode03.bam
unclassified.bam
```

#### Demuxing mapped reads

If the input data files contain mapping data, this information can be preserved in the output files. To do this, you must use the `--no-trim` option. Trimming the barcodes will invalidate any mapping information that may be contained in the input files, and therefore the application will exclude any mapping information if `--no-trim` is not specified.

It is also possible to get the demux application to sort and index any output bam files that contain mapped reads. To enable this use the `--sort-bam` option. If you use this option then you must also use the `--no-trim` option, as trimming will prevent any mapping information from being included in the output files. Index files (.bai extension) will only be created for bam files that contain mapped reads and were sorted. Note that for large datasets sorting the output files may take a few minutes.

#### Using a sample sheet

Dorado is able to use a sample sheet to restrict the barcode classifications to only those present, and to apply aliases to the detected classifications. This is enabled by passing the path to a sample sheet to the `--sample-sheet` argument when using the `basecaller` or `demux` commands. See [here](documentation/SampleSheets.md) for more information.
Expand Down
2 changes: 1 addition & 1 deletion dorado/cli/aligner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ int aligner(int argc, char* argv[]) {
[&](size_t progress) {
tracker.update_post_processing_progress(static_cast<float>(progress));
},
writer_threads);
writer_threads, true);

tracker.summarize();

Expand Down
2 changes: 1 addition & 1 deletion dorado/cli/basecaller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -326,7 +326,7 @@ void setup(std::vector<std::string> args,
[&](size_t progress) {
tracker.update_post_processing_progress(static_cast<float>(progress));
},
thread_allocations.writer_threads);
thread_allocations.writer_threads, true);

// Give the user a nice summary.
tracker.summarize();
Expand Down
26 changes: 21 additions & 5 deletions dorado/cli/demux.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,15 @@ int demuxer(int argc, char* argv[]) {
.default_value(false)
.implicit_value(true);
parser.visible.add_argument("--no-trim")
.help("Skip barcode trimming. If option is not chosen, trimming is enabled.")
.help("Skip barcode trimming. If this option is not chosen, trimming is enabled. "
"Note that you should use this option if your input data is mapped and you "
"want to preserve the mapping in the output files, as trimming will result "
"in any mapping information from the input file(s) being discarded.")
.default_value(false)
.implicit_value(true);
parser.visible.add_argument("--sort-bam")
.help("Sort any BAM output files that contain mapped reads. Using this option "
"requires that the --no-trim option is also set.")
.default_value(false)
.implicit_value(true);
parser.visible.add_argument("--barcode-arrangement")
Expand All @@ -133,6 +141,13 @@ int demuxer(int argc, char* argv[]) {
std::exit(1);
}

auto no_trim(parser.visible.get<bool>("--no-trim"));
auto sort_bam(parser.visible.get<bool>("--sort-bam"));
if (sort_bam && !no_trim) {
spdlog::error("If --sort-bam is specified then --no-trim must also be specified.");
std::exit(1);
}

auto progress_stats_frequency(parser.hidden.get<int>("progress_stats_frequency"));
if (progress_stats_frequency > 0) {
utils::EnsureInfoLoggingEnabled(static_cast<dorado::utils::VerboseLogLevel>(verbosity));
Expand All @@ -146,7 +161,6 @@ int demuxer(int argc, char* argv[]) {
auto emit_summary = parser.visible.get<bool>("emit-summary");
auto threads(parser.visible.get<int>("threads"));
auto max_reads(parser.visible.get<int>("max-reads"));
auto no_trim(parser.visible.get<bool>("--no-trim"));

alignment::AlignmentProcessingItems processing_items{reads, recursive_input, output_dir, true};
if (!processing_items.initialise()) {
Expand Down Expand Up @@ -287,9 +301,11 @@ int demuxer(int argc, char* argv[]) {

// Finalise the files that were created.
tracker.set_description("Sorting output files");
demux_writer_ref.finalise_hts_files([&](size_t progress) {
tracker.update_post_processing_progress(static_cast<float>(progress));
});
demux_writer_ref.finalise_hts_files(
[&](size_t progress) {
tracker.update_post_processing_progress(static_cast<float>(progress));
},
sort_bam);

tracker.summarize();
progress_stats.notify_stats_collector_completed(final_stats);
Expand Down
2 changes: 1 addition & 1 deletion dorado/cli/duplex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -559,7 +559,7 @@ int duplex(int argc, char* argv[]) {
[&](size_t progress) {
tracker.update_post_processing_progress(static_cast<float>(progress));
},
WRITER_THREADS);
WRITER_THREADS, true);

tracker.summarize();
if (!dump_stats_file.empty()) {
Expand Down
2 changes: 1 addition & 1 deletion dorado/cli/trim.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ int trim(int argc, char* argv[]) {
[&](size_t progress) {
tracker.update_post_processing_progress(static_cast<float>(progress));
},
trim_writer_threads);
trim_writer_threads, false);
tracker.summarize();

spdlog::info("> finished adapter/primer trimming");
Expand Down
5 changes: 3 additions & 2 deletions dorado/read_pipeline/BarcodeDemuxerNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ void BarcodeDemuxerNode::set_header(const sam_hdr_t* const header) {
}

void BarcodeDemuxerNode::finalise_hts_files(
const utils::HtsFile::ProgressCallback& progress_callback) {
const utils::HtsFile::ProgressCallback& progress_callback,
bool sort_if_mapped) {
const size_t num_files = m_files.size();
size_t current_file_idx = 0;
for (auto& [bc, hts_file] : m_files) {
Expand All @@ -98,7 +99,7 @@ void BarcodeDemuxerNode::finalise_hts_files(
const size_t total_progress = (current_file_idx * 100 + progress) / num_files;
progress_callback(total_progress);
},
m_htslib_threads);
m_htslib_threads, sort_if_mapped);
++current_file_idx;
}

Expand Down
3 changes: 2 additions & 1 deletion dorado/read_pipeline/BarcodeDemuxerNode.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@ class BarcodeDemuxerNode : public MessageSink {

// Finalisation must occur before destruction of this node.
// Note that this isn't safe to call until after this node has been terminated.
void finalise_hts_files(const utils::HtsFile::ProgressCallback& progress_callback);
void finalise_hts_files(const utils::HtsFile::ProgressCallback& progress_callback,
bool sort_bam);

private:
std::filesystem::path m_output_dir;
Expand Down
8 changes: 5 additions & 3 deletions dorado/utils/hts_file.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,9 @@ HtsFile::~HtsFile() {
// in order to generate a map of sort coordinates to virtual file offsets. we can then jump around in the
// file to write out the records in the sorted order. finally we can delete the unsorted file.
// in case an error occurs, the unsorted file is left on disk, so users can recover their data.
void HtsFile::finalise(const ProgressCallback& progress_callback, int writer_threads) {
void HtsFile::finalise(const ProgressCallback& progress_callback,
int writer_threads,
bool sort_if_mapped) {
assert(progress_callback);

// Rough divisions of how far through we are at the start of each section.
Expand Down Expand Up @@ -120,7 +122,7 @@ void HtsFile::finalise(const ProgressCallback& progress_callback, int writer_thr

SamHdrPtr in_header(sam_hdr_read(in_file.get()));
file_is_mapped = (sam_hdr_nref(in_header.get()) > 0);
if (file_is_mapped) {
if (file_is_mapped && sort_if_mapped) {
// We only need to sort and index the file if contains mapped reads.
HtsFilePtr out_file(hts_open(filepath.string().c_str(), "wb"));
if (bgzf_mt(out_file->fp.bgzf, writer_threads, 128) < 0) {
Expand Down Expand Up @@ -177,7 +179,7 @@ void HtsFile::finalise(const ProgressCallback& progress_callback, int writer_thr
}
}

if (file_is_mapped) {
if (file_is_mapped && sort_if_mapped) {
progress_callback(percent_start_indexing);
if (sam_index_build(filepath.string().c_str(), 0) < 0) {
spdlog::error("Failed to build index for file {}", filepath.string());
Expand Down
4 changes: 3 additions & 1 deletion dorado/utils/hts_file.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ class HtsFile {
int write(const bam1_t* record);

bool finalise_is_noop() const { return m_finalise_is_noop; }
void finalise(const ProgressCallback& progress_callback, int writer_threads);
void finalise(const ProgressCallback& progress_callback,
int writer_threads,
bool sort_if_mapped);

private:
HtsFilePtr m_file;
Expand Down
4 changes: 2 additions & 2 deletions tests/BamWriterTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ class HtsWriterTestsFixture {
auto& writer_ref = dynamic_cast<HtsWriter&>(pipeline->get_node_ref(writer));
stats = writer_ref.sample_stats();

hts_file.finalise([](size_t) { /* noop */ }, num_threads);
hts_file.finalise([](size_t) { /* noop */ }, num_threads, true);
}

stats::NamedStats stats;
Expand Down Expand Up @@ -97,7 +97,7 @@ TEST_CASE("HtsWriterTest: Read and write FASTQ with tag", TEST_GROUP) {
CHECK_THAT(bam_aux2Z(bam_aux_get(reader.record.get(), "st")),
Equals("2023-06-22T07:17:48.308+00:00"));
writer.write(reader.record.get());
hts_file.finalise([](size_t) { /* noop */ }, 2);
hts_file.finalise([](size_t) { /* noop */ }, 2, false);
}

// Read temporary file to make sure tags were correctly set.
Expand Down
2 changes: 1 addition & 1 deletion tests/BarcodeDemuxerNodeTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ TEST_CASE("BarcodeDemuxerNode: check correct output files are created", TEST_GRO

pipeline->terminate(DefaultFlushOptions());

demux_writer_ref.finalise_hts_files([](size_t) { /* noop */ });
demux_writer_ref.finalise_hts_files([](size_t) { /* noop */ }, true);

const std::unordered_set<std::string> expected_files = {
"bc01.bam", "bc01.bam.bai", "bc02.bam", "bc02.bam.bai", "bc03.bam", "bc03.bam.bai",
Expand Down

0 comments on commit f6b6554

Please sign in to comment.