Skip to content

Commit

Permalink
Merge branch 'DOR-585_demux_and_align_summary' into 'master'
Browse files Browse the repository at this point in the history
DOR-585 Demux and aligner --emit-summary option

Closes DOR-585

See merge request machine-learning/dorado!866
  • Loading branch information
kdolan1973 committed Mar 1, 2024
2 parents 67e646b + 5326bb6 commit d994a4d
Show file tree
Hide file tree
Showing 7 changed files with 374 additions and 153 deletions.
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -268,6 +268,8 @@ set(LIB_SOURCE_FILES
dorado/poly_tail/poly_tail_config.h
dorado/poly_tail/rna_poly_tail_calculator.cpp
dorado/poly_tail/rna_poly_tail_calculator.h
dorado/summary/summary.cpp
dorado/summary/summary.h
)

add_library(dorado_lib ${LIB_SOURCE_FILES})
Expand Down
26 changes: 26 additions & 0 deletions dorado/cli/aligner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "read_pipeline/HtsReader.h"
#include "read_pipeline/HtsWriter.h"
#include "read_pipeline/ProgressTracker.h"
#include "summary/summary.h"
#include "utils/PostCondition.h"
#include "utils/bam_utils.h"
#include "utils/log_utils.h"
Expand All @@ -17,6 +18,7 @@
#include <algorithm>
#include <chrono>
#include <filesystem>
#include <fstream>
#include <memory>
#include <string>
#include <thread>
Expand Down Expand Up @@ -101,6 +103,15 @@ int aligner(int argc, char* argv[]) {
"is to stdout. "
"Required if the 'reads' positional argument is a folder.")
.default_value(std::string{});
parser.visible.add_argument("--emit-summary")
.help("If specified, a summary file containing the details of the primary alignments "
"for each "
"read will be emitted to the root of the output folder. This option requires "
"that the "
"'--output-dir' option is also set.")
.default_value(false)
.implicit_value(true)
.nargs(0);
parser.visible.add_argument("-t", "--threads")
.help("number of threads for alignment and BAM writing.")
.default_value(0)
Expand Down Expand Up @@ -138,6 +149,12 @@ int aligner(int argc, char* argv[]) {
auto recursive_input = parser.visible.get<bool>("recursive");
auto output_folder = parser.visible.get<std::string>("output-dir");

auto emit_summary = parser.visible.get<bool>("emit-summary");
if (emit_summary && output_folder.empty()) {
spdlog::error("Cannot specify '--emit-summary' if '--output-dir' is not also specified.");
return EXIT_FAILURE;
}

auto threads(parser.visible.get<int>("threads"));

auto max_reads(parser.visible.get<int>("max-reads"));
Expand Down Expand Up @@ -232,6 +249,15 @@ int aligner(int argc, char* argv[]) {
hts_writer_ref.get_primary(), hts_writer_ref.get_unmapped());
}

if (emit_summary) {
spdlog::info("> generating summary file");
SummaryData summary(SummaryData::ALIGNMENT_FIELDS);
auto summary_file = std::filesystem::path(output_folder) / "alignment_summary.txt";
std::ofstream summary_out(summary_file.string());
summary.process_tree(output_folder, summary_out);
spdlog::info("> summary file complete.");
}

return 0;
}

Expand Down
17 changes: 17 additions & 0 deletions dorado/cli/demux.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "read_pipeline/BarcodeDemuxerNode.h"
#include "read_pipeline/HtsReader.h"
#include "read_pipeline/ProgressTracker.h"
#include "summary/summary.h"
#include "utils/SampleSheet.h"
#include "utils/barcode_kits.h"
#include "utils/basecaller_utils.h"
Expand Down Expand Up @@ -79,6 +80,13 @@ int demuxer(int argc, char* argv[]) {
.help("Output in fastq format. Default is BAM.")
.default_value(false)
.implicit_value(true);
parser.add_argument("--emit-summary")
.help("If specified, a summary file containing the details of the primary alignments "
"for each "
"read will be emitted to the root of the output folder.")
.default_value(false)
.implicit_value(true)
.nargs(0);
parser.add_argument("--barcode-both-ends")
.help("Require both ends of a read to be barcoded for a double ended barcode.")
.default_value(false)
Expand Down Expand Up @@ -115,6 +123,7 @@ int demuxer(int argc, char* argv[]) {

auto reads(parser.get<std::vector<std::string>>("reads"));
auto output_dir(parser.get<std::string>("output-dir"));
auto emit_summary = parser.get<bool>("emit-summary");
auto threads(parser.get<int>("threads"));
auto max_reads(parser.get<int>("max-reads"));

Expand Down Expand Up @@ -216,6 +225,14 @@ int demuxer(int argc, char* argv[]) {
tracker.summarize();

spdlog::info("> finished barcode demuxing");
if (emit_summary) {
spdlog::info("> generating summary file");
SummaryData summary(SummaryData::BARCODING_FIELDS);
auto summary_file = std::filesystem::path(output_dir) / "barcoding_summary.txt";
std::ofstream summary_out(summary_file.string());
summary.process_tree(output_dir, summary_out);
spdlog::info("> summary file complete.");
}

return 0;
}
Expand Down
157 changes: 5 additions & 152 deletions dorado/cli/summary.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#include "summary/summary.h"

#include "Version.h"
#include "read_pipeline/HtsReader.h"
#include "utils/bam_utils.h"
Expand Down Expand Up @@ -42,161 +44,12 @@ int summary(int argc, char *argv[]) {
utils::SetVerboseLogging(static_cast<dorado::utils::VerboseLogLevel>(verbosity));
}

std::vector<std::string> header = {
"filename",
"read_id",
"run_id",
"channel",
"mux",
"start_time",
"duration",
"template_start",
"template_duration",
"sequence_length_template",
"mean_qscore_template",
"barcode",
};

std::vector<std::string> aligned_header = {
"alignment_genome", "alignment_genome_start", "alignment_genome_end",
"alignment_strand_start", "alignment_strand_end", "alignment_direction",
"alignment_length", "alignment_num_aligned", "alignment_num_correct",
"alignment_num_insertions", "alignment_num_deletions", "alignment_num_substitutions",
"alignment_mapq", "alignment_strand_coverage", "alignment_identity",
"alignment_accuracy"};

auto reads(parser.get<std::string>("reads"));
auto separator(parser.get<std::string>("separator"));

HtsReader reader(reads, std::nullopt);

auto read_group_exp_start_time = utils::get_read_group_info(reader.header, "DT");

spdlog::debug("> input fmt: {} aligned: {}", reader.format, reader.is_aligned);
#ifndef _WIN32
std::signal(SIGPIPE, [](int) { interrupt = 1; });
#endif
std::signal(SIGINT, [](int) { interrupt = 1; });

for (size_t col = 0; col < header.size() - 1; col++) {
std::cout << header[col] << separator;
}
std::cout << header[header.size() - 1];

if (reader.is_aligned) {
for (size_t col = 0; col < aligned_header.size() - 1; col++) {
std::cout << separator << aligned_header[col];
}
std::cout << separator << aligned_header[aligned_header.size() - 1];
}

std::cout << '\n';

while (reader.read() && !interrupt) {
if (reader.record->core.flag & (BAM_FSECONDARY | BAM_FSUPPLEMENTARY)) {
continue;
}

auto rg_value = reader.get_tag<std::string>("RG");
if (rg_value.length() == 0) {
spdlog::error("> Cannot generate sequencing summary for files with no RG tags");
return 1;
}
auto rg_split = rg_value.find("_");
auto run_id = rg_value.substr(0, rg_split);
auto model = rg_value.substr(rg_split + 1, rg_value.length());

auto filename = reader.get_tag<std::string>("f5");
if (filename.empty()) {
filename = reader.get_tag<std::string>("fn");
}
auto read_id = bam_get_qname(reader.record);
auto channel = reader.get_tag<int>("ch");
auto mux = reader.get_tag<int>("mx");

auto start_time_dt = reader.get_tag<std::string>("st");
auto duration = reader.get_tag<float>("du");

auto seqlen = reader.record->core.l_qseq;
auto mean_qscore = reader.get_tag<int>("qs");

auto num_samples = reader.get_tag<int>("ns");
auto trim_samples = reader.get_tag<int>("ts");

auto barcode = reader.get_tag<std::string>("BC");
if (barcode.empty()) {
barcode = "unclassified";
}

float sample_rate = num_samples / duration;
float template_duration = (num_samples - trim_samples) / sample_rate;
auto exp_start_dt = read_group_exp_start_time.at(rg_value);
auto start_time = utils::time_difference_seconds(start_time_dt, exp_start_dt);
auto template_start_time = start_time + (duration - template_duration);

std::cout << filename << separator << read_id << separator << run_id << separator << channel
<< separator << mux << separator << start_time << separator << duration
<< separator << template_start_time << separator << template_duration << separator
<< seqlen << separator << mean_qscore << separator << barcode;

if (reader.is_aligned) {
std::string alignment_genome = "*";
int32_t alignment_genome_start = -1;
int32_t alignment_genome_end = -1;
int32_t alignment_strand_start = -1;
int32_t alignment_strand_end = -1;
std::string alignment_direction = "*";
int32_t alignment_length = 0;
int32_t alignment_mapq = 0;
int alignment_num_aligned = 0;
int alignment_num_correct = 0;
int alignment_num_insertions = 0;
int alignment_num_deletions = 0;
int alignment_num_substitutions = 0;
float strand_coverage = 0.0;
float alignment_identity = 0.0;
float alignment_accurary = 0.0;

if (!(reader.record->core.flag & BAM_FUNMAP)) {
alignment_mapq = static_cast<int>(reader.record->core.qual);
alignment_genome = reader.header->target_name[reader.record->core.tid];

alignment_genome_start = int32_t(reader.record->core.pos);
alignment_genome_end = int32_t(bam_endpos(reader.record.get()));
alignment_direction = bam_is_rev(reader.record) ? "-" : "+";

auto alignment_counts = utils::get_alignment_op_counts(reader.record.get());
alignment_num_aligned = int(alignment_counts.matches);
alignment_num_correct =
int(alignment_counts.matches - alignment_counts.substitutions);
alignment_num_insertions = int(alignment_counts.insertions);
alignment_num_deletions = int(alignment_counts.deletions);
alignment_num_substitutions = int(alignment_counts.substitutions);
alignment_length = int(alignment_counts.matches + alignment_counts.insertions +
alignment_counts.deletions);
alignment_strand_start = int(alignment_counts.softclip_start);
alignment_strand_end = int(seqlen - alignment_counts.softclip_end);

strand_coverage = (alignment_strand_end - alignment_strand_start) /
static_cast<float>(seqlen);
alignment_identity =
alignment_num_correct / static_cast<float>(alignment_counts.matches);
alignment_accurary = alignment_num_correct / static_cast<float>(alignment_length);
}

std::cout << separator << alignment_genome << separator << alignment_genome_start
<< separator << alignment_genome_end << separator << alignment_strand_start
<< separator << alignment_strand_end << separator << alignment_direction
<< separator << alignment_length << separator << alignment_num_aligned
<< separator << alignment_num_correct << separator << alignment_num_insertions
<< separator << alignment_num_deletions << separator
<< alignment_num_substitutions << separator << alignment_mapq << separator
<< strand_coverage << separator << alignment_identity << separator
<< alignment_accurary;
}

std::cout << '\n';
}
SummaryData summary;
summary.set_separator(separator[0]);
summary.process_file(reads, std::cout);

return 0;
}
Expand Down
Loading

0 comments on commit d994a4d

Please sign in to comment.