Skip to content

Commit

Permalink
Add barcode arrangements RG header generation when using a custom kit
Browse files Browse the repository at this point in the history
  • Loading branch information
hpendry-ont committed Mar 20, 2024
1 parent 0a9e40a commit 806ad9b
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 52 deletions.
16 changes: 15 additions & 1 deletion dorado/cli/basecaller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include "utils/fs_utils.h"
#include "utils/log_utils.h"
#include "utils/parameters.h"
#include "utils/parse_custom_kit.h"
#include "utils/stats.h"
#include "utils/string_utils.h"
#include "utils/sys_stats.h"
Expand Down Expand Up @@ -149,7 +150,18 @@ void setup(std::vector<std::string> args,

SamHdrPtr hdr(sam_hdr_init());
cli::add_pg_hdr(hdr.get(), args);
utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits, sample_sheet.get());
if (custom_kit) {
auto custom_kit_info = barcode_kits::parse_custom_arrangement(*custom_kit);
if (!custom_kit_info) {
spdlog::error("Unable to load custom barcode arrangement file: {}", *custom_kit);
std::exit(EXIT_FAILURE);
}
auto [kit_name, kit_info] = *custom_kit_info;
utils::add_rg_hdr_for_custom_barcode_kit(hdr.get(), read_groups, kit_name, kit_info,
sample_sheet.get());
} else {
utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits, sample_sheet.get());
}

utils::HtsFile hts_file("-", output_mode, thread_allocations.writer_threads);

Expand Down Expand Up @@ -606,6 +618,7 @@ int basecaller(int argc, char* argv[]) {
spdlog::info("> Creating basecall pipeline");

try {
/* clang format off */
setup(args, model_path, data, mods_model_paths, parser.visible.get<std::string>("-x"),
parser.visible.get<std::string>("--reference"), parser.visible.get<int>("-c"),
parser.visible.get<int>("-o"), parser.visible.get<int>("-b"),
Expand All @@ -625,6 +638,7 @@ int basecaller(int argc, char* argv[]) {
std::move(custom_kit), std::move(custom_barcode_seqs), std::move(custom_primer_file),
resume_parser, parser.visible.get<bool>("--estimate-poly-a"),
polya_config.empty() ? nullptr : &polya_config, model_selection);
/* clang format on */
} catch (const std::exception& e) {
spdlog::error("{}", e.what());
utils::clean_temporary_models(temp_download_paths);
Expand Down
93 changes: 42 additions & 51 deletions dorado/utils/bam_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,30 @@ void emit_read_group(sam_hdr_t* hdr,
sam_hdr_add_lines(hdr, line.c_str(), 0);
}

void emit_read_groups_for_barcode_kit(const std::unordered_map<std::string, ReadGroup>& read_groups,
std::string read_group_to_string(const dorado::ReadGroup& read_group) {
auto value_or_unknown = [](std::string_view s) { return s.empty() ? "unknown" : s; };
std::ostringstream rg;
{
rg << "PU:" << value_or_unknown(read_group.flowcell_id) << "\t";
rg << "PM:" << value_or_unknown(read_group.device_id) << "\t";
rg << "DT:" << value_or_unknown(read_group.exp_start_time) << "\t";
rg << "PL:"
<< "ONT"
<< "\t";
rg << "DS:"
<< "basecall_model=" << value_or_unknown(read_group.basecalling_model)
<< (read_group.modbase_models.empty() ? ""
: (" modbase_models=" + read_group.modbase_models))
<< " runid=" << value_or_unknown(read_group.run_id) << "\t";
rg << "LB:" << value_or_unknown(read_group.sample_id) << "\t";
rg << "SM:" << value_or_unknown(read_group.sample_id);
}
return rg.str();
}

void emit_read_groups_for_barcode_kit(sam_hdr_t* hdr,
const std::unordered_map<std::string, ReadGroup>& read_groups,
const std::string& kit_name,
const barcode_kits::KitInfo& kit_info,
const utils::SampleSheet* const sample_sheet) {
const auto& barcode_sequences = barcode_kits::get_barcodes();
Expand Down Expand Up @@ -72,27 +95,6 @@ void emit_read_groups_for_barcode_kit(const std::unordered_map<std::string, Read
}
}

std::string read_group_to_string(const dorado::ReadGroup& read_group) {
auto value_or_unknown = [](std::string_view s) { return s.empty() ? "unknown" : s; };
std::ostringstream rg;
{
rg << "PU:" << value_or_unknown(read_group.flowcell_id) << "\t";
rg << "PM:" << value_or_unknown(read_group.device_id) << "\t";
rg << "DT:" << value_or_unknown(read_group.exp_start_time) << "\t";
rg << "PL:"
<< "ONT"
<< "\t";
rg << "DS:"
<< "basecall_model=" << value_or_unknown(read_group.basecalling_model)
<< (read_group.modbase_models.empty() ? ""
: (" modbase_models=" + read_group.modbase_models))
<< " runid=" << value_or_unknown(read_group.run_id) << "\t";
rg << "LB:" << value_or_unknown(read_group.sample_id) << "\t";
rg << "SM:" << value_or_unknown(read_group.sample_id);
}
return rg.str();
}

} // namespace

kstring_t allocate_kstring() {
Expand All @@ -105,52 +107,41 @@ void add_rg_hdr(sam_hdr_t* hdr,
const std::unordered_map<std::string, ReadGroup>& read_groups,
const std::vector<std::string>& barcode_kits,
const utils::SampleSheet* const sample_sheet) {
const auto& barcode_kit_infos = barcode_kits::get_kit_infos();
const auto& barcode_sequences = barcode_kits::get_barcodes();

// Emit read group headers without a barcode arrangement.
for (const auto& read_group : read_groups) {
const std::string read_group_tags = read_group_to_string(read_group.second);
emit_read_group(hdr, read_group_tags, read_group.first, {});
}

// Emit read group headers for each barcode arrangement.
const auto& barcode_kit_infos = barcode_kits::get_kit_infos();
for (const auto& kit_name : barcode_kits) {
auto kit_iter = barcode_kit_infos.find(kit_name);
if (kit_iter == barcode_kit_infos.end()) {
throw std::runtime_error(kit_name +
" is not a valid barcode kit name. Please run the help "
"command to find out available barcode kits.");
}
emit_read_groups_for_barcode_kit(read_groups, kit_iter->second, sample_sheet);
//const auto& kit_info = kit_iter->second;
//for (const auto& barcode_name : kit_info.barcodes) {
// const auto additional_tags = "\tBC:" + barcode_sequences.at(barcode_name);
// const auto normalized_barcode_name = barcode_kits::normalize_barcode_name(barcode_name);
// for (const auto& read_group : read_groups) {
// std::string alias;
// auto id = read_group.first + '_';
// if (sample_sheet) {
// if (!sample_sheet->barcode_is_permitted(normalized_barcode_name)) {
// continue;
// }

// alias = sample_sheet->get_alias(
// read_group.second.flowcell_id, read_group.second.position_id,
// read_group.second.experiment_id, normalized_barcode_name);
// }
// if (!alias.empty()) {
// id += alias;
// } else {
// id += barcode_kits::generate_standard_barcode_name(kit_name, barcode_name);
// }
// const std::string read_group_tags = read_group_to_string(read_group.second);
// emit_read_group(hdr, read_group_tags, id, additional_tags);
// }
//}
emit_read_groups_for_barcode_kit(hdr, read_groups, kit_name, kit_iter->second,
sample_sheet);
}
}

void add_rg_hdr_for_custom_barcode_kit(
sam_hdr_t* hdr,
const std::unordered_map<std::string, ReadGroup>& read_groups,
const std::string& kit_name,
const barcode_kits::KitInfo& kit_info,
const utils::SampleSheet* const sample_sheet) {
// Emit read group headers without a barcode arrangement.
for (const auto& read_group : read_groups) {
const std::string read_group_tags = read_group_to_string(read_group.second);
emit_read_group(hdr, read_group_tags, read_group.first, {});
}

emit_read_groups_for_barcode_kit(hdr, read_groups, kit_name, kit_info, sample_sheet);
}

void add_sq_hdr(sam_hdr_t* hdr, const sq_t& seqs) {
for (auto pair : seqs) {
char* name;
Expand Down
8 changes: 8 additions & 0 deletions dorado/utils/bam_utils.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#pragma once
#include "barcode_kits.h"
#include "types.h"

#include <map>
Expand Down Expand Up @@ -29,6 +30,13 @@ void add_rg_hdr(sam_hdr_t* hdr,
const std::vector<std::string>& barcode_kits,
const utils::SampleSheet* const sample_sheet);

void add_rg_hdr_for_custom_barcode_kit(
sam_hdr_t* hdr,
const std::unordered_map<std::string, ReadGroup>& read_groups,
const std::string& kit_name,
const barcode_kits::KitInfo& kit_info,
const utils::SampleSheet* const sample_sheet);

void add_sq_hdr(sam_hdr_t* hdr, const sq_t& seqs);

void strip_sq_hdr(sam_hdr_t* hdr);
Expand Down

0 comments on commit 806ad9b

Please sign in to comment.