Skip to content

Commit

Permalink
Refactored adding read group headers separating custom from default k…
Browse files Browse the repository at this point in the history
…its to avoid a clunky api
  • Loading branch information
hpendry-ont committed Mar 20, 2024
1 parent 806ad9b commit 3f2bc41
Show file tree
Hide file tree
Showing 7 changed files with 79 additions and 90 deletions.
40 changes: 31 additions & 9 deletions dorado/cli/basecaller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,31 @@

namespace dorado {

namespace {

const barcode_kits::KitInfo& get_barcode_kit_info(const std::string& kit_name) {
const auto kit_info = barcode_kits::get_kit_info(kit_name);
if (!kit_info) {
spdlog::error(
"{} is not a valid barcode kit name. Please run the help "
"command to find out available barcode kits.");
std::exit(EXIT_FAILURE);
}
return *kit_info;
}

std::pair<std::string, barcode_kits::KitInfo> get_custom_barcode_kit_info(
const std::string& custom_kit_file) {
auto custom_kit_info = barcode_kits::parse_custom_arrangement(custom_kit_file);
if (!custom_kit_info) {
spdlog::error("Unable to load custom barcode arrangement file: {}", custom_kit_file);
std::exit(EXIT_FAILURE);
}
return *custom_kit_info;
}

} // namespace

using dorado::utils::default_parameters;
using OutputMode = dorado::utils::HtsFile::OutputMode;
using namespace std::chrono_literals;
Expand Down Expand Up @@ -151,16 +176,13 @@ void setup(std::vector<std::string> args,
SamHdrPtr hdr(sam_hdr_init());
cli::add_pg_hdr(hdr.get(), args);
if (custom_kit) {
auto custom_kit_info = barcode_kits::parse_custom_arrangement(*custom_kit);
if (!custom_kit_info) {
spdlog::error("Unable to load custom barcode arrangement file: {}", *custom_kit);
std::exit(EXIT_FAILURE);
}
auto [kit_name, kit_info] = *custom_kit_info;
utils::add_rg_hdr_for_custom_barcode_kit(hdr.get(), read_groups, kit_name, kit_info,
sample_sheet.get());
auto [kit_name, kit_info] = get_custom_barcode_kit_info(*custom_kit);
utils::add_rg_headers_with_barcode_kit(hdr.get(), read_groups, kit_name, kit_info,
sample_sheet.get());
} else {
utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits, sample_sheet.get());
const auto kit_info = get_barcode_kit_info(barcode_kits[0]);
utils::add_rg_headers_with_barcode_kit(hdr.get(), read_groups, barcode_kits[0], kit_info,
sample_sheet.get());
}

utils::HtsFile hts_file("-", output_mode, thread_allocations.writer_threads);
Expand Down
3 changes: 1 addition & 2 deletions dorado/cli/duplex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -462,8 +462,7 @@ int duplex(int argc, char* argv[]) {
recursive_file_loading);
read_groups.merge(DataLoader::load_read_groups(reads, duplex_rg_name, "",
recursive_file_loading));
std::vector<std::string> barcode_kits;
utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits, nullptr);
utils::add_rg_headers(hdr.get(), read_groups);

int batch_size(parser.visible.get<int>("-b"));
int chunk_size(parser.visible.get<int>("-c"));
Expand Down
49 changes: 13 additions & 36 deletions dorado/utils/bam_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,11 +63,11 @@ std::string read_group_to_string(const dorado::ReadGroup& read_group) {
return rg.str();
}

void emit_read_groups_for_barcode_kit(sam_hdr_t* hdr,
const std::unordered_map<std::string, ReadGroup>& read_groups,
const std::string& kit_name,
const barcode_kits::KitInfo& kit_info,
const utils::SampleSheet* const sample_sheet) {
void add_barcode_kit_rg_hdrs(sam_hdr_t* hdr,
const std::unordered_map<std::string, ReadGroup>& read_groups,
const std::string& kit_name,
const barcode_kits::KitInfo& kit_info,
const utils::SampleSheet* const sample_sheet) {
const auto& barcode_sequences = barcode_kits::get_barcodes();
for (const auto& barcode_name : kit_info.barcodes) {
const auto additional_tags = "\tBC:" + barcode_sequences.at(barcode_name);
Expand Down Expand Up @@ -103,43 +103,20 @@ kstring_t allocate_kstring() {
return str;
}

void add_rg_hdr(sam_hdr_t* hdr,
const std::unordered_map<std::string, ReadGroup>& read_groups,
const std::vector<std::string>& barcode_kits,
const utils::SampleSheet* const sample_sheet) {
// Emit read group headers without a barcode arrangement.
void add_rg_headers(sam_hdr_t* hdr, const std::unordered_map<std::string, ReadGroup>& read_groups) {
for (const auto& read_group : read_groups) {
const std::string read_group_tags = read_group_to_string(read_group.second);
emit_read_group(hdr, read_group_tags, read_group.first, {});
}

// Emit read group headers for each barcode arrangement.
const auto& barcode_kit_infos = barcode_kits::get_kit_infos();
for (const auto& kit_name : barcode_kits) {
auto kit_iter = barcode_kit_infos.find(kit_name);
if (kit_iter == barcode_kit_infos.end()) {
throw std::runtime_error(kit_name +
" is not a valid barcode kit name. Please run the help "
"command to find out available barcode kits.");
}
emit_read_groups_for_barcode_kit(hdr, read_groups, kit_name, kit_iter->second,
sample_sheet);
}
}

void add_rg_hdr_for_custom_barcode_kit(
sam_hdr_t* hdr,
const std::unordered_map<std::string, ReadGroup>& read_groups,
const std::string& kit_name,
const barcode_kits::KitInfo& kit_info,
const utils::SampleSheet* const sample_sheet) {
// Emit read group headers without a barcode arrangement.
for (const auto& read_group : read_groups) {
const std::string read_group_tags = read_group_to_string(read_group.second);
emit_read_group(hdr, read_group_tags, read_group.first, {});
}

emit_read_groups_for_barcode_kit(hdr, read_groups, kit_name, kit_info, sample_sheet);
void add_rg_headers_with_barcode_kit(sam_hdr_t* hdr,
const std::unordered_map<std::string, ReadGroup>& read_groups,
const std::string& kit_name,
const barcode_kits::KitInfo& kit_info,
const utils::SampleSheet* const sample_sheet) {
add_rg_headers(hdr, read_groups);
add_barcode_kit_rg_hdrs(hdr, read_groups, kit_name, kit_info, sample_sheet);
}

void add_sq_hdr(sam_hdr_t* hdr, const sq_t& seqs) {
Expand Down
18 changes: 7 additions & 11 deletions dorado/utils/bam_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,13 @@ struct AlignmentOps {
size_t substitutions;
};

void add_rg_hdr(sam_hdr_t* hdr,
const std::unordered_map<std::string, ReadGroup>& read_groups,
const std::vector<std::string>& barcode_kits,
const utils::SampleSheet* const sample_sheet);

void add_rg_hdr_for_custom_barcode_kit(
sam_hdr_t* hdr,
const std::unordered_map<std::string, ReadGroup>& read_groups,
const std::string& kit_name,
const barcode_kits::KitInfo& kit_info,
const utils::SampleSheet* const sample_sheet);
void add_rg_headers(sam_hdr_t* hdr, const std::unordered_map<std::string, ReadGroup>& read_groups);

void add_rg_headers_with_barcode_kit(sam_hdr_t* hdr,
const std::unordered_map<std::string, ReadGroup>& read_groups,
const std::string& kit_name,
const barcode_kits::KitInfo& kit_info,
const utils::SampleSheet* const sample_sheet);

void add_sq_hdr(sam_hdr_t* hdr, const sq_t& seqs);

Expand Down
9 changes: 9 additions & 0 deletions dorado/utils/barcode_kits.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -788,6 +788,15 @@ const std::unordered_map<std::string, std::string> barcodes = {

const std::unordered_map<std::string, KitInfo>& get_kit_infos() { return kit_info_map; }

const KitInfo* get_kit_info(const std::string& kit_name) {
const auto& barcode_kit_infos = get_kit_infos();
auto kit_iter = get_kit_infos().find(kit_name);
if (kit_iter == barcode_kit_infos.end()) {
return nullptr;
}
return &kit_iter->second;
}

const std::unordered_map<std::string, std::string>& get_barcodes() { return barcodes; }

const std::unordered_set<std::string>& get_barcode_identifiers() {
Expand Down
2 changes: 2 additions & 0 deletions dorado/utils/barcode_kits.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ struct KitInfo {
};

const std::unordered_map<std::string, KitInfo>& get_kit_infos();
const KitInfo* get_kit_info(const std::string& kit_name);
const std::unordered_map<std::string, std::string>& get_barcodes();
const std::unordered_set<std::string>& get_barcode_identifiers();

std::string barcode_kits_list_str();

std::string normalize_barcode_name(const std::string& barcode_name);
Expand Down
48 changes: 16 additions & 32 deletions tests/BamUtilsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ TEST_CASE("BamUtilsTest: fetch keys from PG header", TEST_GROUP) {
"5mCG --emit-sam");
}

TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) {
TEST_CASE("BamUtilsTest: Add read group headers scenarios", TEST_GROUP) {
auto has_read_group_header = [](sam_hdr_t *ptr, const char *id) {
return sam_hdr_line_index(ptr, "RG", id) >= 0;
};
Expand All @@ -58,7 +58,7 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) {
SECTION("No read groups generate no headers") {
dorado::SamHdrPtr sam_header(sam_hdr_init());
CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == 0);
dorado::utils::add_rg_hdr(sam_header.get(), {}, {}, nullptr);
dorado::utils::add_rg_headers(sam_header.get(), {});
CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == 0);
}

Expand All @@ -73,7 +73,7 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) {

SECTION("Read groups") {
dorado::SamHdrPtr sam_header(sam_hdr_init());
dorado::utils::add_rg_hdr(sam_header.get(), read_groups, {}, nullptr);
dorado::utils::add_rg_headers(sam_header.get(), read_groups);

// Check the IDs of the groups are all there.
CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == int(read_groups.size()));
Expand All @@ -84,23 +84,15 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) {
}
}

// Pick some of the barcode kits (randomly chosen indices).
const auto &kit_infos = dorado::barcode_kits::get_kit_infos();
const std::vector<std::string> barcode_kits{
std::next(kit_infos.begin(), 1)->first,
std::next(kit_infos.begin(), 7)->first,
};

SECTION("Read groups with barcodes") {
SECTION("Read groups with barcode kit") {
const std::string KIT_NAME{"SQK-RAB204"};
auto kit_info = dorado::barcode_kits::get_kit_info(KIT_NAME);
dorado::SamHdrPtr sam_header(sam_hdr_init());
dorado::utils::add_rg_hdr(sam_header.get(), read_groups, barcode_kits, nullptr);
dorado::utils::add_rg_headers_with_barcode_kit(sam_header.get(), read_groups, KIT_NAME,
*kit_info, nullptr);

// Check the IDs of the groups are all there.
size_t total_barcodes = 0;
for (const auto &kit_name : barcode_kits) {
total_barcodes += kit_infos.at(kit_name).barcodes.size();
}
const size_t total_groups = read_groups.size() * (total_barcodes + 1);
const size_t total_groups = read_groups.size() * (kit_info->barcodes.size() + 1);
CHECK(sam_hdr_count_lines(sam_header.get(), "RG") == int(total_groups));

// Check that the IDs match the expected format.
Expand All @@ -110,24 +102,16 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) {
CHECK(get_barcode_tag(sam_header.get(), id.c_str()) == std::nullopt);

// The headers with barcodes should contain those barcodes.
for (const auto &kit_name : barcode_kits) {
const auto &kit_info = kit_infos.at(kit_name);
for (const auto &barcode_name : kit_info.barcodes) {
const auto full_id = id + "_" +
dorado::barcode_kits::generate_standard_barcode_name(
kit_name, barcode_name);
const auto &barcode_seq = barcode_seqs.at(barcode_name);
CHECK(has_read_group_header(sam_header.get(), full_id.c_str()));
CHECK(get_barcode_tag(sam_header.get(), full_id.c_str()) == barcode_seq);
}
for (const auto &barcode_name : kit_info->barcodes) {
const auto full_id = id + "_" +
dorado::barcode_kits::generate_standard_barcode_name(
KIT_NAME, barcode_name);
const auto &barcode_seq = barcode_seqs.at(barcode_name);
CHECK(has_read_group_header(sam_header.get(), full_id.c_str()));
CHECK(get_barcode_tag(sam_header.get(), full_id.c_str()) == barcode_seq);
}
}
}

SECTION("Read groups with unknown barcode kit") {
dorado::SamHdrPtr sam_header(sam_hdr_init());
CHECK_THROWS(dorado::utils::add_rg_hdr(sam_header.get(), read_groups, {"blah"}, nullptr));
}
}

TEST_CASE("BamUtilsTest: Test bam extraction helpers", TEST_GROUP) {
Expand Down

0 comments on commit 3f2bc41

Please sign in to comment.