Skip to content

Commit

Permalink
Refactoring to move parsing custom barcodes kits functionality into u…
Browse files Browse the repository at this point in the history
…tils so that bam_utils can have a dependency on it for creating the read groups
  • Loading branch information
hpendry-ont committed Mar 19, 2024
1 parent dab78dc commit f37cd6a
Show file tree
Hide file tree
Showing 13 changed files with 189 additions and 123 deletions.
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -230,10 +230,10 @@ add_library(dorado_lib
dorado/demux/BarcodeClassifier.h
dorado/demux/BarcodeClassifierSelector.cpp
dorado/demux/BarcodeClassifierSelector.h
dorado/demux/parse_custom_sequences.cpp
dorado/demux/parse_custom_sequences.h
dorado/demux/Trimmer.cpp
dorado/demux/Trimmer.h
dorado/demux/parse_custom_kit.cpp
dorado/demux/parse_custom_kit.h
dorado/poly_tail/dna_poly_tail_calculator.cpp
dorado/poly_tail/dna_poly_tail_calculator.h
dorado/poly_tail/plasmid_poly_tail_calculator.cpp
Expand Down
3 changes: 2 additions & 1 deletion dorado/demux/AdapterDetector.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#include "AdapterDetector.h"

#include "parse_custom_kit.h"
#include "parse_custom_sequences.h"
#include "utils/alignment_utils.h"
#include "utils/parse_custom_kit.h"
#include "utils/sequence_utils.h"
#include "utils/types.h"

Expand Down
9 changes: 5 additions & 4 deletions dorado/demux/BarcodeClassifier.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#include "BarcodeClassifier.h"

#include "parse_custom_kit.h"
#include "parse_custom_sequences.h"
#include "utils/alignment_utils.h"
#include "utils/barcode_kits.h"
#include "utils/parse_custom_kit.h"
#include "utils/sequence_utils.h"
#include "utils/types.h"

Expand Down Expand Up @@ -118,7 +119,7 @@ std::unordered_map<std::string, dorado::barcode_kits::KitInfo> process_custom_ki
const std::optional<std::string>& custom_kit) {
std::unordered_map<std::string, dorado::barcode_kits::KitInfo> kit_map;
if (custom_kit) {
auto custom_arrangement = demux::parse_custom_arrangement(*custom_kit);
auto custom_arrangement = dorado::barcode_kits::parse_custom_arrangement(*custom_kit);
if (custom_arrangement) {
const auto& [kit_name, kit_info] = *custom_arrangement;
kit_map[kit_name] = kit_info;
Expand Down Expand Up @@ -158,8 +159,8 @@ BarcodeClassifier::BarcodeClassifier(const std::vector<std::string>& kit_names,
: m_custom_kit(process_custom_kit(custom_kit)),
m_custom_seqs(custom_barcodes ? parse_custom_sequences(*custom_barcodes)
: std::unordered_map<std::string, std::string>{}),
m_scoring_params(custom_kit ? parse_scoring_params(*custom_kit)
: BarcodeKitScoringParams{}),
m_scoring_params(custom_kit ? barcode_kits::parse_scoring_params(*custom_kit)
: barcode_kits::BarcodeKitScoringParams{}),
m_barcode_candidates(generate_candidates(kit_names)) {}

BarcodeClassifier::~BarcodeClassifier() = default;
Expand Down
4 changes: 2 additions & 2 deletions dorado/demux/BarcodeClassifier.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#pragma once
#include "parse_custom_kit.h"
#include "utils/barcode_kits.h"
#include "utils/parse_custom_kit.h"
#include "utils/stats.h"
#include "utils/types.h"

Expand Down Expand Up @@ -30,7 +30,7 @@ class BarcodeClassifier {
private:
const std::unordered_map<std::string, dorado::barcode_kits::KitInfo> m_custom_kit;
const std::unordered_map<std::string, std::string> m_custom_seqs;
const BarcodeKitScoringParams m_scoring_params;
const barcode_kits::BarcodeKitScoringParams m_scoring_params;
const std::vector<BarcodeCandidateKit> m_barcode_candidates;

std::vector<BarcodeCandidateKit> generate_candidates(const std::vector<std::string>& kit_names);
Expand Down
39 changes: 39 additions & 0 deletions dorado/demux/parse_custom_sequences.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#include "parse_custom_sequences.h"

#include "utils/bam_utils.h"
#include "utils/barcode_kits.h"

#include <htslib/sam.h>
#include <toml.hpp>
#include <toml/value.hpp>

#include <algorithm>
#include <string>
#include <vector>

namespace dorado::demux {

std::unordered_map<std::string, std::string> parse_custom_sequences(
const std::string& sequences_file) {
auto file = hts_open(sequences_file.c_str(), "r");
BamPtr record;
record.reset(bam_init1());

std::unordered_map<std::string, std::string> sequences;

int sam_ret_val = 0;
while ((sam_ret_val = sam_read1(file, nullptr, record.get())) != -1) {
if (sam_ret_val < -1) {
throw std::runtime_error("Failed to parse custom sequence file " + sequences_file);
}
std::string qname = bam_get_qname(record.get());
std::string seq = utils::extract_sequence(record.get());
sequences[qname] = seq;
}

hts_close(file);

return sequences;
}

} // namespace dorado::demux
14 changes: 14 additions & 0 deletions dorado/demux/parse_custom_sequences.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
#pragma once

#include "utils/barcode_kits.h"

#include <optional>
#include <string>
#include <unordered_map>

namespace dorado::demux {

std::unordered_map<std::string, std::string> parse_custom_sequences(
const std::string& sequences_file);

} // namespace dorado::demux
4 changes: 3 additions & 1 deletion dorado/utils/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ add_library(dorado_utils
module_utils.h
parameters.cpp
parameters.h
parse_custom_kit.cpp
parse_custom_kit.h
PostCondition.h
SampleSheet.cpp
SampleSheet.h
Expand Down Expand Up @@ -59,7 +61,7 @@ add_library(dorado_utils
types.h
uuid_utils.cpp
uuid_utils.h
)
)

if (DORADO_GPU_BUILD)
if(APPLE)
Expand Down
148 changes: 89 additions & 59 deletions dorado/utils/bam_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
const char seq_nt16_str[] = "=ACMGRSVTWYHKDBN";
#endif // _WIN32

namespace dorado::utils {

namespace {

// Convert the 4bit encoded sequence in a bam1_t structure
Expand All @@ -32,9 +34,66 @@ std::string convert_nt16_to_str(uint8_t* bseq, size_t slen) {
return seq;
}

} // namespace
void emit_read_group(sam_hdr_t* hdr,
const std::string& read_group_line,
const std::string& id,
const std::string& additional_tags) {
auto line = "@RG\tID:" + id + '\t' + read_group_line + additional_tags + '\n';
sam_hdr_add_lines(hdr, line.c_str(), 0);
}

namespace dorado::utils {
void emit_read_groups_for_barcode_kit(const std::unordered_map<std::string, ReadGroup>& read_groups,
const barcode_kits::KitInfo& kit_info,
const utils::SampleSheet* const sample_sheet) {
const auto& barcode_sequences = barcode_kits::get_barcodes();
for (const auto& barcode_name : kit_info.barcodes) {
const auto additional_tags = "\tBC:" + barcode_sequences.at(barcode_name);
const auto normalized_barcode_name = barcode_kits::normalize_barcode_name(barcode_name);
for (const auto& read_group : read_groups) {
std::string alias;
auto id = read_group.first + '_';
if (sample_sheet) {
if (!sample_sheet->barcode_is_permitted(normalized_barcode_name)) {
continue;
}

alias = sample_sheet->get_alias(
read_group.second.flowcell_id, read_group.second.position_id,
read_group.second.experiment_id, normalized_barcode_name);
}
if (!alias.empty()) {
id += alias;
} else {
id += barcode_kits::generate_standard_barcode_name(kit_name, barcode_name);
}
const std::string read_group_tags = read_group_to_string(read_group.second);
emit_read_group(hdr, read_group_tags, id, additional_tags);
}
}
}

std::string read_group_to_string(const dorado::ReadGroup& read_group) {
auto value_or_unknown = [](std::string_view s) { return s.empty() ? "unknown" : s; };
std::ostringstream rg;
{
rg << "PU:" << value_or_unknown(read_group.flowcell_id) << "\t";
rg << "PM:" << value_or_unknown(read_group.device_id) << "\t";
rg << "DT:" << value_or_unknown(read_group.exp_start_time) << "\t";
rg << "PL:"
<< "ONT"
<< "\t";
rg << "DS:"
<< "basecall_model=" << value_or_unknown(read_group.basecalling_model)
<< (read_group.modbase_models.empty() ? ""
: (" modbase_models=" + read_group.modbase_models))
<< " runid=" << value_or_unknown(read_group.run_id) << "\t";
rg << "LB:" << value_or_unknown(read_group.sample_id) << "\t";
rg << "SM:" << value_or_unknown(read_group.sample_id);
}
return rg.str();
}

} // namespace

kstring_t allocate_kstring() {
kstring_t str = {0, 0, NULL};
Expand All @@ -49,40 +108,10 @@ void add_rg_hdr(sam_hdr_t* hdr,
const auto& barcode_kit_infos = barcode_kits::get_kit_infos();
const auto& barcode_sequences = barcode_kits::get_barcodes();

// Convert a ReadGroup to a string
auto to_string = [](const ReadGroup& read_group) {
// Lambda function to return "unknown" if string is empty
auto value_or_unknown = [](std::string_view s) { return s.empty() ? "unknown" : s; };
std::ostringstream rg;
{
rg << "PU:" << value_or_unknown(read_group.flowcell_id) << "\t";
rg << "PM:" << value_or_unknown(read_group.device_id) << "\t";
rg << "DT:" << value_or_unknown(read_group.exp_start_time) << "\t";
rg << "PL:"
<< "ONT"
<< "\t";
rg << "DS:"
<< "basecall_model=" << value_or_unknown(read_group.basecalling_model)
<< (read_group.modbase_models.empty()
? ""
: (" modbase_models=" + read_group.modbase_models))
<< " runid=" << value_or_unknown(read_group.run_id) << "\t";
rg << "LB:" << value_or_unknown(read_group.sample_id) << "\t";
rg << "SM:" << value_or_unknown(read_group.sample_id);
}
return std::move(rg).str();
};

auto emit_read_group = [hdr](const std::string& read_group_line, const std::string& id,
const std::string& additional_tags) {
auto line = "@RG\tID:" + id + '\t' + read_group_line + additional_tags + '\n';
sam_hdr_add_lines(hdr, line.c_str(), 0);
};

// Emit read group headers without a barcode arrangement.
for (const auto& read_group : read_groups) {
const std::string read_group_tags = to_string(read_group.second);
emit_read_group(read_group_tags, read_group.first, {});
const std::string read_group_tags = read_group_to_string(read_group.second);
emit_read_group(hdr, read_group_tags, read_group.first, {});
}

// Emit read group headers for each barcode arrangement.
Expand All @@ -93,31 +122,32 @@ void add_rg_hdr(sam_hdr_t* hdr,
" is not a valid barcode kit name. Please run the help "
"command to find out available barcode kits.");
}
const auto& kit_info = kit_iter->second;
for (const auto& barcode_name : kit_info.barcodes) {
const auto additional_tags = "\tBC:" + barcode_sequences.at(barcode_name);
const auto normalized_barcode_name = barcode_kits::normalize_barcode_name(barcode_name);
for (const auto& read_group : read_groups) {
std::string alias;
auto id = read_group.first + '_';
if (sample_sheet) {
if (!sample_sheet->barcode_is_permitted(normalized_barcode_name)) {
continue;
}

alias = sample_sheet->get_alias(
read_group.second.flowcell_id, read_group.second.position_id,
read_group.second.experiment_id, normalized_barcode_name);
}
if (!alias.empty()) {
id += alias;
} else {
id += barcode_kits::generate_standard_barcode_name(kit_name, barcode_name);
}
const std::string read_group_tags = to_string(read_group.second);
emit_read_group(read_group_tags, id, additional_tags);
}
}
emit_read_groups_for_barcode_kit(read_groups, kit_iter->second, sample_sheet);
//const auto& kit_info = kit_iter->second;
//for (const auto& barcode_name : kit_info.barcodes) {
// const auto additional_tags = "\tBC:" + barcode_sequences.at(barcode_name);
// const auto normalized_barcode_name = barcode_kits::normalize_barcode_name(barcode_name);
// for (const auto& read_group : read_groups) {
// std::string alias;
// auto id = read_group.first + '_';
// if (sample_sheet) {
// if (!sample_sheet->barcode_is_permitted(normalized_barcode_name)) {
// continue;
// }

// alias = sample_sheet->get_alias(
// read_group.second.flowcell_id, read_group.second.position_id,
// read_group.second.experiment_id, normalized_barcode_name);
// }
// if (!alias.empty()) {
// id += alias;
// } else {
// id += barcode_kits::generate_standard_barcode_name(kit_name, barcode_name);
// }
// const std::string read_group_tags = read_group_to_string(read_group.second);
// emit_read_group(hdr, read_group_tags, id, additional_tags);
// }
//}
}
}

Expand Down
1 change: 0 additions & 1 deletion dorado/utils/barcode_kits.h
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,4 @@ std::string barcode_kits_list_str();
std::string normalize_barcode_name(const std::string& barcode_name);
std::string generate_standard_barcode_name(const std::string& kit_name,
const std::string& barcode_name);

} // namespace dorado::barcode_kits
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "parse_custom_kit.h"

#include "utils/bam_utils.h"
//#include "utils/bam_utils.h"
//#include "utils/types.h"
#include "utils/barcode_kits.h"

#include <htslib/sam.h>
Expand All @@ -11,7 +12,7 @@
#include <string>
#include <vector>

namespace dorado::demux {
namespace dorado::barcode_kits {

bool check_normalized_id_pattern(const std::string& pattern) {
auto modulo_pos = pattern.find_first_of("%");
Expand Down Expand Up @@ -138,27 +139,4 @@ BarcodeKitScoringParams parse_scoring_params(const std::string& arrangement_file
return params;
}

std::unordered_map<std::string, std::string> parse_custom_sequences(
const std::string& sequences_file) {
auto file = hts_open(sequences_file.c_str(), "r");
BamPtr record;
record.reset(bam_init1());

std::unordered_map<std::string, std::string> sequences;

int sam_ret_val = 0;
while ((sam_ret_val = sam_read1(file, nullptr, record.get())) != -1) {
if (sam_ret_val < -1) {
throw std::runtime_error("Failed to parse custom sequence file " + sequences_file);
}
std::string qname = bam_get_qname(record.get());
std::string seq = utils::extract_sequence(record.get());
sequences[qname] = seq;
}

hts_close(file);

return sequences;
}

} // namespace dorado::demux
} // namespace dorado::barcode_kits
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#include <string>
#include <unordered_map>

namespace dorado::demux {
namespace dorado::barcode_kits {

struct BarcodeKitScoringParams {
float min_soft_barcode_threshold = 0.7f;
Expand All @@ -19,11 +19,8 @@ struct BarcodeKitScoringParams {
std::optional<std::pair<std::string, barcode_kits::KitInfo>> parse_custom_arrangement(
const std::string& arrangement_file);

std::unordered_map<std::string, std::string> parse_custom_sequences(
const std::string& sequences_file);

BarcodeKitScoringParams parse_scoring_params(const std::string& arrangement_file);

bool check_normalized_id_pattern(const std::string& pattern);

} // namespace dorado::demux
} // namespace dorado::barcode_kits
Loading

0 comments on commit f37cd6a

Please sign in to comment.