Skip to content

Commit

Permalink
Merge branch 'DOR-397_modbase_name_in_bams' into 'master'
Browse files Browse the repository at this point in the history
DOR-397 Added modbase model name to BAM files in RG header section.

Closes DOR-397

See merge request machine-learning/dorado!722
  • Loading branch information
iiSeymour committed Nov 24, 2023
2 parents 4102ffc + 936f6e2 commit f0ac935
Show file tree
Hide file tree
Showing 11 changed files with 57 additions and 19 deletions.
20 changes: 10 additions & 10 deletions documentation/SAM.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@

#### Read Group Header

| | | |
| -- | -- | ----------------------------------------------------- |
| RG | ID | `<runid>_<basecalling_model>_<barcode_arrangement>` |
| | PU | `<flow_cell_id>` |
| | PM | `<device_id>` |
| | DT | `<exp_start_time>` |
| | PL | `ONT` |
| | DS | `basecall_model=<basecall_model_name> runid=<run_id>` |
| | LB | `<sample_id>` |
| | SM | `<sample_id>` |
| | | |
| -- | -- | ------------------------------------------------------------------------------------------ |
| RG | ID | `<runid>_<basecalling_model>_<barcode_arrangement>` |
| | PU | `<flow_cell_id>` |
| | PM | `<device_id>` |
| | DT | `<exp_start_time>` |
| | PL | `ONT` |
| | DS | `basecall_model=<basecall_model_name> modbase_models=<modbase_model_names> runid=<run_id>` |
| | LB | `<sample_id>` |
| | SM | `<sample_id>` |

#### Read Tags

Expand Down
4 changes: 3 additions & 1 deletion dorado/cli/basecaller.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ void setup(std::vector<std::string> args,
bool estimate_poly_a) {
auto model_config = load_crf_model_config(model_path);
std::string model_name = models::extract_model_from_model_path(model_path.string());
std::string modbase_model_names = models::extract_model_from_model_paths(remora_models);

if (!DataLoader::is_read_data_present(data_path, recursive_file_loading)) {
std::string err = "No POD5 or FAST5 data found in path: " + data_path;
Expand Down Expand Up @@ -112,7 +113,8 @@ void setup(std::vector<std::string> args,
auto [runners, num_devices] =
create_basecall_runners(model_config, device, num_runners, 0, batch_size, chunk_size);

auto read_groups = DataLoader::load_read_groups(data_path, model_name, recursive_file_loading);
auto read_groups = DataLoader::load_read_groups(data_path, model_name, modbase_model_names,
recursive_file_loading);

bool duplex = false;

Expand Down
8 changes: 5 additions & 3 deletions dorado/cli/duplex.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -311,9 +311,11 @@ int duplex(int argc, char* argv[]) {

// Write read group info to header.
auto duplex_rg_name = std::string(model + "_" + stereo_model_name);
auto read_groups = DataLoader::load_read_groups(reads, model, recursive_file_loading);
read_groups.merge(
DataLoader::load_read_groups(reads, duplex_rg_name, recursive_file_loading));
// TODO: supply modbase model names once duplex modbase is complete
auto read_groups =
DataLoader::load_read_groups(reads, model, "", recursive_file_loading);
read_groups.merge(DataLoader::load_read_groups(reads, duplex_rg_name, "",
recursive_file_loading));
std::vector<std::string> barcode_kits;
utils::add_rg_hdr(hdr.get(), read_groups, barcode_kits, nullptr);

Expand Down
2 changes: 2 additions & 0 deletions dorado/data_loader/DataLoader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -456,6 +456,7 @@ void DataLoader::load_read_channels(std::string data_path, bool recursive_file_l
std::unordered_map<std::string, ReadGroup> DataLoader::load_read_groups(
std::string data_path,
std::string model_path,
std::string modbase_model_names,
bool recursive_file_loading) {
std::unordered_map<std::string, ReadGroup> read_groups;

Expand Down Expand Up @@ -497,6 +498,7 @@ std::unordered_map<std::string, ReadGroup> DataLoader::load_read_groups(
read_groups[id] = ReadGroup{
run_id,
model_path,
modbase_model_names,
flowcell_id,
device_id,
utils::get_string_timestamp_from_unix_time(exp_start_time_ms),
Expand Down
1 change: 1 addition & 0 deletions dorado/data_loader/DataLoader.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ class DataLoader {
static std::unordered_map<std::string, ReadGroup> load_read_groups(
std::string data_path,
std::string model_path,
std::string modbase_model_names,
bool recursive_file_loading = false);

static int get_num_reads(
Expand Down
14 changes: 14 additions & 0 deletions dorado/models/models.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -529,4 +529,18 @@ std::string extract_model_from_model_path(const std::string& model_path) {
return std::filesystem::canonical(path).filename().string();
}

std::string extract_model_from_model_paths(const std::string& model_paths) {
std::string model_names;

std::istringstream stream{model_paths};
std::string model_path;
while (std::getline(stream, model_path, ',')) {
if (!model_names.empty()) {
model_names += ",";
}
model_names += models::extract_model_from_model_path(model_path);
}
return model_names;
}

} // namespace dorado::models
3 changes: 3 additions & 0 deletions dorado/models/models.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,7 @@ uint32_t get_mean_qscore_start_pos_by_model_name(const std::string& model_name);
// Extract the model name from the model path.
std::string extract_model_from_model_path(const std::string& model_path);

// Extract the model names as a comma seperated list from a comma seperated list of model paths.
std::string extract_model_from_model_paths(const std::string& model_paths);

} // namespace dorado::models
3 changes: 3 additions & 0 deletions dorado/utils/bam_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,9 @@ void add_rg_hdr(sam_hdr_t* hdr,
<< "\t";
rg << "DS:"
<< "basecall_model=" << value_or_unknown(read_group.basecalling_model)
<< (read_group.modbase_models.empty()
? ""
: (" modbase_models=" + read_group.modbase_models))
<< " runid=" << value_or_unknown(read_group.run_id) << "\t";
rg << "LB:" << value_or_unknown(read_group.sample_id) << "\t";
rg << "SM:" << value_or_unknown(read_group.sample_id);
Expand Down
1 change: 1 addition & 0 deletions dorado/utils/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ struct BarcodeScoreResult {
struct ReadGroup {
std::string run_id;
std::string basecalling_model;
std::string modbase_models;
std::string flowcell_id;
std::string device_id;
std::string exp_start_time;
Expand Down
8 changes: 4 additions & 4 deletions tests/BamUtilsTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -70,11 +70,11 @@ TEST_CASE("BamUtilsTest: add_rg_hdr read group headers", TEST_GROUP) {

const std::unordered_map<std::string, dorado::ReadGroup> read_groups{
{"id_0",
{"run_0", "basecalling_mod_0", "flowcell_0", "device_0", "exp_start_0", "sample_0", "",
""}},
{"run_0", "basecalling_model_0", "modbase_model_0", "flowcell_0", "device_0",
"exp_start_0", "sample_0", "", ""}},
{"id_1",
{"run_1", "basecalling_mod_1", "flowcell_1", "device_1", "exp_start_1", "sample_1", "",
""}},
{"run_1", "basecalling_model_1", "modbase_model_1", "flowcell_1", "device_1",
"exp_start_1", "sample_1", "", ""}},
};

SECTION("Read groups") {
Expand Down
12 changes: 11 additions & 1 deletion tests/test_simple_basecaller_execution.sh
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,17 @@ if ! uname -r | grep -q tegra; then
$dorado_bin basecaller ${model} $data_dir/pod5 -x cpu --modified-bases 5mCG_5hmCG > $output_dir/calls.bam
fi
samtools quickcheck -u $output_dir/calls.bam
samtools view $output_dir/calls.bam > $output_dir/calls.sam
samtools view -h $output_dir/calls.bam > $output_dir/calls.sam

# Check that the read group has the required model info in it's header
if ! grep -q "basecall_model=${model_name}" $output_dir/calls.sam; then
echo "Output SAM file does not contain basecall model name in header!"
exit 1
fi
if ! grep -q "modbase_models=${model_name}_5mCG_5hmCG" $output_dir/calls.sam; then
echo "Output SAM file does not contain modbase model name in header!"
exit 1
fi

set +e
if $dorado_bin basecaller ${model} $data_dir/pod5 -b ${batch} --emit-fastq --reference $output_dir/ref.fq > $output_dir/error_condition.fq; then
Expand Down

0 comments on commit f0ac935

Please sign in to comment.