Skip to content

Commit

Permalink
Merge branch 'rh/disable_mux_change_trim_r9' into 'master'
Browse files Browse the repository at this point in the history
DOR-607 Disable mux change trimming on r9.4.1 data

See merge request machine-learning/dorado!899
  • Loading branch information
tijyojwad committed Mar 20, 2024
2 parents 9b49ae5 + 3b2c0c3 commit ff330f6
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 2 deletions.
13 changes: 13 additions & 0 deletions dorado/data_loader/DataLoader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "models/kits.h"
#include "models/models.h"
#include "read_pipeline/ReadPipeline.h"
#include "read_pipeline/messages.h"
#include "utils/PostCondition.h"
#include "utils/time_utils.h"
#include "utils/types.h"
Expand Down Expand Up @@ -229,6 +230,7 @@ SimplexReadPtr process_pod5_read(
// used has a rapid adapter and which one.
const auto condition_info = models::ConditionInfo(get_chemistry_key(run_info_data));
new_read->read_common.rapid_chemistry = condition_info.rapid_chemistry();
new_read->read_common.chemistry = condition_info.chemistry();

pod5_end_reason_t end_reason_value{POD5_END_REASON_UNKNOWN};
char end_reason_string_value[200];
Expand Down Expand Up @@ -853,6 +855,7 @@ void DataLoader::load_pod5_reads_from_file_by_read_ids(const std::string& path,

for (auto& v : futures) {
auto read = v.get();
check_read(read);
m_pipeline.push_message(std::move(read));
m_loaded_read_count++;
}
Expand Down Expand Up @@ -911,6 +914,7 @@ void DataLoader::load_pod5_reads_from_file(const std::string& path) {

for (auto& v : futures) {
auto read = v.get();
check_read(read);
m_pipeline.push_message(std::move(read));
m_loaded_read_count++;
}
Expand Down Expand Up @@ -1023,6 +1027,15 @@ void DataLoader::load_fast5_reads_from_file(const std::string& path) {
}
}

void DataLoader::check_read(const SimplexReadPtr& read) {
if (read->read_common.chemistry == models::Chemistry::UNKNOWN &&
m_log_unknown_chemistry.exchange(false)) {
spdlog::warn(
"Could not determine sequencing Chemistry from read data - "
"some features might be disabled");
}
}

DataLoader::DataLoader(Pipeline& pipeline,
const std::string& device,
size_t num_worker_threads,
Expand Down
7 changes: 7 additions & 0 deletions dorado/data_loader/DataLoader.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#pragma once
#include "models/models.h"
#include "read_pipeline/messages.h"
#include "utils/stats.h"
#include "utils/types.h"

Expand Down Expand Up @@ -75,6 +76,7 @@ class DataLoader {
void load_pod5_reads_from_file_by_read_ids(const std::string& path,
const std::vector<ReadID>& read_ids);
void load_read_channels(std::filesystem::path data_path, bool recursive_file_loading);

Pipeline& m_pipeline; // Where should the loaded reads go?
std::atomic<size_t> m_loaded_read_count{0};
std::string m_device;
Expand All @@ -87,6 +89,11 @@ class DataLoader {
std::unordered_map<int, std::vector<ReadSortInfo>> m_reads_by_channel;
std::unordered_map<std::string, size_t> m_read_id_to_index;
int m_max_channel{0};

// Issue warnings if read is potentially problematic
inline void check_read(const SimplexReadPtr& read);
// A flag to warn only once if the data chemsitry is known
std::atomic<bool> m_log_unknown_chemistry{true};
};

} // namespace dorado
9 changes: 7 additions & 2 deletions dorado/read_pipeline/BasecallerNode.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

#include "basecall/CRFModelConfig.h"
#include "basecall/ModelRunnerBase.h"
#include "models/kits.h"
#include "read_utils.h"
#include "stitch.h"
#include "utils/stats.h"
Expand Down Expand Up @@ -204,8 +205,12 @@ void BasecallerNode::working_reads_manager() {
// Chunks have ownership of the working read, so destroy them to avoid a leak.
working_read->called_chunks.clear();

// Trim reads which are affected by mux change and unblocking
utils::mux_change_trim_read(read_common_data);
// Do not trim R9.4.1 data to avoid changes to legacy products
// Check here to avoid adding models lib as a dependency of utils
if (read_common_data.chemistry != models::Chemistry::DNA_R9_4_1_E8) {
// Trim reads which are affected by mux change and unblocking
utils::mux_change_trim_read(read_common_data);
}

// Cleanup the working read.
{
Expand Down
2 changes: 2 additions & 0 deletions dorado/read_pipeline/messages.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,8 @@ class ReadCommon {
// `True` if the basecall model is an RNA model
bool is_rna_model{false};

// The chemistry type if any
models::Chemistry chemistry{models::Chemistry::UNKNOWN};
// The rapid chemistry adapter type if any - sourced from the read info data
models::RapidChemistry rapid_chemistry{models::RapidChemistry::UNKNOWN};

Expand Down

0 comments on commit ff330f6

Please sign in to comment.