From ef83051324ddd12336ee770fa47464d8057c0d7c Mon Sep 17 00:00:00 2001 From: Joyjit Daw Date: Tue, 12 Dec 2023 14:17:31 -0500 Subject: [PATCH] ensure all fast5s are removed from mixed dataset --- dorado/data_loader/DataLoader.cpp | 69 +++++++++++++++++++++---------- 1 file changed, 48 insertions(+), 21 deletions(-) diff --git a/dorado/data_loader/DataLoader.cpp b/dorado/data_loader/DataLoader.cpp index baa9b81b..a82bebbf 100644 --- a/dorado/data_loader/DataLoader.cpp +++ b/dorado/data_loader/DataLoader.cpp @@ -102,6 +102,50 @@ std::string get_string_attribute(const HighFive::Group& group, const std::string } return attribute_string; } + +std::vector filter_fast5_for_mixed_datasets( + const std::vector& files) { + std::vector pod5_entries; + std::vector fast5_entries; + + bool issued_fast5_warn = false; + + for (const auto& entry : files) { + auto entry_path = std::filesystem::path(entry); + std::string ext = entry_path.extension().string(); + std::transform(ext.begin(), ext.end(), ext.begin(), + [](unsigned char c) { return std::tolower(c); }); + if (ext == ".fast5") { + if (!issued_fast5_warn) { + spdlog::warn( + "FAST5 support is unoptimized and will result in poor performance. " + "Please convert your dataset to POD5: " + "https://pod5-file-format.readthedocs.io/en/latest/docs/" + "tools.html#pod5-convert-fast5"); + issued_fast5_warn = true; + } + + fast5_entries.push_back(entry); + } else if (ext == ".pod5") { + pod5_entries.push_back(entry); + } + } + + if (pod5_entries.empty()) { + return fast5_entries; + } else if (!pod5_entries.empty() && !fast5_entries.empty()) { + for (const auto& f5 : fast5_entries) { + spdlog::warn( + "Data folder contains both POD5 and FAST5 files. Please basecall " + "FAST5 separately. Skipping FAST5 " + "file from {}.", + f5.path().string()); + } + } + + return pod5_entries; +} + } // namespace namespace dorado { @@ -295,8 +339,6 @@ void DataLoader::load_reads(const std::string& path, break; case ReadOrder::UNRESTRICTED: for (const auto& entry : iterator) { - static bool issued_fast5_warn = false; - static bool pod5_seen = false; if (m_loaded_read_count == m_max_reads) { break; } @@ -304,26 +346,9 @@ void DataLoader::load_reads(const std::string& path, std::transform(ext.begin(), ext.end(), ext.begin(), [](unsigned char c) { return std::tolower(c); }); if (ext == ".fast5") { - if (!issued_fast5_warn) { - spdlog::warn( - "FAST5 support is unoptimized and will result in poor performance. " - "Please convert your dataset to POD5: " - "https://pod5-file-format.readthedocs.io/en/latest/docs/" - "tools.html#pod5-convert-fast5"); - issued_fast5_warn = true; - } - if (pod5_seen) { - spdlog::warn( - "Data folder contains both POD5 and FAST5 files. Please basecall " - "FAST5 separately. Skipping FAST5 " - "file from {}.", - entry.path().string()); - } else { - load_fast5_reads_from_file(entry.path().string()); - } + load_fast5_reads_from_file(entry.path().string()); } else if (ext == ".pod5") { load_pod5_reads_from_file(entry.path().string()); - pod5_seen = true; } } break; @@ -333,7 +358,9 @@ void DataLoader::load_reads(const std::string& path, } }; - iterate_directory(fetch_directory_entries(path, recursive_file_loading)); + auto filtered_entries = + filter_fast5_for_mixed_datasets(fetch_directory_entries(path, recursive_file_loading)); + iterate_directory(filtered_entries); } int DataLoader::get_num_reads(std::string data_path,