Skip to content

Commit

Permalink
ensure all fast5s are removed from mixed dataset
Browse files Browse the repository at this point in the history
  • Loading branch information
tijyojwad committed Dec 12, 2023
1 parent 9cc1544 commit ef83051
Showing 1 changed file with 48 additions and 21 deletions.
69 changes: 48 additions & 21 deletions dorado/data_loader/DataLoader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,50 @@ std::string get_string_attribute(const HighFive::Group& group, const std::string
}
return attribute_string;
}

std::vector<std::filesystem::directory_entry> filter_fast5_for_mixed_datasets(
const std::vector<std::filesystem::directory_entry>& files) {
std::vector<std::filesystem::directory_entry> pod5_entries;
std::vector<std::filesystem::directory_entry> fast5_entries;

bool issued_fast5_warn = false;

for (const auto& entry : files) {
auto entry_path = std::filesystem::path(entry);
std::string ext = entry_path.extension().string();
std::transform(ext.begin(), ext.end(), ext.begin(),
[](unsigned char c) { return std::tolower(c); });
if (ext == ".fast5") {
if (!issued_fast5_warn) {
spdlog::warn(
"FAST5 support is unoptimized and will result in poor performance. "
"Please convert your dataset to POD5: "
"https://pod5-file-format.readthedocs.io/en/latest/docs/"
"tools.html#pod5-convert-fast5");
issued_fast5_warn = true;
}

fast5_entries.push_back(entry);
} else if (ext == ".pod5") {
pod5_entries.push_back(entry);
}
}

if (pod5_entries.empty()) {
return fast5_entries;
} else if (!pod5_entries.empty() && !fast5_entries.empty()) {
for (const auto& f5 : fast5_entries) {
spdlog::warn(
"Data folder contains both POD5 and FAST5 files. Please basecall "
"FAST5 separately. Skipping FAST5 "
"file from {}.",
f5.path().string());
}
}

return pod5_entries;
}

} // namespace

namespace dorado {
Expand Down Expand Up @@ -295,35 +339,16 @@ void DataLoader::load_reads(const std::string& path,
break;
case ReadOrder::UNRESTRICTED:
for (const auto& entry : iterator) {
static bool issued_fast5_warn = false;
static bool pod5_seen = false;
if (m_loaded_read_count == m_max_reads) {
break;
}
std::string ext = std::filesystem::path(entry).extension().string();
std::transform(ext.begin(), ext.end(), ext.begin(),
[](unsigned char c) { return std::tolower(c); });
if (ext == ".fast5") {
if (!issued_fast5_warn) {
spdlog::warn(
"FAST5 support is unoptimized and will result in poor performance. "
"Please convert your dataset to POD5: "
"https://pod5-file-format.readthedocs.io/en/latest/docs/"
"tools.html#pod5-convert-fast5");
issued_fast5_warn = true;
}
if (pod5_seen) {
spdlog::warn(
"Data folder contains both POD5 and FAST5 files. Please basecall "
"FAST5 separately. Skipping FAST5 "
"file from {}.",
entry.path().string());
} else {
load_fast5_reads_from_file(entry.path().string());
}
load_fast5_reads_from_file(entry.path().string());
} else if (ext == ".pod5") {
load_pod5_reads_from_file(entry.path().string());
pod5_seen = true;
}
}
break;
Expand All @@ -333,7 +358,9 @@ void DataLoader::load_reads(const std::string& path,
}
};

iterate_directory(fetch_directory_entries(path, recursive_file_loading));
auto filtered_entries =
filter_fast5_for_mixed_datasets(fetch_directory_entries(path, recursive_file_loading));
iterate_directory(filtered_entries);
}

int DataLoader::get_num_reads(std::string data_path,
Expand Down

0 comments on commit ef83051

Please sign in to comment.