Skip to content

Commit

Permalink
Merge branch 'jdaw/report-fast5-warning' into 'master'
Browse files Browse the repository at this point in the history
Add warning for FAST5 usage

See merge request machine-learning/dorado!760
  • Loading branch information
tijyojwad committed Dec 13, 2023
2 parents 6c984a0 + ef83051 commit e8b07e2
Showing 1 changed file with 47 additions and 1 deletion.
48 changes: 47 additions & 1 deletion dorado/data_loader/DataLoader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,50 @@ std::string get_string_attribute(const HighFive::Group& group, const std::string
}
return attribute_string;
}

std::vector<std::filesystem::directory_entry> filter_fast5_for_mixed_datasets(
const std::vector<std::filesystem::directory_entry>& files) {
std::vector<std::filesystem::directory_entry> pod5_entries;
std::vector<std::filesystem::directory_entry> fast5_entries;

bool issued_fast5_warn = false;

for (const auto& entry : files) {
auto entry_path = std::filesystem::path(entry);
std::string ext = entry_path.extension().string();
std::transform(ext.begin(), ext.end(), ext.begin(),
[](unsigned char c) { return std::tolower(c); });
if (ext == ".fast5") {
if (!issued_fast5_warn) {
spdlog::warn(
"FAST5 support is unoptimized and will result in poor performance. "
"Please convert your dataset to POD5: "
"https://pod5-file-format.readthedocs.io/en/latest/docs/"
"tools.html#pod5-convert-fast5");
issued_fast5_warn = true;
}

fast5_entries.push_back(entry);
} else if (ext == ".pod5") {
pod5_entries.push_back(entry);
}
}

if (pod5_entries.empty()) {
return fast5_entries;
} else if (!pod5_entries.empty() && !fast5_entries.empty()) {
for (const auto& f5 : fast5_entries) {
spdlog::warn(
"Data folder contains both POD5 and FAST5 files. Please basecall "
"FAST5 separately. Skipping FAST5 "
"file from {}.",
f5.path().string());
}
}

return pod5_entries;
}

} // namespace

namespace dorado {
Expand Down Expand Up @@ -314,7 +358,9 @@ void DataLoader::load_reads(const std::string& path,
}
};

iterate_directory(fetch_directory_entries(path, recursive_file_loading));
auto filtered_entries =
filter_fast5_for_mixed_datasets(fetch_directory_entries(path, recursive_file_loading));
iterate_directory(filtered_entries);
}

int DataLoader::get_num_reads(std::string data_path,
Expand Down

0 comments on commit e8b07e2

Please sign in to comment.