Skip to content

Commit

Permalink
Merge pull request #565 from pangenome/odgi_paths_seq_classification
Browse files Browse the repository at this point in the history
`odgi paths`: private/core/accessory sequence classification
  • Loading branch information
AndreaGuarracino committed Mar 28, 2024
2 parents 824a168 + 4faadf4 commit aa08eae
Show file tree
Hide file tree
Showing 3 changed files with 271 additions and 7 deletions.
254 changes: 247 additions & 7 deletions src/subcommand/paths_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,23 @@ namespace odgi {

using namespace odgi::subcommand;


bool isNumberAndLessThanOrEqualToX(const std::string& s, double x) {
char* end = nullptr;
double val = strtod(s.c_str(), &end);
return end != s.c_str() && *end == '\0' && val <= x;
}

bool isValid(const std::string& levels, double max_val) {
const std::vector<string> levels_vector = split(levels, ',');
for (auto l : levels_vector) {
if (!isNumberAndLessThanOrEqualToX(l, max_val)) {
return false;
}
}
return true;
}

int main_paths(int argc, char** argv) {

// trick argumentparser to do the right thing with the subcommand
Expand Down Expand Up @@ -40,16 +57,27 @@ int main_paths(int argc, char** argv) {
" *path.name*, *path.length*, *path.step.count*, *node.1*,"
" *node.2*, *node.n*. Each path entry is printed in its own line.", {'H', "haplotypes"});
args::Flag scale_by_node_length(path_investigation_opts, "haplo", "Scale the haplotype matrix cells by node length.", {'N', "scale-by-node-len"});
args::ValueFlag<std::string> path_delim(path_investigation_opts, "CHAR", "The part of each path name before this delimiter CHAR is a group"
" identifier. For use with -H, --haplotypes**: it prints an additional, first column **group.name** to stdout.",

args::Group non_ref_opts(parser, "[ Non-ref. Sequence Options ]");
args::ValueFlag<std::string> non_reference_nodes(non_ref_opts, "FILE", "Print to stdout IDs of nodes that are not in the paths listed (by line) in *FILE*.", {"non-reference-nodes"});
args::ValueFlag<std::string> non_reference_ranges(non_ref_opts, "FILE", "Print to stdout (in BED format) path ranges that are not in the paths listed (by line) in *FILE*.", {"non-reference-ranges"});

args::Group seq_type_opts(parser, "[ Sequence Type Options ]");
args::ValueFlag<std::string> coverage_levels(seq_type_opts, "c1,c2,...,cN", "List of coverage thresholds (number of paths that pass through the node). Print to stdout a TAB-separated table with node ID, node length, and class.", {"coverage-levels"});
args::ValueFlag<std::string> fraction_levels(seq_type_opts, "f1,f2,...,fN", "List of fraction thresholds (fraction of paths that pass through the node). Print to stdout a TAB-separated table with node ID , node length, and class.')]", {"fraction-levels"});
args::Flag path_range_class(seq_type_opts, "N", "Instead of node information, print to stdout a TAB-separated table with path range and class.", {"path-range-class"});

args::Group common_opts(parser, "[ Common Options ]");
args::ValueFlag<uint64_t> min_size(common_opts, "N", "Minimum size (in bps) of nodes (with --non-reference-nodes or --coverage-levels/fraction-levels) or ranges (with --non-reference-ranges or --coverage-levels/fraction-levels together with --path-range-class).", {"min-size"});
args::Flag show_step_ranges(common_opts, "N", "Show steps (that is, node IDs and strands) (with --non-reference-ranges or --coverage-levels/fraction-levels together with --path-range-class).", {"show-step-ranges"});
args::ValueFlag<std::string> path_delim(common_opts, "CHAR", "The part of each path name before this delimiter CHAR is a group"
" identifier. For use with -H/--haplotypes, --non-reference-ranges or --coverage-levels/fraction-levels. "
"With -H/--haplotypes it prints an additional, first column **group.name** to stdout.",
{'D', "delim"});
args::ValueFlag<std::uint16_t> path_delim_pos(path_investigation_opts, "N", "Consider the N-th occurrence of the delimiter specified with **-D, --delim**"
args::ValueFlag<std::uint16_t> path_delim_pos(common_opts, "N", "Consider the N-th occurrence of the delimiter specified with **-D, --delim**"
" to obtain the group identifier. Specify 1 for the 1st occurrence (default).",
{'p', "delim-pos"});
args::ValueFlag<std::string> non_reference_nodes(path_investigation_opts, "FILE", "Print to stdout IDs of nodes that are not in the paths listed (by line) in *FILE*.", {"non-reference-nodes"});
args::ValueFlag<std::string> non_reference_ranges(path_investigation_opts, "FILE", "Print to stdout (in BED format) path ranges that are not in the paths listed (by line) in *FILE*.", {"non-reference-ranges"});
args::ValueFlag<uint64_t> min_size(path_investigation_opts, "N", "Minimum size (in bps) of nodes (with --non-reference-nodes) or ranges (with --non-reference-ranges).", {"min-size"});
args::Flag show_step_ranges(path_investigation_opts, "N", "Show steps (that is, node IDs and strands) of --non-reference-ranges.", {"show-step-ranges"});

args::Group path_modification_opts(parser, "[ Path Modification Options ]");
args::ValueFlag<std::string> keep_paths_file(path_modification_opts, "FILE", "Keep paths listed (by line) in *FILE*.", {'K', "keep-paths"});
args::ValueFlag<std::string> drop_paths_file(path_modification_opts, "FILE", "Drop paths listed (by line) in *FILE*.", {'X', "drop-paths"});
Expand Down Expand Up @@ -101,6 +129,26 @@ int main_paths(int argc, char** argv) {
return 1;
}

if (coverage_levels && fraction_levels) {
std::cerr << "[odgi::paths] error: specify --coverage-levels or --fraction-levels, not both." << std::endl;
return 1;
}

if ((non_reference_nodes || non_reference_ranges) && (coverage_levels || fraction_levels)) {
std::cerr << "[odgi::paths] error: specify only one of --non-reference-nodes, --non-reference-range, --coverage-levels or --fraction-levels." << std::endl;
return 1;
}

if (coverage_levels && !isValid(args::get(coverage_levels), std::numeric_limits<double>::max())) {
std::cerr << "[odgi::paths] error: invalid coverage levels. Only comma-separated numbers are accepted." << std::endl;
return 1;
}

if (fraction_levels && !isValid(args::get(fraction_levels), 1.0)) {
std::cerr << "[odgi::paths] error: invalid fraction levels. Only comma-separated numbers <= 1.0 are accepted." << std::endl;
return 1;
}

const uint64_t num_threads = args::get(threads) ? args::get(threads) : 1;
omp_set_num_threads(num_threads);

Expand Down Expand Up @@ -531,6 +579,198 @@ int main_paths(int argc, char** argv) {
}
}

if (coverage_levels || fraction_levels) {
char delim = '\0';
if (!args::get(path_delim).empty()) {
delim = args::get(path_delim).at(0);
}

const uint64_t min_size_in_bp = min_size ? args::get(min_size) : 0;

// Read levels and sort them
std::vector<double> sorted_levels;
{
const std::string levels = coverage_levels ? args::get(coverage_levels) : args::get(fraction_levels);
const std::vector<string> levels_vector = split(levels, ',');
for (auto l : levels_vector) {
sorted_levels.push_back(std::stod(l));
}
std::sort(sorted_levels.begin(), sorted_levels.end());
// Duplicate the first value by inserting it at the beginning (this to manage values < min_value)
sorted_levels.insert(sorted_levels.begin(), sorted_levels.front());
}

std::vector<ska::flat_hash_set<handle_t>> set_of_handles_for_level(sorted_levels.size());

ska::flat_hash_map<path_handle_t, std::string> phandle_2_name;
ska::flat_hash_set<std::string> sample_names;
graph.for_each_path_handle([&](const path_handle_t &path) {
const std::string path_name = graph.get_path_name(path);

std::string sample_name;
if (delim) {
const std::pair<int32_t, int32_t> cnt_pos = group_identified_pos(path_name, delim, delim_pos);
if (cnt_pos.first < 0) {
std::cerr << "[odgi::paths] error: path name '" << path_name << "' has not occurrences of '" << delim << "'." << std::endl;
exit(-1);
} else if (cnt_pos.first != delim_pos) {
std::cerr << "[odgi::paths] warning: path name '" << path_name << "' has too few occurrences of '" << delim << "'. "
<< "The " << cnt_pos.first + 1 << "-th occurrence is used." << std::endl;
}
sample_name = path_name.substr(0, cnt_pos.second);
} else {
sample_name = path_name;
}

phandle_2_name[path] = sample_name;
sample_names.insert(sample_name);
});

// Classify nodes in parallel
graph.for_each_handle([&](const handle_t &h) {
const uint64_t hl = graph.get_length(h);
ska::flat_hash_set<std::string> samples;
graph.for_each_step_on_handle(h, [&](const step_handle_t &occ) {
const path_handle_t p_h = graph.get_path_handle_of_step(occ);
samples.insert(phandle_2_name[p_h]);
});
// Number of samples or fraction of samples
const double value = coverage_levels ? samples.size() : (double) samples.size() / (double) sample_names.size();

for (int64_t i = sorted_levels.size() - 1; i >= 0; --i) {
// if i == 0, then value < min_level
if (i == 0 || value >= sorted_levels[i]) {
#pragma omp critical (set_of_handles_for_level)
set_of_handles_for_level[i].insert(h); // Save the forward handle
// #pragma omp critical (cout)
// std::cout << graph.get_id(h) << "\t" << hl << "\t" << value << "\t" << ">= " << sorted_levels[i] << std::endl;
break;
}
}
}, true);

const std::string symbol = coverage_levels ? "c" : "f";

if (args::get(path_range_class)) {
const bool _show_step_ranges = args::get(show_step_ranges);

// Traverse all paths to emit classified path ranges
if (_show_step_ranges) {
std::cout << "#path.name\tstart\tend\tclass\tsteps" << std::endl;
} else {
std::cout << "#path.name\tstart\tend\tclass" << std::endl;
}
std::vector<path_handle_t> all_paths;
graph.for_each_path_handle([&all_paths](const path_handle_t& path) {
all_paths.push_back(path);
});

#pragma omp parallel for schedule(dynamic, 1)
for (auto& path : all_paths) {
uint64_t start = 0, end = 0;
int64_t last_class = -1;
std::vector<step_handle_t> step_range;
graph.for_each_step_in_path(path, [&](const step_handle_t& step) {
handle_t handle = graph.get_handle_of_step(step);
if (graph.get_is_reverse(handle)) {
handle = graph.flip(handle); // We saved the forward handle
}

// Check the class of the node
int64_t current_class = -1;
for (uint64_t i = 0; i < set_of_handles_for_level.size(); ++i) {
if (set_of_handles_for_level[i].count(handle)) {
current_class = i;
break;
}
}

if (last_class != -1 && (last_class != current_class)) {
// Emit the previous range, if any
if (end > start && (end - start) >= min_size_in_bp) {
std::string seq_class;
if (last_class == 0){
seq_class = symbol + "<" + utils::to_string_custom(sorted_levels[last_class]);
} else if (last_class == set_of_handles_for_level.size() - 1){
seq_class = symbol + ">=" + utils::to_string_custom(sorted_levels[last_class]);
} else {
seq_class = utils::to_string_custom(sorted_levels[last_class]) + "<=" + symbol + "<" + utils::to_string_custom(sorted_levels[last_class + 1]);
}
if (_show_step_ranges) {
std::string step_range_str = "";
for (auto& step : step_range) {
const handle_t handle = graph.get_handle_of_step(step);
step_range_str += std::to_string(graph.get_id(handle)) + (graph.get_is_reverse(handle) ? "-" : "+") + ",";
}

#pragma omp critical (cout)
std::cout << graph.get_path_name(path) << "\t" << start << "\t" << end << "\t" << seq_class << "\t" << step_range_str.substr(0, step_range_str.size() - 1) << std::endl; // trim the trailing comma from step_range
} else {
#pragma omp critical (cout)
std::cout << graph.get_path_name(path) << "\t" << start << "\t" << end << "\t" << seq_class << std::endl;
}
}
start = end;
end += graph.get_length(handle);
if (_show_step_ranges) {
step_range.clear();
}
} else {
end += graph.get_length(handle);
}
if (_show_step_ranges) {
step_range.push_back(step);
}
last_class = current_class;
});

// Emit last range, if any
if (end > start && (end - start) >= min_size_in_bp) {
std::string seq_class;
if (last_class == 0){
seq_class = symbol + "<" + utils::to_string_custom(sorted_levels[last_class]);
} else if (last_class == set_of_handles_for_level.size() - 1){
seq_class = symbol + ">=" + utils::to_string_custom(sorted_levels[last_class]);
} else {
seq_class = utils::to_string_custom(sorted_levels[last_class]) + "<=" + symbol + "<" + utils::to_string_custom(sorted_levels[last_class + 1]);
}
if (_show_step_ranges) {
std::string step_range_str = "";
for (auto& step : step_range) {
const handle_t handle = graph.get_handle_of_step(step);
step_range_str += std::to_string(graph.get_id(handle)) + (graph.get_is_reverse(handle) ? "-" : "+") + ",";
}

#pragma omp critical (cout)
std::cout << graph.get_path_name(path) << "\t" << start << "\t" << end << "\t" << seq_class << "\t" << step_range_str.substr(0, step_range_str.size() - 1) << std::endl; // trim the trailing comma from step_range
} else {
#pragma omp critical (cout)
std::cout << graph.get_path_name(path) << "\t" << start << "\t" << end << "\t" << seq_class << std::endl;
}
}
}
} else {
std::cout << "#node.id\tnode.len\tclass" << std::endl;
for (uint64_t i = 0; i < set_of_handles_for_level.size(); ++i) {
for (auto h : set_of_handles_for_level[i]) {
const uint64_t hl = graph.get_length(h);

if (hl >= min_size_in_bp) {
std::string seq_class;
if (i == 0){
seq_class = symbol + "<" + utils::to_string_custom(sorted_levels[i]);
} else if (i == set_of_handles_for_level.size() - 1){
seq_class = symbol + ">=" + utils::to_string_custom(sorted_levels[i]);
} else {
seq_class = utils::to_string_custom(sorted_levels[i]) + "<=" + symbol + "<" + utils::to_string_custom(sorted_levels[i + 1]);
}
std::cout << graph.get_id(h) << "\t" << hl << "\t" << seq_class << std::endl;
}
}
}
}
}

return 0;
}

Expand Down
23 changes: 23 additions & 0 deletions src/utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,29 @@ namespace utils {
return !s.empty() && std::all_of(s.begin(), s.end(), ::isdigit);
}

std::string to_string_custom(double value) {
std::ostringstream out;

// Start with fixed-point notation.
out << std::fixed << std::setprecision(5) << value;

std::string str = out.str();

// Check if there is a decimal point in the string.
size_t decimal_pos = str.find('.');
if (decimal_pos != std::string::npos) {
// Remove trailing zeros.
str.erase(str.find_last_not_of('0') + 1, std::string::npos);

// If the decimal point is now the last character, remove it too.
if (str.back() == '.') {
str.erase(str.length() - 1);
}
}

return str;
}

void graph_deep_copy(const odgi::graph_t& source,
odgi::graph_t* target) {
// copy the now-compacted graph to our output_graph
Expand Down
1 change: 1 addition & 0 deletions src/utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ using namespace handlegraph;

namespace utils {
bool is_number(const std::string &s);
std::string to_string_custom(double value);
void graph_deep_copy(const odgi::graph_t &source,
odgi::graph_t* target);
bool ends_with(const std::string &fullString, const std::string &ending);
Expand Down

0 comments on commit aa08eae

Please sign in to comment.