From 7206b6843b28b001e0ab9a5dac5a8a08402d8a77 Mon Sep 17 00:00:00 2001 From: Sergey Koren Date: Wed, 14 Aug 2024 19:42:32 -0400 Subject: [PATCH] update to allow 0-node start/end bubble pop --- src/scripts/pop_bubbles_coverage_based.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/src/scripts/pop_bubbles_coverage_based.py b/src/scripts/pop_bubbles_coverage_based.py index 9233bb8a..0eb80bcd 100755 --- a/src/scripts/pop_bubbles_coverage_based.py +++ b/src/scripts/pop_bubbles_coverage_based.py @@ -115,8 +115,8 @@ def pop_bubble(start, end, removed_nodes, removed_edges, edges, coverage, nodele stack.append((edge, top, min(pathwidth, coverage.get(edge[1:], 0)))) assert end in predecessor #sys.stderr.write("Processing bubble from %s to %s and conservative mode is %s\n"%(start, end, conservative)) - # TODO: when our coverage is more trustworthy (e.g. low coverage isn't due to no unique nodes in a path, we can run this in conservative popping mode to remove noise, until then do nothing in these bubbles - if (conservative == True and len(bubble_nodes) > max_bubble_pop_size / 2) or len(bubble_nodes) > max_bubble_pop_size: + # TODO: when our coverage is more trustworthy (e.g. low coverage isn't due to no unique nodes in a path, we can run this in conservative popping mode to remove noise, until then do nothing in these bubbles, also do nothign for 3-node bubbles in high coverage + if (conservative == True and (len(bubble_nodes) == 3 or len(bubble_nodes) > max_bubble_pop_size / 2)) or len(bubble_nodes) > max_bubble_pop_size: #sys.stderr.write("Error bubble beteween %s and %s is not poppable because it its size of %s is larger than max %s\n"%(start, end, len(bubble_nodes), max_bubble_pop_size)) return @@ -142,6 +142,8 @@ def pop_bubble(start, end, removed_nodes, removed_edges, edges, coverage, nodele comp_coverage = find_component_coverage(start, belongs_to_component, component_coverage_sum, component_length_sum) if conservative == True: max_poppable_coverage = 0.25 * comp_coverage + elif start[1:] not in coverage or end[1:] not in coverage: + max_poppable_coverage = 0.25 * comp_coverage elif len(bubble_nodes) == 3: max_poppable_coverage = 0.5*comp_coverage elif start[1:] in coverage and end[1:] in coverage and nodelens[start[1:]] > max_poppable_node_size and nodelens[end[1:]] > max_poppable_node_size and coverage[start[1:]] <= 1.5*comp_coverage and coverage[start[1:]] >= 0.5 * comp_coverage and coverage[end[1:]] <= 1.5*comp_coverage and coverage[end[1:]] >= 0.5*comp_coverage: @@ -176,6 +178,7 @@ def pop_bubble(start, end, removed_nodes, removed_edges, edges, coverage, nodele def try_pop_tip(start, edges, coverage, removed_nodes, removed_edges, max_removable, nodelens): if start not in edges: return + #sys.stderr.write("Processing pop tip from node %s with edges %s max removable is %s\n"%(start, edges[start], max_removable)) if len(edges[start]) < 2 or len(edges[start]) > 4: return max_coverage = 0 max_len = 0 @@ -278,14 +281,17 @@ def try_pop_tip(start, edges, coverage, removed_nodes, removed_edges, max_remova if node in coverage: chain_length_sum[node] = nodelens[node] chain_coverage_sum[node] = coverage[node] * nodelens[node] + else: + chain_length_sum[node] = nodelens[node] + chain_coverage_sum[node] = 0 possible_merges = [] for edge in edges: bubble = find_bubble(edge, edges, nodelens, max_poppable_node_size * 5) if not bubble: continue - if bubble[0][1:] not in coverage or bubble[1][1:] not in coverage: continue - possible_merges.append((bubble[0][1:], bubble[1][1:], max(coverage[bubble[0][1:]] / coverage[bubble[1][1:]], coverage[bubble[1][1:]] / coverage[bubble[0][1:]]))) + cov = max(coverage[bubble[0][1:]] / coverage[bubble[1][1:]], coverage[bubble[1][1:]] / coverage[bubble[0][1:]]) if bubble[0][1:] in coverage and bubble[1][1:] in coverage else 0 + possible_merges.append((bubble[0][1:], bubble[1][1:], cov)) possible_merges.sort(key=lambda x: x[2]) while True: @@ -353,9 +359,11 @@ def try_pop_tip(start, edges, coverage, removed_nodes, removed_edges, max_remova chain_coverage = chain_coverage_sum[key] / chain_length_sum[key] if not bubble: + #sys.stderr.write("Pop tip called with chain cov %s and comp %s\n"%(chain_coverage, comp_coverage)) try_pop_tip(">" + node, edges, coverage, removed_nodes, removed_edges, comp_coverage * 0.75 if chain_coverage <= comp_coverage * 1.5 else comp_coverage * 0.25, nodelens) bubble = find_bubble("<" + node, edges, nodelens, max_poppable_node_size) if not bubble: + #sys.stderr.write("Pop tip reverse called with chain cov %s and comp%s\n"%(chain_coverage, comp_coverage)) try_pop_tip("<" + node, edges, coverage, removed_nodes, removed_edges, comp_coverage * 0.75 if chain_coverage <= comp_coverage * 1.5 else comp_coverage * 0.25, nodelens) for node in removed_nodes: