## Problem 33: Reconstruct a String from its Paired Composition


#### String Reconstruction from Read-Pairs Problem
Reconstruct a string from its paired composition.

> Given: Integers k and d followed by a collection of paired k-mers PairedReads.

> Return: A string Text with (k, d)-mer composition equal to PairedReads. (If multiple answers exist, you may return any one.)


<br>

In [65]:
# ---- INPUT -----

input = """
4 2
GAGA|TTGA
TCGT|GATG
CGTG|ATGT
TGGT|TGAG
GTGA|TGTT
GTGG|GTGA
TGAG|GTTG
GGTC|GAGA
GTCG|AGAT
"""

In [66]:
lines = [ln.strip() for ln in input.splitlines() if ln.strip()]
k, d = map(int, lines[0].split())

pairs = [ln for ln in lines[1:]]
print(k, d)
pairs

4 2


['GAGA|TTGA',
 'TCGT|GATG',
 'CGTG|ATGT',
 'TGGT|TGAG',
 'GTGA|TGTT',
 'GTGG|GTGA',
 'TGAG|GTTG',
 'GGTC|GAGA',
 'GTCG|AGAT']

In [67]:
# building a paired de Bruijn adjacency:
    # node format: "LLL|RRR" (each length k-1)
    # edge: (prefix(L)|prefix(R)) -> (suffix(L)|suffix(R))

k1 = k - 1
adj = {}

for pr in pairs:
    L, R = pr.split("|")
    left_pref,  right_pref  = L[:k1], R[:k1]
    left_suff,  right_suff  = L[1:],  R[1:]
    u = f"{left_pref}|{right_pref}"
    v = f"{left_suff}|{right_suff}"
    adj[u] = adj.get(u, []) + [v]
    
    if v not in adj: 
        adj[v] = []


# adj

In [68]:
# (3) START/END (Eulerian path) — same diff trick we used
diff = {}
for u, vs in adj.items():
    diff[u] = diff.get(u, 0) + len(vs)   # outdeg
    for v in vs:
        diff[v] = diff.get(v, 0) - 1     # indeg

start = next((n for n in diff if diff[n] ==  1), None)
end   = next((n for n in diff if diff[n] == -1), None)

# If it's already a cycle (no start/end), just pick any node with edges
if start is None:
    start = next((u for u, vs in adj.items() if vs), None)
    end   = start

print(start, end)
# diff

GTG|GTG AGA|TGA


In [70]:
# add edge end->start
adj[end] = adj.get(end, []) + [start]

In [69]:
def eulerian_cycle(adj, start):
    g = {u: list(vs) for u, vs in adj.items()}
    stack, cycle = [start], []
    while stack:
        v = stack[-1]
        if g[v]:
            w = g[v].pop()
            stack.append(w)
        else:
            cycle.append(stack.pop())
    cycle.reverse()
    return cycle

def reconstruct_string_from_genome_path(nodes):
    s = nodes[0]
    for i in range(1, len(nodes)):
        s += nodes[i][-1]
    return s

In [71]:
# cycle, then remove the last node to “open” at that artificial edge
cycle = eulerian_cycle(adj, start)
path_nodes  = cycle[:-1]

path_str = " -> ".join(cycle)
# print(path_str)
# cycle[:-1]

In [72]:
# reconstructing Left and Right strings from the node path, then MERGE with offset (k+d)

# Split "LLL|RRR" nodes into two parallel (k-1)-node paths
left_nodes  = [node.split("|")[0] for node in path_nodes]
right_nodes = [node.split("|")[1] for node in path_nodes]

Left  = reconstruct_string_from_genome_path(left_nodes)
Right = reconstruct_string_from_genome_path(right_nodes)

offset = k + d

In [73]:
# where’s the offset?
# A (k,d)-pair is “k bases (Left), then d hidden bases, then k bases (Right).”
# So the first Right character that should align with Left is k + d positions later.

print(Right)
print(Left)
print("^"*(len(Left)-offset) + Right)
print(Left + Right[-offset:])

GTGAGATGTTGA
GTGGTCGTGAGA
^^^^^^GTGAGATGTTGA
GTGGTCGTGAGATGTTGA


In [79]:
# ---- OUTPUT -----

# merge based on calc above
reconstructed = Left + Right[-offset:]
print("reconstructed =")
print(reconstructed)


reconstructed =
GTGGTCGTGAGATGTTGA
