# Graph Implementation
- this is the data structure we will use to build a de bruijn graph
- in the use case of DNA genome construction, the edges of the graph will be the k-mers or DNA fragments and the nodes of the graph will be the k-1 prefixes and suffixes
- we can assemble the genome from the fragments by finding an Eulerian path in the graph and using it to construct the minimal super string
- NOTE: sometimes we encounter a graph where there does not exist an Eulerian path --> we cannot assemble the genome from this set of fragments. In this case the biologist would use a different set of fragments or a different length... 
    - genome construction in practice is NOT so straightforward and mechanical, much more of an iterative process

In [3]:
import itertools

class Graph:
    def __init__(self, vlist=[]):
        """ Initialize a Graph with an optional vertex list """
        self.index = {v:i for i,v in enumerate(vlist)}
        self.vertex = {i:v for i,v in enumerate(vlist)}
        self.edge = []
        self.edgelabel = []
        
    def addVertex(self, label):
        """ Add a labeled vertex to the graph """
        index = len(self.index)
        self.index[label] = index
        self.vertex[index] = label
        
    def addEdge(self, vsrc, vdst, label='', repeats=True):
        """ Add a directed edge to the graph, with an optional label. 
        Repeated edges are distinct, unless repeats is set to False. """
        e = (self.index[vsrc], self.index[vdst])
        if (repeats) or (e not in self.edge):
            self.edge.append(e)
            self.edgelabel.append(label)
            
    def hamiltonianPath(self):
        """ A Brute-force method for finding a Hamiltonian Path. 
        Basically, all possible N! paths are enumerated and checked
        for edges. Since edges can be reused there are no distictions
        made for *which* version of a repeated edge. """
        for path in itertools.permutations(sorted(self.index.values())):
            for i in xrange(len(path)-1):
                if ((path[i],path[i+1]) not in self.edge):
                    break
            else:
                return [self.vertex[i] for i in path]
        return []
    
    def SearchTree(self, path, verticesLeft):
        """ A recursive Branch-and-Bound Hamiltonian Path search. 
        Paths are extended one node at a time using only available
        edges from the graph. """
        if (len(verticesLeft) == 0):
            self.PathV2result = [self.vertex[i] for i in path]
            return True
        for v in verticesLeft:
            if (len(path) == 0) or ((path[-1],v) in self.edge):
                if self.SearchTree(path+[v], [r for r in verticesLeft if r != v]):
                    return True
        return False
    
    def hamiltonianPathV2(self):
        """ A wrapper function for invoking the Branch-and-Bound 
        Hamiltonian Path search. """
        self.PathV2result = []
        self.SearchTree([],sorted(self.index.values()))                
        return self.PathV2result
    
    def degrees(self):
        """ Returns two dictionaries with the inDegree and outDegree
        of each node from the graph. """
        inDegree = {}
        outDegree = {}
        for src, dst in self.edge:
            outDegree[src] = outDegree.get(src, 0) + 1
            inDegree[dst] = inDegree.get(dst, 0) + 1
        return inDegree, outDegree
    
    def verifyAndGetStart(self):
        inDegree, outDegree = self.degrees()
        start = 0
        end = 0
        for vert in self.vertex:
            ins = inDegree.get(vert,0)
            outs = outDegree.get(vert,0)
            if (ins == outs):
                continue
            elif (ins - outs == 1):
                end = vert
            elif (outs - ins == 1):
                start = vert
            else:
                start, end = -1, -1
                break
        if (start >= 0) and (end >= 0):
            return start
        else:
            return -1

    def eulerianPath(self):
        graph = [(src,dst) for src,dst in self.edge]
        currentVertex = self.verifyAndGetStart()
        path = [currentVertex]
        # "next" is where vertices get inserted into our tour
        # it starts at the end (i.e. it is the same as appending),
        # but later "side-trips" will insert in the middle
        next = 1
        while len(graph) > 0:
            for edge in graph:
                if (edge[0] == currentVertex):
                    currentVertex = edge[1]
                    graph.remove(edge)
                    path.insert(next, currentVertex)
                    next += 1
                    break
            else:
                for edge in graph:
                    try:
                        next = path.index(edge[0]) + 1
                        currentVertex = edge[0]
                        break
                    except ValueError:
                        continue
                else:
                    print("There is no path!")
                    return False
        return path
    
    def eulerEdges(self, path):
        edgeId = {}
        for i in range(len(self.edge)):
            edgeId[self.edge[i]] = edgeId.get(self.edge[i], []) + [i]
        edgeList = []
        for i in range(len(path)-1):
            edgeList.append(self.edgelabel[edgeId[path[i],path[i+1]].pop()])            
        return edgeList
    
    def getMinimalSuperstring(self):
        edges = self.eulerEdges(self.eulerianPath())
        return edges[0] + ''.join([edges[i][-1] for i in range(1, len(edges))])
    
    def render(self, highlightPath=[]):
        """ Outputs a version of the graph that can be rendered
        using graphviz tools (http://www.graphviz.org/)."""
        edgeId = {}
        for i in xrange(len(self.edge)):
            edgeId[self.edge[i]] = edgeId.get(self.edge[i], []) + [i]
        edgeSet = set()
        for i in xrange(len(highlightPath)-1):
            src = self.index[highlightPath[i]]
            dst = self.index[highlightPath[i+1]]
            edgeSet.add(edgeId[src,dst].pop())
        result = ''
        result += 'digraph {\n'
        result += '   graph [nodesep=2, size="10,10"];\n'
        for index, label in self.vertex.iteritems():
            result += '    N%d [shape="box", style="rounded", label="%s"];\n' % (index, label)
        for i, e in enumerate(self.edge):
            src, dst = e
            result += '    N%d -> N%d' % (src, dst)
            label = self.edgelabel[i]
            if (len(label) > 0):
                if (i in edgeSet):
                    result += ' [label="%s", penwidth=3.0]' % (label)
                else:
                    result += ' [label="%s"]' % (label)
            elif (i in edgeSet):
                result += ' [penwidth=3.0]'                
            result += ';\n'                
        result += '    overlap=false;\n'
        result += '}\n'
        return result

# Finding Minimal Superstrings with an Euler Path
- in this example I will find the minimal superstring that contains all 4-digit binary numbers
- the nodes will be the n-1 prefixes and suffixes
- the edges will the the 4-digit binary numbers
- the minimal superstring will be the shortest possible string that contains all nodes, or all 4-digit binary numbers

In [4]:
binary = [''.join(t) for t in itertools.product('01', repeat=4)]
nodes = sorted(set([code[:-1] for code in binary] + [code[1:] for code in binary]))

g = Graph(nodes)
for code in binary:
    g.addEdge(code[:-1], code[1:], code)
    
path = g.eulerianPath()
edges = g.eulerEdges(path)
    
print(nodes)
print(path)
print(edges)

print(edges[0] + ''.join([edges[i][-1] for i in range(1, len(edges))]))

print(g.getMinimalSuperstring())

['000', '001', '010', '011', '100', '101', '110', '111']
[0, 0, 1, 3, 7, 7, 6, 5, 3, 6, 4, 1, 2, 5, 2, 4, 0]
['0000', '0001', '0011', '0111', '1111', '1110', '1101', '1011', '0110', '1100', '1001', '0010', '0101', '1010', '0100', '1000']
0000111101100101000
0000111101100101000


# Constructing Genomes from DNA Fragments
- ex) Construct the genome that contains every possible 9-mer except those with an "A"
- 9-mers in this case are DNA fragments of length 9. We are looking to create the genome that contains all possible dna fragments that are 9 base pairs long that do not contain the base pair "A"

### Genome that contains all possible k-mers of length 9 except those with "A"

In [10]:
threes = []
for pattern in itertools.product('CGT', repeat=9):
        motif = ''.join(pattern)
        threes.append(motif)
        
nodes = sorted(set([code[:-1] for code in threes] + [code[1:] for code in threes]))
g = Graph(nodes)
for code in threes:
    g.addEdge(code[:-1], code[1:], code)

genome = g.getMinimalSuperstring()
print(genome)

CCCCCCCCCGGGGGGGGGTTTTTTTTTGTTTTTTTGGTTTTTTGGGTTTTTGTGGTTTTTGGGGTTTTGTTGGTTTTGTGGGTTTTGGGGGTTTGTTTTGTTTGTTTGGTTTGTTGGGTTTGTGTGGTTTGTGGGGTTTGGTGGGTTTGGGGGGTTGTTTTTGTTGTTTTGGTTGTTTGGGTTGTTGTTGTGGTTGTTGGGGTTGTGTTGGTTGTGTGGGTTGTGGGGGTTGGTTTGGTTGGTTGGGTTGGTGGGGTTGGGGGGGTGTTTTTTGTGTTTTTGGTGTTTTGGGTGTTTGTTGTGTTTGTGGTGTTTGGGGTGTTGTTTGTGTTGTTGGTGTTGTGGGTGTTGGGGGTGTGTTTTGTGTGTTTGGTGTGTTGGGTGTGTGTTGTGTGTGTGGTGTGTGGGGTGTGGTTGGTGTGGTGGGTGTGGGGGGTGGTTTTGGTGGTTTGGGTGGTTGTGGTGGTTGGGGTGGTGTTGGTGGTGTGGGTGGTGGTGGGGGTGGGTTGGGTGGGTGGGGTGGGGGGGGCGGGGGGGTCGGGGGGGCCGGGGGGTGCGGGGGGTTCGGGGGGTCCGGGGGGCTCGGGGGGCCCGGGGGTGGCGGGGGTGTCGGGGGTGCCGGGGGTTGCGGGGGTTTCGGGGGTTCCGGGGGTCTCGGGGGTCCCGGGGGCGTCGGGGGCGCCGGGGGCTGCGGGGGCTTCGGGGGCTCCGGGGGCCCCGGGGTGGGCGGGGTGGTCGGGGTGGCCGGGGTGTGCGGGGTGTTCGGGGTGTCCGGGGTGCTCGGGGTGCCCGGGGTTGGCGGGGTTGTCGGGGTTGCCGGGGTTTGCGGGGTTTTCGGGGTTTCCGGGGTTCTCGGGGTTCCCGGGGTCGTCGGGGTCGCCGGGGTCTGCGGGGTCTTCGGGGTCTCCGGGGTCCCCGGGGCGGTCGGGGCGGCCGGGGCGTGCGGGGCGTTCGGGGCGTCCGGGGCGCCCGGGGCTGGCGGGGCTGTCGGGGCTGCCGG

In [8]:
print(len(genome))

19691


### Result

- this resulting string is the genome that contains all possible dna fragments of lenght 9 that do not contain "A"
- we can use the same data structure to construct a genome from any amount of DNA fragments of the same length
- this sequence is 19691 base pairs long

### Genome that contains all 4-mers
- generate all possible 4-mers
- use the selected 4-mers to construct a de-bruijn graph and find the genome

In [44]:
max_quantity = 256 #256 possible 4-mers (4^4)

kmers = []
for pattern in itertools.product('ACGT', repeat=4):
        motif = ''.join(pattern)
        kmers.append(motif)
        
nodes = sorted(set([code[:-1] for code in kmers] + [code[1:] for code in kmers]))
g = Graph(nodes)
for code in kmers:
    g.addEdge(code[:-1], code[1:], code)

genome = g.getMinimalSuperstring()
print(genome)


            

AAAACCCCGGGGTTTTGTTGGTGTGGGCGGTCGGCCGTGCGTTCGTCCGCGCTCTCGCCCTGGCTGTCTGCCTTGCTTTCTTCCTCCCACCGACCTACCAACGCACGGACGTACGAACTCACTGACTTACTAACACAGAGATATAGACATACAAAGCCAGCGAGCTAGCAAGGCAGGGAGGTAGGAAGTCAGTGAGTTAGTAAGAAATCCATCGATCTATCAATGCATGGATGTATGAATTCATTGATTTATTAATAAA


In [45]:
print(len(genome))

259
