In [5]:
#!/usr/bin/env python3

########################################################################
# File: Chan_Nicholas_problem11.ipynb
# Purpose: Given a genome Text, PathGraphk(Text) is the path consisting of |Text| - k + 1 edges, where the i-th edge of this path is labeled by the i-th k-mer in Text and the i-th node of the path is labeled by the i-th (k - 1)-mer in Text. The de Bruijn graph DeBruijnk(Text) is formed by gluing identically labeled nodes in PathGraphk(Text).
# main(infile='FILE_PATH',outfile='FILE_PATH', inCL=[])
#
# Author: Nicholas Chan
# History: 10/18/2021 Created
########################################################################

# Assignment 3: Problem 11

In [96]:
'''
Problem 11: 
Given a genome Text, PathGraphk(Text) is the path consisting of |Text| - k + 1 edges, 
where the i-th edge of this path is labeled by the i-th k-mer in Text and the i-th node 
of the path is labeled by the i-th (k - 1)-mer in Text. The de Bruijn graph DeBruijnk(Text) 
is formed by gluing identically labeled nodes in PathGraphk(Text).

Input:
4
AAGATTCTCTAC

Desired output:
AAG -> AGA
AGA -> GAT
ATT -> TTC
CTA -> TAC
CTC -> TCT
GAT -> ATT
TCT -> CTA,CTC
TTC -> TCT
'''

class Solution:
    '''
    To solve the de Bruijn graph for problem 11, I collected 
    edges (k-mer of size k) from the input sequence and made their
    prefixes keys with their suffixes as values. I utilized the 
    information of the input string to correctly match prefixes
    to their suffixes. Starting out with the whole input sequence and 
    edges made graphing the nodes (assigning suffixes to prefixes) 
    less ambiguous that it would have been just starting out with nodes.
    '''
    def __init__(self, k, seqString):
        self.k = k
        self.seqString = seqString
        self.edgeList = self.edgeComp()
        self.graph = self.graph()
        
    def edgeComp(self) -> 'list':
        '''
        Creates a list of edges from seqString. Edges are to be
        k characters long 
        '''
        seqRange = len(self.seqString) - (self.k) + 1 # Valid frame index range on seqString
        seqList = []
        for idx in range(seqRange):
            kmer = self.seqString[idx:idx + self.k]
            seqList.append(kmer)
        return seqList       
    
    def graph(self) -> 'dict':
        '''
        Create a graph dict from edges in edgeList. Keys are prefixes of each edge
        and values are each edge's corresponding suffixes. Knowledge of the edge makes 
        the assignment of keys to values less ambiguous than they would be just starting
        out with nodes of size k-1.
        '''
        graphDict = dict()
        
        for edge in self.edgeList:
            prefix = edge[:-1]
            graphDict[prefix] = []
        
        for edge in self.edgeList:
            prefix = edge[:-1]
            suffix = edge[1:]
            graphDict[prefix].append(suffix)
            
        return graphDict
            
            
    def printGraph(self) -> 'str':
        '''
        Call this method to return a string containing output
        '''
        output = ""
        keyList = [key for key in self.graph]
        for key in sorted(keyList):
            valList = []
            for vals in self.graph[key]:
                valList.append(vals)
            valList.sort()
            valstring = ','.join(valList)
            output += f"{key} -> {valstring}\n"
        return output
            
            

# Main function 
Main function is written here. This function handles argument parsing, input, and output.

In [97]:
def main(infile, outfile, inCL=None):
    k = 0
    inputSeqString = ''
    with open(infile,'r') as myfile:
        for line in myfile:
            k = int(line.rstrip())
            inputSeqString = next(myfile).rstrip() 
            
    solution = Solution(k, inputSeqString)
    solutionString = solution.printGraph()
    
    with open(outfile, "w") as myfile:
        myfile.write(solutionString)
        print(solutionString)

In [99]:
if __name__ == "__main__":
    main(infile='data/rosalind_ba3d.txt',outfile='output-problem11.txt', inCL=[])
#     main(infile='data/extra-input.txt',outfile='output-problem11.txt', inCL=[])

AAAAAATAGAA -> AAAAATAGAAC
AAAAATAGAAC -> AAAATAGAACA
AAAACAACGGG -> AAACAACGGGA
AAAACTCATGC -> AAACTCATGCA
AAAATAGAACA -> AAATAGAACAC
AAAATCCAAAT -> AAATCCAAATC
AAAATCTGATC -> AAATCTGATCA
AAAATGATCGG -> AAATGATCGGG
AAACAACGGGA -> AACAACGGGAA
AAACATCATCA -> AACATCATCAC
AAACCCTTGTT -> AACCCTTGTTA
AAACCTTTTGA -> AACCTTTTGAT
AAACGTACTAG -> AACGTACTAGT
AAACTCATGCA -> AACTCATGCAA
AAACTCGACCC -> AACTCGACCCC
AAACTTCATAT -> AACTTCATATC
AAACTTGAAAC -> AACTTGAAACT
AAACTTTATCA -> AACTTTATCAT
AAAGCACCTTG -> AAGCACCTTGC
AAAGTACCCAT -> AAGTACCCATG
AAATAGAACAC -> AATAGAACACG
AAATCCAAATC -> AATCCAAATCT
AAATCTCGTCT -> AATCTCGTCTG
AAATCTGATCA -> AATCTGATCAA
AAATGATCGGG -> AATGATCGGGT
AAATGCGTCCA -> AATGCGTCCAT
AAATTACTGTA -> AATTACTGTAA
AAATTGACCTC -> AATTGACCTCA
AAATTGTGAAT -> AATTGTGAATC
AAATTTAGCAG -> AATTTAGCAGT
AAATTTCGCCG -> AATTTCGCCGT
AACAACAACCC -> ACAACAACCCA
AACAACCCACC -> ACAACCCACCT
AACAACGGGAA -> ACAACGGGAAA
AACACGTATAA -> ACACGTATAAA
AACAGGGGACC -> ACAGGGGACCC
AACATAACTCG -> ACATAACTCGG
A