In [1]:
#!/usr/bin/env python3

########################################################################
# File: Chan_Nicholas_problem12.ipynb
# Purpose: Given a genome Text, PathGraphk(Text) is the path consisting of |Text| - k + 1 edges, where the i-th edge of this path is labeled by the i-th k-mer in Text and the i-th node of the path is labeled by the i-th (k - 1)-mer in Text. The de Bruijn graph DeBruijnk(Text) is formed by gluing identically labeled nodes in PathGraphk(Text).
# main(infile='FILE_PATH',outfile='FILE_PATH', inCL=[])
#
# Author: Nicholas Chan
# History: 10/18/2021 Created
########################################################################

# Assignment 3: Problem 12
<br>
Given an arbitrary collection of k-mers Patterns (where some k-mers may appear multiple times), <br>
we define CompositionGraph(Patterns) as a graph with |Patterns| isolated edges. Every edge is <br>
labeled by a k-mer from Patterns, and the starting and ending nodes of an edge are labeled by the <br>
prefix and suffix of the k-mer labeling that edge. We then define the de Bruijn graph of Patterns, <br>
denoted DeBruijn(Patterns), by gluing identically labeled nodes in CompositionGraph(Patterns), <br>
which yields the following algorithm.<br>
<br>
Input:<br>
GAGG<br>
CAGG<br>
GGGG<br>
GGGA<br>
CAGG<br>
AGGG<br>
GGAG<br>
<br>
Desired output:<br>
AGG -> GGG<br>
CAG -> AGG,AGG<br>
GAG -> AGG<br>
GGA -> GAG<br>
GGG -> GGA,GGG<br>

In [2]:
class Solution:
    '''
    To solve the de Bruijn graph for problem 12, I collected 
    edges from input edges and made their prefixes keys with 
    their suffixes as values. I utilized the information of the 
    input edges to correctly match prefixes to their suffixes. 
    Starting out with the whole input sequences/edges made graphing 
    the nodes (assigning suffixes to prefixes) less ambiguous that 
    it would have been just starting out with nodes.
    '''
    def __init__(self, edgeList):
        '''
        Initializes Solution object with a list of edges from input 
        and a graph which is computed from the graph() method.
        '''
        self.edgeList = edgeList
        self.graph = self.graph()

    def graph(self) -> 'dict':
        '''
        Create a graph dict from edges in edgeList. Keys are prefixes of each edge
        and values are each edge's corresponding suffixes. Knowledge of the edge makes 
        the assignment of keys to values less ambiguous than they would be just starting
        out with nodes of size k-1.
        '''
        graphDict = dict()
        
        for edge in self.edgeList:
            prefix = edge[:-1]
            graphDict[prefix] = []
        
        for edge in self.edgeList:
            prefix = edge[:-1]
            suffix = edge[1:]
            graphDict[prefix].append(suffix)
            
        return graphDict
            
            
    def printGraph(self) -> 'str':
        '''
        Call this method to return a string containing output
        '''
        output = ""
        keyList = [key for key in self.graph]
        for key in sorted(keyList):
            valList = []
            for vals in self.graph[key]:
                valList.append(vals)
            valList.sort()
            valstring = ','.join(valList)
            output += f"{key} -> {valstring}\n"
        return output
            
            

# Main function 
Main function is written here. This function handles argument parsing, input, and output.

In [3]:
def main(infile, outfile, inCL=None):
    '''
    The main function parses in lines from an input file
    which are assumed to be k-mers which we cacn generate a 
    De Bruijn Graph from. These k-mers are parsed into a list
    which is passed into a Solution class object which generates
    our graph.
    '''
    k = 0
    inputSeqList = []
    with open(infile,'r') as myfile:
        for line in myfile:
            inputSeqList.append(line.rstrip()) 
            
    solution = Solution(inputSeqList)
    solutionString = solution.printGraph()
    
    with open(outfile, "w") as myfile:
        myfile.write(solutionString)
#         print(solutionString)

In [6]:
if __name__ == "__main__":
    main(infile='data/rosalind_ba3e.txt',outfile='output/output-problem12.txt', inCL=[])
#     main(infile='data/simple-input.txt',outfile='output-problem12.txt', inCL=[])

In [5]:
# INSPECTION

# INSPECTION TEAM
# Hsiang-Yun Lu (Eloise)
# Jodi Lee
# Nabil Mohammed

# RESPONSES
# - Work on implementing main function
# - Write more markdown comments
# - Clean up code
# - Add more docstrings and inline comments

# CORRECTIONS
# - Finished main function
# - Wrote markdown comments
# - Cleaned code
# - Added more docstrings and inline comments