In [1]:
#!/usr/bin/env python3

########################################################################
# File: Chan_Nicholas_problem10.ipynb
# Purpose: Given an arbitrary collection of k-mers Patterns, we form a graph having a node for each k-mer in Patterns and connect k-mers Pattern and Pattern' by a directed edge if Suffix(Pattern) is equal to Prefix(Pattern'). The resulting graph is called the overlap graph on these k-mers, denoted Overlap(Patterns).
#   main(infile='FILE_PATH',outfile='FILE_PATH', inCL=[])
#
# Author: Nicholas Chan
# History: 10/18/2021 Created
########################################################################

# Assignment 3: Problem 10
<br>
Problem 10: <br>
Given an arbitrary collection of k-mers Patterns, we form a graph having a node <br>
for each k-mer in Patterns and connect k-mers Pattern and Pattern' by a directed edge if <br>
Suffix(Pattern) is equal to Prefix(Pattern'). The resulting graph is called the overlap graph <br>
on these k-mers, denoted Overlap(Patterns).<br>
<br>
Desired output:<br>
AGGCA -> GGCAT<br>
CATGC -> ATGCG<br>
GCATG -> CATGC<br>
GGCAT -> GCATG<br>

In [2]:
class Solution:
    '''
    My solution class generates matches between k-mers which 
    are related by their prefixes and suffixes. This results in
    a graph called an overlap graph.
    '''
    def __init__(self, seqList):
        '''
        Initializes Solution object with input sequence list an 
        empty dictionary for suffixes, and an empty dictionary for 
        prefixes. The method addToDicts is called to populate the 
        empty dictionaries. Then the output graph is generated at the 
        end of initialization.
        '''
        self.seqList = seqList
        self.suffix = dict()
        self.prefix = dict()
        self.addToDicts()
        
        self.graph = self.graph()
        
    def addToDicts(self)->'dict':
        '''
        Using input text, create a dict where keys are seq suffixes and vals 
        are seq names.
        '''
        for seq in self.seqList:
            self.suffix[seq[1:]] = seq
            self.prefix[seq[:-1]] = seq
            #self.seqList.remove(seq)
    
    def graph(self) -> 'dict':
        '''
        Using prefix and suffix dicts, create a graph dict where keys 
        are starting seq and vals are seqs with prefixes matching their
        respective key's suffix.
        '''
        graphDict = dict()
        for seq in self.seqList:
            prefix = seq[:-1]
            suffix = seq[1:]
            if suffix in self.suffix and suffix in self.prefix:
                graphDict[self.suffix[suffix]] = self.prefix[suffix]
        return graphDict
            
            
    def printGraph(self) -> 'str':
        '''
        Simple method for returning a string representation
        of the graph.
        '''
        output = ""
        keyList = [key for key in self.graph]
        for key in sorted(keyList):
            output += f"{key} -> {self.graph[key]}\n"
        return output
            
            

# Main function 
Main function is written here. This function handles argument parsing, input, and output.

In [3]:
def main(infile, outfile, inCL=None):
    '''
    The main function parses in lines from an input
    file with the assumption that they are k-mers. These
    k-mers are added to a list which objects of the Solution
    class take as input.
    '''
    inputSeqList = []
    with open(infile,'r') as myfile:
        for line in myfile:
            myLine = line.rstrip()
            inputSeqList.append(myLine)  

    solution = Solution(inputSeqList)
    solutionString = solution.printGraph()
    
    with open(outfile, "w") as myfile:
        myfile.write(solutionString)

In [4]:
if __name__ == "__main__":
    main(infile='data/rosalind_ba3c.txt',outfile='output/output-problem10.txt', inCL=[])

In [5]:
# INSPECTION

# INSPECTION TEAM
# Hsiang-Yun Lu (Eloise)
# Jodi Lee
# Nabil Mohammed

# RESPONSES
# - Work on implementing main function
# - Write more markdown comments
# - Clean up code
# - Add more docstrings and inline comments

# CORRECTIONS
# - Finished main function
# - Wrote markdown comments
# - Cleaned code
# - Added more docstrings and inline comments