In [5]:
#!/usr/bin/env python3

########################################################################
# File: Chan_Nicholas_problem10.ipynb
# Purpose: Given an arbitrary collection of k-mers Patterns, we form a graph having a node for each k-mer in Patterns and connect k-mers Pattern and Pattern' by a directed edge if Suffix(Pattern) is equal to Prefix(Pattern'). The resulting graph is called the overlap graph on these k-mers, denoted Overlap(Patterns).
#   main(infile='FILE_PATH',outfile='FILE_PATH', inCL=[])
#
# Author: Nicholas Chan
# History: 10/18/2021 Created
########################################################################

# Command Line Class
Provided by Dr. B for parsing command line arguments

In [6]:
########################################################################
# CommandLine
########################################################################


class CommandLine():
    """
    Handle the command line, usage and help requests.

    CommandLine uses argparse,
    it implements a standard command line argument parser with various argument options,
    a standard usage and help, and an error termination exception Usage.
    """

    def __init__(self, inOpts = None):
        """
        CommandLine constructor.
        
        Implement a parser to interpret the command line argv string using argparse.
        """
        import argparse
        self.parser = argparse.ArgumentParser(
            description='Program prolog - a brief description of what this thing does',
            epilog='Program epilog - some other stuff you feel compelled to say',
            add_help=True,  # default is True
            prefix_chars='-',
            usage='%(prog)s [options] -option1[default] <input >output'
            )

        self.parser.add_argument('-m', '--minMotif', type=int, default=3, action='store', help='minimum motif length to consider for Markov Model')
        self.parser.add_argument('-M', '--maxMotif', type=int, default=8, action='store', help='maximum motif length to consider for Markov Model')
        self.parser.add_argument('-c', '--cutoff', type=int, default=-4, action='store', help='Significant Z-score cutoff value')
        # Command line option to use Gibbs sampling to find the optimal consensus motif.
#         self.parser.add_argument('-g', '--gibbsampling', type=float, default=1, action='store', help='implement Gibbs sampling')

        if inOpts is None:
            self.args = self.parser.parse_args()
        else:
            self.args = self.parser.parse_args(inOpts)

# Assignment 3: Problem 10

In [50]:
'''
Problem 10: 
Given an arbitrary collection of k-mers Patterns, we form a graph having a node 
for each k-mer in Patterns and connect k-mers Pattern and Pattern' by a directed edge if 
Suffix(Pattern) is equal to Prefix(Pattern'). The resulting graph is called the overlap graph 
on these k-mers, denoted Overlap(Patterns).

Desired output:
AGGCA -> GGCAT
CATGC -> ATGCG
GCATG -> CATGC
GGCAT -> GCATG
'''
class Solution:
    def __init__(self, seqList):
        self.seqList = seqList
        
        self.suffix = dict()
        self.prefix = dict()
        self.addToDicts()
        
        self.graph = self.graph()
        
    def addToDicts(self)->'dict':
        '''
        Using input text, create a dict where keys are seq suffixes and vals 
        are seq names.
        '''
        for seq in self.seqList:
            self.suffix[seq[1:]] = seq
            self.prefix[seq[:-1]] = seq
            #self.seqList.remove(seq)
    
    def graph(self) -> 'dict':
        '''
        Using prefix and suffix dicts, create a graph dict where keys 
        are starting seq and vals are seqs with prefixes matching their
        respective key's suffix.
        '''
        graphDict = dict()
        for seq in self.seqList:
            prefix = seq[:-1]
            suffix = seq[1:]
            if suffix in self.suffix and suffix in self.prefix:
                graphDict[self.suffix[suffix]] = self.prefix[suffix]
        return graphDict
            
            
    def printGraph(self) -> 'str':
        output = ""
        keyList = [key for key in self.graph]
        for key in sorted(keyList):
            output += f"{key} -> {self.graph[key]}\n"
        return output
            
            

In [51]:
import os

os.listdir('data')

['.ipynb_checkpoints', 'rosalind_ba3c.txt', 'simple-input.txt']

In [53]:
inputSeqList = []
with open('data/simple-input.txt','r') as myfile:
    for line in myfile:
        myLine = line.rstrip()
        inputSeqList.append(myLine)
        
sol = Solution(inputSeqList)
print(sol.printGraph())

AGGCA -> GGCAT
CATGC -> ATGCG
GCATG -> CATGC
GGCAT -> GCATG



# Main function 
Main function is written here. This function handles argument parsing, input, and output.

In [56]:
def main(infile, outfile, inCL=None):
    inputSeqList = []
    with open(infile,'r') as myfile:
        for line in myfile:
            myLine = line.rstrip()
            inputSeqList.append(myLine)  

    solution = Solution(inputSeqList)
    solutionString = solution.printGraph()
    
    with open(outfile, "w") as myfile:
        myfile.write(solutionString)

In [57]:
if __name__ == "__main__":
    main(infile='data/rosalind_ba3c.txt',outfile='output-problem10.txt', inCL=[])