In [5]:
#!/usr/bin/env python3

########################################################################
# File: Chan_Nicholas_problem10.ipynb
# Purpose: Given an arbitrary collection of k-mers Patterns, we form a graph having a node for each k-mer in Patterns and connect k-mers Pattern and Pattern' by a directed edge if Suffix(Pattern) is equal to Prefix(Pattern'). The resulting graph is called the overlap graph on these k-mers, denoted Overlap(Patterns).
#   main(infile='FILE_PATH',outfile='FILE_PATH', inCL=[])
#
# Author: Nicholas Chan
# History: 10/18/2021 Created
########################################################################

# Command Line Class
Provided by Dr. B for parsing command line arguments

In [6]:
########################################################################
# CommandLine
########################################################################


class CommandLine():
    """
    Handle the command line, usage and help requests.

    CommandLine uses argparse,
    it implements a standard command line argument parser with various argument options,
    a standard usage and help, and an error termination exception Usage.
    """

    def __init__(self, inOpts = None):
        """
        CommandLine constructor.
        
        Implement a parser to interpret the command line argv string using argparse.
        """
        import argparse
        self.parser = argparse.ArgumentParser(
            description='Program prolog - a brief description of what this thing does',
            epilog='Program epilog - some other stuff you feel compelled to say',
            add_help=True,  # default is True
            prefix_chars='-',
            usage='%(prog)s [options] -option1[default] <input >output'
            )

        self.parser.add_argument('-m', '--minMotif', type=int, default=3, action='store', help='minimum motif length to consider for Markov Model')
        self.parser.add_argument('-M', '--maxMotif', type=int, default=8, action='store', help='maximum motif length to consider for Markov Model')
        self.parser.add_argument('-c', '--cutoff', type=int, default=-4, action='store', help='Significant Z-score cutoff value')
        # Command line option to use Gibbs sampling to find the optimal consensus motif.
#         self.parser.add_argument('-g', '--gibbsampling', type=float, default=1, action='store', help='implement Gibbs sampling')

        if inOpts is None:
            self.args = self.parser.parse_args()
        else:
            self.args = self.parser.parse_args(inOpts)

# FastAreader Class
Provided by Dr. B for reading Fasta Files.

In [7]:

import sys

class FastAreader():
    """
    Read in files and preprocess.
    """
    def __init__(self, fname=''):
        """ Contructor: saves attribute fname. """
        self.fname = fname

    def doOpen(self):
        """ Open a file."""
        if self.fname == '':
            return sys.stdin
        else:
            return open(self.fname)

    def readFasta(self):
        """ Read in a fasta file and yield header and sequences separately"""
        header = ''
        sequence = ''

        with self.doOpen() as fileH:

            header = ''
            sequence = ''

            # skip to first fasta header
            line = fileH.readline()
            while not line.startswith('>'):
                line = fileH.readline()
            header = line[1:].rstrip()

            # Separate headers and sequences
            for line in fileH:
                if line.startswith('>'):
                    yield header, sequence
                    header = line[1:].rstrip()
                    sequence = ''
                else:
                    sequence += ''.join(line.rstrip().split()).upper()

        yield header, sequence

# Assignment 3: Problem 10

In [48]:
'''
Problem 10: 
Given an arbitrary collection of k-mers Patterns, we form a graph having a node 
for each k-mer in Patterns and connect k-mers Pattern and Pattern' by a directed edge if 
Suffix(Pattern) is equal to Prefix(Pattern'). The resulting graph is called the overlap graph 
on these k-mers, denoted Overlap(Patterns).
'''
class Solution:
    def __init__(self, text:'str', k:'str'):
        self.text = text
        self.k = k
        self.compList = self.kmerComp()
        
    def kmerComp(self)->'list':
        '''
        Using input text, create a list of all possible compositions
        of length k and return it.
        '''
        frame = len(self.text)-self.k+1
        kmerList = [] # to hold list of k-mer compositions of size k
        for pos in range(frame):
            kmerList.append(self.text[pos:pos+self.k])
        kmerList.sort()
        return kmerList

# Main function 
Main function is written here. This function handles argument parsing, input, and output.

In [72]:
def main(infile, outfile, inCL=None):
    
    k=0
    inputSeq = ""
    
    with open(infile,'r') as myfile:
        for line in myfile:
            myLine = line.rstrip()
            k = int(myLine)
            print(f"k:{k}")
            inputSeq += next(myfile).rstrip()    

    solution = Solution(inputSeq, k)
    solutionList = solution.compList
    
    with open(outfile, "w") as myfile:
        for kmer in solutionList:
            myfile.write(kmer+'\n')
            print(kmer)

In [73]:
if __name__ == "__main__":
    main(infile='data/rosalind_ba3c.txt',outfile='output-problem10.txt', inCL=[])

k:50
AAAACCACTGTCAGCGCTTCAAGCAGTACTGGATAACCCACCAGGTTTTC
AAAACCGGGCTCTCAAGATGCGGGCTTCTCTGGAAGATTTAGTTTTACCT
AAAAGGAGCTTTGATGAAAGCGGTAAGATAGACGTGACTTACGCCCGAGT
AAAATAAATCGAGAACTCCTTGGTAGGTTTAAGCTCATGTTGTACTGAAT
AAACAGCCTAGAACTGGATCCCTGTAGGGTGTCCGTTGATCGATCGGAAA
AAACCACTGTCAGCGCTTCAAGCAGTACTGGATAACCCACCAGGTTTTCG
AAACCGGGCTCTCAAGATGCGGGCTTCTCTGGAAGATTTAGTTTTACCTG
AAAGACTCACGGGCTTCGAACGCGTAATAGTACGCGGTCTGAACCGTATC
AAAGATAAGCCTAATGAAGGGTCCATCTTAGGCATAAACAGCCTAGAACT
AAAGCGGTAAGATAGACGTGACTTACGCCCGAGTTGGCAAAACCGGGCTC
AAAGCTGTAACGTATCCATGGATTCCGGGCCGTATCTCAACTTCGGTCGA
AAAGGAGCTTTGATGAAAGCGGTAAGATAGACGTGACTTACGCCCGAGTT
AAAGTTGCAAGCTAGGATTCGCGGATTGCACTAAGCCGAAAAGGAGCTTT
AAATAAATCGAGAACTCCTTGGTAGGTTTAAGCTCATGTTGTACTGAATA
AAATACGAGCGCCATGGCAGTAATCTGTTCATGGAGTGATCACACGCTGA
AAATCGAGAACTCCTTGGTAGGTTTAAGCTCATGTTGTACTGAATATCAA
AACAAAACCACTGTCAGCGCTTCAAGCAGTACTGGATAACCCACCAGGTT
AACACTATCCAAGCCCTCTCGTTGAGCCCTATTGAGGGATCATATTGAGT
AACAGCCTAGAACTGGATCCCTGTAGGGTGTCCGTTGATCGATCGGAAAG
AACCACTGTCAGCGCTTCAAGCAGTA