In [5]:
#!/usr/bin/env python3

########################################################################
# File: Chan_Nicholas_problem9.ipynb
# Purpose: Find the string spelled by a genome path.
#   main(infile='FILE_PATH',outfile='FILE_PATH', inCL=[])
#
# Author: Nicholas Chan
# History: 10/18/2021 Created
########################################################################

# Command Line Class
Provided by Dr. B for parsing command line arguments

In [6]:
########################################################################
# CommandLine
########################################################################


class CommandLine():
    """
    Handle the command line, usage and help requests.

    CommandLine uses argparse,
    it implements a standard command line argument parser with various argument options,
    a standard usage and help, and an error termination exception Usage.
    """

    def __init__(self, inOpts = None):
        """
        CommandLine constructor.
        
        Implement a parser to interpret the command line argv string using argparse.
        """
        import argparse
        self.parser = argparse.ArgumentParser(
            description='Program prolog - a brief description of what this thing does',
            epilog='Program epilog - some other stuff you feel compelled to say',
            add_help=True,  # default is True
            prefix_chars='-',
            usage='%(prog)s [options] -option1[default] <input >output'
            )

        self.parser.add_argument('-m', '--minMotif', type=int, default=3, action='store', help='minimum motif length to consider for Markov Model')
        self.parser.add_argument('-M', '--maxMotif', type=int, default=8, action='store', help='maximum motif length to consider for Markov Model')
        self.parser.add_argument('-c', '--cutoff', type=int, default=-4, action='store', help='Significant Z-score cutoff value')
        # Command line option to use Gibbs sampling to find the optimal consensus motif.
#         self.parser.add_argument('-g', '--gibbsampling', type=float, default=1, action='store', help='implement Gibbs sampling')

        if inOpts is None:
            self.args = self.parser.parse_args()
        else:
            self.args = self.parser.parse_args(inOpts)

# FastAreader Class
Provided by Dr. B for reading Fasta Files.

In [7]:

import sys

class FastAreader():
    """
    Read in files and preprocess.
    """
    def __init__(self, fname=''):
        """ Contructor: saves attribute fname. """
        self.fname = fname

    def doOpen(self):
        """ Open a file."""
        if self.fname == '':
            return sys.stdin
        else:
            return open(self.fname)

    def readFasta(self):
        """ Read in a fasta file and yield header and sequences separately"""
        header = ''
        sequence = ''

        with self.doOpen() as fileH:

            header = ''
            sequence = ''

            # skip to first fasta header
            line = fileH.readline()
            while not line.startswith('>'):
                line = fileH.readline()
            header = line[1:].rstrip()

            # Separate headers and sequences
            for line in fileH:
                if line.startswith('>'):
                    yield header, sequence
                    header = line[1:].rstrip()
                    sequence = ''
                else:
                    sequence += ''.join(line.rstrip().split()).upper()

        yield header, sequence

# Assignment 3: Problem 9

In [7]:
'''
Problem 9: Find the string spelled by a genome path.
'''
class Solution:
    def __init__(self, seqList:'list'):
        self.seqList = seqList
        self.seq = self.reconstructor()

    def reconstructor(self)->'str':
        firstSeq = self.seqList[0]
        self.seqList.remove(firstSeq)
        while len(self.seqList) > 0:
            for seq in self.seqList:
                if seq[:len(seq)-1] in firstSeq[len(firstSeq)-len(seq):]: # Rear of ref to front of seqList seq
                    # print(f'firstSeq front: {firstSeq[len(firstSeq)-len(seq):]} and seq rear: {seq[:len(seq)-1]}')
                    firstSeq = firstSeq + seq[-1]
                    self.seqList.remove(seq)
                elif seq[1:] in firstSeq[:len(firstSeq)-len(seq)]: # Front of ref to rear of seqList seq
                    # print(f'seq rear: {seq[1:]} and firstSeq front: {firstSeq[:len(firstSeq)-len(seq)]}')
                    firstSeq = seq[0] + firstSeq
                    self.seqList.remove(seq)
                else:
                    continue
        return firstSeq

# Main function 
Main function is written here. This function handles argument parsing, input, and output.

In [22]:
def main(infile, outfile, inCL=None):

    inputSeqList = []
    
    with open(infile,'r') as myfile:
        for line in myfile:
            inputSeqList.append(line.rstrip())
    #print(f"inputSeqList: {inputSeqList}")
                
    solution = Solution(inputSeqList)
    solutionString = solution.seq

    with open(outfile, "w") as myfile:
        myfile.write(solutionString + '\n')
        print(solutionString)

In [24]:
if __name__ == "__main__":
    main(infile='data/rosalind_ba3b.txt',outfile='output-problem9.txt', inCL=[])

ACCAGACGGGAACCGCGGTTCCAAACCGGTTGCCCGCTCAAGTTTGGATCTATATACCTAGCCTCTGTTTATTAAAAACTAACCACCAGATAGTTTCTCTAAGGCGGTACAGGAGGCCGTTGCTATATTAATGTAGGATGGTATCATAACAGGACATCTTTATAGGAAAGGTCGGTTCTTATGGGAGATCACCGATTTCGCCGCGCCGTGCCAAGGCTCATGCTTGCGTAAACTGACTTCAAGCGTAGGAGACACTGGTGTCACCCCTGTCAATATCGTTACGAAAGCTGCAAAGCGAACAAGGGTCCTGTCCCCAACGCACCGCGAATATATACCCATCGTATGACCGAGGAGGCGTGCCAAGTAGAAGTATGCGAGTTCGGGCTTAAGATGCGCTACGATAATATAAAGATCATGCCTAGTTGAGCCTATTACCTCAAGGGCAAGCCTGCGGCGCAACTATCGTCATCTAAGCGCGTTGCTTTTTATCTCTGATACCTTAACATCCACGGCAGCGCAGCTATTTCGTGAACTATAACAAATACTCAATTCGTTCGTACAGCACGTCAGGAAGTCGGGTTACATGATAACGCCCCCAAGCCTACTAATCCCCCGACAAGTTTGACTTCCCCACGATCGTTTCGTGTTTTGTACACTTGGAGTCGCGGTAACCTGAGTGGACCACAAGTTAGTGGACGACAGGCCATCGGTAGAGGACCTGGCTGCCGGATACTATTAATAGGGATCTTTACGATATCGAAGGCCAAACACAGTATGGCTGGGGAGTTTTCGAGACCGGGGCCGCTAGATCGAAAATAGTTACACGACAATCGGACGCTCAGATCAGCACATGGAAGGGCGAAGTCGTCAAGCATTTGGAATACCATAGAAAAGTTGAGGTCACTCAGGGTAATACGGGCCCTGAACTAGAACCTATAGCTACAATATGGGAGCTTGCAGCAAAATTCCAGGCGGCATCGGCATTGTAATCTTTGCGAGC