In [80]:
class Encoding:
    '''
    Substring class uses seq given as input, along with storing it's 
    complement as a seqComp. The seq and seqComp are read along each 
    of their 3 reading frames, thus yielding 6 reading frames which will
    be stored as 6 strings. 
    
    I will use a sliding frame to determine seqs which match the amino acid seq 
    given as input. The frame will be (amino acid seq length) * 3  long and will
    travel over every 3rd index in a seq. if a match is found, it will be stored
    in a list.

    ex input
    ATGGCCATGGCCCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA
    MA
    '''
    # Nucleotide to Amino Acid dictionary came from Geeks for Geeks
    aa = { 
        'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
        'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
        'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
        'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',                
        'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
        'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
        'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
        'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
        'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
        'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
        'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
        'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
        'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
        'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
        'TAC':'Y', 'TAT':'Y', 'TAA':'STOP', 'TAG':'STOP',
        'TGC':'C', 'TGT':'C', 'TGA':'STOP', 'TGG':'W',
    }
    # Dictionary for finding nucleotide complements
    nucComp = {'A': 'T', 'C': 'G', 'T': 'A', 'G': 'C'}
    
    def __init__(self, nucSeq, aaSeq):
        self.aaSeq = aaSeq
        self.seq = nucSeq
        self.revSeq = self.revComp(self.seq)
        self.frLen = 3*len(self.aaSeq)
        self.matchedSeqs = self.findAAencodings()

    def revComp(self,seq):
        ''' 
        Finds the reverse compelement of a nuc seq 
        '''
        return ''.join([self.nucComp[nuc] for nuc in seq])[::-1]

    def findSubSeqs(self, seq):
        ''' 
        Given a seq, findSubSeqs returns a list that splits the seq
        into seqs of length of aaSeq * 3 (num of nucs it takes to code for aaSeq)
        '''
        return [seq[k : self.frLen + k] for k in range(len(seq) - self.frLen + 1)
    
    def findAAencodings(self):
        '''
        Combines all subseqs long enough to encode for aaSeq from each reading frame
        and translates each subseq. If the subseq's amino acid translation matches with 
        aaSeq, save the seq
        '''
        matchedSeqs = []
        fseqCodons = [] # List of lists containing codons of encoding length subsequences in forward and reverse strands
        rseqCodons = [] # List of lists containing codons of encoding length subsequences in forward and reverse strands
        for seq in self.findSubSeqs(self.seq):
            fseqCodons.append([seq[x*3:(x+1)*3] for x in range(len(self.aaSeq))])
        for seq in self.findSubSeqs(self.revSeq):
            rseqCodons.append([seq[x*3:(x+1)*3] for x in range(len(self.aaSeq))])
        for codonCombo in fseqCodons:
            aaTrans = "".join([self.aa[codon] for codon in codonCombo])
            if aaTrans == self.aaSeq:
                matchedSeqs.append("".join(codonCombo))
        for codonCombo in rseqCodons:
            aaTrans = "".join([self.aa[codon] for codon in codonCombo])
            if aaTrans == self.aaSeq:
                matchedSeqs.append(self.revComp("".join(codonCombo)))
        return matchedSeqs

In [83]:
def main(infile, outfile='', inCL=None):
    '''
    '''
    with open(infile,'r') as myfile:
        nucs = myfile.readline().rstrip()
        peps = myfile.readline().rstrip()
        myEncoding = Encoding(nucs, peps)
        for seq in myEncoding.matchedSeqs:
            print(seq)
            

if __name__ == "__main__":
    main("data/rosalind_ba4b.txt")
#     main("data/ex-data.txt")

GATTTCTGGAAATTTTTCATCTGG
GACTTTTGGAAGTTTTTTATCTGG
GATTTCTGGAAGTTCTTCATATGG
GACTTTTGGAAATTTTTCATCTGG
GATTTTTGGAAATTCTTTATATGG
GACTTCTGGAAATTCTTTATATGG
CCAAATAAAGAATTTCCAGAAGTC
CCAGATGAAAAATTTCCAAAAGTC
CCATATAAAGAATTTCCAAAAGTC
CCAGATGAAAAACTTCCAGAAATC
CCAAATAAAAAATTTCCAAAAATC
CCAGATGAAAAACTTCCAGAAGTC
CCAAATGAAAAATTTCCAAAAGTC
CCATATAAAAAATTTCCAGAAGTC
CCATATGAAAAACTTCCAAAAATC
CCATATAAAGAATTTCCAGAAATC
CCAGATAAAGAACTTCCAAAAGTC
CCAAATAAAGAACTTCCAAAAGTC
CCATATAAAAAATTTCCAAAAGTC
CCATATGAAAAACTTCCAAAAGTC
