# Header and Library Imports

In [1]:
# -*- coding: utf-8 -*-
"""
Created on Sun Nov  7 08:52:26 2021

@author: richa
"""

'\nCreated on Sun Nov  7 08:52:26 2021\n\n@author: richa\n'

# Substrings Class
is designed to accept a DNA sequence and desired polypeptide sequence. Finds all instances of subsequences in the DNA sequence that either encodes for the desired polypeptide sequence or its reverse complement encodes for the desired polypeptide sequence.

There are 2 attributes:
- seq - DNA sequence to search through (given)
- pep - Desired polypeptide to find in DNA sequence (given)

There is 1 method:
- findSubstrings - finds all instances of subsequences in the DNA sequence that either encodes for the desired polypeptide sequence or its reverse complement does

In [2]:
class Substrings :
    '''
    Find all instances of DNA subsequences that encode for the desired 
    polypeptide or its reverse complement encodes for the desired polypeptide
    
    Attributes:
        seq - DNA sequence to search through
        pep - desired polypeptide sequence
        
    Methods:
        findSubstrings - find all instances of DNA subsequences in given 
                        sequence that either encodes the desired polypeptide 
                        sequence or its reverse complement does
    '''
    # Codons to amino acid table
    AA = {'AAA': 'K', 'AAC': 'N', 'AAG': 'K', 'AAT': 'N', 'ACA': 'T',
          'ACC': 'T', 'ACG': 'T', 'ACT': 'T', 'AGA': 'R', 'AGC': 'S',
          'AGG': 'R', 'AGT': 'S', 'ATA': 'I', 'ATC': 'I', 'ATG': 'M',
          'ATT': 'I', 'CAA': 'Q', 'CAC': 'H', 'CAG': 'Q', 'CAT': 'H',
          'CCA': 'P', 'CCC': 'P', 'CCG': 'P', 'CCT': 'P', 'CGA': 'R',
          'CGC': 'R', 'CGG': 'R', 'CGT': 'R', 'CTA': 'L', 'CTC': 'L',
          'CTG': 'L', 'CTT': 'L', 'GAA': 'E', 'GAC': 'D', 'GAG': 'E',
          'GAT': 'D', 'GCA': 'A', 'GCC': 'A', 'GCG': 'A', 'GCT': 'A',
          'GGA': 'G', 'GGC': 'G', 'GGG': 'G', 'GGT': 'G', 'GTA': 'V',
          'GTC': 'V', 'GTG': 'V', 'GTT': 'V', 'TAA': 'STOP', 'TAC': 'Y',
          'TAG': 'STOP', 'TAT': 'Y', 'TCA': 'S', 'TCC': 'S', 'TCG': 'S',
          'TCT': 'S', 'TGA': 'STOP', 'TGC': 'C', 'TGG': 'W', 'TGT': 'C',
          'TTA': 'L', 'TTC': 'F', 'TTG': 'L', 'TTT': 'F'}
    # Translation table for reverse complements
    translate = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
    
    def __init__(self, seq, pep):
        ''' 
        Build a Substrings object from a given DNA sequence and desired
        polypeptide sequence.
        '''
        self.seq = seq  # DNA sequence
        self.pep = pep  # Desired polypeptide
        
    def findSubstrings(self):
        '''
        Find all subsequences in the DNA sequence that encodes for the desired
        polypeptide or its reverse complement does.
        '''
        # List of codon makeups for all possible substrings on forward strand
        fStrand = []
        # List of codon makeups for all possible substrings on reverse strand
        rStrand = []
        # List of DNA subsequence that encode desired polypeptide of the reverse complement
        inSeq = []
        # Length of DNA subsequence for desired polypeptide
        sLen = 3 * len(self.pep)
        
        # Loop through possible start positions for DNA subsequence
        for s in range(len(self.seq) - sLen + 1):
            # DNA subsequence
            cod = self.seq[s: s + sLen]
            # Reverse complement of DNA subsequence
            rCod = ''.join([Substrings.translate[b] for b in cod])[::-1]
            # Append list of codon makeup in forward strand DNA subsequence
            fStrand.append([cod[c: c + 3] for c in range(0, sLen, 3)])
            # Append ist of codon makeup in reverse strand DNA subsequence
            rStrand.append([rCod[c: c + 3] for c in range(0, sLen, 3)])
            
        for c in range(len(fStrand)):
            # Polypeptide translation of forward strand DNA subsequence
            p = ''.join([Substrings.AA[s] for s in fStrand[c]])
            # Polypeptide translation of reverse strand DNA subsequence
            r = ''.join([Substrings.AA[s] for s in rStrand[c]])
            if p == self.pep:
                inSeq.append(''.join(fStrand[c]))
            if r == self.pep:
                inSeq.append(''.join(rStrand[c]))
                
        return inSeq

# Main
- Get DNA sequence and desired polypeptide sequence
- Instantiation of the Substrings object for the DNA sequence and desired polypeptide sequence
- Prints all instances of subsequences in the DNA sequence that either encodes for the desired polypeptide sequence or its reverse complement encodes for the desired polypeptide sequence

In [3]:
def main(inFile = None):
    '''
    Get the DNA sequence and desired polypeptide string from the input file and
    find all instances of DNA subsequences that either encode the desired
    polypeptide or its reverse complement encodes the desired polypeptide.
    '''
    with open(inFile) as inFile:
        # Get DNA sequence
        seq = inFile.readline().rstrip()
        # Get desired polypeptide
        pep = inFile.readline().rstrip()
                
    mySubstrings = Substrings(seq, pep)
    myFind = mySubstrings.findSubstrings()
    for s in myFind:
        print(s)

if __name__ == "__main__":
    main(inFile = 'rosalind_ba4b.txt')

CTGTGTGACATTCAGTGTCGCCCCCCCCGG
TTATGTGATATACAATGTCGGCCACCTCGG
CTTTGTGATATTCAGTGTCGTCCACCGCGA
CTTTGCGACATTCAATGCCGACCCCCGCGC
CTCTGCGACATCCAGTGTCGTCCACCTCGC
CTGTGCGACATTCAGTGCCGCCCTCCACGT
CTTTGCGACATTCAATGCCGACCACCGCGA
CTCTGCGATATTCAATGCCGTCCCCCACGG
TTGTGTGATATCCAATGTCGGCCTCCGCGC
CTCTGTGATATACAGTGCCGCCCGCCTCGG
TTGTGCGATATCCAGTGTAGGCCACCGCGA
CTATGTGACATACAGTGCCGGCCTCCACGG
CTCTGCGACATCCAATGTAGGCCACCTCGA
CTATGTGACATTCAGTGTCGGCCCCCTAGA
CTCTGTGACATACAGTGTAGGCCACCTAGG
CTTTGCGACATACAGTGCAGACCGCCACGT
CTTTGTGACATACAGTGTAGACCCCCACGA
CTCTGCGATATCCAATGCCGCCCTCCAAGA
CTCTGTGACATACAATGTCGTCCCCCGCGC


# Inspections
Team: 

Comments:

Corrections: