
## Part 1 - Sequence Class


In [185]:
from functools import total_ordering
# total_ordering decorator extrapolates the meanings of le, ge, gt based 
# on the overriden eq and lt methods

@total_ordering
class Sequence(str):
    
    #object method, self is required to be first argument
    def __init__(self,seq):
        # put self. to define attribute
        # without self., any variable is a local variable... not attribute 
        self.seq = seq
        
    #overriding
    # informal conversion to a string
    def __str__(self):
        return self.seq
    # formal converstion to a string 
    def __repr__(self):
        return f'Sequence ({self.seq})'
    
    #overriding __eq__ to define criteria
    def __eq__(self,other):
        return self.seq == other.seq and self.seq == other.seq
    def __lt__(self,other):
        return self.seq < other.seq
    
    def __len__(self):
        return len(self.seq)
    
    def __add__(self,other):
        return Sequence(self.seq + other.seq)
    
    def valid_base (self):
        valid = "ACTGU" 
        for base in self:
            if base not in valid:
                return False
            else:
                return True
        
    def gcCount (self): 
        print ('G base count is ' + str(self.count("G")) + ' and C base count is ' + str(self.count("C")))
        
    def dna_to_protein_count (self): 
        d = {}
        aa_dict = {'M':['ATG'], 'F':['TTT', 'TTC'], 'L':['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
                   'C':['TGT', 'TGC'], 'Y':['TAC', 'TAT'], 'W':['TGG'], 'P':['CCT', 'CCC', 'CCA', 'CCG'],
                   'H':['CAT', 'CAC'],'Q':['CAA', 'CAG'], 'R':['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
                   'I':['ATT', 'ATC', 'ATA'], 'T':['ACT', 'ACC', 'ACA', 'ACG'],'N':['AAT', 'AAC'], 
                   'K':['AAA', 'AAG'], 'S':['AGT', 'AGC', 'TCT', 'TCC', 'TCA', 'TCG'],
                   'V':['GTT', 'GTC', 'GTA', 'GTG'],'A':['GCT', 'GCC', 'GCA', 'GCG'], 'D':['GAT', 'GAC'],
                   'E':['GAA', 'GAG'], 'G':['GGT', 'GGC', 'GGA', 'GGG'],'*':['TAA','TAG','TGA']}
        for b in range (0, len(self), 3):
            #codon is established for every three characters
            codon = self[b:b+3]
            #the amino acid dictionary is searched through to find the codon's corresponding amino acid 
            for aa, codons in aa_dict.items():
                #when the codon is found in the dictionary, it's corresponding amino acid count is added to
                if codon in codons and aa in d.keys(): 
                    d[aa] += 1
                elif codon in codons:
                    d[aa] = 1
        return (d)
   
    def dna_to_protein (self): 
        c = ''
        aa_dict = {'M':['ATG'], 'F':['TTT', 'TTC'], 'L':['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'],
                   'C':['TGT', 'TGC'], 'Y':['TAC', 'TAT'], 'W':['TGG'], 'P':['CCT', 'CCC', 'CCA', 'CCG'],
                   'H':['CAT', 'CAC'],'Q':['CAA', 'CAG'], 'R':['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'],
                   'I':['ATT', 'ATC', 'ATA'], 'T':['ACT', 'ACC', 'ACA', 'ACG'],'N':['AAT', 'AAC'], 
                   'K':['AAA', 'AAG'], 'S':['AGT', 'AGC', 'TCT', 'TCC', 'TCA', 'TCG'],
                   'V':['GTT', 'GTC', 'GTA', 'GTG'],'A':['GCT', 'GCC', 'GCA', 'GCG'], 'D':['GAT', 'GAC'],
                   'E':['GAA', 'GAG'], 'G':['GGT', 'GGC', 'GGA', 'GGG'],'*':['TAA','TAG','TGA']}
        for b in range (0, len(self), 3):
            #codon is established for every three characters
            codon = self[b:b+3]
            #the amino acid dictionary is searched through to find the codon's corresponding amino acid 
            for aa, codons in aa_dict.items():
                #when the codon is found in the dictionary, it's corresponding amino acid count is added to
                if codon in codons and aa in aa_dict.keys(): 
                    c += aa
        return (c)
       


The above cell creates a class called Sequence that is then used in the below cell to test methods. A sample base sequence is assigned to a variable below and that varible is used to create Sequence objects.

In [186]:
#Use this cell for testing your Sequence class. Show us what tests you ran to confirm your methods worked correctly
A = "AGGGTC"
B = "AGGGTTCCTTCA"
C = "GGCCTTTTAAATTTC"
D = "CCTTTAAAAGGTTCTCTG"

header_A = ">seq1A"
header_B = ">seq1B"
header_C = ">seq1C"
header_D = ">seq1D"

#creates objects from the strings
seqA = Sequence(A)
seqB = Sequence(B)
seqC = Sequence(C)
seqD = Sequence(D)

label_A= Sequence(header_A)
label_B = Sequence(header_B)
label_C = Sequence(header_C)
label_D = Sequence(header_D)


#prints out the sequence given to the varibable above  
print(seqA)
print(repr(seqD))
#with compare seqA to seqB and see if they are equal.. it should be false
print(seqA == seqB)
#will see if seqC is less than seqB (length).. it should be false 
print(seqC < seqB)

#gives the length of the Sequence object
print(len(seqA))
print(len(seqD))

#adds together two Sequence objects 
print(seqA+seqB)
print(label_A + seqA)
print(str(label_B) + " " + str(seqB))

seqA.gcCount()

#shows that seqA is a class
isinstance(seqA,Sequence)


AGGGTC
Sequence (CCTTTAAAAGGTTCTCTG)
False
False
6
18
AGGGTCAGGGTTCCTTCA
>seq1AAGGGTC
>seq1B AGGGTTCCTTCA
G base count is 3 and C base count is 1


True

In [187]:
seqA.valid_base()

True

In [191]:
seqD.dna_to_protein_count()

{'P': 1, 'L': 2, 'K': 1, 'G': 1, 'S': 1}

In [190]:
seqD.dna_to_protein()

'PLKGSL'

## Part 2 - SequenceRecord Class


In [34]:
class SequenceRecord(object):
    
    #object method, self is required to be first argument
    def __init__(self,header,seq):
        # put self. to define attribute
        # without self., any variable is a local variable... not attribute 
        self.header = header
        self.seq = seq
        
    #overriding
    # informal conversion to a string
    def __str__(self):
        return (self.seq) 
    # formal converstion to a string  
    def __repr__(self):
        return f'SequenceRecord: ({self.header},{self.seq})'


The below cell is used to test SequenceRecord class that was defined above.

In [35]:
#will give us the sequnce opject seqA from above and a header
seqRecord = SequenceRecord(seq= seqA, header = ">seq1")
#this uses the formal conversion to a string
print(repr(seqRecord))

#this shows that while the header is a string, seq is a Sequence object
print(type(seqRecord.header) == str)
print(type(seqRecord.seq) == str)




SequenceRecord: (>seq1,AGGGTTCCCA)
True
False


## Part 3 - Parsing using classes

In [36]:
def fastaParser():
    #opens the sample file 
    with open("sample.fa") as fh:

        #sets up place to store 
        header = ""
        sequence = ""
        #looks through the file line by line
        for line in fh:

            line = line.strip()
            
            if line.startswith(">"):
                if len(header) > 0:
                    #only greater than 0 if you've already read a header line in... 
                    yield (header,sequence)
                    sequence = ""

                #line is a header
                header = line.lstrip(">")
            
            else:
                #line is a sequence line
                sequence += line.strip()
        #need this bc if not the last seq wont be printed bc the trigger for yield above is theh next >... there is not next for the last seq
        yield(header,sequence)
               

for header,seq in fastaParser():
    print(repr(SequenceRecord(header,seq)))
    


SequenceRecord: (seq1A,ATGTGCATTCAAGCTGTAAGAAGCCACGATTGGCTATGTGTGCTTTGGTCTCTTTGAAGATGGAT)
SequenceRecord: (seq2A,ATGACCGTGCCACCTATCAAGCTTGCAAAAGGCATTATCACCGTCTCAGATTGTGGGGTTAAATACGAGTACATGGTTAAAGACATTGGA)
SequenceRecord: (seq3A,ATGGGAGCGCTTCAAACACTTGGTCCCATGCTTTTGTCCAACCCTACCCTGCCTTCAAATCGTTTTACTAATGGGGAAGCAAAGCTTGGTTTGGTATACAGTAAACAGCATATCT)


In the above cell, a FASTA parser is written as a generator. The parser will use the sample file and separate out sequence names and the base sequence itself. 