Name: Lydia Holley

Email: lholley4@uncc.edu

Your classes should, at minimum:
- have a __repr__ and __str__that provide a meaningful representation as a string
- check that the bases or amino acids in the string are valid
- work as the argument for a SequenceRecord

DNA
- a translate method that will convert the DNA sequence and return a ProteinSequence object
- one other method of your choice (what you did previously is fine)

Protein
- a method of your choice. In this case, if the method you would implement is too complex to reasonably implement or would use resources you don't have access to, it is okay to leave it as what is called a stub method (has only one line, "pass") and explain in comments what this method would do and it's purpose
 
Here is a dictionary you can copy into your code to help facilitate DNA translation:
aa_dict = {'M':['ATG'], 'F':['TTT', 'TTC'], 'L':['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'], 'C':['TGT', 'TGC'], 'Y':['TAC', 'TAT'], 'W':['TGG'], 'P':['CCT', 'CCC', 'CCA', 'CCG'], 'H':['CAT', 'CAC'], 'Q':['CAA', 'CAG'], 'R':['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'], 'I':['ATT', 'ATC', 'ATA'], 'T':['ACT', 'ACC', 'ACA', 'ACG'], 'N':['AAT', 'AAC'], 'K':['AAA', 'AAG'], 'S':['AGT', 'AGC', 'TCT', 'TCC', 'TCA', 'TCG'], 'V':['GTT', 'GTC', 'GTA', 'GTG'], 'A':['GCT', 'GCC', 'GCA', 'GCG'], 'D':['GAT', 'GAC'], 'E':['GAA', 'GAG'], 'G':['GGT', 'GGC', 'GGA', 'GGG'], '*':['TAA','TAG','TGA']}

In [225]:
#Sequence class
from functools import total_ordering

@total_ordering
class Sequence:
     # initialize with valid characters
    def __init__(self,seq):
        valid_char = "MFLCYWPHQRITNKSVADEG*"
        for i in range(len(seq)):
            if seq[i] in valid_char:
                continue
            else:
                raise ValueError(f'Error at position {i+1}')
        self.seq = seq
       
    def __str__(self):
        return self.seq

    def __repr__(self):
        return f'Sequence: {self.seq}'
    
    def __eq__(self,other): #this will only allow objects of the same class to be equal
        if isinstance(other,DNAseq):
            if isinstance(self,DNAseq) and self.seq == other.seq:
                return True
            else:
                return False            
        elif isinstance(other,ProteinSeq):
            if isinstance(self,ProteinSeq) and self.seq == other.seq:
                return True
            else:
                return False            
        elif isinstance(other,Sequence):
            if isinstance(self,DNAseq):
                return False
            elif isinstance(self,ProteinSeq):
                return False
            elif isinstance(self,Sequence) and self.seq == other.seq:
                return True
            else:
                return False            
        else:
            return False
        
    def __add__(self,other): #this will only allow objects of the same class to be added to each other
        if isinstance(other,DNAseq):
            if isinstance(self,DNAseq):
                return DNAseq(self.seq + other.seq)
            else:
                print("cannot add sequences of different classes")
                return False            
        elif isinstance(other,ProteinSeq):
            if isinstance(self,ProteinSeq):
                return ProteinSeq(self.seq + other.seq)
            else:
                print("cannot add sequences of different classes")
                return False           
        elif isinstance(other,Sequence):
            if isinstance(self,Sequence):
                return Sequence(self.seq + other.seq)
            else:
                print("cannot add sequences of different classes")
                return False           
        else:
            return False
        
    def __lt__(self,other):
        if type(self.seq) == type(other.seq):
            return len(self.seq) < len(other.seq)
    
    def __len__(self):
        return len(self.seq)
    

In [226]:
#DNA sequence class

@total_ordering
class DNAseq(Sequence):
    
    def __init__(self,seq):
        valid_char = "ATCG"
        for i in range(len(seq)):
            if seq[i] in valid_char:
                continue
            else:
                raise ValueError(f'Invalid base at position {i+1}')
        super().__init__(seq)
        
    def __repr__(self):
        return f'DNA Sequence: {self.seq}'
    
    def translate_to_protein(self): #translates DNA to Amino Acids. Utilized last semester notes as an aid
        aa_dict = {'M':['ATG'], 'F':['TTT', 'TTC'], 'L':['TTA', 'TTG', 'CTT', 'CTC', 'CTA', 'CTG'], 'C':['TGT', 'TGC'], 'Y':['TAC', 'TAT'], 'W':['TGG'], 'P':['CCT', 'CCC', 'CCA', 'CCG'], 'H':['CAT', 'CAC'], 'Q':['CAA', 'CAG'], 'R':['CGT', 'CGC', 'CGA', 'CGG', 'AGA', 'AGG'], 'I':['ATT', 'ATC', 'ATA'], 'T':['ACT', 'ACC', 'ACA', 'ACG'], 'N':['AAT', 'AAC'], 'K':['AAA', 'AAG'], 'S':['AGT', 'AGC', 'TCT', 'TCC', 'TCA', 'TCG'], 'V':['GTT', 'GTC', 'GTA', 'GTG'], 'A':['GCT', 'GCC', 'GCA', 'GCG'], 'D':['GAT', 'GAC'], 'E':['GAA', 'GAG'], 'G':['GGT', 'GGC', 'GGA', 'GGG'], '*':['TAA','TAG','TGA']}
        protein_seq = ""
        for i in range(0,len(self.seq),3):
            codon = self.seq[i:i+3]
            for aa,codons in aa_dict.items():
                if codon in codons:
                    protein_seq += aa
                    
        return ProteinSeq(protein_seq)
    
    def __GC_content__(self): #produces the proportion of the sequence that is G and C base pairs
        GC_count = 0
        for base in self.seq:
            if base == "G" or base == "C":
                GC_count += 1
            else:
                continue
        proportionGC = GC_count/len(self.seq)
        return proportionGC

In [227]:
#confirms the translation works
seq2 = DNAseq("TTCATGCTT")
print(seq2.translate_to_protein()) #FML
seqL = DNAseq("TTATACGACATCGCC")
print(seqL.translate_to_protein()) #LYDIA


FML
LYDIA


In [228]:
#Protein Sequence Class

@total_ordering
class ProteinSeq(Sequence):
    
    def __init__(self,seq):        
        valid_char = "MFLCYWPHQRITNKSVADEG*"
        for i in range(len(seq)):
            if seq[i] in valid_char:
                continue
            else:
                raise ValueError(f'Invalid Amino Acid at position {i+1}')
        super().__init__(seq)
        
    def __repr__(self):
        return f'Protein Sequence: {self.seq}'
    
    def __highest_AA_content__(self): #returns the amino acid found most in the sequence
        aa_counter = {}
        aa_highest = []
        for aa in self.seq:
            if aa in aa_counter.keys():
                aa_counter[aa] += 1
            else:
                aa_counter[aa] = 1
        highest_value = max(aa_counter.values())
        for aa,value in aa_counter.items():
            if value == highest_value:
                aa_highest.append(aa)
            
        return aa_highest

In [229]:
#assign variables to classes
seq = "ATCGGCATGCATGCCTCGTAGCGTA"
seq1 = Sequence("ATCGGCATGCATGCCTCGTAGCGTA")
seq2 = DNAseq("GTTCATGATGTACATGAATGATAC")
seq3 = DNAseq("ATCGGCATGCATGCCTCGTAGCGTA")
seq4 = ProteinSeq("NTEAGCYWINKSTHQRVD")
seq6 = DNAseq("ATCGGCATGCATGCCTCGTAGCGTA")
seq7 = ProteinSeq("NWTEIWNKSTWAGCYWITEIWNNKSTHWQRYWINTEINNKKWVD")

#print variables to ensure proper output
print(seq)  #ATCGGCATGCATGCCTCGTAGCGTA
print(seq1) #ATCGGCATGCATGCCTCGTAGCGTA
print(seq2) #GTTCATGATGTACATGAATGATAC
print(seq3) #ATCGGCATGCATGCCTCGTAGCGTA
print(seq4) #NTEAGCYWINKSTHQRVD

print("\n") #ensure that the __eq__ method works properly
print(seq == seq1)  #False
print(seq1 == seq2) #False
print(seq1 == seq3) #False
print(seq2 == seq3) #False
print(seq1 == seq4) #False
print(seq2 == seq4) #False
print(seq3 == seq6) #True

print("\n") #ensure that each variable is stored as the correct class object
print(type(seq))  #str
print(type(seq1)) #Sequence
print(type(seq2)) #DNAseq
print(type(seq3)) #DNAseq
print(type(seq4)) #ProteinSeq

print("\n") #checks that the __add__ method works properly
print(seq2.__add__(seq3)) #GTTCATGATGTACATGAATGATACATCGGCATGCATGCCTCGTAGCGTA
print(seq2)  #GTTCATGATGTACATGAATGATAC
print(seq2.__add__(seq4)) #cannot add sequences of different classes. False
print(seq2 + seq4) #cannot add sequences of different classes. False
print(seq4 + seq7) #NTEAGCYWINKSTHQRVDNWTEIWNKSTWAGCYWITEIWNNKSTHWQRYWINTEINNKKWVD

print("\n") #ensures that the __GC_content__ method works properly
print(seq2.__GC_content__()) #0.3333333333333333
print(seq3.__GC_content__()) #0.56
print(seq6.__GC_content__()) #0.56

print("\n") #ensures that highest amino acid count method works
print(seq4.__highest_AA_content__()) #['N', 'T']
print(seq7.__highest_AA_content__()) #['W']

ATCGGCATGCATGCCTCGTAGCGTA
ATCGGCATGCATGCCTCGTAGCGTA
GTTCATGATGTACATGAATGATAC
ATCGGCATGCATGCCTCGTAGCGTA
NTEAGCYWINKSTHQRVD


False
False
False
False
False
False
True


<class 'str'>
<class '__main__.Sequence'>
<class '__main__.DNAseq'>
<class '__main__.DNAseq'>
<class '__main__.ProteinSeq'>


GTTCATGATGTACATGAATGATACATCGGCATGCATGCCTCGTAGCGTA
GTTCATGATGTACATGAATGATAC
cannot add sequences of different classes
False
cannot add sequences of different classes
False
NTEAGCYWINKSTHQRVDNWTEIWNKSTWAGCYWITEIWNNKSTHWQRYWINTEINNKKWVD


0.3333333333333333
0.56
0.56


['N', 'T']
['W']


In [230]:
#test whether sequence validation worked
seq70 = Sequence("???") #this should return an error with a message about error position

ValueError: Error at position 1

In [231]:
#test whether protein seq validation worked
seq71 = ProteinSeq("A??") #this should return an error with a message about error position

ValueError: Invalid Amino Acid at position 2

In [232]:
#test whether DNA seq validation worked
seq72 = DNAseq("AA?") #this should return an error with a message about error position

ValueError: Invalid base at position 3

In [233]:
# SequenceRecord class just to check that the classes work as input
from functools import total_ordering

@total_ordering
class SequenceRecord:

    def __init__(self,title,*args):
        self.title = title
        for seq in args:
            if isinstance(seq,Sequence):
                self.sequence = seq
            elif isinstance(seq,str):
                new_seq = Sequence(seq)
                self.sequence = new_seq
  
    def __str__(self):
        return str(self.title)+ "\n" + str(self.sequence)

    def __repr__(self):
        return f'{self.title}: {self.sequence}'
    
    def __eq__(self,other):
        if self.sequence == other.sequence:
            return True
        else:
            return False
    
    def __lt__(self,other):
        return len(self.sequence) < len(other.sequence)
    
    def __gt__(self,other):
        return len(self.sequence) > len(other.sequence)    

In [234]:
# Use this cell to test that the classes work as SequenceRecord class input

seq = "ATCGGCATGCATGCCTCGTAGCGTA"
seq1 = Sequence("ATCGGCATGCATGCCTCGTAGCGTA")
seq2 = DNAseq("GTTCATGATGTACATGAATGATAC")
seq3 = DNAseq("ATCGGCATGCATGCCTCGTAGCGTA")
seq4 = ProteinSeq("NTEAGCYWINKSTHQRVD")
seq6 = DNAseq("ATCGGCATGCATGCCTCGTAGCGTA")
seq7 = ProteinSeq("NWTEIWNKSTWAGCYWITEIWNNKSTHWQRYWINTEINNKKWVD")

seqRec_1 = SequenceRecord("Sequence 1", seq1)
print(repr(seqRec_1)) #Sequence 1: ATCGGCATGCATGCCTCGTAGCGTA
print(seqRec_1.__repr__()) #Sequence 1: ATCGGCATGCATGCCTCGTAGCGTA
print(type(seqRec_1)) #<class '__main__.SequenceRecord'>

seqRec_2 = SequenceRecord("Sequence 2", seq2)
print(repr(seqRec_2)) #Sequence 1: ATCGGCATGCATGCCTCGTAGCGTA
print(seqRec_2.__repr__()) #Sequence 2: GTTCATGATGTACATGAATGATAC
print(type(seqRec_2)) #<class '__main__.SequenceRecord'>

Sequence 1: ATCGGCATGCATGCCTCGTAGCGTA
Sequence 1: ATCGGCATGCATGCCTCGTAGCGTA
<class '__main__.SequenceRecord'>
Sequence 2: GTTCATGATGTACATGAATGATAC
Sequence 2: GTTCATGATGTACATGAATGATAC
<class '__main__.SequenceRecord'>
