In [111]:
class Chromosome:
    code_dict = None
    def __init__(self,chr_id):
        self.chr_id = chr_id    # Chromosome id
        self.chrom = ""         # all mRNA sequences 
        self.genes = {}         # Dict: keys are gene names,values are sequences
        self.amino_acid_seqs={} # Dict: keys are gene names, values are amino acid translation of genes
        self.startCodon = "ATG" # Start codon

    def translate(self, gene, start, stop):
         # extract gene from mRNA sequences
        sequence = self.chrom[start:stop] 
        # Initialize amino_acid_seqs
        self.amino_acid_seqs[gene] = ""
         
        for frame in range(0,6):  # for 6 frames
             #Before reading last 3 frames, reverse the sequence
            if frame==3:         
                sequence = self.reverse(sequence) 
            # record mRNA sequence of the gene in the relevant dict
            self.genes[gene] =  sequence
            # Determine the start codon
            self.find_longest_translation_for_frame(frame,sequence)
    
    def find_longest_translation_for_frame(self,frame,sequence):
        # Helper method to find the longest translation 
        # according to the frame
        
            #initialize temp_amino_acid_seqs as empty string
            temp_amino_acid_seqs = ""
            #Start from 0,1,2 according to the frame
            start = self.find_next_start_codon(frame%3,sequence)        

            for i in range( start , len(sequence)-2, 3): 
                # If the codon read is not end codon 
                if  code_dict[sequence[i:i+3]]!= "*":   
                    # append the translation of the codon to amino acid sequence
                    temp_amino_acid_seqs += code_dict[sequence[i:i+3]] 
                # else (if the codon read is the end codon)
                else:       
                   #  if the new seq is longer save it.
                    if len(temp_amino_acid_seqs)>len( self.amino_acid_seqs[gene]):
                        self.amino_acid_seqs[gene] = temp_amino_acid_seqs
                        temp_amino_acid_seqs=""
                    # Find the next start codon
                    start = self.find_next_start_codon(i,sequence)
                    # continue to search for possible longer translations 
                    
    def find_next_start_codon(self,i,sequence):
        # Helper method to find the next start codon after i
        for j in range(i,(len(sequence)-2),3):
                if sequence[j:j+3] ==self.startCodon:
                    break
        return j
    
    def reverse(self,sequence): 
        # Helper method to reverse the mRNA sequences (for the last 3 frames)
        sequence = sequence[::-1]  
        sequence = sequence.replace("A","t")
        sequence = sequence.replace("T","A")
        sequence = sequence.replace("G","c")
        sequence = sequence.replace("C","G")
        sequence=sequence.upper()
        return sequence
  
    def info(self):        
        # Helper function to print info
        print(f"Chromosome id: {self.chr_id}")
        print("Sequence length:",len(self.chrom))
        for gene in self.genes:
           # print(gene)
            print(self.amino_acid_seqs[gene])

In [112]:
# assing code dictionary to Class static member code_dict for translation 
code_dict = {} 
with open("standard_code.txt","r") as f:
    for line in f.readlines():
        codon, amino_acid = line.split()
        if len(codon)==3: 
            code_dict[codon] = amino_acid
Chromosome.code_dict = code_dict

In [113]:
# Generate objects of type Chromosome class
chromosomes = {}
with open("sequences.fasta") as f:
    for line in f.read().splitlines():
        if line.startswith(">"):
            chr_id = line[1:]
            chromosomes[chr_id] = Chromosome(chr_id)  
        else:
            chromosomes[chr_id].chrom += line

In [114]:
# translate chromosomes
with open("intervals.gff","r") as f:
    for line in f.read().splitlines():
        chr_id, start, stop, gene = line.strip().split()
        if start.isnumeric() and stop.isnumeric():                   # skip header-like lines
            chromosomes[chr_id].translate(gene,int(start),int(stop)) # using gene as the key,add start-stop indices to the chromosome's corresponding dictionary 

In [115]:
#Print
for chr_id in chromosomes:           # For each chromosome in the chromosomes dictionary
    chromosome = chromosomes[chr_id]
    for gene in chromosome.genes:    # For each gene in the chromosome object
        print(">",gene)
        print(chromosome.amino_acid_seqs[gene]) # print translation

> gene0
MMWIYCGFFRTSRMRDWWKLTVDTDDPPELILMSMHRNVCKICVLQMNRHFSGCRLNFVWMNGEEM
> gene1
MVKRWQSWRFYVGPGIRAPMLPDPMTRQQLVVFVPWALYHTSHNWKCLNGTTHSHRSQLRAFYGEIKGEGDMFF
> gene2
MRTVVMTKGCVQYDWQGCACTLQHRVKCMFMFNYLDESNDHGVSFTYMVQAYWIWKRVVCAPGSVL
> gene3
MPVFSAWTTDYWKGLLIRPAYGQSNAAVCLTQRWFGGGLITTSQKETIKFSNPTTWVMMNIDDYINQSD
> gene4
MLAMMCAKHKLIQHSMRDCWHNLTCQCCQFSKMYAKPCWQYRNKLFKVWIRG
> gene5
MDIYFSWNEMMSTKGELRYPWFKGFSYLKHGCMWVPLQFWGNRMVSHEKFQHHVLFTEVDMVQ
> gene6
MSVSNQGERSDSWNPHTWTLQKHWRQLSMYGKKDAVLECMAMPMMQSIFYMQKIDQMMWQSLMVTWFRGCGQCKDLS
> gene7
MFVCNLVRLFYGPFSNWSDVTLGGLIASDPPRGTQTCGWSIKDNPRLMGHAQFRWDPQSQPYDHFASIDLIKPMPSRFYMHAQPAIEPLLNT
> gene8
MKQTCIHNVLHKGPRKCASLLTQFYRYTIRHANRPYYVVVLTRHIQPFPMDACTHHAFVHTVILIHYDEYVVAINRGTHHSL
> gene9
MRPRCKRLVRWKMRLFAALAPVIGYQYQKACSEDICFTIMLYFKDRAESTRYACDNNFNTMWWQNAYPRCIMFTNHYTAGMHMNDLIRMAWFFLHLHLD
