In [6]:
import pandas as pd
import Bio

In [8]:
df=pd.read_csv('GCF_000441575.1_ASM44157v1_genomic.fna')

merged_text= ' '.join(df['>NC_021894.1 Candidatus Carsonella ruddii DC'])
genome_file_path='Candidatus_Carsonella_ruddii_DC.txt'

with open(genome_file_path, 'w') as file:
  file.write(merged_text)

with open(genome_file_path, 'r') as file:
    genome=file.read()

In [16]:
from Bio.Seq import Seq
from Bio import SeqIO
#function to transcribe dna to mRNA
def genome_to_mRNA(genome):
    dna=Seq(genome)
    protein=dna.transcribe()
    return protein

#function to translate dna to amino acids
def genome_to_aa(genome):
    genome=genome.replace(" ","")
    dna=Seq(genome)
    amino_acids=dna.translate()
    return amino_acids


In [28]:
mRNA=genome_to_mRNA(genome)
mRNA_file_path='mRNA.txt'
with open(mRNA_file_path, 'w') as file:
    file.write(str(mRNA))


In [29]:
aa=genome_to_aa(genome)
aa_file_path='amino_acids.txt'
with open(aa_file_path, 'w') as file:
    file.write(str(aa))

In [7]:
from Bio import SeqIO
#function that returns open reading frames
def open_reading_frame(file_name,file_type):
    record = SeqIO.read(file_name, file_type)
    min_pro_len = 100
    results = []
    for strand, nuc in [(+1, record.seq), (-1, record.seq.reverse_complement())]:
         for frame in range(3):
             length = 3 * ((len(record) - frame) // 3)  
             for pro in nuc[frame : frame + length].split("*"):
                 if len(pro) >= min_pro_len:
                     result=(
                         "%s...%s - length %i, strand %i, frame %i  "
                         % (pro[:30], pro[-3:], len(pro), strand, frame)
                     )
                     results.append(result)
    return results

#function that returns transcribed open reading frames into mRNA
def orf_transcribed(file_name,file_type):
    record = SeqIO.read(file_name, file_type)
    min_pro_len = 100
    results = []
    for strand, nuc in [(+1, record.seq), (-1, record.seq.reverse_complement())]:
         for frame in range(3):
             length = 3 * ((len(record) - frame) // 3)  
             for pro in nuc[frame : frame + length].transcribe().split("*"):
                 if len(pro) >= min_pro_len:
                     result=(
                         "%s...%s - length %i, strand %i, frame %i  "
                         % (pro[:30], pro[-3:], len(pro), strand, frame)
                     )
                     results.append(result)
    return results

#function that returns translated open reading frames into amino avids
def orf_translated(file_name,file_type):
    record = SeqIO.read(file_name, file_type)
    min_pro_len = 100
    results = []
    for strand, nuc in [(+1, record.seq), (-1, record.seq.reverse_complement())]:
         for frame in range(3):
             length = 3 * ((len(record) - frame) // 3)  
             for pro in nuc[frame : frame + length].translate().split("*"):
                 if len(pro) >= min_pro_len:
                     result= (
                         "%s...%s - length %i, strand %i, frame %i  "
                         % (pro[:30], pro[-3:], len(pro), strand, frame)
                     )
                     results.append(result)
    return results
                     

#tokenize before defining open reading frames
#understand entropy

In [10]:
file_name= 'GCF_000441575.1_ASM44157v1_genomic.fna'
file_type="fasta"
dna=open_reading_frame(file_name,file_type)
mrna=orf_transcribed(file_name,file_type)
aa=orf_translated(file_name,file_type)
print(dna)
print("\n")
print(mrna)
print("\n")
print(aa)

['ATGAAAAATATTATTGTTGCAAAAGTTACT...TAA - length 174012, strand 1, frame 0  ', 'TGAAAAATATTATTGTTGCAAAAGTTACTC...AAA - length 174012, strand 1, frame 1  ', 'GAAAAATATTATTGTTGCAAAAGTTACTCC...AAA - length 174012, strand 1, frame 2  ', 'TTTTATTTAAAAAAAAAAATTCCACTTGCC...TTC - length 174012, strand -1, frame 0  ', 'TTTATTTAAAAAAAAAAATTCCACTTGCCG...TCA - length 174012, strand -1, frame 1  ', 'TTATTTAAAAAAAAAAATTCCACTTGCCGA...CAT - length 174012, strand -1, frame 2  ']


['AUGAAAAAUAUUAUUGUUGCAAAAGUUACU...UAA - length 174012, strand 1, frame 0  ', 'UGAAAAAUAUUAUUGUUGCAAAAGUUACUC...AAA - length 174012, strand 1, frame 1  ', 'GAAAAAUAUUAUUGUUGCAAAAGUUACUCC...AAA - length 174012, strand 1, frame 2  ', 'UUUUAUUUAAAAAAAAAAAUUCCACUUGCC...UUC - length 174012, strand -1, frame 0  ', 'UUUAUUUAAAAAAAAAAAUUCCACUUGCCG...UCA - length 174012, strand -1, frame 1  ', 'UUAUUUAAAAAAAAAAAUUCCACUUGCCGA...CAU - length 174012, strand -1, frame 2  ']


['MKNIIVAKVTPDDLTSICIIRLSGKKLRKF...MGK - length 443, strand 1, f