In [2]:
from Bio import Entrez, SeqIO
import os

# Set your email address (required by NCBI Entrez)
Entrez.email = "your_email@example.com"

def fetch_mrna_sequences(gene_id):
    try:
        # Search for the gene in NCBI
        handle = Entrez.esearch(db="nucleotide", term=f"{gene_id}[Gene ID] AND mRNA[Filter]")
        record = Entrez.read(handle)
        handle.close()
        
        # Get the list of mRNA IDs
        mrna_ids = record["IdList"]
        if not mrna_ids:
            print("No mRNA sequences found for this gene.")
            return None
        
        # Fetch the mRNA sequences
        fasta_data_list = []
        for mrna_id in mrna_ids:
            handle = Entrez.efetch(db="nucleotide", id=mrna_id, rettype="fasta", retmode="text")
            fasta_data = handle.read()
            handle.close()
            fasta_data_list.append(fasta_data)
        
        return fasta_data_list
    except Exception as e:
        print(f"An error occurred: {e}")
        return None


def extract_coding_sequence(mrna_sequence):
    """
    Extracts the coding sequence (CDS) from the given mRNA sequence.
    Assumes the coding sequence starts with 'ATG' (start codon) and ends with a stop codon ('TAA', 'TAG', 'TGA').

    Parameters:
        mrna_sequence (str): The mRNA sequence.

    Returns:
        str: The coding sequence (CDS) or an empty string if no valid CDS is found.
    """
    start_codon = "ATG"
    stop_codons = {"TAA", "TAG", "TGA"}
    
    # Find the start codon
    start_index = mrna_sequence.find(start_codon)
    if start_index == -1:
        print("Start codon not found.")
        return ""
    
    # Search for the stop codon in the correct reading frame
    for i in range(start_index + 3, len(mrna_sequence), 3):
        codon = mrna_sequence[i:i+3]
        if codon in stop_codons:
            return mrna_sequence[start_index:i+3]
    
    print("Stop codon not found.")
    return ""


def save_mrna_fasta(content, file_name):
    # Ensure the directory exists
    os.makedirs('./mrna_seqs', exist_ok=True)
    
    # Create the full file path
    file_path = os.path.join('./mrna_seqs', file_name)
    
    # Write the content to the file
    with open(file_path, 'w') as file:
        file.write(content)
    print(f"FASTA file saved at: {file_path}")





HOMO_SAPIENS_CCND2_GENE_ID = 894
MUS_MUSCULUS_CCND2_GENE_ID = 12444  
# Homo sapiens CCND2 gene has 3 different mrna seqs
save_mrna_fasta(extract_coding_sequence(fetch_mrna_sequences(HOMO_SAPIENS_CCND2_GENE_ID)[0]), "homo_CCND2.fasta")
save_mrna_fasta(extract_coding_sequence(fetch_mrna_sequences(MUS_MUSCULUS_CCND2_GENE_ID)[0]), "mmusculus_CCND2.fasta")


FASTA file saved at: ./mrna_seqs/homo_CCND2.fasta
FASTA file saved at: ./mrna_seqs/mmusculus_CCND2.fasta


In [None]:
from Bio.Seq import Seq

def nucleotide_to_aminoacid(nucleotide_sequence):
    """
    Converts a nucleotide sequence to an amino acid sequence.

    Parameters:
        nucleotide_sequence (str): The nucleotide sequence.

    Returns:
        str: The translated amino acid sequence.
    """
    # Create a Seq object
    seq_obj = Seq(nucleotide_sequence)
    
    # Translate the nucleotide sequence to an amino acid sequence
    amino_acid_sequence = seq_obj.translate()
    
    return str(amino_acid_sequence)



MRRMVATWMLEVCEEQKCEEEVFPLAMNYLDRFLAGVPTPKSHLQLLGAVCMFLASKLKETSPLTAEKLCIYTDNSIKPQELLEWELVVLGKLKWNLAAVTPH




In [None]:
import glob
# TODO, understand why these aminoacids look so weird 
# Get all .fasta files in the mrna_seqs directory
fasta_files = glob.glob('./mrna_seqs/*.fasta')

for fasta_file in fasta_files:
    # Read the contents of the .fasta file
    with open(fasta_file, 'r') as file:
        fasta_content = file.read()
    
    # Extract the nucleotide sequence (ignoring the header line)
    nucleotide_sequence = ''.join(fasta_content.splitlines()[1:])
    
    # Convert the nucleotide sequence to an amino acid sequence
    amino_acid_sequence = nucleotide_to_aminoacid(nucleotide_sequence)
    
    # Create the new file name with the _amino prefix
    amino_file = fasta_file.replace('.fasta', '_amino.fasta')
    
    # Write the amino acid sequence to the new file
    with open(amino_file, 'w') as output_file:
        output_file.write(f">amino_{os.path.basename(fasta_file)}\n{amino_acid_sequence}")
    
    print(f"Amino acid sequence saved to: {amino_file}")

Amino acid sequence saved to: ./mrna_seqs/mmusculus_CCND2_amino.fasta
Amino acid sequence saved to: ./mrna_seqs/homo_CCND2_amino.fasta
