In [None]:
#this code for a list of genes and saves the output as a fasta file of cds or protein sequence with seq id as just the species name

import requests
import sys
import time
import os

server = "https://rest.ensembl.org"
gene_file = ""  # path to text file with one human gene ID per line
output_dir = ""  # directory to save individual FASTA files for cds - recommend changing path when used for protein sequences
os.makedirs(output_dir, exist_ok=True)

with open(gene_file) as f:
    human_genes = [line.strip() for line in f if line.strip()]
for gene in human_genes:
    output_fasta = os.path.join(output_dir, f"{gene}.fasta")

    # url for fetching from rest api - to fetch protein sequences change to sequence=protein 
    ext = f"/homology/id/human/{gene}?sequence=cdna;aligned=0;type=orthologues;content-type=application/json"

    r = requests.get(server + ext, headers={"Content-Type": "application/json"})
    if not r.ok:
        print(f"Error fetching {gene}")
        r.raise_for_status()
        sys.exit()

    decoded = r.json()

    with open(output_fasta, "w") as out_f:
        # for fetching human sequence id
        human_seq = decoded.get("data", [])[0].get("sequence")
        if human_seq:
            out_f.write(f">homo_sapiens|{gene}\n{human_seq}\n")
        else:
            # if present inside as source - varies depending on the gene id sometimes
            for homology in decoded.get("data", [])[0].get("homologies", []):
                if homology.get("source", {}).get("species") == "homo_sapiens" :
                    human_seq = homology["source"].get("seq")
                    if human_seq:
                        out_f.write(f">homo_sapiens\n{human_seq}\n")
                        break

        
        for homology in decoded.get("data", [])[0].get("homologies", []):
            target = homology.get("target", {})
            gene_id = target.get("id")
            species = target.get("species")
            seq = target.get("seq")

            
            if species == "human" or species == "homo_sapiens":
                continue

            if gene_id and species and seq:
                header = f">{species}"
                out_f.write(f"{header}\n{seq}\n")

    print(f"Saved human + orthologs for {gene} to {output_fasta}") #helps to keep track of successful runs
    time.sleep(0.1)

In [None]:
#code for filtering based on species ID - in the event of multiple sequences from same species - fetches the first one
#use for both cds and protein sequences

#pip install biopython #if not installed already

from Bio import SeqIO
import os

species = set(["homo_sapiens", "mus_musculus", "rattus_norvegicus", "macaca_mulatta", "xenopus_tropicalis", "sus_scrofa"]) 


input_dir = "" #directory with all the orthologs fetched for a gene using the cell above
output_dir = "" #output directory for storing the species level files
input_format = "fasta"

os.makedirs(output_dir, exist_ok=True)

for fname in os.listdir(input_dir):
    if not fname.endswith(".fasta"):
        continue

    input_file = os.path.join(input_dir, fname)
    output_file = os.path.join(output_dir, fname)

    seen_species = set()

    def filter_records():
        for record in SeqIO.parse(input_file, input_format):
            sp = record.id
            if sp in species:
                if sp not in seen_species:
                    seen_species.add(sp)
                    yield record   # only yield the first hit for this species

    count = SeqIO.write(filter_records(), output_file, input_format) #to ensure that sequences from all 6 species are in the corresponding fasta files
    print(f"Saved {count} records from {input_file} to {output_file}")