# Jupyter Notebook to loop (download - process - delete) through all CDS and ncRNA in Ensembl

Relevant sites: <br>
https://useast.ensembl.org/info/data/ftp/index.html  <br>
https://useast.ensembl.org/info/website/tutorials/sequence.html  <br>
https://useast.ensembl.org/info/genome/genebuild/biotypes.html  <br>
https://www.ensembl.info/2018/08/17/ensembl-insights-how-are-utrs-annotated/  <br>
https://useast.ensembl.org/info/genome/genebuild/ncrna.html  <br>
https://useast.ensembl.org/Help/View?id=155  <br>

In [1]:
import requests
import gzip
import os
from Bio import SeqIO
import pandas as pd
from bs4 import BeautifulSoup

# Base URL for Ensembl FASTA files
base_url = "https://ftp.ensembl.org/pub/release-112/fasta/"

# Path to save the Excel file
output_path = "/Users/celia/Desktop/biotokens/ensembl/ensembl_cds_ncrna_counts.xlsx"

# Fetch the list of organisms
response = requests.get(base_url)
if response.status_code != 200:
    print(f"Failed to access {base_url}")
    exit()

soup = BeautifulSoup(response.text, 'html.parser')
organism_links = soup.find_all('a', href=True)
# Filter organisms starting with 'amphi' just for testing purposes - remove if link['href'].startswith('a') if processing all organisms
organisms = [link['href'].strip('/') for link in organism_links if link['href'].startswith('amphi') and link['href'].endswith('/')]

# DataFrame to store the results
results = pd.DataFrame(columns=["Organism", "CDS_Count", "ncRNA_Count"])

for organism in organisms:
    try:
        # Initialize counts
        cds_count = 0
        ncrna_count = 0

        # Define file types and corresponding data processing
        for data_type in ["cds", "ncrna"]:
            url = f"{base_url}{organism}/{data_type}/"
            response = requests.get(url)
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                links = soup.find_all('a', href=True)
                
                for link in links:
                    href = link['href']
                    if href.endswith('.fa.gz'):
                        file_name = href
                        file_url = url + file_name
                        
                        print(f"Downloading {file_url}...")
                        response = requests.get(file_url)
                        if response.status_code == 200:
                            with open(file_name, 'wb') as f:
                                f.write(response.content)
                            
                            # Count the number of sequences
                            with gzip.open(file_name, 'rt') as f:
                                count = sum(1 for _ in SeqIO.parse(f, "fasta"))
                            
                            # Update the respective count
                            if data_type == "cds":
                                cds_count += count
                            elif data_type == "ncrna":
                                ncrna_count += count
                            
                            # Delete the file after processing
                            os.remove(file_name)
                            print(f"Processed {organism} {data_type}, found {count} entries.")
                        break
            else:
                print(f"Could not access {url}, status code: {response.status_code}")
        
        # Store the result in the DataFrame
        results = pd.concat([results, pd.DataFrame({"Organism": [organism], "CDS_Count": [cds_count], "ncRNA_Count": [ncrna_count]})], ignore_index=True)
        
    except Exception as e:
        print(f"An error occurred while processing {organism}: {e}")

# Save the results to a single spreadsheet
results.to_excel(output_path, index=False)

Downloading https://ftp.ensembl.org/pub/release-112/fasta/amphilophus_citrinellus/cds/Amphilophus_citrinellus.Midas_v5.cds.all.fa.gz...
Processed amphilophus_citrinellus cds, found 31765 entries.
Downloading https://ftp.ensembl.org/pub/release-112/fasta/amphilophus_citrinellus/ncrna/Amphilophus_citrinellus.Midas_v5.ncrna.fa.gz...
Processed amphilophus_citrinellus ncrna, found 782 entries.
Downloading https://ftp.ensembl.org/pub/release-112/fasta/amphiprion_ocellaris/cds/Amphiprion_ocellaris.ASM2253959v1.cds.all.fa.gz...
Processed amphiprion_ocellaris cds, found 70038 entries.
Downloading https://ftp.ensembl.org/pub/release-112/fasta/amphiprion_ocellaris/ncrna/Amphiprion_ocellaris.ASM2253959v1.ncrna.fa.gz...
Processed amphiprion_ocellaris ncrna, found 5852 entries.
Downloading https://ftp.ensembl.org/pub/release-112/fasta/amphiprion_percula/cds/Amphiprion_percula.Nemo_v1.cds.all.fa.gz...
Processed amphiprion_percula cds, found 34985 entries.
Downloading https://ftp.ensembl.org/pub/relea