In [9]:
import pandas as pd
import os
genomes_tab=pd.read_csv(os.path.expanduser('~/Reference_Genome_tracking.csv'),sep=',',dtype='str')
genomes_tab['Species']=genomes_tab['Species'].str.replace(' ','_',regex=False)
genomes_tab.index=genomes_tab['Species']
genomes_tab=genomes_tab.loc[~genomes_tab.index.isna(),:]

In [12]:
from ftplib import FTP

# Define the species of interest
species_list = ["homo_sapiens", "mus_musculus", "danio_rerio"]

ensembl_base = "ftp.ensembl.org"
ensembl_path = "/pub/current_gtf/"

ftp = FTP(ensembl_base)
ftp.login()

for species in species_list:
    species_dir = ensembl_path + species + "/"
    
    # Get a list of files in the species directory
    files = ftp.nlst(species_dir)
    
    # Identify the GTF file (assuming there's only one .gtf.gz per species directory)
    gtf_file = next((f for f in files if f.endswith(".gtf.gz")), None)
    
    if gtf_file:
        local_file_name = gtf_file.split('/')[-1]
        with open(local_file_name, 'wb') as local_file:
            ftp.retrbinary('RETR ' + gtf_file, local_file.write)

ftp.quit()


'221 Goodbye.'

In [22]:
#Test function
from Bio import Entrez
from difflib import SequenceMatcher
import re
Entrez.email = "matthew.schmitz@alleninstitute.org"

def get_ncbi_assembly_stats(species_name):
    handle = Entrez.esearch(db="assembly", term=species_name)
    record = Entrez.read(handle)
    print(record)
    assembly_id = record['IdList'][-1]
    summary_handle = Entrez.esummary(db="assembly", id=assembly_id)
    summary = Entrez.read(summary_handle)
    
    organism_from_ncbi = summary['DocumentSummarySet']['DocumentSummary'][0]['Organism']
    match_ratio = SequenceMatcher(None, re.sub('\(.+\)','',organism_from_ncbi), species_name).ratio()

    # Check if the match ratio is above a certain threshold
    # You can adjust this threshold as needed
    if match_ratio < 0.9:
        print(f"Warning: Organism from NCBI ('{organism_from_ncbi}') does not closely match the input species name ('{species_name}')!")
        return None

    return summary['DocumentSummarySet']['DocumentSummary'][0]

species_list = ['Homo sapiens', 'Mus musculus', 'Pan paniscus']
for species in species_list:
    print(species)
    print(f"{species}: {get_ncbi_assembly_stats(species)}")


Homo sapiens
{'Count': '1277', 'RetMax': '20', 'RetStart': '0', 'IdList': ['18601811', '16629201', '16629191', '16629181', '16629171', '16629161', '16629151', '16629131', '13464321', '12917181', '12656101', '12656051', '12656031', '12655981', '12655941', '12655831', '12655811', '12655741', '12655711', '12655631'], 'TranslationSet': [{'From': 'Homo sapiens', 'To': '"Homo sapiens"[Organism]'}], 'TranslationStack': [{'Term': '"Homo sapiens"[Organism]', 'Field': 'Organism', 'Count': '1277', 'Explode': 'Y'}, 'GROUP'], 'QueryTranslation': '"Homo sapiens"[Organism]'}
Homo sapiens: DictElement({'RsUid': '', 'GbUid': '33120188', 'AssemblyAccession': 'GCA_913844075.1', 'LastMajorReleaseAccession': 'GCA_913844075.1', 'LatestAccession': '', 'ChainId': '913844075', 'AssemblyName': 's2r3_clone5_genome', 'UCSCName': '', 'EnsemblName': '', 'Taxid': '9606', 'Organism': 'Homo sapiens (human)', 'SpeciesTaxid': '9606', 'SpeciesName': 'Homo sapiens', 'AssemblyType': 'haploid', 'AssemblyStatus': 'Scaffold',

In [None]:
#Download all ncbi assemblies

from Bio import Entrez
import os
import requests
from ftplib import FTP
import re

Entrez.email = "matthew.schmitz@alleninstitute.org"


def sanitize_filename(filename):
    # Replace spaces with underscores
    filename = filename.replace(" ", "_")

    # Replace special characters with dashes
    # Define a regex pattern for unwanted characters
    pattern = r'[^\w\-_.]'
    filename = re.sub(pattern, '-', filename)

    return filename

def download_ftp_file(ftp_url, destination_filename):
    # Parse the FTP URL
    ftp_host, ftp_path = ftp_url.replace("ftp://", "").split("/", 1)

    # Connect to the FTP server
    with FTP(ftp_host) as ftp:
        ftp.login()
        
        # Ensure the directory for the file exists
        os.makedirs(os.path.dirname(destination_filename), exist_ok=True)

        # Download the file
        try:
            with open(destination_filename, 'wb') as local_file:
                ftp.retrbinary(f'RETR {ftp_path}', local_file.write)
        except Exception as e:
            print(f"Error downloading {ftp_url}: {e}")


def get_best_assembly_and_download(species_name,overall_path=''):
    handle = Entrez.esummary(db="taxonomy", id=species_name)
    record = Entrez.read(handle)
    scientific_name = record[0]['ScientificName']

    handle = Entrez.esearch(db="assembly", term=f"{species_name}[Taxonomy ID]  AND (latest[filter] AND all[filter] NOT anomalous[filter]) AND (\"chromosome\"[Assembly Level] OR \"complete genome\"[Assembly Level] OR )", retmax=10)
    record = Entrez.read(handle)
    best_assembly = None
    max_contig_n50 = 0
    for assembly_id in record['IdList']:
        summary_handle = Entrez.esummary(db="assembly", id=assembly_id)
        summary = Entrez.read(summary_handle)
        doc_summary = summary['DocumentSummarySet']['DocumentSummary'][0]
        contig_n50 = int(doc_summary['ContigN50'])
        if contig_n50 > max_contig_n50:
            best_assembly = doc_summary
            max_contig_n50 = contig_n50

    if not best_assembly:
        print(f"No suitable assembly found for {species_name}")
        return

    print(f"Best assembly for {species_name}: {best_assembly['AssemblyAccession']}")

    dirname=str(species_name)+"-"+sanitize_filename(scientific_name)
    
    # Downloading the FASTA file
    for prefix in ['GCA', 'GCF']:
        print(best_assembly['FtpPath_Assembly_rpt'])
        stat_ftp_url = best_assembly['FtpPath_Assembly_rpt']
        stat_name=sanitize_filename(f"{species_name}_assembly_report.txt")
        stat_destination = os.path.join(overall_path,dirname,stat_name)
        download_ftp_file(stat_ftp_url, stat_destination)
        
        fasta_ftp_url = re.sub('_assembly_report\.txt','_genomic.fna.gz',best_assembly['FtpPath_Assembly_rpt'])
        fasta_name=sanitize_filename(f"{species_name}_genomic.fna.gz")
        fasta_destination = os.path.join(overall_path,dirname,fasta_name)
        download_ftp_file(fasta_ftp_url, fasta_destination)

        gtf_ftp_url = re.sub('_assembly_report\.txt','_genomic.gtf.gz',best_assembly['FtpPath_Assembly_rpt'])
        gtf_name=sanitize_filename(f"{species_name}_genomic.gtf.gz")
        gtf_destination = os.path.join(overall_path,dirname,gtf_name)
        #gtf_destination = f"{species_name}/{species_name}_genomic.gtf.gz"
        download_ftp_file(gtf_ftp_url, gtf_destination)

def download_file(url, filename):
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(filename, 'wb') as f:
            f.write(response.content)
    else:
        print(f"Failed to download {url}")


species_dict=dict(zip([9668,9669, 246437, 10116, 13616, 27679, 
                     9823, 9361, 9986, 60711, 9598, 
                     30608, 10181, 37293, 9545, 9544, 
                     10090, 30611, 9685, 9595, 9606, 
                     9483, 42100, 9515, 9614, 9407, 
                     9555, 9999, 9615, 7955,9739,9742,37347,9913,9913,9593],
                   ["Wild Ferret","Ferret", "Chinese.Treeshrew", "Rat", "Opossum", "Squirrel.monkey", 
                     "Pig", "Armadillo.Nine.banded", "Rabbit", "African.green.monkey", "Chimpanzee", 
                     "Mouse.lemur", "Naked.mole.rat", "Owl.monkey", "Macaque.pig.tailed", "Macaque.rhesus", 
                     "Mouse", "Galago", "Cat", "Gorilla", "Human", 
                     "Marmoset", "Vaquita", "Tufted.capuchin", "Coyote", "Egyptian.fruit.bat", 
                     "Olive.baboon", "Squirrel.arctic.ground","Dog","Zebrafish",'Tursiops','Phocoena phocoena','Tupaia belangeri',
                   "Bos taurus",'Gorilla gorilla']))

species_list = species_dict.keys()
#species_list=[str(x) for x in species_list]
for species in species_list:
    best_assembly=get_best_assembly_and_download(species,overall_path='/home/matthew.schmitz/Matthew/genome/ncbi')



No suitable assembly found for 9668
No suitable assembly found for 9669
Best assembly for 246437: GCA_033439345.1
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/033/439/345/GCA_033439345.1_KIZ_version_2/GCA_033439345.1_KIZ_version_2_assembly_report.txt
Error downloading ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/033/439/345/GCA_033439345.1_KIZ_version_2/GCA_033439345.1_KIZ_version_2_genomic.gtf.gz: 550 genomes/all/GCA/033/439/345/GCA_033439345.1_KIZ_version_2/GCA_033439345.1_KIZ_version_2_genomic.gtf.gz: No such file or directory
ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/033/439/345/GCA_033439345.1_KIZ_version_2/GCA_033439345.1_KIZ_version_2_assembly_report.txt
Error downloading ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCA/033/439/345/GCA_033439345.1_KIZ_version_2/GCA_033439345.1_KIZ_version_2_genomic.gtf.gz: 550 genomes/all/GCA/033/439/345/GCA_033439345.1_KIZ_version_2/GCA_033439345.1_KIZ_version_2_genomic.gtf.gz: No such file or directory
Best assembly for 10116: GCA_021556685.1
ftp://ftp

In [None]:
import requests
import os
import re
from ftplib import FTP

def sanitize_filename(filename):
    # Replace spaces with underscores and special characters with dashes
    filename = filename.replace(" ", "_")
    pattern = r'[^\w\-_.]'
    return re.sub(pattern, '-', filename)

def download_ftp_file(ftp_url, destination_filename):
    # Parse and download from FTP URL
    ftp_host, ftp_path = ftp_url.replace("ftp://", "").split("/", 1)
    with FTP(ftp_host) as ftp:
        ftp.login()
        os.makedirs(os.path.dirname(destination_filename), exist_ok=True)
        try:
            with open(destination_filename, 'wb') as local_file:
                ftp.retrbinary(f'RETR {ftp_path}', local_file.write)
        except Exception as e:
            print(f"Error downloading {ftp_url}: {e}")

def get_species_name(tax_id):
    """ Fetch species name using taxonomic ID from Ensembl """
    url = f"https://rest.ensembl.org/taxonomy/id/{tax_id}?content-type=application/json"
    response = requests.get(url)
    return response.json()['scientific_name']

def find_matching_files(ftp, path, patterns):
    """ Find files matching given patterns """
    ftp.cwd(path)
    files = ftp.nlst()
    matched_files = [file for file in files if any(pattern in file for pattern in patterns)]
    return matched_files

def download_ensembl_files(tax_id, path):
    species_name = get_species_name(tax_id)
    species_path = species_name.lower().replace(' ', '_')

    # Fetch genome information from Ensembl
    url = f"https://rest.ensembl.org/info/genomes/{species_path}?content-type=application/json"
    response = requests.get(url)
    data = response.json()
    assembly_name = data['assembly_name']

    # Prepare directory name
    dir_name = sanitize_filename(f"{tax_id}-{species_name}")
    dir_path = os.path.join(path, dir_name)
    os.makedirs(dir_path, exist_ok=True)

    # Prepare FTP paths
    ensembl_ftp_base = "ftp.ensembl.org"
    fasta_ftp_dir = f"/pub/release-110/fasta/{species_path}/dna/"
    gtf_ftp_dir = f"/pub/release-110/gtf/{species_path}/"

    with FTP(ensembl_ftp_base) as ftp:
        ftp.login()

        # Find matching FASTA and GTF files
        fa_patterns = [".dna_sm.toplevel.fa.gz"]
        gtf_patterns = [".gtf.gz"]
        unwanted_patterns = ["chr", "hapl", "abinitio"]

        fasta_files = find_matching_files(ftp, fasta_ftp_dir, fa_patterns)
        gtf_files = find_matching_files(ftp, gtf_ftp_dir, gtf_patterns)
        fasta_files = [f for f in fasta_files if not any(p in f for p in unwanted_patterns)]
        gtf_files = [f for f in gtf_files if not any(p in f for p in unwanted_patterns)]

        # Download files
        if fasta_files:
            download_ftp_file(f"ftp://{ensembl_ftp_base}{fasta_ftp_dir}{fasta_files[0]}", os.path.join(dir_path, fasta_files[0]))
        if gtf_files:
            download_ftp_file(f"ftp://{ensembl_ftp_base}{gtf_ftp_dir}{gtf_files[0]}", os.path.join(dir_path, gtf_files[0]))


species_list = species_dict.keys()
#species_list=[str(x) for x in species_list]
for species in species_list:
    download_ensembl_files(species, '/home/matthew.schmitz/Matthew/genome/ensembl')

