Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for downloading plant species with pyEnsembl #305

Merged
merged 1 commit into from Mar 28, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions pyensembl/ensembl_release.py
Expand Up @@ -77,12 +77,14 @@ def __init__(
species=self.species.latin_name,
sequence_type="cdna",
server=server,
is_plant = self.species.is_plant,
),
make_fasta_url(
ensembl_release=self.release,
species=self.species.latin_name,
sequence_type="ncrna",
server=server,
is_plant = self.species.is_plant,
),
]

Expand All @@ -92,6 +94,7 @@ def __init__(
species=self.species.latin_name,
sequence_type="pep",
server=self.server,
is_plant = self.species.is_plant,
)
]

Expand Down
31 changes: 24 additions & 7 deletions pyensembl/ensembl_url_templates.py
Expand Up @@ -24,13 +24,19 @@
from .ensembl_versions import check_release_number

ENSEMBL_FTP_SERVER = "https://ftp.ensembl.org"
ENSEMBL_PLANTS_FTP_SERVER = "https://ftp.ensemblgenomes.ebi.ac.uk/"

# Example directories
# FASTA files: /pub/release-78/fasta/homo_sapiens/
# GTF annotation files: /pub/release-78/gtf/homo_sapiens/
FASTA_SUBDIR_TEMPLATE = "/pub/release-%(release)d/fasta/%(species)s/%(type)s/"
PLANTS_FASTA_SUBDIR_TEMPLATE = "/pub/release-%(release)d/plants/fasta/%(species)s/%(type)s/"
GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/gtf/%(species)s/"
PLANTS_GTF_SUBDIR_TEMPLATE = "/pub/release-%(release)d/plants/gtf/%(species)s/"

#List plants
#Lest do a vector with all the plants species that we added to make the custom url
lPlants = ("arabidopsis_thaliana","arabidopsis")

def normalize_release_properties(ensembl_release, species):
"""
Expand Down Expand Up @@ -63,12 +69,18 @@ def make_gtf_filename(ensembl_release, species):
}


def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER):
def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER, gtf_subdir=GTF_SUBDIR_TEMPLATE):
"""
Returns a URL and a filename, which can be joined together.
"""
if species.is_plant:
server = ENSEMBL_PLANTS_FTP_SERVER
gtf_subdir = PLANTS_GTF_SUBDIR_TEMPLATE
#else:
#print(f"[+] {species.latin_name} it is not a plant", flush=True)

ensembl_release, species, _ = normalize_release_properties(ensembl_release, species)
subdir = GTF_SUBDIR_TEMPLATE % {"release": ensembl_release, "species": species}
subdir = gtf_subdir % {"release": ensembl_release, "species": species}
filename = make_gtf_filename(ensembl_release=ensembl_release, species=species)
return server + subdir + filename

Expand All @@ -93,11 +105,11 @@ def make_gtf_url(ensembl_release, species, server=ENSEMBL_FTP_SERVER):
NEW_FASTA_FILENAME_TEMPLATE_NCRNA = "%(Species)s.%(reference)s.ncrna.fa.gz"


def make_fasta_filename(ensembl_release, species, sequence_type):
def make_fasta_filename(ensembl_release, species, sequence_type, is_plant):
ensembl_release, species, reference_name = normalize_release_properties(
ensembl_release, species
)
if ensembl_release <= 75:
if ensembl_release <= 75 and not is_plant:
if sequence_type == "ncrna":
return OLD_FASTA_FILENAME_TEMPLATE_NCRNA % {
"Species": species.capitalize(),
Expand Down Expand Up @@ -125,7 +137,7 @@ def make_fasta_filename(ensembl_release, species, sequence_type):
}


def make_fasta_url(ensembl_release, species, sequence_type, server=ENSEMBL_FTP_SERVER):
def make_fasta_url(ensembl_release, species, sequence_type, is_plant, server=ENSEMBL_FTP_SERVER, fasta_subdir=FASTA_SUBDIR_TEMPLATE):
"""Construct URL to FASTA file with cDNA transcript or protein sequences

Parameter examples:
Expand All @@ -136,12 +148,17 @@ def make_fasta_url(ensembl_release, species, sequence_type, server=ENSEMBL_FTP_S
ensembl_release, species, reference_name = normalize_release_properties(
ensembl_release, species
)
subdir = FASTA_SUBDIR_TEMPLATE % {

if is_plant:
server = ENSEMBL_PLANTS_FTP_SERVER
fasta_subdir = PLANTS_FASTA_SUBDIR_TEMPLATE

subdir = fasta_subdir % {
"release": ensembl_release,
"species": species,
"type": sequence_type,
}
filename = make_fasta_filename(
ensembl_release=ensembl_release, species=species, sequence_type=sequence_type
ensembl_release=ensembl_release, species=species, sequence_type=sequence_type, is_plant = is_plant
)
return server + subdir + filename
2 changes: 1 addition & 1 deletion pyensembl/ensembl_versions.py
Expand Up @@ -12,7 +12,7 @@

MIN_ENSEMBL_RELEASE = 47
MAX_ENSEMBL_RELEASE = 111

MAX_PLANTS_ENSEMBL_RELEASE = 58

def check_release_number(release):
"""
Expand Down
26 changes: 23 additions & 3 deletions pyensembl/species.py
Expand Up @@ -12,7 +12,7 @@

from serializable import Serializable

from .ensembl_versions import MAX_ENSEMBL_RELEASE
from .ensembl_versions import MAX_ENSEMBL_RELEASE, MAX_PLANTS_ENSEMBL_RELEASE

# TODO: replace Serializable with data class

Expand All @@ -30,7 +30,7 @@ class Species(Serializable):
_reference_names_to_species = {}

@classmethod
def register(cls, latin_name, synonyms, reference_assemblies):
def register(cls, latin_name, synonyms, reference_assemblies, is_plant=False):
"""
Create a Species object from the given arguments and enter into
all the dicts used to look the species up by its fields.
Expand All @@ -39,6 +39,7 @@ def register(cls, latin_name, synonyms, reference_assemblies):
latin_name=latin_name,
synonyms=synonyms,
reference_assemblies=reference_assemblies,
is_plant=is_plant,
)
cls._latin_names_to_species[species.latin_name] = species
for synonym in synonyms:
Expand Down Expand Up @@ -80,7 +81,7 @@ def all_species_release_pairs(cls):
for release in range(release_range[0], release_range[1] + 1):
yield species_name, release

def __init__(self, latin_name, synonyms=[], reference_assemblies={}):
def __init__(self, latin_name, synonyms=[], reference_assemblies={}, is_plant=False):
"""
Parameters
----------
Expand All @@ -96,6 +97,7 @@ def __init__(self, latin_name, synonyms=[], reference_assemblies={}):
self.synonyms = synonyms
self.reference_assemblies = reference_assemblies
self._release_to_genome = {}
self.is_plant = is_plant
for genome_name, (start, end) in self.reference_assemblies.items():
for i in range(start, end + 1):
if i in self._release_to_genome:
Expand Down Expand Up @@ -350,3 +352,21 @@ def check_species_object(species_name_or_object):
"R64-1-1": (76, MAX_ENSEMBL_RELEASE),
},
)

arabidopsis_thaliana = Species.register(
latin_name="arabidopsis_thaliana",
synonyms=["arabidopsis"],
reference_assemblies={
"TAIR10": (40, MAX_PLANTS_ENSEMBL_RELEASE),
},
is_plant=True
)

rice = Species.register(
latin_name="oryza_sativa",
synonyms=["rice"],
reference_assemblies={
"IRGSP-1.0": (40, MAX_PLANTS_ENSEMBL_RELEASE),
},
is_plant=True
)
2 changes: 1 addition & 1 deletion pyensembl/version.py
@@ -1,4 +1,4 @@
__version__ = "2.3.11"
__version__ = "2.3.12"

def print_version():
print(f"v{__version__}")
Expand Down