# Notes:
#### GTF:  
The files are consistently named following this pattern: species.assembly.version.gtf.gz  
http://ftp.ensembl.org/pub/release-105/gtf/taeniopygia_guttata/#:~:text=Taeniopygia_guttata.bTaeGut1_v1.p.105.gtf.gz  
    
#### FASTAs:  
cDNA:  
The files are consistently named following this pattern: species.assembly.sequence_type.status.fa.gz   
http://ftp.ensembl.org/pub/release-105/fasta/taeniopygia_guttata/cdna/Taeniopygia_guttata.bTaeGut1_v1.p.cdna.all.fa.gz  
DNA:  
The files are consistently named following this pattern: species.assembly.sequence_type.id_type.id.fa.gz  
http://ftp.ensembl.org/pub/release-105/fasta/taeniopygia_guttata/dna/Taeniopygia_guttata.bTaeGut1_v1.p.dna.toplevel.fa.gz

1. Find latest release (also include option to define)
2. Fetch GTF and FASTA (define DNA versus cDNA)

In [252]:
from bs4 import BeautifulSoup
import requests
import re
import numpy as np

In [377]:
species = "homo_sapiens"

___

# Find latest Ensembl release:

In [378]:
url = "http://ftp.ensembl.org/pub/"

html = requests.get(url)

soup = BeautifulSoup(html.text, "html.parser")

In [379]:
# Find all releases
releases = soup.body.findAll(text=re.compile('release-'))

In [380]:
# Get release numbers
rels = []
for rel in releases:
    rels.append(rel.split("/")[0].split("-")[-1])
    
# Find highest release number (= latest release)
ENS_rel = np.array(rels).astype(int).max()
ENS_rel

105

Alternative way to fetch latest release available from REST API server (might not be the same as pub release server?):  
Code from https://rest.ensembl.org/documentation/info/data

In [381]:
# import requests, sys
 
# server = "https://rest.ensembl.org"
# ext = "/info/data/?"
 
# r = requests.get(server+ext, headers={ "Content-Type" : "application/json"})
 
# if not r.ok:
#   r.raise_for_status()
#   sys.exit()
 
# decoded = r.json()
# print(repr(decoded))

___

# Get GTF + its release date from this release from Ensembl:

In [441]:
url = f"http://ftp.ensembl.org/pub/release-{ENS_rel}/gtf/{species}/"
html = requests.get(url)
soup = BeautifulSoup(html.text, "html.parser")

In [442]:
# soup

In [443]:
# soup.body.findAll("h1")

In [444]:
# soup.body.findAll("a")

In [445]:
nones = []
a_elements = []
pre = soup.find('pre')
for element in pre.descendants:
    if element.name == "a":
        a_elements.append(element)
    elif element.name != "a":
        nones.append(element)

In [446]:
for i, string in enumerate(a_elements):
    if f"{ENS_rel}.gtf.gz" in string.text:
        gtf_str = string

In [447]:
gtf_str["href"]

'Homo_sapiens.GRCh38.105.gtf.gz'

In [448]:
gtf_url = f"http://ftp.ensembl.org/pub/release-{ENS_rel}/gtf/{species}/{gtf_str['href']}"
gtf_url

'http://ftp.ensembl.org/pub/release-105/gtf/homo_sapiens/Homo_sapiens.GRCh38.105.gtf.gz'

Get release date:

In [449]:
for i, string in enumerate(nones):
    if f"{ENS_rel}.gtf.gz" in string.text:
        gtf_date = nones[i+1]

In [450]:
gtf_date

'                     18-Jun-2021 13:28            50829040\r\n'

___

# Get cDNA FASTA

http://ftp.ensembl.org/pub/release-105/fasta/taeniopygia_guttata/cdna/Taeniopygia_guttata.bTaeGut1_v1.p.cdna.all.fa.gz

In [451]:
url = f"http://ftp.ensembl.org/pub/release-{ENS_rel}/fasta/{species}/cdna"
html = requests.get(url)
soup = BeautifulSoup(html.text, "html.parser")

In [452]:
# soup

In [453]:
nones = []
a_elements = []
pre = soup.find('pre')
for element in pre.descendants:
    if element.name == "a":
        a_elements.append(element)
    elif element.name != "a":
        nones.append(element)

In [454]:
for i, string in enumerate(a_elements):
    if "cdna.all.fa" in string.text:
        cdna_str = string

In [455]:
cdna_str["href"]

'Homo_sapiens.GRCh38.cdna.all.fa.gz'

In [456]:
cdna_url = f"http://ftp.ensembl.org/pub/release-{ENS_rel}/fasta/{species}/cdna/{cdna_str['href']}"
cdna_url

'http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/cdna/Homo_sapiens.GRCh38.cdna.all.fa.gz'

Get release date:

In [457]:
for i, string in enumerate(nones):
    if "cdna.all.fa" in string.text:
        cdna_date = nones[i+1]

In [458]:
cdna_date

'                 18-Jun-2021 12:56            76085001\r\n'

___

# Get DNA FASTA link

In [459]:
url = f"http://ftp.ensembl.org/pub/release-{ENS_rel}/fasta/{species}/dna"
html = requests.get(url)
soup = BeautifulSoup(html.text, "html.parser")

In [460]:
nones = []
a_elements = []
pre = soup.find('pre')
for element in pre.descendants:
    if element.name == "a":
        a_elements.append(element)
    elif element.name != "a":
        nones.append(element)

In [463]:
for string in a_elements:
    if "dna.toplevel" in string.text:
        dna_str = string

In [465]:
dna_str["href"]

'Homo_sapiens.GRCh38.dna.toplevel.fa.gz'

In [466]:
dna_url = f"http://ftp.ensembl.org/pub/release-{ENS_rel}/fasta/{species}/dna/{dna_str['href']}"
dna_url

'http://ftp.ensembl.org/pub/release-105/fasta/homo_sapiens/dna/Homo_sapiens.GRCh38.dna.toplevel.fa.gz'

Find release date and time:

In [467]:
for i, string in enumerate(nones):
    if "dna.toplevel" in string.text:
        dna_date = nones[i+1]

In [468]:
dna_date

'             18-Jun-2021 13:13          1107654500\r\n'

___