In [1]:
import requests
import xml.etree.ElementTree as ET
import csv
from collections import Counter

In [2]:
# Fetch XML from the web
url = "https://s3.amazonaws.com/bbop-sqlite"

# Namespace needed to access elements correctly
namespace = {'ns': 'http://s3.amazonaws.com/doc/2006-03-01/'}

tsv_output = "bbop-sem-sql-catalog-filtered.tsv"

In [10]:
obo_ontologies_tsv = "obo_ontologies.tsv"

In [11]:
def read_tsv_to_dicts(filepath):
    with open(filepath, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file, delimiter='\t')  # Use '\t' as the delimiter for TSV
        data = [row for row in reader]  # Convert each row into a dictionary
    return data

In [14]:
obo_ontologies = read_tsv_to_dicts(obo_ontologies_tsv)

In [15]:
obo_ontologies = {i["id"]: i for i in obo_ontologies}

In [3]:
response = requests.get(url)
response.raise_for_status()  # Raise an error if the request failed

In [4]:
# Parse the XML
root = ET.fromstring(response.content)


In [17]:
# Extract Contents elements into a list of dictionaries
contents = []
extension_counter = Counter()
base_names = set()
db_gz_basenames = set()


for content in root.findall('ns:Contents', namespace):
    key = content.find('ns:Key', namespace).text
    size = int(content.find('ns:Size', namespace).text)

    # Skip entries with directory paths
    if '/' in key:
        print(f"Skipping directory entry: {key}")
        continue

    # Skip empty files
    if size == 0:
        print(f"Skipping empty entry: {key}")
        continue

    # Custom basename and extension logic
    if '-' in key:
        split_index = key.find('-')
        basename = key[:split_index]
        extension = key[split_index + 1:]
    elif '.' in key:
        split_index = key.find('.')
        basename = key[:split_index]
        extension = key[split_index + 1:]
    else:
        basename = key
        extension = ''

    base_names.add(basename)
    if extension == 'db.gz':
        db_gz_basenames.add(basename)

    # Count the extension
    extension_counter[extension] += 1

    entry = {
        'Key': key,
        'basename': basename,
        'extension': extension,
        'LastModified': content.find('ns:LastModified', namespace).text,
        # 'ETag': content.find('ns:ETag', namespace).text.replace('"', ''),
        'Size': size,
        # 'StorageClass': content.find('ns:StorageClass', namespace).text,
        'obo_foundry_title': obo_ontologies.get(basename, {}).get('title', None)
    }
    contents.append(entry)


Skipping empty entry: biopragmatics-reactome.db.gz.tmp
Skipping empty entry: ceph.db
Skipping empty entry: cmo.db
Skipping empty entry: dictybase.db.gz.tmp
Skipping empty entry: drugcentral.db.gz.tmp
Skipping empty entry: drugmechdb.db.gz.tmp
Skipping empty entry: ecosim.db.gz.tmp
Skipping empty entry: epio-relation-graph.tsv.owl.tmp
Skipping empty entry: epio.db
Skipping empty entry: go-lego.db.gz.tmp
Skipping empty entry: gsso
Skipping empty entry: gsso.db
Skipping empty entry: gssox.db
Skipping empty entry: kegg.genome.db.gz.tmp
Skipping empty entry: lov.db.gz.tmp
Skipping empty entry: micro.db
Skipping empty entry: molgenie.db.gz.tmp
Skipping empty entry: omop.db.gz.tmp
Skipping empty entry: pathbank.db.gz.tmp
Skipping directory entry: releases/2022-05-31/README.md
Skipping directory entry: releases/2022-05-31/aeo.db
Skipping directory entry: releases/2022-05-31/agro-relation-graph.tsv.gz
Skipping directory entry: releases/2022-05-31/agro.db
Skipping directory entry: releases/2022-

In [18]:
non_db_gz_basenames = list(base_names - db_gz_basenames)
non_db_gz_basenames.sort()

In [19]:
non_db_gz_basenames

['README',
 'aeo',
 'allotrope',
 'bad',
 'biopragmatics',
 'cme',
 'envo.db',
 'foo',
 'goa_uniprot_all',
 'inst',
 'kegg',
 'matcher',
 'obo',
 'obo_prefixes',
 'oboe',
 'omiabis',
 'reactome']

In [20]:
extension_counter

Counter({'db.gz': 237,
         'owl': 150,
         'relation-graph.tsv.gz': 147,
         'db': 112,
         'relation-graph.tsv': 3,
         'db.tmp': 3,
         'md': 1,
         'ontology.db': 1,
         'ontology.db.old': 1,
         'reactome.db.gz': 1,
         'db.old': 1,
         'owl.old': 1,
         'journal': 1,
         'relation-graph.tsv.ttl.tmp': 1,
         'amigo.db.gz': 1,
         'lego.db.gz': 1,
         'nucleus-relation-graph.tsv.gz': 1,
         'nucleus.db': 1,
         'nucleus.db.old': 1,
         'nucleus.owl': 1,
         'plus.db': 1,
         'genegroup.db.gz': 1,
         'db.gz.old': 1,
         'genome.db.gz': 1,
         'test.db.old': 1,
         'ingest.db.gz': 1,
         'ontologies.db': 1,
         'core.db.gz': 1,
         'standards.db.gz': 1,
         'min.owl': 1,
         'db.db': 1,
         'Homo-sapiens.db.gz': 1,
         'hs.db.gz': 1,
         'mm.db.gz': 1})

In [21]:
# Write the data to a TSV file
with open(tsv_output, 'w', newline='') as tsvfile:
    writer = csv.DictWriter(tsvfile, fieldnames=contents[0].keys(), delimiter='\t')
    writer.writeheader()
    writer.writerows(contents)

print(f"Fetched and saved {len(contents)} entries to {tsv_output}")

Fetched and saved 680 entries to bbop-sem-sql-catalog-filtered.tsv
