In [1]:
import csv
import pprint
import xml.etree.ElementTree as ET
from collections import Counter

import requests
import yaml

In [3]:
# TSV inputs
obo_ontologies_tsv = "obo_ontologies.tsv"
bioportal_ontologies_tsv = "bioportal_ontology_class_counts.tsv"

In [2]:
# Fetch XML from the web
s3_url = "https://s3.amazonaws.com/bbop-sqlite"

registry_url = "https://raw.githubusercontent.com/INCATools/semantic-sql/refs/heads/main/src/semsql/builder/registry/ontologies.yaml"

ols_ontologies_endpoint_url = "https://www.ebi.ac.uk/ols4/api/ontologies?lang=en"

# Namespace needed to access elements correctly
namespace = {'ns': 'http://s3.amazonaws.com/doc/2006-03-01/'}

tsv_output = "bbop-sem-sql-catalog-filtered.tsv"

In [4]:
def read_tsv_to_dicts(filepath):
    with open(filepath, mode='r', newline='', encoding='utf-8') as file:
        reader = csv.DictReader(file, delimiter='\t')  # Use '\t' as the delimiter for TSV
        data = [row for row in reader]  # Convert each row into a dictionary
    return data

In [5]:
obo_ontologies = read_tsv_to_dicts(obo_ontologies_tsv)

FileNotFoundError: [Errno 2] No such file or directory: 'obo_ontologies.tsv'

In [6]:
bioportal_ontologies = read_tsv_to_dicts(bioportal_ontologies_tsv)

In [7]:
obo_ontologies = {i["id"]: i for i in obo_ontologies}

In [8]:
bioportal_ontologies = {i["acronym"].lower(): i for i in bioportal_ontologies}

In [9]:
bioportal_ontologies

{'aba-amb': {'acronym': 'ABA-AMB',
  'name': 'Allen Brain Atlas (ABA) Adult Mouse Brain Ontology',
  'class_count': '913'},
 'abd': {'acronym': 'ABD',
  'name': 'Anthology of Biosurveillance Diseases',
  'class_count': '1445'},
 'aceso': {'acronym': 'ACESO',
  'name': 'Adverse Childhood Experiences Ontology',
  'class_count': '296'},
 'acgt-mo': {'acronym': 'ACGT-MO',
  'name': 'Cancer Research and Management ACGT Master Ontology',
  'class_count': '1770'},
 'acvd_ontology': {'acronym': 'ACVD_ONTOLOGY',
  'name': 'Atherosclerotic Cerebrovascular Disease Ontology',
  'class_count': '1719'},
 'ad-drop': {'acronym': 'AD-DROP',
  'name': 'Alzheimer Disease Relevance Ontology by Process',
  'class_count': '25'},
 'adalab': {'acronym': 'ADALAB', 'name': 'AdaLab ontology', 'class_count': ''},
 'adalab-meta': {'acronym': 'ADALAB-META',
  'name': 'AdaLab-meta ontology',
  'class_count': ''},
 'adar': {'acronym': 'ADAR',
  'name': 'Autism DSM-ADI-R ontology',
  'class_count': '791'},
 'adcad': {

In [10]:
# Fetch and parse the semsql registry YAML
response = requests.get(registry_url)
response.raise_for_status()  # Raise an error for bad status codes
registry_dict = yaml.safe_load(response.text)

In [11]:
pprint.pprint(registry_dict)

{'description': 'Registry overlap for Semantic-SQL designed to supplement OBO '
                'vocabularies',
 'id': 'semantic-sql-registry',
 'license': 'CC0',
 'ontologies': {'aio': {'url': 'https://raw.githubusercontent.com/berkeleybop/artificial-intelligence-ontology/main/aio.owl'},
                'bao': {'has_imports': True,
                        'url': 'http://www.bioassayontology.org/bao/bao_complete.owl'},
                'bcio': {'prefixmap': {'BCIO': 'http://humanbehaviourchange.org/ontology/BCIO_',
                                       'BCIOR': 'http://humanbehaviourchange.org/ontology/BCIOR_'},
                         'url': 'http://humanbehaviourchange.org/ontology/bcio.owl'},
                'bero': {'url': 'https://github.com/berkeleybop/bero/releases/download/2022-05-26/bero.owl'},
                'bfo2020': {'url': 'http://purl.obolibrary.org/obo/bfo/2020/bfo.owl'},
                'bfo2020_core': {'url': 'http://purl.obolibrary.org/obo/bfo/2020/bfo-core.owl'},


In [21]:
x = list(registry_dict['ontologies'].keys())
x.sort()
x

['aio',
 'bao',
 'bcio',
 'bero',
 'bfo2020',
 'bfo2020_core',
 'bfo2020_notime',
 'bfo2020_time',
 'biolink',
 'biopax',
 'biopragmatics-reactome',
 'biovoices',
 'cco',
 'cellosaurus',
 'chebiplus',
 'chemessence',
 'chemont',
 'chiro',
 'chr',
 'co_324',
 'comet',
 'complexportal',
 'comploinc',
 'cosmo',
 'cpont',
 'credit',
 'cso',
 'dbpendiaont',
 'dhba',
 'dmba',
 'drugbank',
 'drugcentral',
 'drugmechdb',
 'dtype',
 'eccode',
 'ecosim',
 'ecso',
 'edam',
 'efo',
 'enanomapper',
 'enigma_context',
 'envthes',
 'fhkb',
 'fibo',
 'fma',
 'foodon',
 'gard',
 'go',
 'go-amigo',
 'go-lego',
 'goldterms',
 'gtdb',
 'hba',
 'hcao',
 'hgnc',
 'hgnc.genegroup',
 'hpinternational',
 'icd10cm',
 'icd10who',
 'interpro',
 'iof',
 'ito',
 'kegg.genome',
 'kgcl',
 'kin',
 'lov',
 'maxo',
 'mba',
 'mixs',
 'mlo',
 'modl',
 'molgenie',
 'mondo-ingest',
 'msio',
 'nando',
 'ncit',
 'neo',
 'nmdc_schema',
 'obiws',
 'oboe-core',
 'oboe-standards',
 'occo',
 'oeo',
 'ogco',
 'omim',
 'omop',
 'ont

In [12]:
# Fetch the BBOP SQLite S3 XML catalog
response = requests.get(s3_url)
response.raise_for_status()  # Raise an error if the request failed

In [13]:
# Parse the XML
root = ET.fromstring(response.content)


In [14]:
bioportal_ontologies

{'aba-amb': {'acronym': 'ABA-AMB',
  'name': 'Allen Brain Atlas (ABA) Adult Mouse Brain Ontology',
  'class_count': '913'},
 'abd': {'acronym': 'ABD',
  'name': 'Anthology of Biosurveillance Diseases',
  'class_count': '1445'},
 'aceso': {'acronym': 'ACESO',
  'name': 'Adverse Childhood Experiences Ontology',
  'class_count': '296'},
 'acgt-mo': {'acronym': 'ACGT-MO',
  'name': 'Cancer Research and Management ACGT Master Ontology',
  'class_count': '1770'},
 'acvd_ontology': {'acronym': 'ACVD_ONTOLOGY',
  'name': 'Atherosclerotic Cerebrovascular Disease Ontology',
  'class_count': '1719'},
 'ad-drop': {'acronym': 'AD-DROP',
  'name': 'Alzheimer Disease Relevance Ontology by Process',
  'class_count': '25'},
 'adalab': {'acronym': 'ADALAB', 'name': 'AdaLab ontology', 'class_count': ''},
 'adalab-meta': {'acronym': 'ADALAB-META',
  'name': 'AdaLab-meta ontology',
  'class_count': ''},
 'adar': {'acronym': 'ADAR',
  'name': 'Autism DSM-ADI-R ontology',
  'class_count': '791'},
 'adcad': {

In [15]:
# Extract Contents elements into a list of dictionaries
contents = []
extension_counter = Counter()
base_names = set()
db_gz_basenames = set()


for content in root.findall('ns:Contents', namespace):
    key = content.find('ns:Key', namespace).text
    size = int(content.find('ns:Size', namespace).text)

    # Skip entries with directory paths
    if '/' in key:
        print(f"Skipping directory entry: {key}")
        continue

    # Skip empty files
    if size == 0:
        print(f"Skipping empty entry: {key}")
        continue

    # Custom basename and extension logic
    if '-' in key:
        split_index = key.find('-')
        basename = key[:split_index]
        extension = key[split_index + 1:]
    elif '.' in key:
        split_index = key.find('.')
        basename = key[:split_index]
        extension = key[split_index + 1:]
    else:
        basename = key
        extension = ''

    base_names.add(basename)
    if extension == 'db.gz':
        db_gz_basenames.add(basename)

    # Count the extension
    extension_counter[extension] += 1

    entry = {
        'Key': key,
        'basename': basename,
        'extension': extension,
        'LastModified': content.find('ns:LastModified', namespace).text,
        # 'ETag': content.find('ns:ETag', namespace).text.replace('"', ''),
        'Size': size,
        # 'StorageClass': content.find('ns:StorageClass', namespace).text,
        'obo_foundry_title': obo_ontologies.get(basename, {}).get('title', None),
        'bioportal_name': bioportal_ontologies.get(basename, {}).get('name', None),
        'registry_url': registry_dict['ontologies'].get(basename, {}).get('url', None),
    }
    contents.append(entry)


Skipping empty entry: biopragmatics-reactome.db.gz.tmp
Skipping empty entry: ceph.db
Skipping empty entry: cmo.db
Skipping empty entry: dictybase.db.gz.tmp
Skipping empty entry: drugcentral.db.gz.tmp
Skipping empty entry: drugmechdb.db.gz.tmp
Skipping empty entry: ecosim.db.gz.tmp
Skipping empty entry: epio-relation-graph.tsv.owl.tmp
Skipping empty entry: epio.db
Skipping empty entry: go-lego.db.gz.tmp
Skipping empty entry: gsso
Skipping empty entry: gsso.db
Skipping empty entry: gssox.db
Skipping empty entry: kegg.genome.db.gz.tmp
Skipping empty entry: lov.db.gz.tmp
Skipping empty entry: micro.db
Skipping empty entry: molgenie.db.gz.tmp
Skipping empty entry: omop.db.gz.tmp
Skipping empty entry: pathbank.db.gz.tmp
Skipping directory entry: releases/2022-05-31/README.md
Skipping directory entry: releases/2022-05-31/aeo.db
Skipping directory entry: releases/2022-05-31/agro-relation-graph.tsv.gz
Skipping directory entry: releases/2022-05-31/agro.db
Skipping directory entry: releases/2022-

In [16]:
non_db_gz_basenames = list(base_names - db_gz_basenames)
non_db_gz_basenames.sort()

In [17]:
non_db_gz_basenames

['README',
 'aeo',
 'allotrope',
 'bad',
 'biopragmatics',
 'cme',
 'envo.db',
 'foo',
 'goa_uniprot_all',
 'inst',
 'kegg',
 'matcher',
 'obo',
 'obo_prefixes',
 'oboe',
 'omiabis',
 'reactome']

In [18]:
extension_counter

Counter({'db.gz': 237,
         'owl': 150,
         'relation-graph.tsv.gz': 147,
         'db': 112,
         'relation-graph.tsv': 3,
         'db.tmp': 3,
         'md': 1,
         'ontology.db': 1,
         'ontology.db.old': 1,
         'reactome.db.gz': 1,
         'db.old': 1,
         'owl.old': 1,
         'journal': 1,
         'relation-graph.tsv.ttl.tmp': 1,
         'amigo.db.gz': 1,
         'lego.db.gz': 1,
         'nucleus-relation-graph.tsv.gz': 1,
         'nucleus.db': 1,
         'nucleus.db.old': 1,
         'nucleus.owl': 1,
         'plus.db': 1,
         'genegroup.db.gz': 1,
         'db.gz.old': 1,
         'genome.db.gz': 1,
         'test.db.old': 1,
         'ingest.db.gz': 1,
         'ontologies.db': 1,
         'core.db.gz': 1,
         'standards.db.gz': 1,
         'min.owl': 1,
         'db.db': 1,
         'Homo-sapiens.db.gz': 1,
         'hs.db.gz': 1,
         'mm.db.gz': 1})

In [19]:
# Write the data to a TSV file
with open(tsv_output, 'w', newline='') as tsvfile:
    writer = csv.DictWriter(tsvfile, fieldnames=contents[0].keys(), delimiter='\t')
    writer.writeheader()
    writer.writerows(contents)

print(f"Fetched and saved {len(contents)} entries to {tsv_output}")

Fetched and saved 680 entries to bbop-sem-sql-catalog-filtered.tsv
