In [8]:
import obonet
import pandas as pd
import argparse
import logging
import re
import numpy as np

logging.basicConfig(level=logging.INFO)


def calculate_label(row):
    """
    Helper function to calculate the label for a given row.
    Returns the definition of the row with any text between brackets removed.
    """
    definition = row.get("def", None)

    # Remove any text between brackets, e.g., PubMed citations
    # Remove leading and trailing quotation marks
    if definition is not None:
        definition = re.sub(r'\s*\[.*?\]\s*', '', definition)
        definition = definition.strip('"')
        
    return definition

def process_synonyms(row)->dict:
    """extracts the synonyms of a GO Annotation

    :param row: Row of GO annotation dataset
    :type row: _type_
    :return: dict
    :rtype: lists of synonyms for relevant scopes
    """    
    if row is np.nan or not row:
        return {"synonym_exact": [], "synonym_narrow": [], "synonym_related": [], "synonym_broad": []}

    scopes = {"EXACT": [], "NARROW": [], "RELATED": [], "BROAD": []}
    for synonym in row:
        match = re.search(r'\"(.+?)\"\s+(EXACT|NARROW|RELATED|BROAD)\s+\[', synonym)
        if match:
            text, scope = match.groups()
            scopes[scope].append(text)

    return {
        "synonym_exact": scopes["EXACT"],
        "synonym_narrow": scopes["NARROW"],
        "synonym_related": scopes["RELATED"],
        "synonym_broad": scopes["BROAD"]
    }


def download_and_process_obo(url: str):
    """
    Download the OBO file from the specified URL and save the GO ID and label to a pickle.
    """
    logging.info("Downloading and processing OBO file...")

    # Load the .obo file directly from the URL into a networkx graph using obonet
    graph = obonet.read_obo(url, ignore_obsolete=False)

    # Convert the graph nodes (terms) into a pandas dataframe
    df = pd.DataFrame.from_dict(dict(graph.nodes(data=True)), orient='index')

    logging.info("Calculating labels...")
    # Create a new column called "label"
    df["label"] = df.apply(calculate_label, axis=1)

    # Filter the dataframe to retain only 'label', 'name' and 'synonym' columns, with the 'id' column as the index
    df_filtered = df[['label','name']+['is_obsolete']]

    obsolete_labels = set(df_filtered[df_filtered['is_obsolete']=="true"].index)
    used_labels = set(df_filtered.index) - obsolete_labels
    return used_labels,obsolete_labels


dates = ['2019-07-01','2020-07-16','2021-07-02','2022-07-01','2023-07-27','2024-03-28']
urls = [f'http://release.geneontology.org/{date}/ontology/go.obo' for date in dates]


prev_used_labels,prev_obsolete_labels,prev_all_labels = set(),set(),set()

for url in urls:
    used_labels,obsolete_labels = download_and_process_obo(url)
    print(url)
    print('new labels = ',len(used_labels-prev_used_labels))
    print('new obsolete = ',len(obsolete_labels-prev_obsolete_labels))
    print('all labels = ',len(obsolete_labels.union(used_labels)-prev_all_labels))

    prev_used_labels.update(used_labels)
    prev_obsolete_labels.update(obsolete_labels)
    prev_all_labels = prev_used_labels.union(prev_obsolete_labels)


INFO:root:Downloading and processing OBO file...
INFO:root:Will decode content from http://release.geneontology.org/2019-07-01/ontology/go.obo using utf-8 charset.
INFO:root:Calculating labels...
INFO:root:Downloading and processing OBO file...


http://release.geneontology.org/2019-07-01/ontology/go.obo
new labels =  44945
new obsolete =  2456
all labels =  47401


INFO:root:Will decode content from http://release.geneontology.org/2020-07-16/ontology/go.obo using utf-8 charset.
INFO:root:Calculating labels...
INFO:root:Downloading and processing OBO file...
INFO:root:Will decode content from http://release.geneontology.org/2021-07-02/ontology/go.obo using utf-8 charset.


http://release.geneontology.org/2020-07-16/ontology/go.obo
new labels =  342
new obsolete =  503
all labels =  344


INFO:root:Calculating labels...
INFO:root:Downloading and processing OBO file...
INFO:root:Will decode content from http://release.geneontology.org/2022-07-01/ontology/go.obo using utf-8 charset.


http://release.geneontology.org/2021-07-02/ontology/go.obo
new labels =  340
new obsolete =  354
all labels =  341


INFO:root:Calculating labels...
INFO:root:Downloading and processing OBO file...
INFO:root:Will decode content from http://release.geneontology.org/2023-07-27/ontology/go.obo using utf-8 charset.


http://release.geneontology.org/2022-07-01/ontology/go.obo
new labels =  267
new obsolete =  461
all labels =  265


INFO:root:Calculating labels...
INFO:root:Downloading and processing OBO file...


http://release.geneontology.org/2023-07-27/ontology/go.obo
new labels =  322
new obsolete =  939
all labels =  323


INFO:root:Will decode content from http://release.geneontology.org/2024-03-28/ontology/go.obo using utf-8 charset.
INFO:root:Calculating labels...


http://release.geneontology.org/2024-03-28/ontology/go.obo
new labels =  218
new obsolete =  798
all labels =  222


In [9]:
len(prev_all_labels)

48896