In [1]:
import duckdb
from sqlalchemy import create_engine, text
from sqlalchemy.engine import Engine

import pandas as pd
from typing import List, Dict, Any

from oaklib import get_adapter
from oaklib.implementations import AggregatorImplementation

what's a generic, high performance way to swap data between the duckdb and in-memory Python data structures?

wrap in sqlalchemy and use pandas?

I'm adding connections and transactions... seems like a lot of overhead

In [2]:
duckdb_filename = "ncbi_biosamples.duckdb"

In [3]:
def open_duckdb_file(filename):
    """
    Creates a file-based DuckDB database and returns the connection.
  
    Args:
      filename: The name of the DuckDB database file to create.
  
    Returns:
      duckdb.DuckDBPyConnection: The DuckDB connection object.
    """
    conn = duckdb.connect(database=filename)
    return conn

In [4]:
def create_engine_connection(db_path: str) -> Engine:
    """
    Creates and returns an SQLAlchemy Engine for the DuckDB database.

    Args:
        db_path (str): Path to the DuckDB file.

    Returns:
        sqlalchemy.engine.Engine: A connection engine for the DuckDB database.
    """
    try:
        engine = create_engine(f"duckdb:///{db_path}")
        return engine
    except Exception as e:
        raise RuntimeError(f"Failed to create SQLAlchemy engine: {e}")

In [5]:
def list_tables(engine: Engine) -> None:
    """
    Prints a list of all tables in the current schema.

    Args:
        engine (sqlalchemy.engine.Engine): SQLAlchemy engine for the database.
    """
    try:
        with engine.connect() as connection:
            result = connection.execute(text("SHOW TABLES"))
            print("Tables in the current schema:")
            for row in result:
                print(f" - {row[0]}")
    except Exception as e:
        raise RuntimeError(f"Error listing tables: {e}")

In [6]:

def extract_curies_from_text(
    text: str,
    row_id: int = None,  # Generic parameter for row context
    prefix_min_len: int = 2,
    prefix_max_len: int = 10,
    local_id_min_len: int = 2,
    local_id_max_len: int = 10,
    prefix_chars_allowed: str = r"[a-zA-Z]",
    local_id_chars_allowed: str = r"[0-9]",
    delimiter_chars_allowed: str = r"[_:]",
) -> List[dict]:
    """
    Extract ontology class CURIEs from text and return them as dictionaries.

    Args:
        text (str): The input text.
        row_id (int, optional): A generic ID or reference for the row context.
        prefix_min_len (int): Minimum length of the prefix part of the CURIE.
        prefix_max_len (int): Maximum length of the prefix part of the CURIE.
        local_id_min_len (int): Minimum length of the local ID part of the CURIE.
        local_id_max_len (int): Maximum length of the local ID part of the CURIE.
        prefix_chars_allowed (str): Allowed characters for the prefix.
        local_id_chars_allowed (str): Allowed characters for the local ID.
        delimiter_chars_allowed (str): Allowed delimiters between prefix and local ID.

    Returns:
        List[dict]: A list of dictionaries containing CURIE parts and row context.
    """
    import re

    pattern = rf"""
        \b                                      # Word boundary
        (?P<prefix>{prefix_chars_allowed}{{{prefix_min_len},{prefix_max_len}}})  # Prefix
        (?P<delimiter>{delimiter_chars_allowed})                               # Delimiter
        (?P<local_id>{local_id_chars_allowed}{{{local_id_min_len},{local_id_max_len}}})  # Local ID
        \b                                      # Word boundary
    """
    matches = re.finditer(pattern, text, re.VERBOSE)
    return [
        {
            "row_id": row_id,  # General ID reference
            "curie_prefix": match.group("prefix"),
            "curie_delimiter": match.group("delimiter"),
            "curie_local_id": match.group("local_id"),
        }
        for match in matches
    ]


In [7]:
def class_detection_by_label(
    text: str,
    ontology_adapters: Dict[str, Any],
    row_id: int = None,
    min_annotated_length: int = 3
) -> List[Dict[str, Any]]:
    """
    Detect ontology class labels in a string using multiple ontology adapters and collect the annotations.
    Only annotations with a match string length >= min_annotated_length are included.
    Additionally, a flag 'is_longest_match' is added to indicate the longest match for the string.

    Args:
        text (str): The input string to be analyzed.
        ontology_adapters (dict): Dictionary of ontology adapters.
        row_id (int, optional): A generic ID or reference for the row context.
        min_annotated_length (int): Minimum length for the annotated match string to be included.

    Returns:
        List[dict]: A list of dictionaries with the annotations, each representing a new row.
    """
    annotations_for_this_string = []

    # Annotate the string using each ontology adapter
    for _, adapter in ontology_adapters.items():
        annotations = adapter.annotate_text(text)

        if annotations:  # If there are annotations
            for annotation in annotations:
                subject_string = annotation.match_string

                # Only include annotations where subject_string length is >= min_annotated_length
                if len(subject_string) >= min_annotated_length:
                    # Build the annotation dictionary
                    annotations_dict = {
                        "id": row_id,
                        "subject_string": subject_string,
                        "subject_start": annotation.subject_start,
                        "subject_end": annotation.subject_end,
                        "predicate_id": annotation.predicate_id,
                        "concluded_curie": annotation.object_id,
                        "object_string": annotation.object_label,
                    }
                    annotations_for_this_string.append(annotations_dict)

    # Determine the longest match for this string
    if annotations_for_this_string:
        longest_annotation = max(annotations_for_this_string, key=lambda x: len(x['subject_string']))

        # Mark each annotation as the longest or not
        for annotation in annotations_for_this_string:
            annotation['is_longest_match'] = annotation['subject_string'] == longest_annotation['subject_string']

    return annotations_for_this_string


In [8]:
def create_ontology_adapters(ontology_short_names: list) -> dict:
    """
    Create a dictionary of OAK adapters for each ontology short name.

    Args:
        ontology_short_names (list): A list of ontology short names (e.g., ['envo', 'po']).

    Returns:
        dict: A dictionary where keys are ontology short names and values are the OAK adapters.
    """
    adapters = {}
    for short_name in ontology_short_names:
        adapter_string = f"sqlite:obo:{short_name}"  # Create the adapter string
        try:
            adapters[short_name] = get_adapter(adapter_string)  # Get the adapter and add to dictionary
        except Exception as e:
            print(f"Warning: Failed to create adapter for {short_name}. Error: {e}")
    return adapters

In [9]:
duckdb_engine = create_engine_connection(duckdb_filename)

In [10]:
list_tables(duckdb_engine)

Tables in the current schema:
 - attributes
 - contexts_to_normalized_strings
 - curies_of_strings_of_splits
 - harmonized_attributes_wide
 - ids
 - links
 - normalized_context_strings
 - organism
 - overview


In [11]:
with duckdb_engine.connect() as connection:
    transaction = connection.begin()
    connection.execute(text("DROP TABLE IF EXISTS main.normalized_contexts;"))
    connection.execute(text("drop table if exists main.normalized_context_strings;"))
    connection.execute(text("DROP TABLE IF EXISTS normalized_contexts;"))
    connection.execute(text("drop table if exists normalized_context_strings;"))
    connection.execute(text("""
    drop table if exists main.contexts_to_normalized_strings;
    """))
    connection.execute(text("""
    drop table if exists contexts_to_normalized_strings;
    """))
    transaction.commit()

In [12]:
list_tables(duckdb_engine)

Tables in the current schema:
 - attributes
 - curies_of_strings_of_splits
 - harmonized_attributes_wide
 - ids
 - links
 - organism
 - overview


In [13]:
with duckdb_engine.connect() as connection:
    
    transaction = connection.begin()
    
    connection.execute(text("""
    CREATE TEMPORARY TABLE normalized_contexts AS
    SELECT
        id,
        harmonized_name,
        content,
        regexp_replace(trim(lower(content)), '\\s+', ' ', 'g') AS normalized
    FROM
        main.attributes
    WHERE
        harmonized_name IN ('env_broad_scale', 'env_local_scale', 'env_medium');
    """))

    connection.execute(text("""
    CREATE TABLE main.normalized_context_strings (
    normalized_context_string_id INTEGER PRIMARY KEY,
    normalized_context_string TEXT UNIQUE
    );
    """))

    connection.execute(text("""
    INSERT INTO main.normalized_context_strings (normalized_context_string_id, normalized_context_string)
    SELECT
        ROW_NUMBER() OVER () AS string_id,
        normalized
    FROM (
        SELECT DISTINCT normalized
        FROM main.normalized_contexts
    ) sub;
    """))
    
    connection.execute(text("""
    CREATE TABLE main.contexts_to_normalized_strings (
    id INTEGER ,
    harmonized_name TEXT ,
    normalized_context_string_id INTEGER
    );
    """))

    connection.execute(text("""
    INSERT INTO main.contexts_to_normalized_strings (id, harmonized_name, normalized_context_string_id)
    SELECT
        nc.id,
        nc.harmonized_name,
        ncs.normalized_context_string_id
    FROM
        main.normalized_contexts nc
    JOIN main.normalized_context_strings ncs
    ON
        nc.normalized = ncs.normalized_context_string;
    """))

    connection.execute(text("""
    drop table if exists main.normalized_contexts;
    """))
    connection.execute(text("""
    drop table if exists normalized_contexts;
    """))

    transaction.commit()


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [14]:
list_tables(duckdb_engine)

Tables in the current schema:
 - attributes
 - contexts_to_normalized_strings
 - curies_of_strings_of_splits
 - harmonized_attributes_wide
 - ids
 - links
 - normalized_context_strings
 - organism
 - overview


 # locally retrieve and process normalized_context_strings

In [15]:
normalized_context_strings = pd.read_sql("select * from main.normalized_context_strings", duckdb_engine)

In [16]:
normalized_context_strings

Unnamed: 0,normalized_context_string_id,normalized_context_string
0,1,not determined
1,2,arboreal habitat
2,3,antarctic cold desert
3,4,water
4,5,managed aquifer recharge system
...,...,...
165331,165332,peatland[envo_00000044]
165332,165333,tropica
165333,165334,6.6 estuary/coastal waters
165334,165335,hinderbanken


In [17]:
normalized_context_strings.to_csv('normalized_context_strings.tsv', index=False, sep='\t')

In [18]:
# Ensure the column has string values and handle non-string entries
normalized_context_strings['normalized_context_string'] = normalized_context_strings['normalized_context_string'].fillna("").astype(str)


# extract curies

In [19]:
# Apply the CURIE extraction function to each row
curies_list = normalized_context_strings.apply(
    lambda row: extract_curies_from_text(
        row["normalized_context_string"],
        row.get("normalized_context_string_id", None)  # Pass row ID if available
    ), axis=1
).explode().dropna()

In [20]:
# Convert the list of dictionaries into a DataFrame
direct_curies = pd.DataFrame(curies_list.tolist())

In [21]:
direct_curies['reassembled_curie'] = direct_curies['curie_prefix'].fillna('').str.upper() + \
                                 ":" + \
                                 direct_curies['curie_local_id'].fillna('')

In [22]:
# Drop the specified columns
direct_curies = direct_curies.drop(columns=['curie_delimiter', 'curie_local_id'])

In [23]:
direct_curies = direct_curies.rename(columns={'row_id': 'unique_split_normalized_context_string_id'})

In [24]:
direct_curies

Unnamed: 0,unique_split_normalized_context_string_id,curie_prefix,reassembled_curie
0,6,envo,ENVO:00002003
1,7,envo,ENVO:00005750
2,18,envo,ENVO:00003040
3,33,envo,ENVO:00002005
4,37,envo,ENVO:00002149
...,...,...,...
44054,165324,envo,ENVO:00000233
44055,165331,envo,ENVO:01000237
44056,165332,envo,ENVO:00000044
44057,165336,envo,ENVO:01001269


In [25]:
direct_curies['curie_prefix'].value_counts()

curie_prefix
envo        32584
bto          2020
uberon       1928
po           1390
gut           662
            ...  
genepio         1
sprouted        1
tejon           1
emapa           1
ww              1
Name: count, Length: 195, dtype: int64

In [26]:
with duckdb_engine.connect() as connection:
    direct_curies.to_sql('direct_curies', connection, if_exists='replace', index=False)

# entity recognition of classes

In [27]:
ontologies = ['envo', 'po']

In [28]:
ontology_adapters = create_ontology_adapters(ontologies)

In [29]:
agg = AggregatorImplementation(implementations=ontology_adapters)

In [30]:
annotations_list = normalized_context_strings.apply(
    lambda row: class_detection_by_label(
        row["normalized_context_string"],
        ontology_adapters,
        row["normalized_context_string_id"]
    ), axis=1
).explode().dropna()

# 10 minutes with envo and po, no manipulation of the input except for (lowercase ) uniqification

ERROR:root:Skipping statements(subject=ENVO:00000112,predicate=oio:hasDbXref,object=<http://www.eionet.europa.eu/gemet/concept/8704>,value=None,datatype=None,language=None,); ValueError: <http://www.eionet.europa.eu/gemet/concept/8704> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:00001996,predicate=oio:hasDbXref,object=<https://en.wikipedia.org/wiki/Acid_mine_drainage>,value=None,datatype=None,language=None,); ValueError: <https://en.wikipedia.org/wiki/Acid_mine_drainage> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:01000225,predicate=oio:hasDbXref,object=<https://www.worldwildlife.org/biomes/tropical-and-subtropical-dry-broadleaf-forests>,value=None,datatype=None,language=None,); ValueError: <https://www.worldwildlife.org/biomes/tropical-and-subtropical-dry-broadleaf-forests> is not a valid URI or CURIE
ERROR:root:Skipping statements(subject=ENVO:01000227,predicate=oio:hasDbXref,object=<https://www.worldwildlife.org/biomes/tropical

In [31]:
# Convert the list of dictionaries into a DataFrame
curies_of_strings = pd.DataFrame(annotations_list.tolist())

In [32]:
# Display the resulting DataFrame
curies_of_strings

Unnamed: 0,id,subject_string,subject_start,subject_end,predicate_id,concluded_curie,object_string,is_longest_match
0,2,habitat,10,16,rdfs:label,ENVO:01000739,habitat,True
1,3,col,11,13,oio:hasRelatedSynonym,ENVO:00000084,col,False
2,3,desert,16,21,oio:hasBroadSynonym,ENVO:00000172,desert,False
3,3,desert,16,21,oio:hasBroadSynonym,ENVO:00000173,desert,False
4,3,desert,16,21,oio:hasBroadSynonym,ENVO:00000183,desert,False
...,...,...,...,...,...,...,...,...
357112,165336,space,19,23,oio:hasBroadSynonym,UBERON:0000464,space,False
357113,165336,spa,19,21,oio:hasBroadSynonym,ENVO:01000938,spa,False
357114,165336,area of developed space,1,23,rdfs:label,ENVO:01001269,area of developed space,True
357115,165336,temperate biome,42,56,rdfs:label,ENVO:01001831,temperate biome,False


In [33]:
with duckdb_engine.connect() as connection:
    curies_of_strings.to_sql('curies_of_strings_of_splits', connection, if_exists='replace', index=False)



FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [34]:
list_tables(duckdb_engine)

Tables in the current schema:
 - attributes
 - contexts_to_normalized_strings
 - curies_of_strings_of_splits
 - direct_curies
 - harmonized_attributes_wide
 - ids
 - links
 - normalized_context_strings
 - organism
 - overview


In [35]:
duckdb_engine.dispose()

In [36]:
print(curies_of_strings['predicate_id'].value_counts())

predicate_id
rdfs:label               139185
oio:hasRelatedSynonym     74379
oio:hasBroadSynonym       54144
rdf:ID                    42460
oio:hasExactSynonym       37408
oio:hasNarrowSynonym       9538
oio:hasDbXref                 3
Name: count, dtype: int64


predicate_id
rdfs:label               139185
oio:hasRelatedSynonym     74379
oio:hasBroadSynonym       54144
rdf:ID                    42460
oio:hasExactSynonym       37408
oio:hasNarrowSynonym       9538
oio:hasDbXref                 3
Name: count, dtype: int64

In [37]:
curies_of_strings['concluded_curie_prefix'] = curies_of_strings['concluded_curie'].str.split(':').str[0]

In [38]:
print(curies_of_strings['concluded_curie_prefix'].value_counts())

concluded_curie_prefix
ENVO         252461
UBERON        42771
CHEBI         17783
PATO          15269
NCBITaxon     11858
PO            10813
FOODON         2195
RO             1389
BFO            1038
OBI             789
PCO             740
GO                9
oio               2
Name: count, dtype: int64


concluded_curie_prefix
ENVO         252461
UBERON        42771
CHEBI         17783
PATO          15269
NCBITaxon     11858
PO            10813
FOODON         2195
RO             1389
BFO            1038
OBI             789
PCO             740
GO                9
oio               2
Name: count, dtype: int64