Pipeline to transform the set of nmdc-schema-compliant mongodb collections to an RDF dataset amenable to SPARQL queries.

Ensure that changes to the code will be import-able in this notebook without needing restart the kernel and thus lose state.

In [1]:
%load_ext autoreload
%autoreload 2

Connect to local dockerized dev environment.

In [2]:
from dotenv import load_dotenv

load_dotenv(".env.localhost")
!env | grep MONGO_HOST

MONGO_HOST=localhost:27018


Initialize a db connection.

In [3]:
from nmdc_runtime.api.db.mongo import get_mongo_db

mdb = get_mongo_db()

Get all populated nmdc-schema collections with entity `id`s.

In [5]:
from nmdc_runtime.util import schema_collection_names_with_id_field

populated_collections = sorted([
    name for name in set(schema_collection_names_with_id_field()) & set(mdb.list_collection_names())
    if mdb[name].estimated_document_count() > 0
])
populated_collections

['biosample_set',
 'data_object_set',
 'extraction_set',
 'field_research_site_set',
 'library_preparation_set',
 'mags_activity_set',
 'metabolomics_analysis_activity_set',
 'metagenome_annotation_activity_set',
 'metagenome_assembly_set',
 'metagenome_sequencing_activity_set',
 'metaproteomics_analysis_activity_set',
 'metatranscriptome_activity_set',
 'nom_analysis_activity_set',
 'omics_processing_set',
 'pooling_set',
 'processed_sample_set',
 'read_based_taxonomy_analysis_activity_set',
 'read_qc_analysis_activity_set',
 'study_set']

Get a JSON-LD context for the NMDC Schema, to serialize documents to RDF.

In [8]:
import json
from pprint import pprint

from linkml.generators.jsonldcontextgen import ContextGenerator
from nmdc_schema.nmdc_data import get_nmdc_schema_definition

context = ContextGenerator(get_nmdc_schema_definition())
context = json.loads(context.serialize())["@context"]

for k, v in list(context.items()):
    if isinstance(v, dict): #and v.get("@type") == "@id":
        v.pop("@id", None) # use nmdc uri, not e.g. MIXS uri

In [31]:
from rdflib import Graph

g = Graph()

In [32]:
def split_chunk(seq, n: int):
    """
    Split sequence into chunks of length n. Do not pad last chunk.
    
    >>> list(split_chunk(list(range(10)), 3))
    [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9]]
    """
    for i in range(0, len(seq), n):
        yield seq[i : i + n]

In [33]:
from toolz import assoc, dissoc
from tqdm.notebook import tqdm

chunk_size = 2_000
total = sum((1 + mdb[name].estimated_document_count() // 2_000) for name in populated_collections)

pbar = tqdm(total=total)

for name in populated_collections:
    print(name)
    docs = [dissoc(doc, "_id") for doc in mdb[name].find()]
    chunks = list(split_chunk(docs, chunk_size))
    for chunk in chunks:
        doc_jsonld = {"@context": context, "@graph": chunk}
        g.parse(data=json.dumps(doc_jsonld), format='json-ld')
        pbar.update(1)

  0%|          | 0/112 [00:00<?, ?it/s]

biosample_set
data_object_set
extraction_set
field_research_site_set
library_preparation_set
mags_activity_set
metabolomics_analysis_activity_set
metagenome_annotation_activity_set
metagenome_assembly_set
metagenome_sequencing_activity_set
metaproteomics_analysis_activity_set
metatranscriptome_activity_set
nom_analysis_activity_set
omics_processing_set
pooling_set
processed_sample_set
read_based_taxonomy_analysis_activity_set
read_qc_analysis_activity_set
study_set


In [38]:
print(f"{len(g):,}")

6,546,004


In [39]:
from rdflib import Namespace, RDF, Literal, URIRef

NMDC = Namespace("https://w3id.org/nmdc/")

for s, p, o in g:
    s_str = str(s)
    if s_str.endswith("\n"):
        s_str_fixed = str(s_str)[:-2]
        g.remove((s,p,o))
        g.add((URIRef(s_str_fixed), p,o))
    if isinstance(o, URIRef):
        o_str = str(o)
        if o_str.endswith("\n"):
            o_str_fixed = str(o_str)[:-2]
            g.remove((s,p,o))
            g.add((s, p, URIRef(o_str_fixed)))

In [40]:
import gzip

with gzip.open('data/nmdc-db.nt.gz', 'wb') as f:
    f.write(g.serialize(format='nt').encode())

In [51]:
!docker cp data/nmdc-db.nt.gz fuseki:/fuseki-base/

[sPreparing to copy...[?25l[u[2KCopying to container - 0B[24G[0K12.1MB[24G[0K23.9MB[24G[0K38.2MB[24G[0K51.8MB[24G[0K63.4MB[24G[0K77.1MB[24G[0K91.4MB[24G[0K106MB[24G[0K118MB[24G[0K132MB[24G[0K146MB[24G[0K161MB[24G[0K178MB[24G[0K193MB[?25h[u[2KSuccessfully copied 203MB to fuseki:/fuseki-base/


In [56]:
!docker compose down fuseki

[1A[1B[0G[?25l[+] Running 0/0
 ⠋ Container fuseki  Stopping                                              [34m0.1s [0m
[?25h[1A[1A[0G[?25l[+] Running 0/1
 ⠙ Container fuseki  Stopping                                              [34m0.2s [0m
[?25h[1A[1A[0G[?25l[+] Running 0/1
 ⠹ Container fuseki  Stopping                                              [34m0.3s [0m
[?25h[1A[1A[0G[?25l[+] Running 0/1
 ⠸ Container fuseki  Stopping                                              [34m0.4s [0m
[?25h[1A[1A[0G[?25l[+] Running 2/1
 [32m✔[0m Container fuseki              [32mRemoved[0m                                   [34m0.4s [0m
 [33m[1m![0m Network nmdc-runtime_default  [33m[1mResourc...[0m                                [34m0.0s [0m
[?25h

In [54]:
!docker compose run fuseki ./apache-jena-4.9.0/bin/tdbloader --loc /fuseki-base/nmdc-db.tdb /fuseki-base/nmdc-db.nt.gz

19:14:31 INFO  loader          :: -- Start triples data phase
19:14:31 INFO  loader          :: ** Load empty triples table
19:14:31 INFO  loader          :: -- Start quads data phase
19:14:31 INFO  loader          :: ** Load empty quads table
19:14:31 INFO  loader          :: Load: /fuseki-base/nmdc-db.nt.gz -- 2024/03/05 19:14:31 UTC
19:14:32 WARN  riot            :: [line: 26821, col: 92] Bad IRI: Not a valid UUID string: uuid:KONA-CB-B-504c5931-f7cf-47aa-a2c0-ce90bcb3a0e3
19:14:33 INFO  loader          :: Add: 100,000 triples (Batch: 82,440 / Avg: 82,440)
19:14:33 WARN  riot            :: [line: 119380, col: 92] Bad IRI: Not a valid UUID string: uuid:WY03-CB-B-37aa5072-bb2c-40da-a03f-cbd4acb0f135
19:14:33 WARN  riot            :: [line: 194240, col: 92] Bad IRI: Not a valid UUID string: uuid:WY10-CB-T-b6ac5210-79d5-4142-8756-4babab75fca0
19:14:33 INFO  loader          :: Add: 200,000 triples (Batch: 127,551 / Avg: 100,150)
19:14:34 WARN  riot            :: [line: 248302, col: 92] B

In [57]:
!docker compose up fuseki -d

[1A[1B[0G[?25l[+] Running 1/0
 [32m✔[0m Container fuseki  [32mCreated[0m                                               [34m0.0s [0m
[?25h[1A[1A[0G[?25l[34m[+] Running 1/1[0m
 [32m✔[0m Container fuseki  [32mStarted[0m                                               [34m0.0s [0m
[?25h