In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from dotenv import load_dotenv

load_dotenv(".env")

In [None]:
import os

from elasticsearch import Elasticsearch

# Password for the 'elastic' user generated by Elasticsearch
ELASTIC_PASSWORD = os.environ.get("ELASTIC_PASSWORD")

# Create the client instance
client = Elasticsearch(
    "https://localhost:9200",
    # docker cp nmdc-elasticsearch_es01_1:/usr/share/elasticsearch/config/certs/ca/ca.crt .
    ca_certs="ca.crt",
    basic_auth=("elastic", ELASTIC_PASSWORD)
)

client.info().body

In [None]:
print(client.cat.health())

In [None]:
import requests

docs = []
rv = requests.get("https://api.microbiomedata.org/biosamples?page=1&per_page=200")
docs.extend(rv.json()['results'])
rv = requests.get("https://api.microbiomedata.org/biosamples?page=2&per_page=200")
docs.extend(rv.json()['results'])
rv = requests.get("https://api.microbiomedata.org/biosamples?page=3&per_page=200")
docs.extend(rv.json()['results'])
rv = requests.get("https://api.microbiomedata.org/biosamples?page=4&per_page=200")
docs.extend(rv.json()['results'])
len(docs)

-  mechanisms for discovery on top of metadata store
-  metadata completions and suggestions
-  suggest or predict a user’s query
- hundreds of possible metadata keys, and thousands of possible values per key

What metadata keys are most ripe for demo of completions/suggestions?

1. The elements of a biosample's GOLD five-level ecosystem classification path: `ecosytem`, `ecosystem_category`, `ecosystem_type`, `ecosystem_subtype`, and `specific_ecosystem`.

2. The elements of a biosample's "MIxS triad": `env_broad_scale`, `env_local_scale`, and `env_medium`.

get all GOLD ecosystem values:
- get official excel sheet (https://gold.jgi.doe.gov/download?mode=ecosystempaths) via https://gold.jgi.doe.gov/downloads
- pandas.read_excel using openpyxl

In [None]:
import pandas as pd

df_ecosystem_paths = pd.read_excel("GOLDs5levelEcosystemClassificationPaths.xlsx")

In [None]:
from typing import Dict

ecosystem_paths : Dict[str, int] = {}
for row in df_ecosystem_paths.itertuples():
    _path = " > ".join([str(e) for e in row[2:]])
    _id = row[1]
    ecosystem_paths[_path] = _id
    
assert len(df_ecosystem_paths) == len(ecosystem_paths)

In [None]:
ecosystem_paths

In [None]:
def ecosystem_path(doc):
    _path = [
        doc["ecosystem"],
        doc["ecosystem_category"],
        doc["ecosystem_type"],
        doc["ecosystem_subtype"],
        doc["specific_ecosystem"],
    ]
    return " > ".join(_path)

In [None]:
assert all(ecosystem_path(doc) in ecosystem_paths for doc in docs)

In [None]:
for doc in docs:
    _path = ecosystem_path(doc)
    _id = ecosystem_paths[_path]
    doc["ecosystem_path"] = f"{_id} : {_path}"

Get all Environment Ontology (ENVO) and Plant Ontology (PO) terms:
- From https://obofoundry.org/
- http://purl.obolibrary.org/obo/envo.owl
- http://purl.obolibrary.org/obo/po.owl

In [None]:
!du -h envo.owl

In [None]:
from rdflib import Graph

g_envo = Graph()
g_envo.parse("envo.owl", format="xml")

In [None]:
!du -h po.owl

In [None]:
from rdflib import Graph

g_po = Graph()
g_po.parse("po.owl", format="xml")

In [None]:
def mixs_triad(doc):
    _terms = [
        doc["env_broad_scale"],
        doc["env_local_scale"],
        doc["env_medium"],
    ]
    return " > ".join(t["has_raw_value"] for t in _terms)

In [None]:
curies = set()
for doc in docs:
    curies |= set(mixs_triad(doc).split(" > "))

In [None]:
def curie_purl(curie):
    prefix, n = curie.split(':', maxsplit=1)
    return f"http://purl.obolibrary.org/obo/{prefix}_{n}"

In [None]:
curie_purl = {c: curie_purl(c) for c in curies}

In [None]:
from rdflib import URIRef
from rdflib.namespace import RDFS

curie_label = {}

for c, purl in curie_purl.items():
    if c.startswith("ENVO:"):
        curie_label[c] = str(g_envo.value(
            subject=URIRef(purl),
            predicate=RDFS.label
        ))
    elif c.startswith("PO:"):
        curie_label[c] = str(g_po.value(
            subject=URIRef(purl),
            predicate=RDFS.label
        ))
    else:
        raise ValueError("Unknown CURIE prefix")

In [None]:
curie_label

In [None]:
def mixs_triad_labeled(doc):
    _curies = mixs_triad(doc).split(" > ")
    _curie_seq = " > ".join(_curies)
    _curie_labels = " > ".join(curie_label[c] for c in _curies)
    return f"{_curie_seq} : {_curie_labels}"

In [None]:
for doc in docs:
    doc["mixs_triad"] = mixs_triad_labeled(doc)

# Index

In [None]:
index_name = 'biosamples'

if client.indices.exists(index=index_name):
    client.indices.delete(index=index_name)

import json
with open(f"{index_name}.json") as f:
    index_body = json.load(f)

client.indices.create(index=index_name, **index_body)

In [None]:
from elasticsearch.helpers import bulk

bulk(client, [dict(_id=d["id"], _index=index_name, **d)for d in docs])

In [None]:
print(client.cat.count(index=index_name, v=True))

In [None]:
for doc in docs:
    print(doc['ecosystem_path'])

In [None]:
for doc in docs:
    print(doc['mixs_triad'])