In [16]:
import os
import re
from pymongo import MongoClient
from dotenv import load_dotenv
from pprint import pprint
from tqdm.notebook import tqdm

In [17]:
# Load env variables from local/.env
load_dotenv("local/.env")

True

In [18]:
# Fetch credentials
MONGO_USERNAME = os.getenv("MONGO_NCBI_LOADBALANCER_WRITING_USERNAME")
MONGO_PASSWORD = os.getenv("MONGO_NCBI_LOADBALANCER_WRITING_PW")

In [19]:
MONGO_USERNAME

'mam-ncbi'

In [20]:
MONGO_PASSWORD

'carve_uninsured_rocking_293'

In [21]:
# === MongoDB Connection Settings (variable-based) ===
MONGO_HOST = "localhost"   # e.g., "localhost" or "mongo.ncbi.example.org"
MONGO_PORT = 27777               # default MongoDB port

In [22]:
# Optional authentication options
AUTH_SOURCE = "admin"
AUTH_MECHANISM = "SCRAM-SHA-256"
EXTRA_PARAMS = "directConnection=true"

In [23]:
# Optional database/collection names as variables
DB_NAME = "ncbi_metadata"
COLLECTION_NAME = "biosamples_env_triad_value_counts_gt_1"

In [24]:
# Build the URI
mongo_uri = (
    f"mongodb://{MONGO_USERNAME}:{MONGO_PASSWORD}"
    f"@{MONGO_HOST}:{MONGO_PORT}/"
    f"?authSource={AUTH_SOURCE}&authMechanism={AUTH_MECHANISM}&{EXTRA_PARAMS}"
)

In [25]:
# Connect
client = MongoClient(mongo_uri)
db = client[DB_NAME]
collection = db[COLLECTION_NAME]

In [26]:
strict_curie_pattern = re.compile(
    r"""
    (?P<raw>
        (?P<label>.*?)?                           # optional label
        [\[\(]?\s*                                 # optional open bracket
        \b                                         # word boundary before prefix
        (?P<prefix>[A-Z][A-Z0-9]+)                 # prefix (upper-case)
        [:_]                                       # colon or underscore ONLY (no hyphen/space)
        (?P<local>[A-Za-z0-9]{2,})                 # local ID
        \s*[\]\)]?                                 # optional close bracket
    )
    """,
    re.VERBOSE
)


In [27]:
def extract_components(text):
    if not isinstance(text, str):
        return []

    components = []
    annotations = text.split('|')

    for ann in annotations:
        matches = list(strict_curie_pattern.finditer(ann))
        if matches:
            for m in matches:
                label = m.group('label')
                if label:
                    label = label.strip().lstrip(',;')
                components.append({
                    'raw': m.group('raw').strip(),
                    'label': label if label else None,
                    'prefix': m.group('prefix').upper().strip(),
                    'local': m.group('local').strip()
                })
        else:
            components.append({
                'raw': ann.strip(),
                'label': ann.strip(),
                'prefix': None,
                'local': None
            })

    return components

In [28]:
# Fetch one document from the collection
sample_doc = collection.find_one()

In [29]:
# Pretty-print the result (in Jupyter)
pprint(sample_doc)

{'_id': ObjectId('67e1b8661418ebf728b73a5b'),
 'count': 1205691,
 'digits_only': False,
 'env_triad_value': 'not applicable',
 'envo_count': 0.0,
 'equation_like': False,
 'insdc_missing_match': True,
 'length': 14}


In [None]:
docs = list(collection.find({"env_triad_value": {"$exists": True}}))

for doc in tqdm(docs, desc="Parsing and updating"):
    value = doc["env_triad_value"]
    parsed = extract_components(value)

    collection.update_one(
        {"_id": doc["_id"]},
        {
            "$set": {
                "components": parsed,
                "components_count": len(parsed)
            }
        }
    )


In [59]:
# Remove both fields from all documents
pattern = {
    "$unset": {
        "components": "",
        "components_count": ""
    }
}


In [100]:
# Remove both fields from all documents
result = collection.update_many({}, pattern)

In [101]:
# Show how many documents were modified
print(f"Modified {result.modified_count} documents.")

Modified 66177 documents.


---

In [51]:
from oaklib import get_adapter

In [52]:
envo_adapter_string  = "sqlite:obo:envo"

In [53]:
envo_adapter = get_adapter(envo_adapter_string)

In [57]:
all_envo_curies_and_iris = list(envo_adapter.entities())

In [58]:
all_envo_curies_and_iris

['<http://geneontology.org/foo/applies-pattern>',
 '<http://schema.org/image>',
 '<https://www.wikidata.org/wiki/Q2306597>',
 '<https://www.wikidata.org/wiki/Q2>',
 '<https://www.wikidata.org/wiki/Q525>',
 '<https://www.wikidata.org/wiki/Q715269>',
 'BFO:0000001',
 'BFO:0000002',
 'BFO:0000003',
 'BFO:0000004',
 'BFO:0000006',
 'BFO:0000008',
 'BFO:0000011',
 'BFO:0000015',
 'BFO:0000016',
 'BFO:0000017',
 'BFO:0000019',
 'BFO:0000020',
 'BFO:0000023',
 'BFO:0000024',
 'BFO:0000027',
 'BFO:0000029',
 'BFO:0000030',
 'BFO:0000031',
 'BFO:0000034',
 'BFO:0000035',
 'BFO:0000038',
 'BFO:0000040',
 'BFO:0000050',
 'BFO:0000051',
 'BFO:0000054',
 'BFO:0000055',
 'BFO:0000062',
 'BFO:0000063',
 'BFO:0000066',
 'BFO:0000067',
 'BFO:0000140',
 'BFO:0000141',
 'BFO:0000142',
 'BFO:0000146',
 'CHEBI:10545',
 'CHEBI:131621',
 'CHEBI:131927',
 'CHEBI:133331',
 'CHEBI:13389',
 'CHEBI:13392',
 'CHEBI:134024',
 'CHEBI:134179',
 'CHEBI:134438',
 'CHEBI:13643',
 'CHEBI:137980',
 'CHEBI:138675',
 'CHEBI

----

In [88]:
import requests
import requests_cache

In [89]:
# Create a cache that lasts for 1 hour (3600 seconds)
requests_cache.install_cache('my_cache', expire_after=3600) # 1 hour

In [90]:
BIOPORTAL_API_KEY = os.getenv("BIOPORTAL_API_KEY")

In [91]:
print(BIOPORTAL_API_KEY)

e4043787-8e9c-4fc8-a0a0-54fd01b6be9d


In [92]:
bioportal_ontologies_url = f"https://data.bioontology.org/ontologies?apikey={BIOPORTAL_API_KEY}"

In [93]:
bioportal_ontologies_resp = requests.get(bioportal_ontologies_url)

In [95]:
bioportal_ontologies = bioportal_ontologies_resp.json()

In [96]:
type(bioportal_ontologies)

list

In [97]:
bioportal_ontologies[0]

{'administeredBy': ['https://data.bioontology.org/users/Hardy+Xie'],
 'acronym': 'PICO',
 'name': 'Pathogen-related Informed Consent Ontology',
 'summaryOnly': False,
 'flat': None,
 'ontologyType': 'https://data.bioontology.org/ontology_types/ONTOLOGY',
 '@id': 'https://data.bioontology.org/ontologies/PICO',
 '@type': 'http://data.bioontology.org/metadata/Ontology',
 'links': {'submissions': 'https://data.bioontology.org/ontologies/PICO/submissions',
  'properties': 'https://data.bioontology.org/ontologies/PICO/properties',
  'classes': 'https://data.bioontology.org/ontologies/PICO/classes',
  'single_class': 'https://data.bioontology.org/ontologies/PICO/classes/{class_id}',
  'roots': 'https://data.bioontology.org/ontologies/PICO/classes/roots',
  'instances': 'https://data.bioontology.org/ontologies/PICO/instances',
  'metrics': 'https://data.bioontology.org/ontologies/PICO/metrics',
  'reviews': 'https://data.bioontology.org/ontologies/PICO/reviews',
  'notes': 'https://data.bioont