In [1]:
import os

from dotenv import load_dotenv
load_dotenv(os.path.expanduser("~/.nmdc_mongo.env"))

from nmdc_mongo import get_db

db_share = get_db("dwinston_share")
db_scratch = get_db("dwinston_scratch")

In [2]:
biosample_study = {}

for odoc in db_share.omics_processing_set.find():
    biosample_id = odoc.get("has_input",[None])[0]
    study_id = odoc.get("part_of",[None])[0]
    if biosample_id and study_id:
        biosample_study[biosample_id] = study_id

In [3]:
len(biosample_study), db_share.omics_processing_set.count_documents({})

(689, 7144)

In [4]:
db_scratch.biosample_set.count_documents({"id": {"$in": list(biosample_study)}})

684

Add custom field `_study_id` to `biosample_set` schema

In [5]:
import json
import re
from toolz import assoc_in, dissoc
from zipfile import ZipFile

from mongospawn.schema import collschemas_for

from nmdc_mongo import (
    add_to_db,
    correct_metaP_doc,
    dbschema,
    fetch_and_validate_json,
    fetch_conform_and_persist_from_manifest,
    fetch_json,
    get_db,
    reset_database,
    snake_case_set_name
)

from nmdc_mongo.admin import admin_client, reset_database_schema

###########################
# Adjustments for GSP below
###########################

defined_object_names = set(dbschema["definitions"])

set_for_object_name = {
    spec["items"]["$ref"].split("#/definitions/")[-1]: set_name
    for set_name, spec in dbschema["properties"].items()
}

existing_set_names = set(dbschema["properties"])

for object_without_set in (defined_object_names - set(set_for_object_name.keys())):
    proposed_set_name = snake_case_set_name(object_without_set)
    if proposed_set_name not in existing_set_names:
        dbschema["properties"][proposed_set_name] = {
            "description": (f"This property links a database object to the set of"
                            f" {object_without_set} objects within it."),
            "items": {"$ref": f"#/definitions/{object_without_set}"},
            "type": "array",
        }
        
dbschema = assoc_in(dbschema, ["definitions", "ControlledTermValue", "properties", "term", "type"], "string")
del dbschema["definitions"]["ControlledTermValue"]["properties"]["term"]["$ref"]

# 'k' not capitalized upstream perhaps. should conform!
#dbschema = assoc_in(dbschema, ["definitions", "MetagenomeAssembly", "properties", "scaf_l_gt50k", "type"], "number")

In [6]:
dbschema = assoc_in(dbschema, ["definitions", "Biosample", "properties", "_study_id", "type"], "string")

collschemas = collschemas_for(dbschema)

In [7]:
reset_database_schema(admin_client["dwinston_share"], ["biosample_set"], collschemas)

updating biosample_set


In [8]:
from pymongo import UpdateOne

requests = []
for biosample_id, study_id in biosample_study.items():
    requests.append(UpdateOne({"id": biosample_id}, {"$set": {"_study_id": study_id}}))

In [9]:
rv = db_share.biosample_set.bulk_write(requests)

In [10]:
rv.modified_count

689

In [11]:
db_share.biosample_set.create_index("_study_id")

'_study_id_1'

In [12]:
stegen = "gold:Gs0114663"
wrighton = "gold:Gs0114675"
brodie = "gold:Gs0135149"

In [13]:
db_share.biosample_set.count_documents({"_study_id": brodie})

53

In [14]:
db_share.biosample_set.count_documents({"_study_id": stegen})

85

In [15]:
db_share.biosample_set.count_documents({"_study_id": wrighton})

25

In [16]:
db_share.biosample_set.count_documents({"_study_id": {"$in": [brodie, stegen, wrighton]}})

163

In [17]:
db_share.biosample_set.count_documents(
    {"_study_id": {"$in": ["gold:Gs0135149", "gold:Gs0114663", "gold:Gs0114675"]}}
)

163