In [1]:
import csv
import os

from dotenv import load_dotenv
from linkml_runtime import SchemaView
from linkml_runtime.linkml_model.meta import ClassDefinition
from nmdc_schema.get_nmdc_view import ViewGetter
from pymongo import MongoClient


## This won't work unless you have access to an NMDC MonogDB

If you don't have one running locally on your computer, this will most likely require 
- obtaining a username and password for one of the NMDC hosted MongoDBs
- establishing a ssh tunnel to that hosted MongoDB

In any case, you will need a `.env` file with `SOURCE_MONGO_USER` and `SOURCE_MONGO_PASS` values

This could all be rewritten to use an NMDC API

In [2]:
dot_env_file = "../local/.env"
# dot_env_file = ".env"

mongo_authMechanism='SCRAM-SHA-256'
mongo_authSource="admin"
mongo_db_name = "nmdc"
mongo_directConnection=True
mongo_port=27777


In [3]:
load_dotenv(dotenv_path=dot_env_file, verbose=True, override=True)

True

In [4]:
mongo_pw = os.getenv('SOURCE_MONGO_PASS')
mongo_user = os.getenv('SOURCE_MONGO_USER')
print(mongo_user)

mam


In [5]:
client = MongoClient("localhost",
                     authMechanism=mongo_authMechanism, 
                     authSource=mongo_authSource,
                     directConnection=mongo_directConnection,
                     password=mongo_pw,
                     port=mongo_port,
                     username=mongo_user,
                     )

In [6]:
output_file = '../local/populated_slot_counts.tsv'

In [7]:
vg = ViewGetter()
nmdc_view = vg.get_view()

In [8]:
Database = nmdc_view.get_class('Database')

In [9]:
def get_collection_names(mongo_db):
    collections = mongo_db.list_collection_names()
    return list(collections)

In [10]:
def get_synonymous_collection_db_slots(mongo_db, schema_view: SchemaView, class_name='Database'):
    class_slots = schema_view.class_induced_slots(class_name)
    class_slot_names = [s.name for s in class_slots]
    class_slot_names.sort()

    collections = get_collection_names(mongo_db)

    synonymous_collection_names = set_arithmetic(set(class_slot_names), set(collections))

    return synonymous_collection_names

In [11]:
def set_arithmetic(set1, set2):
    set1_only = set1 - set2
    set2_only = set2 - set1
    intersection = set1.intersection(set2)
    temp = {
        'set 1 only': set1_only,
        'set 2 only': set2_only,
        'intersection': intersection
    }
    return temp

In [12]:
def check_collection_for_populated_slot(mongo_db, collection_name, selected_slot):
    collection = mongo_db[collection_name]

    query = {selected_slot: {'$exists': True}}

    result = collection.count_documents(query)

    return result


In [13]:
def write_dicts_to_tsv(data, filename):
    fieldnames = data[0].keys()  # Assume all dictionaries have the same keys

    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        writer.writerows(data)


In [14]:
db = client[mongo_db_name]

In [15]:
collections_to_check = get_synonymous_collection_db_slots(db, nmdc_view, 'Database')


## Slow/Inefficient!

In [16]:
populated_slot_counts = []
for slot_name in collections_to_check['intersection']:
    slots_range = nmdc_view.get_slot(slot_name).range
    if slots_range:
        range_element = nmdc_view.get_element(slots_range)
        range_type = type(range_element)
        if range_type == ClassDefinition:
            range_slots = nmdc_view.class_induced_slots(slots_range)
            for range_slot in range_slots:
                doc_count = check_collection_for_populated_slot(db, slot_name, range_slot.name)
                print(f"{slot_name} {range_slot.name} doc_count: {doc_count}")
                population_dict = {
                    'collection': slot_name,
                    'slot': range_slot.name,
                    'doc_count': doc_count
                }
                populated_slot_counts.append(population_dict)


data_object_set compression_type doc_count: 0
data_object_set data_category doc_count: 0
data_object_set data_object_type doc_count: 106636
data_object_set file_size_bytes doc_count: 105447
data_object_set insdc_experiment_identifiers doc_count: 0
data_object_set md5_checksum doc_count: 104240
data_object_set url doc_count: 106923
data_object_set was_generated_by doc_count: 4404
data_object_set id doc_count: 109604
data_object_set name doc_count: 109604
data_object_set description doc_count: 109604
data_object_set alternative_identifiers doc_count: 73832
data_object_set type doc_count: 109378
processed_sample_set biomaterial_purity doc_count: 0
processed_sample_set dna_absorb1 doc_count: 0
processed_sample_set dna_concentration doc_count: 0
processed_sample_set external_database_identifiers doc_count: 0
processed_sample_set id doc_count: 6268
processed_sample_set name doc_count: 6268
processed_sample_set description doc_count: 0
processed_sample_set alternative_identifiers doc_count: 0

In [17]:
write_dicts_to_tsv(populated_slot_counts, output_file)