In [1]:
import os
import csv

# import pandas as pd
from dotenv import load_dotenv
from linkml_runtime import SchemaView
from linkml_runtime.linkml_model.meta import ClassDefinition
from pymongo import MongoClient

In [2]:
mongo_db_name = "nmdc"
output_file = '../local/populated_slot_counts.tsv'
dot_env_file = "../local/.env"
schema_file = "../src/schema/nmdc.yaml"

In [3]:
nmdc_view = SchemaView(schema_file)

In [4]:
load_dotenv(dot_env_file)

True

In [5]:
mongo_pw = os.getenv('SOURCE_MONGO_PASS')
# print(mongo_pw)
mongo_user = os.getenv('SOURCE_MONGO_USER')
print(mongo_user)




In [6]:
Database = nmdc_view.get_class('Database')

In [7]:
def get_collection_names(mongo_db):
    collections = mongo_db.list_collection_names()
    return list(collections)

In [8]:
def get_synonymous_collection_db_slots(mongo_db, schema_view: SchemaView, class_name='Database'):
    class_slots = schema_view.class_induced_slots(class_name)
    class_slot_names = [s.name for s in class_slots]
    class_slot_names.sort()

    collections = get_collection_names(mongo_db)

    synonymous_collection_names = set_arithmetic(set(class_slot_names), set(collections))

    return synonymous_collection_names

In [9]:
def set_arithmetic(set1, set2):
    set1_only = set1 - set2
    set2_only = set2 - set1
    intersection = set1.intersection(set2)
    temp = {
        'set 1 only': set1_only,
        'set 2 only': set2_only,
        'intersection': intersection
    }
    return temp

In [10]:
def check_collection_for_populated_slot(mongo_db, collection_name, selected_slot):
    collection = mongo_db[collection_name]

    query = {selected_slot: {'$exists': True}}
    # result = collection.find(query)

    result = collection.count_documents(query)

    return result


In [11]:
def write_dicts_to_tsv(data, filename):
    fieldnames = data[0].keys()  # Assume all dictionaries have the same keys

    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames, delimiter='\t')
        writer.writeheader()
        writer.writerows(data)


In [12]:
client = MongoClient("localhost",
                     port=27777,
                     username=mongo_user,
                     password=mongo_pw,
                     authSource="admin",
                     authMechanism='SCRAM-SHA-256',  # todo should be an option
                     directConnection=True
                     )

In [13]:
db = client[mongo_db_name]

In [14]:
collections_to_check = get_synonymous_collection_db_slots(db, nmdc_view, 'Database')
# pprint.pprint(collections_to_check)

ServerSelectionTimeoutError: localhost:27777: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms), Timeout: 30s, Topology Description: <TopologyDescription id: 66636b7acf1fd706b257e7c7, topology_type: Single, servers: [<ServerDescription ('localhost', 27777) server_type: Unknown, rtt: None, error=AutoReconnect('localhost:27777: [Errno 111] Connection refused (configured timeouts: socketTimeoutMS: 20000.0ms, connectTimeoutMS: 20000.0ms)')>]>

In [23]:
populated_slot_counts = []
for slot_name in collections_to_check['intersection']:
    slots_range = nmdc_view.get_slot(slot_name).range
    if slots_range:
        range_element = nmdc_view.get_element(slots_range)
        range_type = type(range_element)
        if range_type == ClassDefinition:
            range_slots = nmdc_view.class_induced_slots(slots_range)
            for range_slot in range_slots:
                # print(f"checking {slot_name} {range_slot.name}")
                doc_count = check_collection_for_populated_slot(db, slot_name, range_slot.name)
                print(f"{slot_name} {range_slot.name} doc_count: {doc_count}")
                population_dict = {
                    'collection': slot_name,
                    'slot': range_slot.name,
                    'doc_count': doc_count
                }
                populated_slot_counts.append(population_dict)


collecting_biosamples_from_site_set has_input doc_count: 0
collecting_biosamples_from_site_set has_output doc_count: 0
collecting_biosamples_from_site_set protocol_link doc_count: 0
collecting_biosamples_from_site_set start_date doc_count: 0
collecting_biosamples_from_site_set end_date doc_count: 0
collecting_biosamples_from_site_set id doc_count: 0
collecting_biosamples_from_site_set name doc_count: 0
collecting_biosamples_from_site_set description doc_count: 0
collecting_biosamples_from_site_set alternative_identifiers doc_count: 0
study_set emsl_project_dois doc_count: 0
study_set neon_study_identifiers doc_count: 0
study_set id doc_count: 14
study_set alternative_identifiers doc_count: 0
study_set gnps_task_identifiers doc_count: 0
study_set abstract doc_count: 0
study_set alternative_descriptions doc_count: 0
study_set alternative_names doc_count: 0
study_set alternative_titles doc_count: 0
study_set doi doc_count: 12
study_set ecosystem doc_count: 8
study_set ecosystem_category d

In [16]:
# populated_slot_count_frame = pd.DataFrame(populated_slot_counts)
# populated_slot_count_frame.to_csv('../local/populated_slot_counts.csv')

In [17]:
write_dicts_to_tsv(populated_slot_counts, output_file)