In [31]:
from pymongo import MongoClient
import pprint

import csv

from oaklib import get_adapter

from oaklib.datamodels.vocabulary import IS_A

In [27]:
connection_string = "mongodb://localhost:27017/"

envo_adapter_string = 'sqlite:obo:envo'

gold_study_id = "Gs0154244"

# no bioproject identifier is  included in the NCBI EMP500 biosample records
# but they might all have emp500_principal_investigator attributes
# indexing on Attributes.Attribute.attribute_name took ~ 30 minutes. resulting index size is ~ 3 GB
# may need to index MongoDB on that
# could alternatively use the DuckDB
ncbi_project_id = "17119329"
ncbi_project_accession = "PRJEB42019"
ncbi_emp500_evidence_attribute = "emp500_principal_investigator"
nmdc_study_id = "nmdc:sty-11-547rwq94"

ncbi_output_tsv = "ncbi_emp500_environmental_triads.tsv"

gold_seq_proj_output_tsv = "gold_emp500_seq_proj_links.tsv"

gold_biosamples_output_tsv = "gold_emp500_environments_ecosystems.tsv"

In [14]:
# Connect to MongoDB server running on localhost
client = MongoClient(connection_string)

In [4]:
gold_db = client["gold_metadata"]

In [5]:
ncbi_db = client["biosamples"]
ncbi_biosamples_collection = ncbi_db["biosamples"]

In [None]:
# Define the query
emp500_biosamples_query = {"Attributes.Attribute.attribute_name": ncbi_emp500_evidence_attribute}

In [None]:
# Fetch matching records and convert them to a list of dictionaries
ncbi_emp500_biosamples = list(ncbi_biosamples_collection.find(emp500_biosamples_query))


In [None]:
print(len(ncbi_emp500_biosamples))

In [None]:
# Define the harmonized names you're looking for
target_harmonized_names = {'env_broad_scale', 'env_local_scale', 'env_medium'}

# Initialize an empty list to store the result
result_list = []

# Iterate through the matching records
for record in ncbi_emp500_biosamples:
    # Extract the accession and ncbi_project_accession values
    accession = record.get('accession')
    ncbi_bioproject_accession = record.get('ncbi_project_accession')

    # Extract the values for the target paths
    attribute_values = {}
    attributes = record.get('Attributes', {}).get('Attribute', [])
    for attribute in attributes:
        harmonized_name = attribute.get('harmonized_name')
        if harmonized_name in target_harmonized_names:
            attribute_values[harmonized_name] = attribute.get('content')

    # Add the extracted values to the result list
    result_list.append({
        'ncbi_bioproject_accession': ncbi_project_accession,
        'ncbi_biosample_accession': accession,
        **attribute_values  # Unpack the extracted attributes
    })


In [None]:
# Define the header for the TSV file
header = ['ncbi_bioproject_accession', 'ncbi_biosample_accession', 'env_broad_scale', 'env_local_scale', 'env_medium']

# Write the result list to the TSV file
with open(ncbi_output_tsv, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=header, delimiter='\t')

    # Write the header row
    writer.writeheader()

    # Write the data rows
    for item in result_list:
        writer.writerow(item)

print(f"Data has been written to {ncbi_output_tsv}")

In [9]:
# Define the query
gold_emp500_seq_projs_query = {"ncbiBioProjectAccession": "PRJEB42019"}

gold_seq_projs_collection = gold_db["projects"]

# Fetch matching records
gold_emp500_seq_projs = list(gold_seq_projs_collection.find(gold_emp500_seq_projs_query))

# Initialize a list to store the processed data
result_table = []

# Process each matching record
for record in gold_emp500_seq_projs:
    # Extract required fields with default values for missing fields
    project_gold_id = record.get('projectGoldId', '')
    sequencingStrategy = record.get('sequencingStrategy', '')
    projectStatus = record.get('projectStatus', '')
    study_gold_id = record.get('studyGoldId', '')
    biosample_gold_id = record.get('biosampleGoldId', '')
    organism_gold_id = record.get('organismGoldId', '')
    ncbi_bioproject_accession = record.get('ncbiBioProjectAccession', '')
    ncbi_biosample_accession = record.get('ncbiBioSampleAccession', '')
    sra_experiment_ids = record.get('sraExperimentIds', [])

    # Pipe-concatenate the values in sraExperimentIds
    sra_experiment_ids_str = '|'.join(sra_experiment_ids)

    # Append the processed data as a dictionary
    result_table.append({
        'projectGoldId': project_gold_id,
        'sequencingStrategy': sequencingStrategy,
        'projectStatus': projectStatus,
        'studyGoldId': study_gold_id,
        'biosampleGoldId': biosample_gold_id,
        'ncbiBioProjectAccession': ncbi_bioproject_accession,
        'ncbiBioSampleAccession': ncbi_biosample_accession,
        'sraExperimentIds': sra_experiment_ids_str
    })

header = ['projectGoldId', 'sequencingStrategy', 'projectStatus', 'studyGoldId', 'biosampleGoldId',
          'ncbiBioProjectAccession', 'ncbiBioSampleAccession', 'sraExperimentIds']

with open(gold_seq_proj_output_tsv, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=header, delimiter='\t')

    # Write the header row
    writer.writeheader()

    # Write the data rows
    writer.writerows(result_table)

print(f"Data has been written to {gold_seq_proj_output_tsv}")


Data has been written to gold_emp500_seq_proj_links.tsv


In [10]:
unique_biosample_gold_ids = list({record['biosampleGoldId'] for record in result_table if 'biosampleGoldId' in record})
print(len(unique_biosample_gold_ids))

1024


In [12]:
# Query MongoDB for all documents where biosampleGoldId matches any value in unique_biosample_gold_ids
query = {"biosampleGoldId": {"$in": unique_biosample_gold_ids}}

gold_biosamples_collection = gold_db["biosamples"]
# Fetch matching documents and convert them to a list of dictionaries
gold_emp500_biosamples = list(gold_biosamples_collection.find(query))

# Print the number of matching documents and optionally inspect some of them
print(f"Found {len(gold_emp500_biosamples)} matching documents.")

# for doc in gold_emp500_biosamples[:5]:  # Print the first 5 documents for inspection
#     pprint.pprint(doc)


Found 1024 matching documents.


In [16]:
# Initialize an empty list to store the processed data
processed_data = []

# Iterate through the documents in gold_emp500_biosamples
for record in gold_emp500_biosamples:
    # Extract the fields, using `.get()` for defensive handling of nulls
    biosample_gold_id = record.get('biosampleGoldId', '')
    ecosystem_path_id = record.get('ecosystemPathId', '')
    ecosystem = record.get('ecosystem', '')
    ecosystem_category = record.get('ecosystemCategory', '')
    ecosystem_type = record.get('ecosystemType', '')
    ecosystem_subtype = record.get('ecosystemSubtype', '')
    specific_ecosystem = record.get('specificEcosystem', '')

    # Extract and transform envo fields (replacing underscores with colons)
    envo_broad_scale = record.get('envoBroadScale', {})
    envo_broad_scale_id = envo_broad_scale.get('id', '').replace('_', ':')
    envo_broad_scale_label = envo_broad_scale.get('label', '')

    envo_local_scale = record.get('envoLocalScale', {})
    envo_local_scale_id = envo_local_scale.get('id', '').replace('_', ':')
    envo_local_scale_label = envo_local_scale.get('label', '')

    envo_medium = record.get('envoMedium', {})
    envo_medium_id = envo_medium.get('id', '').replace('_', ':')
    envo_medium_label = envo_medium.get('label', '')

    # Add the processed record to the list
    processed_data.append({
        'biosampleGoldId': biosample_gold_id,
        'ecosystemPathId': ecosystem_path_id,
        'ecosystem': ecosystem,
        'ecosystemCategory': ecosystem_category,
        'ecosystemType': ecosystem_type,
        'ecosystemSubtype': ecosystem_subtype,
        'specificEcosystem': specific_ecosystem,
        'envoBroadScale.id': envo_broad_scale_id,
        'envoBroadScale.label': envo_broad_scale_label,
        'envoLocalScale.id': envo_local_scale_id,
        'envoLocalScale.label': envo_local_scale_label,
        'envoMedium.id': envo_medium_id,
        'envoMedium.label': envo_medium_label,
    })

# Define the header for the TSV file
header = [
    'biosampleGoldId',
    'ecosystemPathId',
    'ecosystem',
    'ecosystemCategory',
    'ecosystemType',
    'ecosystemSubtype',
    'specificEcosystem',
    'envoBroadScale.id',
    'envoBroadScale.label',
    'envoLocalScale.id',
    'envoLocalScale.label',
    'envoMedium.id',
    'envoMedium.label',
]

# Write the processed data to a TSV file
with open(gold_biosamples_output_tsv, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=header, delimiter='\t')

    # Write the header row
    writer.writeheader()

    # Write the data rows
    writer.writerows(processed_data)

print(f"Processed data has been written to {gold_biosamples_output_tsv}")


Processed data has been written to gold_emp500_environments_ecosystems.tsv


In [None]:
# Close the connection
client.close()

In [17]:
import pandas as pd

# Define the file paths
filenames = [
    "gold_emp500_environments_ecosystems.tsv",
    "gold_emp500_seq_proj_links.tsv",
    "ncbi_emp500_environmental_triads.tsv"
]

# Read each file into a DataFrame and assign it to a variable named after the base of the filename
for filename in filenames:
    # Extract the base name (without extension) and replace non-alphanumeric characters with underscores
    var_name = filename.split('.')[0]
    # Read the TSV into a DataFrame
    globals()[var_name] = pd.read_csv(filename, sep='\t', dtype=str)

# Verify the loaded DataFrames
print("Loaded DataFrames:")
for var_name in filenames:
    var_name = var_name.split('.')[0]
    print(f"{var_name}: {globals()[var_name].shape}")


Loaded DataFrames:
gold_emp500_environments_ecosystems: (1024, 13)
gold_emp500_seq_proj_links: (1836, 8)
ncbi_emp500_environmental_triads: (1024, 5)


In [20]:
pre_merged_df = pd.merge(
    gold_emp500_seq_proj_links,
    ncbi_emp500_environmental_triads,
    left_on="ncbiBioSampleAccession",
    right_on="ncbi_biosample_accession",
    how="outer",
    indicator=False  # Add an indicator column to track unmatched rows
)

# Join the resulting DataFrame with gold_emp500_environments_ecosystems
merged_df = pd.merge(
    pre_merged_df,
    gold_emp500_environments_ecosystems,
    left_on="biosampleGoldId",
    right_on="biosampleGoldId",
    how="outer",
    indicator=False  # Add an indicator column to track unmatched rows
)

In [24]:
# Step 1: Drop the specified columns
columns_to_drop = ['projectGoldId', 'sequencingStrategy', 'projectStatus', 'sraExperimentIds']
deduped_df = merged_df.drop(columns=columns_to_drop, errors='ignore')

# Step 2: Remove duplicate rows
deduped_df = deduped_df.drop_duplicates()

# Verify the resulting DataFrame
print(f"Final cleaned DataFrame shape (after dropping columns and removing duplicates): {final_cleaned_df.shape}")


Final cleaned DataFrame shape (after dropping columns and removing duplicates): (1024, 21)


In [29]:
envo_adapter = get_adapter(envo_adapter_string)

In [32]:
# Define the IDs for the classes
biome_id = "ENVO:00000428"  # Biome
environmental_material_id = "ENVO:00010483"  # Environmental material

# Get all subclasses (descendants) of 'biome'
biome_subclasses = envo_adapter.descendants(biome_id, reflexive=False, predicates=[IS_A])
biome_subclasses = list(biome_subclasses)  # Convert to a list for easier handling

# Get all subclasses (descendants) of 'environmental material'
environmental_material_subclasses = envo_adapter.descendants(environmental_material_id, reflexive=False, predicates=[IS_A])
environmental_material_subclasses = list(environmental_material_subclasses)  # Convert to a list


Subclasses of ENVO:00000428 (biome):
['ENVO:01001505', 'ENVO:01000024', 'ENVO:01000252', 'ENVO:01000180', 'ENVO:01000123', 'ENVO:01000174', 'ENVO:01000025', 'ENVO:01000038', 'ENVO:01000857', 'ENVO:01001230', 'ENVO:01000247', 'ENVO:01000030', 'ENVO:01000051', 'ENVO:01000036', 'ENVO:00000889', 'ENVO:01000186', 'ENVO:01000028', 'ENVO:01000201', 'ENVO:00000446', 'ENVO:01000220', 'ENVO:01000179', 'ENVO:01000228', 'ENVO:01000208', 'ENVO:01000246', 'ENVO:01000197', 'ENVO:01000037', 'ENVO:00000890', 'ENVO:01000222', 'ENVO:01000194', 'ENVO:01000192', 'ENVO:01000212', 'ENVO:01001835', 'ENVO:01000053', 'ENVO:01000224', 'ENVO:01000229', 'ENVO:01000185', 'ENVO:01000215', 'ENVO:01000023', 'ENVO:01000020', 'ENVO:01000181', 'ENVO:00000893', 'ENVO:01000253', 'ENVO:01000219', 'ENVO:01000339', 'ENVO:01000034', 'ENVO:01000218', 'ENVO:01000032', 'ENVO:01000249', 'ENVO:00002030', 'ENVO:01000177', 'ENVO:01000210', 'ENVO:01000052', 'ENVO:01000039', 'ENVO:03605008', 'ENVO:01000198', 'ENVO:01000022', 'ENVO:0100

In [33]:
# Add 'bad_ebs' column: True if envoBroadScale.id is not in biome_subclasses
deduped_df['bad_ebs'] = ~deduped_df['envoBroadScale.id'].isin(biome_subclasses)

# Add 'bad_em' column: True if envoMedium.id is not in environmental_material_subclasses
deduped_df['bad_em'] = ~deduped_df['envoMedium.id'].isin(environmental_material_subclasses)


In [34]:
deduped_df.to_csv("merged_emp500_data.tsv", sep="\t", index=False)