In [1]:
from collections import Counter
from collections import defaultdict
import csv
import json
import pprint
import re
import sys

import pandas as pd
from oaklib import get_adapter
from oaklib.datamodels.vocabulary import IS_A

sys.path.append('../..')  # Assuming notebooks/ is at the root level
import core # appears as if there's a problem in PyCharm Jupyter, but it still works


at what point would i want to do an aggregation query instead of an $in query?

In [2]:
NCBI_METADATA = "ncbi_metadata"
GOLD_METADATA = "gold_metadata_studies_with_samples"

In [3]:
# For Ontology Access Kit (OAK)
envo_adapter_string = "sqlite:obo:envo"

In [4]:
BIOME = "ENVO:00000428"
ABP = "ENVO:01000813"
ENV_MAT = "ENVO:00010483"

ANIMAL_MANURE = "ENVO:00003031"
BODILY_FLUID_MATERIAL = "ENVO:02000019"
FECAL_MATERIAL ="ENVO:00002003"
# FECES = "UBERON:0001988"
LIQ_WATER = "ENVO:00002006"
PETROLEUM = "ENVO:00002984"
SEDIMENT = "ENVO:00002007"
SLUDGE = "ENVO:00002044"
SOIL = "ENVO:00001998"

In [5]:
# Configurable match length cutoff
MATCH_LENGTH_CUTOFF = 3  # Exclude matches with length < 3 from needs_review_list

In [6]:
# Define the target harmonized names
target_harmonized_names = {"env_broad_scale", "env_local_scale", "env_medium"}

In [7]:
#inputs
per_slot_value_curations_tsv = "inputs/emp500_undefined_triad_slot_value_counts_curated.tsv"
macrae_crerar_biosample_clusters_with_coordinates_tsv = "MacRae-Crerar/macrae_crerar_attributes_curated.tsv"
myrold_attributes_with_land_cover_tsv= "myrold/myrold_attributes_with_land_cover.tsv"


In [8]:
#outputs
detailed_tsv = "outputs/emp500_term_legitimacy.tsv"
emp500_mixs_style_triads_plus_gold_ids_tsv = "outputs/emp500_matches_and_curations_plus_gold_ids.tsv"
missing_terms_tsv = "outputs/emp500_annotations_for_review.tsv"
needs_review_tsv = "outputs/emp500_env_context_vocabulary_for_review.tsv"
problematic_summary_tsv = "outputs/emp500_undefined_triad_slot_value_counts.tsv"

summary_json_file = "outputs/emp500_summary.json"

In [9]:
envo_adapter = get_adapter(envo_adapter_string)

In [10]:
envo_entities_list = list(envo_adapter.entities())

# Create a dictionary mapping CURIEs to labels
envo_curies_to_label = {
    curie: envo_adapter.label(curie) for curie in envo_entities_list
}


In [11]:
abp_curies = list(envo_adapter.descendants(ABP, predicates=[IS_A]))
biome_curies = list(envo_adapter.descendants(BIOME, predicates=[IS_A]))
env_mat_curies = list(envo_adapter.descendants(ENV_MAT, predicates=[IS_A]))

animal_manure_curies = list(envo_adapter.descendants(ANIMAL_MANURE, predicates=[IS_A]))
bodily_fluid_material_curies = list(envo_adapter.descendants(BODILY_FLUID_MATERIAL, predicates=[IS_A]))
fecal_material_curies = list(envo_adapter.descendants(FECAL_MATERIAL, predicates=[IS_A]))
# feces_curies = list(envo_adapter.descendants(FECES, predicates=[IS_A]))
liq_water_curies = list(envo_adapter.descendants(LIQ_WATER, predicates=[IS_A]))
petroleum_curies = list(envo_adapter.descendants(PETROLEUM, predicates=[IS_A]))
sediment_curies = list(envo_adapter.descendants(SEDIMENT, predicates=[IS_A]))
sludge_curies = list(envo_adapter.descendants(SLUDGE, predicates=[IS_A]))
soil_curies = list(envo_adapter.descendants(SOIL, predicates=[IS_A]))

In [12]:
non_biome_non_env_mat_abp_curies = set(abp_curies) - set(biome_curies) - set(env_mat_curies)

In [13]:
obsoletes_curies_labelled = list()

obsoletes_curies_envo = set(envo_adapter.obsoletes())
for curie in obsoletes_curies_envo:
    temp_dict = dict()
    temp_dict["curie"] = curie
    temp_dict["label"] = envo_adapter.label(curie)
    obsoletes_curies_labelled.append(temp_dict)

In [14]:
client = core.get_mongo_client()

In [15]:
# Search for BioProjects
bioprojects_cursor = core.fetch_mongodb_records_by_text(
    client,
    NCBI_METADATA,
    "bioprojects",
    "emp500"
)


In [16]:
# illustration or removing _id from the dictionaries if desired
bioprojects_list = [
    {key: value for key, value in document.items() if key!= '_id'}
    for document in bioprojects_cursor
]

In [17]:
print("BioProjects:")
pprint.pprint(bioprojects_list)

BioProjects:
[{'ProjectDescr': {'Description': 'The Earth Microbiome Project Multi-omics '
                                  'component (EMP500) involves amplicon and '
                                  'shotgun metagenomic sequencing and '
                                  'metabolomic profiling of over five hundred '
                                  'microbial communities from diverse '
                                  'environments on our planet. We developed '
                                  'new protocols for shotgun metagenomic '
                                  'sequencing and assembly, with the goal of '
                                  'applying this workflow to a range of '
                                  'environmental samples, combined with '
                                  'metabolomic profiling. 16S, 18S, and ITS '
                                  'amplicon sequencing was done in addition. '
                                  'We acquired a set of >500 fresh '
 

In [18]:
# Search for GOLD studies
gold_studies_cursor = core.fetch_mongodb_records_by_text(
    client,
    GOLD_METADATA,
    "studies",
    "emp500"
)

gold_studies_list = list(gold_studies_cursor)


In [19]:
print("GOLD studies:")
pprint.pprint(gold_studies_list)

GOLD studies:
[{'_id': ObjectId('67ae0a020614a9099996848c'),
  'addDate': '2021-07-24',
  'biosamples': ['Gb0239921',
                 'Gb0239928',
                 'Gb0239989',
                 'Gb0240012',
                 'Gb0240069',
                 'Gb0240099',
                 'Gb0240127',
                 'Gb0239958',
                 'Gb0239985',
                 'Gb0239997',
                 'Gb0240033',
                 'Gb0240039',
                 'Gb0240051',
                 'Gb0240056',
                 'Gb0240068',
                 'Gb0240083',
                 'Gb0240106',
                 'Gb0239927',
                 'Gb0297473',
                 'Gb0297480',
                 'Gb0297366',
                 'Gb0297316',
                 'Gb0297394',
                 'Gb0297436',
                 'Gb0297349',
                 'Gb0297333',
                 'Gb0297352',
                 'Gb0297353',
                 'Gb0297430',
                 'Gb0297433',
            

In [20]:
emp500_bioproject_accession = "PRJEB42019"
emp500_gold_study_id = "Gs0154244"

In [21]:
emp500_bioproj_biosample_links_cursor = core.fetch_mongodb_records_by_path(
    client,
    NCBI_METADATA,
    "sra_biosamples_bioprojects",
    "bioproject_accession",
    emp500_bioproject_accession
)

emp500_bioproj_biosample_links = [
    {key: value for key, value in document.items() if key!= '_id'}
    for document in emp500_bioproj_biosample_links_cursor
]

In [22]:
emp500_biosample_accessions = [doc["biosample_accession"] for doc in emp500_bioproj_biosample_links]

In [23]:
emp500_ncbi_biosamples_cursor = core.fetch_mongodb_records_by_path_in(
    client,
    NCBI_METADATA,
    "biosamples",
    "accession",
    emp500_biosample_accessions
)

emp500_ncbi_biosamples = [
    {key: value for key, value in document.items() if key!= '_id'}
    for document in emp500_ncbi_biosamples_cursor
]

In [24]:
emp500_ncbi_biosamples = core.lod_to_dod(emp500_ncbi_biosamples, "accession")

In [25]:
print(len(emp500_ncbi_biosamples))

1024


In [26]:
emp500_gold_study_cursor = core.fetch_mongodb_records_by_path(
    client,
    NCBI_METADATA,
    "sra_biosamples_bioprojects",
    "bioproject_accession",
    emp500_bioproject_accession
)

emp500_bioproj_biosample_links = [
    {key: value for key, value in document.items() if key!= '_id'}
    for document in emp500_bioproj_biosample_links_cursor
]

In [27]:
# Search for GOLD studies
emp500_gold_study_cursor = core.fetch_mongodb_records_by_path(
    client,
    GOLD_METADATA,
    "studies",
    "studyGoldId",
    emp500_gold_study_id
)

emp500_gold_study_list = list(emp500_gold_study_cursor)

In [28]:
emp500_gold_biosamples_list = emp500_gold_study_list[0].get("biosamples", [])

In [29]:
emp500_gold_biosamples_cursor = core.fetch_mongodb_records_by_path_in(
    client,
    GOLD_METADATA,
    "biosamples",
    "biosampleGoldId",
    emp500_gold_biosamples_list
)

emp500_gold_biosamples = [
    {key: value for key, value in document.items() if key!= '_id'}
    for document in emp500_gold_biosamples_cursor
]

In [30]:
emp500_gold_biosamples = core.lod_to_dod(emp500_gold_biosamples, "biosampleGoldId")

In [31]:
print(len(emp500_gold_biosamples))

1024


Biosamples have been fetched from NCBI and GOLD at this point

In [32]:
emp500_gold_seq_projs_cursor = core.fetch_mongodb_records_by_path_in(
    client,
    GOLD_METADATA,
    "projects",
    "biosampleGoldId",
    emp500_gold_biosamples_list
)

emp500_gold_seq_projs = [
    {key: value for key, value in document.items() if key!= '_id'}
    for document in emp500_gold_seq_projs_cursor
]

In [33]:
print(len(emp500_gold_seq_projs))

1836


In [34]:
emp500_seq_proj_summaries_by_accession = defaultdict(dict)

for record in emp500_gold_seq_projs:
    bioproj_accession = record['ncbiBioProjectAccession']
    biosample_accession = record['ncbiBioSampleAccession']
    biosample_gold_id = record['biosampleGoldId']
    sra_ids = record['sraExperimentIds']

    if biosample_accession in emp500_seq_proj_summaries_by_accession:
        # Check if other fields are the same
        existing_record = emp500_seq_proj_summaries_by_accession[biosample_accession]
        if (
                existing_record['ncbiBioProjectAccession'] == bioproj_accession and
                existing_record['biosampleGoldId'] == biosample_gold_id
        ):
            # Update existing record with new sraExperimentIds
            existing_record['sraExperimentIds'].update(sra_ids)
        else:
            # Handle conflicting data (e.g., raise an error or log a warning)
            print(f"Conflicting data for accession {biosample_accession}: {record} vs {existing_record}")
    else:
        # Create a new record
        emp500_seq_proj_summaries_by_accession[biosample_accession] = {
            'biosampleGoldId': biosample_gold_id,
            'ncbiBioProjectAccession': bioproj_accession,
            'sraExperimentIds': set(sra_ids)
        }

In [35]:
print(len(emp500_seq_proj_summaries_by_accession))

1024


In [36]:
emp500_gold_integrated_biosamples = {}

for accession, seq_proj_summary in emp500_seq_proj_summaries_by_accession.items():
    biosample_gold_id = seq_proj_summary.get('biosampleGoldId')  # Get gold ID
    if biosample_gold_id:
      gold_biosample_data = emp500_gold_biosamples.get(biosample_gold_id)
      if gold_biosample_data:
        # Merge the dictionaries (you can customize how to handle conflicts)
        integrated_data = {**seq_proj_summary, **gold_biosample_data}
        emp500_gold_integrated_biosamples[accession] = integrated_data


todo: add label lookup and confirmation and biome, abp, env_medium, soil and water checks for GOLD samples in `emp500_gold_integrated_biosamples`

In [37]:
for biosample_gold_id, gold_biosample in emp500_gold_integrated_biosamples.items():
    ebs_true_curie = gold_biosample['envoBroadScale']['id'].replace("_", ":")
    els_true_curie = gold_biosample['envoLocalScale']['id'].replace("_", ":")
    em_true_curie = gold_biosample['envoMedium']['id'].replace("_", ":")

    gold_biosample['envoBroadScale_label_lu'] = envo_curies_to_label.get(ebs_true_curie)
    gold_biosample['envoBroadScale_label_exact_match'] = gold_biosample['envoBroadScale_label_lu'] == \
                                                         gold_biosample['envoBroadScale']['label']
    gold_biosample['envoBroadScale_biome'] = ebs_true_curie in biome_curies
    gold_biosample['envoLocalScale_label_lu'] = envo_curies_to_label.get(els_true_curie)
    gold_biosample['envoLocalScale_label_exact_match'] = gold_biosample['envoLocalScale_label_lu'] == \
                                                         gold_biosample['envoLocalScale']['label']
    gold_biosample['envoLocalScale_abp_subset'] = els_true_curie in non_biome_non_env_mat_abp_curies
    gold_biosample['envoMedium_label_lu'] = envo_curies_to_label.get(em_true_curie)
    gold_biosample['envoMedium_label_exact_match'] = gold_biosample['envoMedium_label_lu'] == \
                                                     gold_biosample['envoMedium']['label']
    gold_biosample['envoMedium_env_mat'] = em_true_curie in env_mat_curies
    gold_biosample['envoMedium_soil'] = em_true_curie in soil_curies
    gold_biosample['envoMedium_liq_water'] = em_true_curie in liq_water_curies
    gold_biosample['envoMedium_sediment'] = em_true_curie in sediment_curies
    # gold_biosample['envoMedium_feces'] = em_true_curie in feces_curies
    gold_biosample['envoMedium_fecal_material'] = em_true_curie in fecal_material_curies
    gold_biosample['envoMedium_animal_manure'] = em_true_curie in animal_manure_curies
    gold_biosample['envoMedium_bodily_fluid_material'] = em_true_curie in bodily_fluid_material_curies
    gold_biosample['envoMedium_sludge'] = em_true_curie in sludge_curies
    gold_biosample['envoMedium_petroleum'] = em_true_curie in petroleum_curies



In [38]:
def get_harmonized_value(ncbi_biosample, harmonized_name):
  """
  Extracts the 'content' from ncbi_biosample['Attributes']['Attribute']
  where 'harmonized_name' matches the given value.

  Args:
    ncbi_biosample: The dictionary containing biosample data.
    harmonized_name: The 'harmonized_name' to search for.

  Returns:
    The 'content' value if a match is found, otherwise None.
  """
  if 'Attributes' in ncbi_biosample and 'Attribute' in ncbi_biosample['Attributes']:
    for attribute in ncbi_biosample['Attributes']['Attribute']:
      if attribute.get('harmonized_name') == harmonized_name:
        return attribute.get('content')
  return None

In [39]:
# todo: is it worth keeping track of any package knowledge from NCBI or GOLD?

emp500_integrated_biosample_summaries = {}

ncbi_env_content_counter = Counter()  # Store counts of all strings used as env triad values

for accession, ncbi_biosample in emp500_ncbi_biosamples.items():
    integrated_summary_dict = dict()
    gold_biosample = emp500_gold_integrated_biosamples.get(accession)  # Get gold ID
    if gold_biosample:
        # print(f"Found matching GOLD data for {gold_biosample['biosampleGoldId']}")
        integrated_summary_dict['accession'] = accession
        integrated_summary_dict['env_broad_scale'] = get_harmonized_value(ncbi_biosample, 'env_broad_scale')
        ncbi_env_content_counter[integrated_summary_dict['env_broad_scale']] += 1  # Count occurrences
        integrated_summary_dict['env_local_scale'] = get_harmonized_value(ncbi_biosample, 'env_local_scale')
        ncbi_env_content_counter[integrated_summary_dict['env_local_scale']] += 1  # Count occurrences
        integrated_summary_dict['env_medium'] = get_harmonized_value(ncbi_biosample, 'env_medium')
        ncbi_env_content_counter[integrated_summary_dict['env_medium']] += 1  # Count occurrences
        integrated_summary_dict['biosampleGoldId'] = gold_biosample['biosampleGoldId']
        integrated_summary_dict['ncbiBioProjectAccession'] = gold_biosample['ncbiBioProjectAccession']
        integrated_summary_dict['sraExperimentIds'] = "|".join(sorted(gold_biosample['sraExperimentIds']))
        integrated_summary_dict['ecosystemPathId'] = gold_biosample['ecosystemPathId']
        integrated_summary_dict['ecosystem'] = gold_biosample['ecosystem']
        integrated_summary_dict['ecosystemCategory'] = gold_biosample['ecosystemCategory']
        integrated_summary_dict['ecosystemType'] = gold_biosample['ecosystemType']
        integrated_summary_dict['ecosystemSubtype'] = gold_biosample['ecosystemSubtype']
        integrated_summary_dict['specificEcosystem'] = gold_biosample['specificEcosystem']

        integrated_summary_dict['envoBroadScale_id'] = gold_biosample['envoBroadScale']['id'].replace("_", ":")
        integrated_summary_dict['envoBroadScale_label'] = gold_biosample['envoBroadScale']['label']

        integrated_summary_dict['envoBroadScale_label_lu'] = gold_biosample['envoBroadScale_label_lu']
        integrated_summary_dict['envoBroadScale_label_exact_match'] = gold_biosample['envoBroadScale_label_exact_match']
        integrated_summary_dict['envoBroadScale_biome'] = gold_biosample['envoBroadScale_biome']

        integrated_summary_dict['envoLocalScale_id'] = gold_biosample['envoLocalScale']['id'].replace("_", ":")
        integrated_summary_dict['envoLocalScale_label'] = gold_biosample['envoLocalScale']['label']

        integrated_summary_dict['envoLocalScale_label_lu'] = gold_biosample['envoLocalScale_label_lu']
        integrated_summary_dict['envoLocalScale_label_exact_match'] = gold_biosample['envoLocalScale_label_exact_match']
        integrated_summary_dict['envoLocalScale_abp_subset'] = gold_biosample['envoLocalScale_abp_subset']

        integrated_summary_dict['envoMedium_id'] = gold_biosample['envoMedium']['id'].replace("_", ":")
        integrated_summary_dict['envoMedium_label'] = gold_biosample['envoMedium']['label']
        integrated_summary_dict['envoMedium_label_lu'] = gold_biosample['envoMedium_label_lu']
        integrated_summary_dict['envoMedium_label_exact_match'] = gold_biosample['envoMedium_label_exact_match']
        integrated_summary_dict['envoMedium_env_mat'] = gold_biosample['envoMedium_env_mat']
        integrated_summary_dict['envoMedium_soil'] = gold_biosample['envoMedium_soil']
        integrated_summary_dict['envoMedium_liq_water'] = gold_biosample['envoMedium_liq_water']
        integrated_summary_dict['envoMedium_sediment'] = gold_biosample['envoMedium_sediment']
        # integrated_summary_dict['envoMedium_feces'] = gold_biosample['envoMedium_feces']
        integrated_summary_dict['envoMedium_fecal_material'] = gold_biosample['envoMedium_fecal_material']
        integrated_summary_dict['envoMedium_animal_manure'] = gold_biosample['envoMedium_animal_manure']
        integrated_summary_dict['envoMedium_bodily_fluid_material'] = gold_biosample['envoMedium_bodily_fluid_material']
        integrated_summary_dict['envoMedium_sludge'] = gold_biosample['envoMedium_sludge']
        integrated_summary_dict['envoMedium_petroleum'] = gold_biosample['envoMedium_petroleum']
        integrated_summary_dict['envoMedium_accounted_for'] = gold_biosample['envoMedium_soil'] or gold_biosample[
            'envoMedium_liq_water'] or gold_biosample['envoMedium_sediment'] or \
                                                              gold_biosample['envoMedium_fecal_material'] or \
                                                              gold_biosample['envoMedium_animal_manure'] or \
                                                              gold_biosample['envoMedium_bodily_fluid_material'] or \
                                                              gold_biosample['envoMedium_sludge'] or gold_biosample[
                                                                  'envoMedium_petroleum']

        integrated_summary_dict['addDate'] = gold_biosample['addDate']
        integrated_summary_dict['modDate'] = gold_biosample['modDate']

        emp500_integrated_biosample_summaries[accession] = integrated_summary_dict
    else:
        print(f"No matching GOLD data found for {accession}")

In [40]:
print(len(ncbi_env_content_counter))

112


In [41]:
ncbi_env_content_counter

Counter({'urban biome': 285,
         'research facility': 191,
         'feces': 184,
         'sterile water': 178,
         'organic material': 166,
         'host-associated habitat': 133,
         'soil': 132,
         'marine benthic feature': 125,
         'temperate mixed forest biome': 112,
         'marine benthic biome': 90,
         'animal-associated habitat': 84,
         'marine coral reef biome': 51,
         'sediment': 45,
         'subpolar coniferous forest biome': 45,
         'marine biome': 42,
         'marine salt marsh biome': 40,
         'desert biome': 40,
         'marine sediment': 38,
         'marine reef biome': 30,
         'tropical broadleaf forest biome': 28,
         'insecta-associated habitat': 28,
         'lake sediment': 27,
         'montane desert': 26,
         'alpine soil': 26,
         'kelp forest': 25,
         'saline marsh': 24,
         'temperate coniferous forest biome': 22,
         'dense settlement biome': 22,
         'coral 

In [42]:
# use the OAK annotator to match the submitted env triad values to ENVO terms
# Don't panic about red error messages

# List to store all annotations
ncbi_env_triad_terms_annotations_list = []

for content_value, count in ncbi_env_content_counter.items():
    try:
        annotations = envo_adapter.annotate_text(content_value)
        for annotation in annotations:
            # Fetch the label for the object_id
            object_label = envo_adapter.label(annotation.object_id) if annotation.object_id else "Unknown"

            annotation_dict = {
                "content_value": content_value,
                "count": count,  # Frequency of this content value in the dataset
                "predicate_id": annotation.predicate_id,
                "object_id": annotation.object_id,
                "object_label": object_label,  # Looked up label
                "subject_start": annotation.subject_start,
                "subject_end": annotation.subject_end,
                "match_string": annotation.match_string,
                "matches_whole_text": annotation.matches_whole_text,
            }
            ncbi_env_triad_terms_annotations_list.append(annotation_dict)

    except Exception as e:
        print(f"Error processing '{content_value}': {e}")


ERROR:root:Skipping statements(subject=ENVO:01001644,predicate=oio:hasDbXref,object=None,value=Carbonate which is formed as the result of some biological process.,datatype=None,language=None,); ValueError: Carbonate which is formed as the result of some biological process. is not a valid URI or CURIE


In [43]:
for annotation in ncbi_env_triad_terms_annotations_list:
    annotation["match_string_len"] = len(annotation["match_string"]) if annotation["match_string"] else 0

In [44]:
# Pretty-print some sample results
for annotation in ncbi_env_triad_terms_annotations_list[:3]:  # Show first 10 annotations
    pprint.pprint(annotation)

{'content_value': 'urban biome',
 'count': 285,
 'match_string': 'biome',
 'match_string_len': 5,
 'matches_whole_text': False,
 'object_id': 'ENVO:00000428',
 'object_label': 'biome',
 'predicate_id': 'rdfs:label',
 'subject_end': 11,
 'subject_start': 7}
{'content_value': 'urban biome',
 'count': 285,
 'match_string': 'urban biome',
 'match_string_len': 11,
 'matches_whole_text': True,
 'object_id': 'ENVO:01000249',
 'object_label': 'urban biome',
 'predicate_id': 'rdfs:label',
 'subject_end': 11,
 'subject_start': 1}
{'content_value': 'anaerobic bioreactor',
 'count': 9,
 'match_string': 'bioreactor',
 'match_string_len': 10,
 'matches_whole_text': False,
 'object_id': 'ENVO:00002123',
 'object_label': 'bioreactor',
 'predicate_id': 'rdfs:label',
 'subject_end': 20,
 'subject_start': 11}


In [45]:
# Lists for categorization
perfect_match_list = []
needs_review_list = []

# Identify content_values that have at least one perfect match with predicate_id='rdfs:label'
all_annotated_content_values = set()  # Track all content_values that were annotated
non_perfect_match_content_values = set()  # Track all content_values that didn’t make it into perfect matches
perfect_match_content_values = set()
unmatched_content_values = set()  # Store content_values that are neither in perfect_match_list nor needs_review_list

for annotation in ncbi_env_triad_terms_annotations_list:
    content_value = annotation["content_value"]
    all_annotated_content_values.add(content_value)

    if annotation["matches_whole_text"] and annotation["predicate_id"] == "rdfs:label":
        perfect_match_list.append(annotation)
        perfect_match_content_values.add(content_value)

# Identify content_values that have no perfect match and meet the length cutoff
needs_review_content_values = set()

for annotation in ncbi_env_triad_terms_annotations_list:
    content_value = annotation["content_value"]
    annotation["match_string_len"] = len(annotation["match_string"]) if annotation["match_string"] else 0

    if (
            content_value not in perfect_match_content_values and
            annotation["match_string_len"] >= MATCH_LENGTH_CUTOFF
    ):
        needs_review_list.append(annotation)
        needs_review_content_values.add(content_value)

# Identify content_values that are in env_content_counter but not in perfect_match_list or needs_review_list
for content_value in ncbi_env_content_counter.keys():
    if content_value not in perfect_match_content_values and content_value not in needs_review_content_values:
        unmatched_content_values.add(content_value)


In [46]:
# Helper function: Normalize text (lowercase + remove extra spaces)
def normalize_text(text):
    return re.sub(r'\s+', ' ', text).strip().lower()  # Replace multiple spaces & trim


# Create a lookup set of normalized obsolete labels
obsolete_label_set = {normalize_text(entry["label"]) for entry in obsoletes_curies_labelled}

# Prepare result storage
exact_obsolete_matches = []

# Check if "obsolete " + content_value exists in the obsolete labels
for content_value in needs_review_content_values | unmatched_content_values:  # Union of both sets
    obsolete_label_candidate = normalize_text(f"obsolete {content_value}")
    if obsolete_label_candidate in obsolete_label_set:
        # Find the corresponding CURIE using normalized comparison
        matching_entry = next(entry for entry in obsoletes_curies_labelled
                              if normalize_text(entry["label"]) == obsolete_label_candidate)
        exact_obsolete_matches.append({
            "content_value": content_value,  # ORIGINAL content_value
            "obsolete_label": matching_entry["label"],  # ORIGINAL obsolete label
            "obsolete_curie": matching_entry["curie"]
        })


In [47]:
pprint.pprint(exact_obsolete_matches)

[{'content_value': 'animal-associated habitat',
  'obsolete_curie': 'ENVO:00006776',
  'obsolete_label': 'obsolete animal-associated habitat'},
 {'content_value': 'ocean water',
  'obsolete_curie': 'ENVO:00002151',
  'obsolete_label': 'obsolete ocean water'},
 {'content_value': 'insecta-associated habitat',
  'obsolete_curie': 'ENVO:00009004',
  'obsolete_label': 'obsolete insecta-associated habitat'},
 {'content_value': 'marine benthic feature',
  'obsolete_curie': 'ENVO:01000105',
  'obsolete_label': 'obsolete marine benthic feature'},
 {'content_value': 'montane grasslands and shrubland biome',
  'obsolete_curie': 'ENVO:00000882',
  'obsolete_label': 'obsolete Montane grasslands and shrubland biome'}]


In [48]:
obsolete_curies = {entry["obsolete_curie"] for entry in
                   exact_obsolete_matches}  # obsolete values that were used (indirectly)
# obsolete_curies = {entry["curie"] for entry in obsoletes_curies_labelled} # all obsolete values in EnvO

obsoletes_metadata_list = []

for curie in obsolete_curies:
    entity_metadata = envo_adapter.entity_metadata_map(curie)  # Fetch metadata
    if entity_metadata:
        obsoletes_metadata_list.append(entity_metadata)


In [49]:
pprint.pprint(obsoletes_metadata_list[:3])  # Show first 3 obsolete entities

[{'IAO:0100001': ['ENVO:00002149'],
  'id': ['ENVO:00002151'],
  'oio:hasDbXref': ['https://en.wikipedia.org/wiki/Ocean_water'],
  'oio:hasOBONamespace': ['ENVO'],
  'oio:id': ['ENVO:00002151'],
  'owl:deprecated': [True],
  'rdfs:isDefinedBy': ['http://purl.obolibrary.org/obo/envo.owl'],
  'rdfs:label': ['obsolete ocean water'],
  'schema:url': ['http://purl.obolibrary.org/obo/ENVO_00002151'],
  'sh:prefix': ['ENVO']},
 {'id': ['ENVO:00000882'],
  'oio:consider': ['ENVO:01000194', 'ENVO:01000216'],
  'oio:hasOBONamespace': ['ENVO'],
  'oio:id': ['ENVO:00000882'],
  'owl:deprecated': [True],
  'rdfs:isDefinedBy': ['http://purl.obolibrary.org/obo/envo.owl'],
  'rdfs:label': ['obsolete Montane grasslands and shrubland biome'],
  'schema:url': ['http://purl.obolibrary.org/obo/ENVO_00000882'],
  'sh:prefix': ['ENVO']},
 {'IAO:0000115': ['A habitat that is in or on a living insect. Here "insect" '
                  'denotes an individual of a species that is a sub-taxon of '
               

In [50]:
obsoletes_predicate_counter = Counter()

for metadata in obsoletes_metadata_list:
    if isinstance(metadata, dict):  # Ensure it's a dictionary
        obsoletes_predicate_counter.update(metadata.keys())


In [51]:
obsoletes_predicate_counter

Counter({'id': 5,
         'owl:deprecated': 5,
         'rdfs:label': 5,
         'sh:prefix': 5,
         'schema:url': 5,
         'rdfs:isDefinedBy': 5,
         'oio:hasOBONamespace': 4,
         'oio:id': 4,
         'IAO:0000115': 3,
         'oio:consider': 2,
         'IAO:0100001': 1,
         'oio:hasDbXref': 1,
         'oio:hasExactSynonym': 1,
         'oio:inSubset': 1,
         'oio:created_by': 1,
         'oio:creation_date': 1,
         'rdfs:comment': 1})

In [52]:
# Compute high-level summary statistics
summary = {
    "total_unique_terms": len(ncbi_env_content_counter),  # Unique terms submitted
    "perfect_matches": len(perfect_match_content_values),  # Unique perfect matches
    "matches_need_review": len(needs_review_content_values),  # Unique terms needing review
    "no_valid_oak_match": len(unmatched_content_values),  # Terms that didn't get a valid OAK annotation
    "exact_obsolete_matches": len(exact_obsolete_matches),  # Unique terms that exactly matched an obsolete label
    "obsolete_terms_with_replacement": sum(
        1 for m in obsoletes_metadata_list if "IAO:0100001" in m
    ),  # Obsolete terms with a clear replacement
    "obsolete_terms_with_consider": sum(
        1 for m in obsoletes_metadata_list if "oio:consider" in m
    ),  # Obsolete terms with "consider" alternatives
}

# Consistency check
if summary["perfect_matches"] + summary["matches_need_review"] + summary["no_valid_oak_match"] != summary[
    "total_unique_terms"]:
    print(
        f"WARNING: perfect_matches ({summary['perfect_matches']}) + matches_need_review ({summary['matches_need_review']}) "
        f"+ no_valid_oak_match ({summary['no_valid_oak_match']}) ≠ total_unique_terms ({summary['total_unique_terms']})"
    )

# Save to JSON

with open(summary_json_file, "w", encoding="utf-8") as json_file:
    json.dump(summary, json_file, indent=4)

print(f"Summary saved to {summary_json_file}")


Summary saved to outputs/emp500_summary.json


In [53]:
# Initialize an empty list to store term legitimacy data
term_legitimacy_data = []

for content_value, emp500_usages in ncbi_env_content_counter.items():
    # Determine perfect match status
    is_perfect_match = content_value in perfect_match_content_values

    # Get matched_label and matched_curie from perfect_match_list
    perfect_match_entry = next(
        (entry for entry in perfect_match_list if entry["content_value"] == content_value),
        None
    )
    matched_label = perfect_match_entry["object_label"] if perfect_match_entry else ""
    matched_curie = perfect_match_entry["object_id"] if perfect_match_entry else ""

    # Determine obsolete status
    obsolete_entry = next((e for e in exact_obsolete_matches if e["content_value"] == content_value), None)
    is_obsolete = bool(obsolete_entry)
    obsolete_label = obsolete_entry["obsolete_label"] if obsolete_entry else ""
    obsolete_curie = ""

    # Get obsolete CURIE from obsoletes_curies_labelled
    if obsolete_entry:
        obsolete_curie_entry = next(
            (o for o in obsoletes_curies_labelled if obsolete_label in o["label"]),
            None
        )
        obsolete_curie = obsolete_curie_entry["curie"] if obsolete_curie_entry else ""

    # Determine replacement term (IAO:0100001)
    replacement_curie = ""
    replacement_label = ""
    consider_replacements = ""

    if obsolete_curie:
        metadata = next((m for m in obsoletes_metadata_list if "id" in m and obsolete_curie in m["id"]), None)

        # Process replacement_curie
        if metadata and "IAO:0100001" in metadata:
            replacement_curie = metadata["IAO:0100001"][0]  # Extract replacement CURIE
            replacement_label = envo_adapter.label(replacement_curie) if replacement_curie else ""

        # Process oio:consider replacements
        if metadata and "oio:consider" in metadata:
            consider_list = metadata["oio:consider"]
            consider_replacements = " | ".join(
                f"{curie}/{envo_adapter.label(curie)}" for curie in consider_list
            )

    # Append data to the list
    term_legitimacy_data.append({
        "content_value": content_value,
        "emp500_usages": emp500_usages,
        "perfect_match": is_perfect_match,
        "matched_label": matched_label,
        "matched_curie": matched_curie,
        "is_obsolete": is_obsolete,
        "obsolete_label": obsolete_label,
        "obsolete_curie": obsolete_curie,
        "replacement_curie": replacement_curie,
        "replacement_label": replacement_label,
        "consider_replacements": consider_replacements
    })



# Write collected data to TSV using csv.DictWriter
with open(detailed_tsv, mode="w", newline="", encoding="utf-8") as file:
    fieldnames = [
        "content_value", "emp500_usages", "perfect_match", "matched_label", "matched_curie",
        "is_obsolete", "obsolete_label", "obsolete_curie", "replacement_curie", "replacement_label",
        "consider_replacements"
    ]
    writer = csv.DictWriter(file, delimiter="\t", fieldnames=fieldnames)

    # Write header
    writer.writeheader()

    # Write data rows
    writer.writerows(term_legitimacy_data)


In [54]:
# Extract `content_value` entries from term_legitimacy_data where `is_obsolete` is True
obsolete_content_values = {
    entry["content_value"] for entry in term_legitimacy_data if entry["is_obsolete"]
}

# Create a unified list to hold all formatted entries for writing
needs_review_data = []

# Process `needs_review_list` while filtering out obsolete content values
for entry in needs_review_list:
    if entry["content_value"] not in obsolete_content_values:
        formatted_entry = {
            "content_value": entry["content_value"],
            "emp500_usages": entry.pop("count", 0),  # Rename "count" to "emp500_usages"
            "predicate_id": entry.get("predicate_id", ""),
            "object_id": entry.get("object_id", ""),
            "object_label": entry.get("object_label", ""),
            "subject_start": entry.get("subject_start", ""),
            "subject_end": entry.get("subject_end", ""),
            "match_string": entry.get("match_string", ""),
            "match_string_len": entry.get("match_string_len", ""),
            "matches_whole_text": entry.get("matches_whole_text", "")
        }
        needs_review_data.append(formatted_entry)

# Process unmatched content values (excluding obsolete terms)
filtered_unmatched_content_values = unmatched_content_values - obsolete_content_values

for content_value in filtered_unmatched_content_values:
    formatted_entry = {
        "content_value": content_value,
        "emp500_usages": ncbi_env_content_counter.get(content_value, 0),
        "predicate_id": "",
        "object_id": "",
        "object_label": "",
        "subject_start": "",
        "subject_end": "",
        "match_string": "",
        "match_string_len": "",
        "matches_whole_text": ""
    }
    needs_review_data.append(formatted_entry)

# Define fieldnames based on actual expected columns
fieldnames = [
    "content_value", "emp500_usages", "predicate_id", "object_id", "object_label",
    "subject_start", "subject_end", "match_string", "match_string_len", "matches_whole_text"
]

# Write collected data to TSV using csv.DictWriter
with open(needs_review_tsv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, delimiter="\t", fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(needs_review_data)

print(f"Needs Review TSV saved to {needs_review_tsv} (Filtered & ensures obsolete terms do NOT reappear)")


Needs Review TSV saved to outputs/emp500_env_context_vocabulary_for_review.tsv (Filtered & ensures obsolete terms do NOT reappear)


In [55]:
per_slot_value_curations_list = []
with open(per_slot_value_curations_tsv, 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file, delimiter='\t')
    for row in reader:
      per_slot_value_curations_list.append(row)

In [56]:
per_slot_value_curations_dict = {}
for row in per_slot_value_curations_list:
    key = (row['problematic_key'], row['problematic_value'])
    per_slot_value_curations_dict[key] = row


In [57]:
pprint.pprint(per_slot_value_curations_dict)

{('env_broad_scale', 'Small lake biome'): {'count': '20',
                                           'done': 'true',
                                           'notes': '',
                                           'problematic_key': 'env_broad_scale',
                                           'problematic_value': 'Small lake '
                                                                'biome',
                                           'replacement_curie': 'ENVO:00000892',
                                           'replacement_label': 'small '
                                                                'freshwater '
                                                                'lake biome'},
 ('env_broad_scale', 'flooded savanna'): {'count': '1',
                                          'done': 'true',
                                          'notes': 'SAMEA7723866 “Seasonal '
                                                   'succession influences the '
             

In [58]:
myrold_attributes_with_land_cover_frame = pd.read_csv(myrold_attributes_with_land_cover_tsv, sep="\t")

In [59]:
myrold_attributes_with_land_cover = myrold_attributes_with_land_cover_frame.to_dict(orient='records')

In [60]:
myrold_attributes_with_land_cover_dict = {i['accession']: i for i in myrold_attributes_with_land_cover }

In [61]:
myrold_attributes_with_land_cover_dict

{'SAMEA7724195': {'accession': 'SAMEA7724195',
  'ena first public': '2020-12-17',
  'ena last update': '2020-12-16',
  'ena-checklist': 'ERC000011',
  'ena-first-public': nan,
  'ena-last-update': nan,
  'external id': 'SAMEA7724195',
  'insdc center alias': 'UCSDMI',
  'insdc center name': 'University of California San Diego Microbiome Initiative',
  'insdc first public': '2020-12-17T04:08:06Z',
  'insdc last update': '2020-12-16T01:23:59Z',
  'insdc status': 'public',
  'submitter id': 'qiita_sid_13114:13114.myrold.5.s001',
  'alpha_shotgun_woltka_min10k_richness': 719.0,
  'alpha_shotgun_woltka_rar10k_richness': 504.0,
  'alpha_shotgun_woltka_rar3450_richness': 255.0,
  'collection_timestamp': '12/15/15 0:00',
  'cur_land_use': 'forest',
  'cur_vegetation': 'Douglas-fir',
  'depth_sample': 0.15,
  'elevation': 286,
  'emp500_principal_investigator': 'Myrold',
  'emp500_study_id': 5,
  'emp500_title': 'Tree-associated soils',
  'empo_2': 'Non-saline',
  'empo_3': 'Soil (non-saline)'

In [62]:
macrae_crerar_biosample_curations_list = []
with open(macrae_crerar_biosample_clusters_with_coordinates_tsv, 'r', encoding='utf-8') as file:
    reader = csv.DictReader(file, delimiter='\t')
    for row in reader:
      macrae_crerar_biosample_curations_list.append(row)

In [63]:
macrae_crerar_biosample_curations_dict = {i['accession']: i for i in macrae_crerar_biosample_curations_list }

In [64]:
pprint.pprint(macrae_crerar_biosample_curations_dict)

{'SAMEA7724107': {'ENA first public': '2020-12-17',
                  'ENA last update': '2020-12-16',
                  'ENA-CHECKLIST': 'ERC000011',
                  'External Id': 'SAMEA7724107',
                  'INSDC center alias': 'UCSDMI',
                  'INSDC center name': 'University of California San Diego '
                                       'Microbiome Initiative',
                  'INSDC first public': '2020-12-17T04:08:06Z',
                  'INSDC last update': '2020-12-16T01:23:54Z',
                  'INSDC status': 'public',
                  'Submitter Id': 'qiita_sid_13114:13114.macrae.crerar.3.s001',
                  'accession': 'SAMEA7724107',
                  'air_temp': '27.8',
                  'alpha_shotgun_woltka_min10k_richness': '',
                  'alpha_shotgun_woltka_rar10k_richness': '',
                  'alpha_shotgun_woltka_rar3450_richness': '353',
                  'block': 'B09',
                  'cluster': '1',
               

In [65]:
# Create a lookup dictionary from term_legitimacy_data using content_value as the key
term_lookup = {entry["content_value"]: entry for entry in term_legitimacy_data}

In [66]:
# List to collect logs for missing term annotations
missing_term_logs = []

# Expected subset mappings and corresponding boolean flags
subset_validations = {
    "env_broad_scale": ("is_biome", biome_curies),
    "env_local_scale": ("is_selected_abp", non_biome_non_env_mat_abp_curies),
    "env_medium": ("is_env_mat", env_mat_curies)  # Corrected order
}

# Iterate over each biosample record
for k, sample in emp500_integrated_biosample_summaries.items():

    for key in target_harmonized_names:
        original_value = sample.get(key, "").strip()

        # Lookup the term legitimacy data
        term_data = term_lookup.get(original_value, {})

        # Extract matching or replacement details
        matched_label = term_data.get("matched_label", "")
        matched_curie = term_data.get("matched_curie", "")
        replacement_label = term_data.get("replacement_label", "")
        replacement_curie = term_data.get("replacement_curie", "")
        consider_replacements = term_data.get("consider_replacements", "")

        via_obsolescence = False  # Default to False

        # Lookup curation in per_slot_value_curations_dict
        curation_key = (key, original_value)
        curation = per_slot_value_curations_dict.get(curation_key)

        if curation:
            if curation["done"] and curation["replacement_curie"] and curation["replacement_label"]:
                sample[f"{key}_curation_curie"] = curation["replacement_curie"]
                sample[f"{key}_curation_label"] = curation["replacement_label"]

        if sample["accession"] in macrae_crerar_biosample_curations_dict:
            sample[f"{key}_curation_curie"] = macrae_crerar_biosample_curations_dict[sample["accession"]][
                'inferred_curie']
            sample[f"{key}_curation_label"] = macrae_crerar_biosample_curations_dict[sample["accession"]][
                'inferred_label']

        if sample["accession"] in myrold_attributes_with_land_cover_dict:
            sample[f"{key}_curation_curie"] = myrold_attributes_with_land_cover_dict[sample["accession"]][
                'envo_curie']
            sample[f"{key}_curation_label"] = myrold_attributes_with_land_cover_dict[sample["accession"]][
                'envo_label']

        if replacement_label and replacement_curie:
            sample[f"{key}_via_obsolescence"] = True
            sample[f"{key}_replacement_label"] = replacement_label
            sample[f"{key}_replacement_curie"] = replacement_curie
        elif matched_label and matched_curie:
            sample[f"{key}_matched_label"] = matched_label
            sample[f"{key}_matched_curie"] = matched_curie
        else:
            # Log missing annotation details
            missing_term_logs.append({
                "accession": sample["accession"],
                "problematic_key": key,
                "problematic_value": original_value,
                "consider_replacements": consider_replacements
            })

        # Validate subset membership and set boolean flag
        curie = replacement_curie if via_obsolescence else matched_curie
        subset_key, expected_curies = subset_validations.get(key, (None, None))

        if subset_key:
            sample[subset_key] = curie in expected_curies if curie else False


In [67]:
# Write missing_term_logs to a TSV file
if missing_term_logs:
    fieldnames = missing_term_logs[0].keys()  # Get all column headers from the first row

    with open(missing_terms_tsv, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, delimiter="\t", fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(missing_term_logs)

    print(f"Missing term logs saved to {missing_terms_tsv}")

Missing term logs saved to outputs/emp500_annotations_for_review.tsv


In [68]:
# Initialize a Counter to track occurrences of (problematic_key, problematic_value)
problematic_counts = Counter()

# Count occurrences of each unique problematic_key/problematic_value combination
for entry in missing_term_logs:
    key_value_pair = (entry["problematic_key"], entry["problematic_value"])
    problematic_counts[key_value_pair] += 1

# Convert the Counter into a list of dictionaries
problematic_summary = [
    {"problematic_key": key, "problematic_value": value, "count": count}
    for (key, value), count in problematic_counts.items()
]


In [69]:
problematic_counts

Counter({('env_local_scale', 'host-associated habitat'): 133,
         ('env_local_scale', 'marine benthic feature'): 125,
         ('env_local_scale', 'animal-associated habitat'): 84,
         ('env_local_scale', 'insecta-associated habitat'): 28,
         ('env_local_scale', 'montane desert'): 26,
         ('env_local_scale', 'forest'): 21,
         ('env_broad_scale', 'Small lake biome'): 20,
         ('env_local_scale', 'stratum'): 18,
         ('env_local_scale', 'temperate coniferous forest'): 16,
         ('env_local_scale', 'oil reservior'): 14,
         ('env_broad_scale', 'montane grasslands and shrubland biome'): 12,
         ('env_local_scale', 'cropland'): 10,
         ('env_medium', 'coastal water'): 9,
         ('env_local_scale', 'wetland'): 9,
         ('env_local_scale', 'intermittent pond'): 7,
         ('env_broad_scale', 'temperate shrubland'): 6,
         ('env_medium', 'pitcher plant fluid'): 6,
         ('env_local_scale', 'rangeland'): 5,
         ('env_local_

In [70]:
fieldnames = problematic_summary[0].keys()  # Get all column headers from the first row

with open(problematic_summary_tsv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, delimiter="\t", fieldnames=fieldnames)
    writer.writeheader()
    writer.writerows(problematic_summary)

In [71]:
# make a LOD FOR WRITING TO TSV
emp500_integrated_biosample_summaries_lod = list(emp500_integrated_biosample_summaries.values())

In [72]:

ordered_columns = [
    "accession",
    "biosampleGoldId",
    "ncbiBioProjectAccession",
    "sraExperimentIds",
    "modDate",
    "addDate",
    "ecosystemPathId",
    "ecosystem",
    "ecosystemCategory",
    "ecosystemType",
    "ecosystemSubtype",
    "specificEcosystem",
    "envoBroadScale_id",
    "envoBroadScale_label",
    "envoBroadScale_label_lu",
    "envoBroadScale_label_exact_match",
    "envoBroadScale_biome",
    "envoLocalScale_id",
    "envoLocalScale_label",
    "envoLocalScale_label_lu",
    "envoLocalScale_label_exact_match",
    "envoLocalScale_abp_subset",
    "envoMedium_id",
    "envoMedium_label",
    "envoMedium_label_lu",
    "envoMedium_label_exact_match",
    "envoMedium_env_mat",
    "envoMedium_soil",
    "envoMedium_liq_water",
    "envoMedium_sediment",
    # "envoMedium_feces",
    "envoMedium_fecal_material",
    "envoMedium_animal_manure",
    "envoMedium_bodily_fluid_material",
    "envoMedium_sludge",
    "envoMedium_petroleum",
    "envoMedium_accounted_for",
    "env_broad_scale",
    "env_broad_scale_matched_curie",
    "env_broad_scale_matched_label",
    "env_broad_scale_replacement_curie",
    "env_broad_scale_replacement_label",
    "env_broad_scale_via_obsolescence",
    "is_biome",
    "env_broad_scale_curation_curie",
    "env_broad_scale_curation_label",
    "env_local_scale",
    "env_local_scale_matched_curie",
    "env_local_scale_matched_label",
    "env_local_scale_replacement_curie",
    "env_local_scale_replacement_label",
    "env_local_scale_via_obsolescence",
    "is_selected_abp",
    "env_local_scale_curation_curie",
    "env_local_scale_curation_label",
    "env_medium",
    "env_medium_matched_curie",
    "env_medium_matched_label",
    "env_medium_replacement_curie",
    "env_medium_replacement_label",
    "env_medium_via_obsolescence",
    "is_env_mat",
    "env_medium_curation_curie",
    "env_medium_curation_label",
]


In [73]:
# Write merged data to a TSV file
with open(emp500_mixs_style_triads_plus_gold_ids_tsv, mode="w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, delimiter="\t", fieldnames=ordered_columns)
    writer.writeheader()
    writer.writerows(emp500_integrated_biosample_summaries_lod)

print(f"Merged MIxS-style environmental triad data saved to {emp500_mixs_style_triads_plus_gold_ids_tsv}")

Merged MIxS-style environmental triad data saved to outputs/emp500_matches_and_curations_plus_gold_ids.tsv
