In [1]:
import os
import pandas as pd

In [2]:
# directory containing the raw mappings
input_directory = "../../lexmatch-output"

# list of attempted target ontologies
for_lexmatch_txt = '../../lexmatch-shell-scripts/for-lexmatch.txt'

# for identifying the entities
#   for which mappings should be reported
#   even if they don't involve MIxS or NMDC
env_triad_pvs_sheet_tsv = "env_triad_pvs_sheet.tsv"

In [3]:
output_tsv = "lexmatch-combined.tsv"

In [4]:
env_triad_pvs_frame = pd.read_csv(env_triad_pvs_sheet_tsv, sep='\t', skiprows=[1])

In [5]:
env_triad_pvs_curies = list(env_triad_pvs_frame['class_uri'])

In [6]:
# Initialize an empty list to collect DataFrames
dataframes = []

# Loop through each file in the directory
for filename in os.listdir(input_directory):
    if filename.endswith(".tsv"):
        print(filename)
        file_path = os.path.join(input_directory, filename)

        # Read the file while skipping rows that start with #
        with open(file_path, 'r') as f:
            lines = [line for line in f if not line.startswith('#')]

        # Convert the filtered lines to a DataFrame
        if lines and len(lines) > 1:
            print(len(lines))
            from io import StringIO

            df = pd.read_csv(StringIO(''.join(lines)), sep='\t')
            dataframes.append(df)


env_triad_pvs_vs_biolink.SSSOM.tsv
149
nmdc_nmdc_vs_chmo.SSSOM.tsv
1499
nmdc_nmdc_vs_biolink.SSSOM.tsv
1525
env_triad_pvs_vs_agro.SSSOM.tsv
710
nmdc_nmdc_vs_agro.SSSOM.tsv
1703
nmdc_mixs_vs_chmo.SSSOM.tsv
10
env_triad_pvs_vs_chmo.SSSOM.tsv
100
nmdc_mixs_vs_agro.SSSOM.tsv
91
nmdc_mixs_vs_biolink.SSSOM.tsv
10


In [7]:
# Concatenate all DataFrames into a single DataFrame
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
    print("Aggregated DataFrame created successfully!")
else:
    print("No data found.")


Aggregated DataFrame created successfully!


In [8]:
combined_df.shape

(5788, 11)

In [9]:
# Remove duplicate rows and force a new copy
unique_df = combined_df.drop_duplicates().copy()


In [10]:
unique_df.shape

(2400, 11)

In [11]:
non_reflexive_by_term_df = unique_df[unique_df['subject_id'] != unique_df['object_id']].copy()

In [12]:
non_reflexive_by_term_df.shape

(1827, 11)

In [13]:
# Step 1: Split prefixes first
non_reflexive_by_term_df['subject_prefix'] = non_reflexive_by_term_df['subject_id'].str.split(':').str[0]
non_reflexive_by_term_df['object_prefix'] = non_reflexive_by_term_df['object_id'].str.split(':').str[0]

In [14]:
# List of values to filter out
privileged_prefixes = ['mixs', 'nmdc']

# Columns to check
prefix_columns = ['subject_prefix', 'object_prefix']

In [15]:
non_reflexive_by_prefix_df = non_reflexive_by_term_df[non_reflexive_by_term_df['subject_prefix'] != non_reflexive_by_term_df['object_prefix']].copy()

In [16]:
non_reflexive_by_prefix_df.shape

(348, 13)

In [17]:
# Filter rows where neither 'subject_prefix' nor 'object_prefix' are in privileged_prefixes
enriched_for_pvs_frame = non_reflexive_by_prefix_df[
    ~non_reflexive_by_prefix_df[prefix_columns].isin(privileged_prefixes).any(axis=1)
]


In [18]:
enriched_for_pvs_frame.shape

(70, 13)

In [19]:
enriched_for_pvs_frame

Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string,subject_prefix,object_prefix
46,biolink:broad_match,broad match,skos:closeMatch,skos:broadMatch,,semapv:LexicalMatching,oaklib,0.5,rdf:ID,skos:exactMatch-INVERSE,biolink:broad_match,biolink,skos
56,biolink:close_match,close match,skos:closeMatch,skos:closeMatch,,semapv:LexicalMatching,oaklib,0.5,rdf:ID,skos:exactMatch-INVERSE,biolink:close_match,biolink,skos
91,biolink:exact_match,exact match,skos:closeMatch,skos:exactMatch,,semapv:LexicalMatching,oaklib,0.5,rdf:ID,skos:exactMatch-INVERSE,biolink:exact_match,biolink,skos
108,biolink:license,license,skos:closeMatch,dcterms:license,,semapv:LexicalMatching,oaklib,0.5,rdf:ID,skos:exactMatch-INVERSE,biolink:license,biolink,dcterms
111,biolink:name,name,skos:closeMatch,dcterms:title,,semapv:LexicalMatching,oaklib,0.5,rdf:ID,skos:narrowMatch-INVERSE,biolink:name,biolink,dcterms
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5673,CHMO:0001659,electrospray ionisation,skos:closeMatch,MS:1000073,electrospray ionization,semapv:LexicalMatching,oaklib,0.5,oio:hasExactSynonym,rdfs:label,electrospray ionization,CHMO,MS
5674,CHMO:0001659,electrospray ionisation,skos:closeMatch,MS:1000073,electrospray ionization,semapv:LexicalMatching,oaklib,0.5,oio:hasRelatedSynonym,oio:hasExactSynonym,esi,CHMO,MS
5675,CHMO:0001659,electrospray ionisation,skos:closeMatch,MS:1000073,electrospray ionization,semapv:LexicalMatching,oaklib,0.5,rdfs:label,oio:hasExactSynonym,electrospray ionisation,CHMO,MS
5679,CHMO:0001947,precipitation with compressed antisolvent,skos:closeMatch,OBI:0200051,principal components analysis dimensionality r...,semapv:LexicalMatching,oaklib,0.5,oio:hasRelatedSynonym,oio:hasExactSynonym,pca,CHMO,OBI


In [20]:
# Identify all subject_* and object_* columns dynamically
subject_columns = [col for col in non_reflexive_by_term_df.columns if col.startswith('subject_')]
object_columns = [col for col in non_reflexive_by_term_df.columns if col.startswith('object_')]

In [21]:
subject_columns

['subject_id', 'subject_label', 'subject_match_field', 'subject_prefix']

In [22]:
object_columns

['object_id', 'object_label', 'object_match_field', 'object_prefix']

In [23]:
mixs_nmdc_subj_frame = non_reflexive_by_term_df[non_reflexive_by_term_df["subject_prefix"].isin(['mixs', 'nmdc'])]

In [24]:
mixs_nmdc_obj_frame = non_reflexive_by_term_df[non_reflexive_by_term_df["object_prefix"].isin(['mixs', 'nmdc'])]

In [25]:
# Create a copy with swapped columns
reversed_obj_frame = mixs_nmdc_obj_frame.rename(columns={
    'subject_id': 'object_id_temp',
    'subject_label': 'object_label_temp',
    'subject_prefix': 'object_prefix_temp',
    'subject_match_field': 'object_match_field_temp',

    'object_id': 'subject_id',
    'object_label': 'subject_label',
    'object_prefix': 'subject_prefix',
    'object_match_field': 'subject_match_field'
})

# # Complete the swap by renaming the temporary columns
reversed_obj_frame = reversed_obj_frame.rename(columns={
    'object_id_temp': 'object_id',
    'object_label_temp': 'object_label',
    'object_prefix_temp': 'object_prefix',
    'object_match_field_temp': 'object_match_field'
})

In [26]:
# Create a copy with swapped columns
reversed_enriched_for_pvs_frame = enriched_for_pvs_frame.rename(columns={
    'subject_id': 'object_id_temp',
    'subject_label': 'object_label_temp',
    'subject_prefix': 'object_prefix_temp',
    'subject_match_field': 'object_match_field_temp',

    'object_id': 'subject_id',
    'object_label': 'subject_label',
    'object_prefix': 'subject_prefix',
    'object_match_field': 'subject_match_field'
})

# # Complete the swap by renaming the temporary columns
reversed_enriched_for_pvs_frame = reversed_enriched_for_pvs_frame.rename(columns={
    'object_id_temp': 'object_id',
    'object_label_temp': 'object_label',
    'object_prefix_temp': 'object_prefix',
    'object_match_field_temp': 'object_match_field'
})

In [27]:
enriched_for_pvs_frame = pd.concat([enriched_for_pvs_frame,reversed_enriched_for_pvs_frame], axis=0, ignore_index=True)

In [28]:
enriched_for_pvs_frame = enriched_for_pvs_frame[enriched_for_pvs_frame['subject_id'].isin(env_triad_pvs_curies)]

In [29]:
enriched_for_pvs_frame

Unnamed: 0,subject_id,subject_label,predicate_id,object_id,object_label,mapping_justification,mapping_tool,confidence,subject_match_field,object_match_field,match_string,subject_prefix,object_prefix
41,ENVO:00000100,valley,skos:closeMatch,PATO:0002078,hollow,semapv:LexicalMatching,oaklib,0.5,oio:hasRelatedSynonym,rdfs:label,hollow,ENVO,PATO
79,ENVO:00000170,dune,skos:closeMatch,AGRO:00000196,bund,semapv:LexicalMatching,oaklib,0.5,oio:hasBroadSynonym,oio:hasExactSynonym,ridge,ENVO,AGRO
80,ENVO:00000178,levee,skos:closeMatch,AGRO:00000196,bund,semapv:LexicalMatching,oaklib,0.5,oio:hasRelatedSynonym,oio:hasExactSynonym,dyke,ENVO,AGRO
81,ENVO:00000178,levee,skos:closeMatch,AGRO:00000196,bund,semapv:LexicalMatching,oaklib,0.5,oio:hasRelatedSynonym,oio:hasExactSynonym,levee,ENVO,AGRO
82,ENVO:00000178,levee,skos:closeMatch,AGRO:00000196,bund,semapv:LexicalMatching,oaklib,0.5,rdfs:label,oio:hasExactSynonym,levee,ENVO,AGRO
83,ENVO:00000283,ridge,skos:closeMatch,AGRO:00000196,bund,semapv:LexicalMatching,oaklib,0.5,oio:hasExactSynonym,oio:hasExactSynonym,ridge,ENVO,AGRO
84,ENVO:00000283,ridge,skos:closeMatch,AGRO:00000196,bund,semapv:LexicalMatching,oaklib,0.5,oio:hasRelatedSynonym,oio:hasExactSynonym,ridge,ENVO,AGRO
85,ENVO:00000283,ridge,skos:closeMatch,AGRO:00000196,bund,semapv:LexicalMatching,oaklib,0.5,rdfs:label,oio:hasExactSynonym,ridge,ENVO,AGRO
87,ENVO:03600087,greenhouse,skos:closeMatch,AGRO:00000363,greenhouse,semapv:LexicalMatching,oaklib,0.5,rdfs:label,rdfs:label,greenhouse,ENVO,AGRO
88,PO:0008037,seedling,skos:closeMatch,AGRO:00000548,seedling,semapv:LexicalMatching,oaklib,0.5,rdfs:label,rdfs:label,seedling,PO,AGRO


In [30]:
predictable_df = pd.concat([mixs_nmdc_subj_frame, reversed_obj_frame, enriched_for_pvs_frame], axis=0, ignore_index=True)

In [31]:
# Remove duplicate rows and force a new copy
unique_df = predictable_df.drop_duplicates().copy()

In [32]:
unique_df['subject_prefix'].value_counts()

subject_prefix
nmdc    1207
mixs      57
ENVO      11
PO         5
Name: count, dtype: int64

In [33]:
unique_df['object_prefix'].value_counts()

object_prefix
nmdc         991
ENVO          76
biolink       55
CHEBI         50
CHMO          31
PATO          21
OBI           21
AGRO          13
mixs           5
UO             3
IAO            3
BFO            2
skos           2
ror            2
dcterms        1
linkml         1
APOLLO_SV      1
RO             1
NCBITaxon      1
Name: count, dtype: int64

In [34]:
# Write the combined DataFrame to a TSV
unique_df.to_csv(output_tsv, sep='\t', index=False)

print(f"Combined TSV written to {output_tsv}")


Combined TSV written to lexmatch-combined.tsv


In [35]:
# Read the file and create a set of lowercase strings
with open(for_lexmatch_txt, 'r') as f:
    requested_ontologies = {line.strip().lower() for line in f if len(line.strip().lower()) > 0}

In [36]:
# Get DataFrame prefixes as a set
hit_prefixes = set(unique_df['object_prefix'].str.lower())

In [37]:
requests_only = list(requested_ontologies - hit_prefixes)
requests_only.sort()

In [38]:
requests_only

[]

In [39]:
hits_only = list(hit_prefixes - requested_ontologies )
hits_only.sort()

In [40]:
hits_only

['apollo_sv',
 'bfo',
 'chebi',
 'dcterms',
 'envo',
 'iao',
 'linkml',
 'mixs',
 'ncbitaxon',
 'nmdc',
 'obi',
 'pato',
 'ro',
 'ror',
 'skos',
 'uo']