In [1]:
import os
import pandas as pd

In [25]:
# Define the directory containing the TSV files
directory = "../lexmatch-output"

output_tsv = "lexmatch-combined.tsv"

for_lexmatch_txt = '../for-lexmatch.txt'

In [3]:
# Initialize an empty list to collect DataFrames
dataframes = []

# Loop through each file in the directory
for filename in os.listdir(directory):
    if filename.endswith(".tsv"):
        # print(filename)
        file_path = os.path.join(directory, filename)

        # Read the file while skipping rows that start with #
        with open(file_path, 'r') as f:
            lines = [line for line in f if not line.startswith('#')]

        # Convert the filtered lines to a DataFrame
        if lines:
            from io import StringIO
            df = pd.read_csv(StringIO(''.join(lines)), sep='\t')
            dataframes.append(df)


In [4]:
# Concatenate all DataFrames into a single DataFrame
if dataframes:
    combined_df = pd.concat(dataframes, ignore_index=True)
    print("Aggregated DataFrame created successfully!")
else:
    print("No data found.")


Aggregated DataFrame created successfully!


In [5]:
# Remove duplicate rows and force a new copy
unique_df = combined_df.drop_duplicates().copy()


In [6]:
non_reflexive_df = unique_df[unique_df['subject_id'] != unique_df['object_id']]

In [7]:
# Step 1: Split prefixes first
non_reflexive_df['subject_prefix'] = non_reflexive_df['subject_id'].str.split(':').str[0]
non_reflexive_df['object_prefix'] = non_reflexive_df['object_id'].str.split(':').str[0]

In [8]:
# Identify all subject_* and object_* columns dynamically
subject_columns = [col for col in non_reflexive_df.columns if col.startswith('subject_')]
object_columns = [col for col in non_reflexive_df.columns if col.startswith('object_')]

In [9]:
subject_columns

['subject_id', 'subject_label', 'subject_match_field', 'subject_prefix']

In [10]:
object_columns

['object_id', 'object_label', 'object_match_field', 'object_prefix']

In [11]:
mixs_nmdc_subj_frame = non_reflexive_df[non_reflexive_df["subject_prefix"].isin(['mixs', 'nmdc'])]

In [13]:
mixs_nmdc_obj_frame = non_reflexive_df[non_reflexive_df["object_prefix"].isin(['mixs', 'nmdc'])]

In [15]:
# Create a copy with swapped columns
reversed_df = mixs_nmdc_obj_frame.rename(columns={
    'subject_id': 'object_id_temp',
    'subject_label': 'object_label_temp',
    'subject_prefix': 'object_prefix_temp',
    'subject_match_field': 'object_match_field_temp',

    'object_id': 'subject_id',
    'object_label': 'subject_label',
    'object_prefix': 'subject_prefix',
    'object_match_field': 'subject_match_field'
})

# # Complete the swap by renaming the temporary columns
reversed_df = reversed_df.rename(columns={
    'object_id_temp': 'object_id',
    'object_label_temp': 'object_label',
    'object_prefix_temp': 'object_prefix',
    'object_match_field_temp': 'object_match_field'
})

In [17]:
predictable_df = pd.concat([mixs_nmdc_subj_frame, reversed_df], axis=0, ignore_index=True)

In [19]:
# Remove duplicate rows and force a new copy
unique_df = predictable_df.drop_duplicates().copy()

In [20]:
unique_df['subject_prefix'].value_counts()

subject_prefix
nmdc    2283
mixs     230
Name: count, dtype: int64

In [22]:
unique_df['object_prefix'].value_counts()

object_prefix
nmdc       991
ENVO       209
CHEBI      107
ito         99
PATO        84
          ... 
INO          1
HSO          1
PCO          1
FLOPO        1
CHEMINF      1
Name: count, Length: 87, dtype: int64

In [21]:
# Write the combined DataFrame to a TSV
unique_df.to_csv(output_tsv, sep='\t', index=False)

print(f"Combined TSV written to {output_tsv}")


Combined TSV written to lexmatch-combined.tsv


In [33]:
# Read the file and create a set of lowercase strings
with open(for_lexmatch_txt, 'r') as f:
    requested_ontologies = {line.strip().lower() for line in f if len(line.strip().lower()) > 0}

In [34]:
# Get DataFrame prefixes as a set
hit_prefixes = set(unique_df['object_prefix'].str.lower())

In [35]:
requests_only = list(requested_ontologies - hit_prefixes)
requests_only.sort()

In [39]:
requests_only

['bco',
 'biopax',
 'biovoices',
 'chemont',
 'chiro',
 'cio',
 'cob',
 'cro',
 'doid',
 'duo',
 'eco',
 'ecosim',
 'ecto',
 'enigma_context',
 'fao',
 'gecko',
 'geo',
 'gsso',
 'hom',
 'hsapdv',
 'iceo',
 'ico',
 'kisao',
 'mamo',
 'mco',
 'micro',
 'mlo',
 'mop',
 'nomen',
 'obcs',
 'obib',
 'occo',
 'ogsf',
 'ohd',
 'ohmi',
 'ohpi',
 'omo',
 'ornaseq',
 'ovae',
 'pdro',
 'peco',
 'phipo',
 'po',
 'ppo',
 'proco',
 'prov',
 'psdo',
 'pso',
 'rbo']

In [37]:
hits_only = list(hit_prefixes - requested_ontologies )
hits_only.sort()

In [38]:
hits_only

['bfo',
 'chebi',
 'chemontid',
 'dce',
 'dcterms',
 'dtype',
 'ecosimconcept',
 'efo',
 'faldo',
 'goldvocab',
 'linkml',
 'maxo',
 'mixs',
 'mondo',
 'mp',
 'ncbitaxon',
 'ncit',
 'nmdc',
 'nmrcv',
 'obo',
 'ogg',
 'ogi',
 'oio',
 'ontorion',
 'pr',
 'qudtschema',
 'ro',
 'ror',
 'sio',
 'skos',
 'so',
 'stato',
 'symp',
 'uberon',
 'uo',
 'vaem']