In [7]:
import os
import csv
from collections import defaultdict
from oaklib import get_adapter

# Directory containing the files
input_directory = "/Users/jtr4v/PythonProject/malco/outputdir_all_2024_07_04/gpt_o1_preview_disease_results"

# Ontology adapter to query MONDO
adapter = get_adapter("sqlite:obo:mondo")

# Dictionary to store disease names and their associated identifiers
disease_mapping = defaultdict(set)

def get_mondo_label(mondo_id):
    """
    Fetch the label for a MONDO ID using the Ontology Access Kit.
    """
    try:
        label = adapter.label(mondo_id)
        return label if label else "Label not found"
    except Exception as e:
        return f"Error retrieving label: {str(e)}"

# Iterate through each file in the directory
for filename in os.listdir(input_directory):
    file_path = os.path.join(input_directory, filename)
    
    # Only process files that are not directories and have a .txt or .tsv extension
    if os.path.isfile(file_path) and filename.endswith(('.txt', '.tsv')):
        with open(file_path, 'r') as file:
            reader = csv.DictReader(file, delimiter='\t')
            
            # Read each row in the file
            for row in reader:
                disease_name = row['disease_name']
                disease_identifier = row['disease_identifier']
                
                # Skip 'N/A' identifiers
                if disease_identifier and disease_identifier != 'N/A':
                    disease_mapping[disease_name].add(disease_identifier)

# Convert sets to sorted lists
disease_mapping = {name: sorted(ids) for name, ids in sorted(disease_mapping.items())}

# Prepare TSV output data
seen_rows = set()  # To track unique rows
tsv_rows = []
for disease_name, mondo_ids in disease_mapping.items():
    for mondo_id in mondo_ids:
        mondo_label = get_mondo_label(mondo_id)
        row = (disease_name, mondo_id, mondo_label)
        if row not in seen_rows:  # Avoid duplicates
            seen_rows.add(row)
            tsv_rows.append(row)

# Output TSV file
output_tsv_path = "disease_summary_with_labels.tsv"
with open(output_tsv_path, 'w', newline='') as tsv_file:
    writer = csv.writer(tsv_file, delimiter='\t')
    # Write the header row
    writer.writerow(["Disease Name", "MONDO ID", "MONDO Label"])
    # Write the data rows
    writer.writerows(tsv_rows)

print(f"Disease summary with labels saved in TSV format to {output_tsv_path}")

Disease summary with labels saved in TSV format to disease_summary_with_labels.tsv
