In [1]:
from oaklib import get_adapter

from malco.post_process.compute_mrr import mondo_adapter
# This takes in the raw results from Exomiser/Pheval and converts them into a TSV with 
# diff diagnosis and scores for each case:
# rank	score	disease_name	disease_identifier
# 1	0.8802	SCID, autosomal recessive, T-negative/B-positive type	OMIM:600802
# 2	0.868	Combined immunodeficiency with faciooculoskeletal anomalies	ORPHA:221139
# 2	0.868	Combined immunodeficiency with faciooculoskeletal anomalies	ORPHA:221139
# 4	0.8626	Severe combined immunodeficiency, X-linked	OMIM:300400

from malco.post_process.post_process_results_format import create_exomiser_standardised_results
from pathlib import Path

reparse_exomiser_results = False

if reparse_exomiser_results:
    create_exomiser_standardised_results(
        results_dir=Path('/Volumes/files_2019/EXOMISER_PHENOTYPE_ONLY_2024_09_16/exomiser-14.0.1-2206/raw_results/'),
        output_dir=Path('/Users/jtr4v/PythonProject/malco/outputdir_all_2024_07_04/'),
        score_name='combined_score',
        sort_order='descending',
        variant_analysis=False,
        gene_analysis=False,
        disease_analysis=True,
        include_acmg=False,
    )

In [2]:
from malco.post_process.mondo_score_utils import omim_mappings
from malco.post_process.compute_mrr import mondo_adapter

from warnings import warn

from oaklib.datamodels.vocabulary import IS_A
from oaklib.interfaces import MappingProviderInterface

from cachetools import cached, LRUCache
from cachetools.keys import hashkey

FULL_SCORE = 1
PARTIAL_SCORE = 0.5

# Now ground the results to MONDO and score as we did for GPT results. 

@cached(cache=LRUCache(maxsize=4096), info=True, key=lambda prediction, ground_truth, mondo_adapter: hashkey(prediction, ground_truth))
def score_exomiser_result(prediction: str,
                          ground_truth: str,
                          mondo_adapter,
                          orpha_curie_prefix: bool = True
                          ) -> float:
    """
    Score the prediction from Exomiser. The prediction is an OMIM or Orphanet ID.
    
    1) If the prediction is the same as the ground truth, return 1.0
    2) If the prediction is an OMIM or other ID that maps to a Mondo term whose descendent is the ground truth,  return PARTIAL_SCORE
    3) Otherwise, return 0.0
    
    This is similar essentially the same as score_grounded_result(), except we are
    mapping the predictions to Mondo then scoring 

    Args:
        prediction (str): The prediction.
        ground_truth (str): The ground truth.
        mondo_adapter: The mondo adapter.

    Returns:
        float: The score.
    """
    if not isinstance(mondo_adapter, MappingProviderInterface):
        raise ValueError("Adapter is not an MappingProviderInterface")
    
    if prediction == ground_truth:
        # predication is the correct OMIM
        return FULL_SCORE

    if ground_truth in omim_mappings(prediction, mondo_adapter):
        # prediction is an ID that directly maps to a correct OMIM
        return FULL_SCORE
    
    # Get mapping of the prediction to MONDO, then see if that Mondo or any of its descendants
    # map to the ground truth
    
    # Correct ORPHA curie prefix to Orphanet
    if orpha_curie_prefix:
        # Replace ^ORPHA: with Orphanet:
        if prediction.startswith("ORPHA:"):
            prediction = prediction.replace("ORPHA:", "Orphanet:")
            warn(f"Replaced ORPHA prefix with Orphanet for {prediction}")        
    
    these_mondos = [m['subject_id'] for m in list(mondo_adapter.sssom_mappings([prediction])) 
    if (m.predicate_id == "skos:exactMatch" or m.predicate_id == 'oio:hasDbXref') 
     and m.subject_source == 'MONDO']

    for mondo_descendant in mondo_adapter.descendants(these_mondos, predicates=[IS_A], reflexive=True):
        if ground_truth in omim_mappings(mondo_descendant, mondo_adapter):
            # prediction is a MONDO that maps to a correct OMIM via a descendant
            return PARTIAL_SCORE
    return 0.0

mondo_adapter = mondo_adapter()

assert(score_exomiser_result("OMIM:130050", "OMIM:130050", mondo_adapter) == FULL_SCORE) 
assert(score_exomiser_result("Orphanet:79255", "OMIM:230500", mondo_adapter) == PARTIAL_SCORE)
assert(score_exomiser_result("ORPHA:79255", "OMIM:230500", mondo_adapter) == PARTIAL_SCORE)
assert(score_exomiser_result("OMIM:130050", "OMIM:230500", mondo_adapter) == 0.0)  # Ehlers Danlos vascular type versus GM1 gangliosidosis type 1

  warn(f"Replaced ORPHA prefix with Orphanet for {prediction}")


In [3]:
# for each phenopacket file, find the corresponding exomiser results file
# make a dict of phenopacket_id -> exomiser_results_file

import os
import glob

# Define the directories
phenopacket_dir = "../supplemental_data/phenopackets/"
results_dir = "../outputdir_all_2024_07_04/pheval_disease_results/"

# Get all result files
result_files = glob.glob(os.path.join(results_dir, "*-pheval_disease_result.tsv"))

# Initialize the dictionary
results2phenopacket = {}

# Process each result file
for result_file in result_files:
    # Get the base name of the result file (without extension and suffix)
    base_name = os.path.basename(result_file).replace("-pheval_disease_result.tsv", "")
    
    # Construct the pattern for matching phenopacket files
    phenopacket_pattern = os.path.join(phenopacket_dir, f"{base_name}.json")
    
    # Find matching phenopacket files
    matching_phenopackets = glob.glob(phenopacket_pattern)
    
    # Check if there's exactly one matching phenopacket file
    if len(matching_phenopackets) == 1:
        results2phenopacket[os.path.basename(result_file)] = os.path.basename(matching_phenopackets[0])
    elif len(matching_phenopackets) == 0:
        raise ValueError(f"No matching phenopacket file found for {result_file}")
    else:
        raise ValueError(f"Multiple matching phenopacket files found for {result_file}")

print(f"Total mappings: {len(results2phenopacket)}")

Total mappings: 5212


In [4]:
import os
import glob
import json
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from oaklib import get_adapter
from warnings import warn

# number of lines to read from each result file after deduplication
num_lines_to_read = 16

# Define the directories
phenopacket_dir = "../supplemental_data/phenopackets/"
results_dir = "../outputdir_all_2024_07_04/pheval_disease_results/"
output_dir = "../outputdir_all_2024_07_04/pheval_disease_results_ADD_IS_CORRECT_COL/"

# Create output directory if it doesn't exist
Path(output_dir).mkdir(parents=True, exist_ok=True)

# Initialize the MONDO adapter for each file
mondo = get_adapter("sqlite:obo:mondo")

# Get all result files
result_files = glob.glob(os.path.join(results_dir, "*-pheval_disease_result.tsv"))

for result_file in tqdm(result_files, desc="Processing files"):
    try:
        # Get the base name of the result file (without extension and suffix)
        base_name = os.path.basename(result_file).replace("-pheval_disease_result.tsv", "")

        # Construct the path for the matching phenopacket file
        phenopacket_file = os.path.join(phenopacket_dir, f"{base_name}.json")

        # Check if the phenopacket file exists
        if not os.path.exists(phenopacket_file):
            warn(f"!!!!!!!!!!!!!!!!!!No matching phenopacket file found for {result_file}!!!!!!!!!!!!!!!!!")
            continue

        # Read the result file (reading all rows first)
        df = pd.read_csv(result_file, sep='\t')

        # Remove consecutive duplicates based on 'disease_identifier' while preserving the original rank
        df = df.loc[~df['disease_identifier'].duplicated(keep='first')]

        # Truncate the data after removing consecutive duplicates to keep only the first 'num_lines_to_read' rows
        df = df.head(num_lines_to_read)

        # Read the phenopacket file
        with open(phenopacket_file, 'r') as f:
            phenopacket_data = json.load(f)

        # Extract the correct disease ID
        correct_id = phenopacket_data['interpretations'][0]['diagnosis']['disease']['id']

        # Add the correct_ID column
        df['correct_ID'] = correct_id

        # Initialize a sub-progress bar for processing each row in the DataFrame
        with tqdm(total=len(df), desc=f"Processing rows in {base_name}") as row_progress_bar:
            # Calculate the score for each prediction
            df['grounded_score'] = df['disease_identifier'].apply(
                lambda x: score_exomiser_result(x, correct_id, mondo)
            )

            # Update progress bar after each row is processed
            for _ in df['grounded_score']:
                row_progress_bar.update(1)

        # Add the is_correct column (score of 1.0 means exact match)
        df['is_correct'] = df['grounded_score'] > 0  # 0.0 is False, 0.5 or 1.0 is True

        # Recalculate rank without changing the order
        df['rank'] = range(1, len(df) + 1)

        # Write the updated dataframe to the new directory
        output_file = os.path.join(output_dir, os.path.basename(result_file))
        df.to_csv(output_file, sep='\t', index=False)

    except Exception as e:
        print(f"Error processing {result_file}: {str(e)}")
        continue  # Skip to the next file

print("Processing completed successfully.")

Processing files:   0%|          | 0/5212 [00:00<?, ?it/s]
  warn(f"Replaced ORPHA prefix with Orphanet for {prediction}")
  warn(f"Replaced ORPHA prefix with Orphanet for {prediction}")

Processing rows in PMID_27220909_proband: 100%|██████████| 16/16 [00:14<00:00,  1.14it/s][A
Processing files:   0%|          | 1/5212 [00:14<20:24:30, 14.10s/it]
  warn(f"Replaced ORPHA prefix with Orphanet for {prediction}")
  warn(f"Replaced ORPHA prefix with Orphanet for {prediction}")
  warn(f"Replaced ORPHA prefix with Orphanet for {prediction}")
  warn(f"Replaced ORPHA prefix with Orphanet for {prediction}")
  warn(f"Replaced ORPHA prefix with Orphanet for {prediction}")
  warn(f"Replaced ORPHA prefix with Orphanet for {prediction}")

Processing rows in PMID_36446582_Ockeloen2015_P11: 100%|██████████| 16/16 [00:20<00:00,  1.26s/it][A
Processing files:   0%|          | 2/5212 [00:34<25:36:16, 17.69s/it]
  warn(f"Replaced ORPHA prefix with Orphanet for {prediction}")
  warn(f"Replaced ORPHA prefix

Processing completed successfully.





In [7]:
import shutil
import os

# Define original and new directory names with the prepended path
base_path = '../outputdir_all_2024_07_04'

original_dir_1 = os.path.join(base_path, 'pheval_disease_results')
new_dir_1 = os.path.join(base_path, 'pheval_disease_results_no_correct_col_no_removal_of_dups')

original_dir_2 = os.path.join(base_path, 'pheval_disease_results_ADD_IS_CORRECT_COL')
new_dir_2 = os.path.join(base_path, 'pheval_disease_results')

# Rename the directories
if os.path.exists(original_dir_1):
    shutil.move(original_dir_1, new_dir_1)
    print(f"Renamed '{original_dir_1}' to '{new_dir_1}'")
else:
    print(f"Directory '{original_dir_1}' does not exist.")

if os.path.exists(original_dir_2):
    shutil.move(original_dir_2, new_dir_2)
    print(f"Renamed '{original_dir_2}' to '{new_dir_2}'")
else:
    print(f"Directory '{original_dir_2}' does not exist.")

Renamed '../outputdir_all_2024_07_04/pheval_disease_results' to '../outputdir_all_2024_07_04/pheval_disease_results_no_correct_col_no_removal_of_dups'
Renamed '../outputdir_all_2024_07_04/pheval_disease_results_ADD_IS_CORRECT_COL' to '../outputdir_all_2024_07_04/pheval_disease_results'


In [5]:
import os
import subprocess

# Change to the specified directory
os.chdir('src/malco/post_process')

# Define the command to run
command = [
    'python', 'make_hits_at_n_plot_exomiser.py',
    '-i', '../../../outputdir_all_2024_07_04/pheval_disease_results/',
    '-o', '../../../outputdir_all_2024_07_04/plots/topn_result_exomiser.tsv',
    '-p', '../../../outputdir_all_2024_07_04/plots/topn_result_exomiser_plot.png'
]

# Run the command
result = subprocess.run(command, capture_output=True, text=True)

# Print the command output
print(result.stdout)
if result.stderr:
    print("Error:", result.stderr)