In [None]:
from oaklib import get_adapter

from malco.post_process.compute_mrr import mondo_adapter
# This takes in the raw results from Exomiser/Pheval and converts them into a TSV with 
# diff diagnosis and scores for each case:
# rank	score	disease_name	disease_identifier
# 1	0.8802	SCID, autosomal recessive, T-negative/B-positive type	OMIM:600802
# 2	0.868	Combined immunodeficiency with faciooculoskeletal anomalies	ORPHA:221139
# 2	0.868	Combined immunodeficiency with faciooculoskeletal anomalies	ORPHA:221139
# 4	0.8626	Severe combined immunodeficiency, X-linked	OMIM:300400

from malco.post_process.post_process_results_format import create_exomiser_standardised_results
from pathlib import Path

create_exomiser_standardised_results(
    results_dir=Path('/Volumes/files_2019/EXOMISER_PHENOTYPE_ONLY_2024_08_24/exomiser-13.3.0-2109/raw_results').expanduser(),
    output_dir=Path('/Users/jtr4v/PythonProject/malco/outputdir_all_2024_07_04/').expanduser(),
    score_name='combined_score',
    sort_order='descending',
    variant_analysis=False,
    gene_analysis=False,
    disease_analysis=True,
    include_acmg=False,
)

In [3]:
# need to also generate an exomiser "correct diagnosis" file

# for each phenopacket file, find the corresponding exomiser results file
# make a dict of phenopacket_id -> exomiser_results_file
# then use that dict to generate the correct diagnosis file
# also make new column in the correct diagnosis file for the correct diagnosis?
import os
import glob
import warnings
        
# Define the directories
phenopacket_dir = "../supplemental_data/phenopackets/"
results_dir = "../outputdir_all_2024_07_04/pheval_disease_results/"

# Get all phenopacket files
phenopacket_files = glob.glob(os.path.join(phenopacket_dir, "*.json"))

# Initialize the dictionary
phenopacket2results = {}

# Process each phenopacket file
for phenopacket_file in phenopacket_files:
    # Get the base name of the phenopacket file (without extension)
    base_name = os.path.splitext(os.path.basename(phenopacket_file))[0]
    
    # Construct the pattern for matching result files
    result_pattern = os.path.join(results_dir, f"{base_name}-pheval_disease_result.tsv")
    
    # Find matching result files
    matching_results = glob.glob(result_pattern)
    
    # Check if there's exactly one matching result file
    if len(matching_results) == 1:
        phenopacket2results[os.path.basename(phenopacket_file)] = os.path.basename(matching_results[0])
    elif len(matching_results) == 0:
        # raise ValueError(f"No matching result file found for {phenopacket_file}")
        warnings.warn(f"No matching result file found for {phenopacket_file}")
    else:
        raise ValueError(f"Multiple matching result files found for {phenopacket_file}")

print(f"Total mappings: {len(phenopacket2results)}")

Total mappings: 5212




In [4]:
phenopacket2results

{'PMID_34722527_individual_048-051_1_Thaddeus_P__Dryja_NullRPGRIP1Al.json': 'PMID_34722527_individual_048-051_1_Thaddeus_P__Dryja_NullRPGRIP1Al-pheval_disease_result.tsv',
 'PMID_23407777_23407777_P1.json': 'PMID_23407777_23407777_P1-pheval_disease_result.tsv',
 'PMID_31239556_individual22father.json': 'PMID_31239556_individual22father-pheval_disease_result.tsv',
 'PMID_36189931_Individual25.json': 'PMID_36189931_Individual25-pheval_disease_result.tsv',
 'PMID_29469822_Family4II-2.json': 'PMID_29469822_Family4II-2-pheval_disease_result.tsv',
 'PMID_31021519_SATB2-47fromZarateetal2018aBenganietal.json': 'PMID_31021519_SATB2-47fromZarateetal2018aBenganietal-pheval_disease_result.tsv',
 'PMID_37196654_Individual5.json': 'PMID_37196654_Individual5-pheval_disease_result.tsv',
 'PMID_36446582_Novara2017_P2.json': 'PMID_36446582_Novara2017_P2-pheval_disease_result.tsv',
 'PMID_29122497_29122497_P8.json': 'PMID_29122497_29122497_P8-pheval_disease_result.tsv',
 'PMID_35190816_STX_EG1010P.json':

In [6]:
import os
import glob
import warnings

# Define the directories
phenopacket_dir = "../supplemental_data/phenopackets/"
results_dir = "../outputdir_all_2024_07_04/pheval_disease_results/"

# Get all result files
result_files = glob.glob(os.path.join(results_dir, "*-pheval_disease_result.tsv"))

# Initialize the dictionary
results2phenopacket = {}

# Process each result file
for result_file in result_files:
    # Get the base name of the result file (without extension and suffix)
    base_name = os.path.basename(result_file).replace("-pheval_disease_result.tsv", "")
    
    # Construct the pattern for matching phenopacket files
    phenopacket_pattern = os.path.join(phenopacket_dir, f"{base_name}.json")
    
    # Find matching phenopacket files
    matching_phenopackets = glob.glob(phenopacket_pattern)
    
    # Check if there's exactly one matching phenopacket file
    if len(matching_phenopackets) == 1:
        results2phenopacket[os.path.basename(result_file)] = os.path.basename(matching_phenopackets[0])
    elif len(matching_phenopackets) == 0:
        raise ValueError(f"No matching phenopacket file found for {result_file}")
    else:
        raise ValueError(f"Multiple matching phenopacket files found for {result_file}")

print(f"Total mappings: {len(results2phenopacket)}")

Total mappings: 5212

Example mappings:
1. PMID_27220909_proband-pheval_disease_result.tsv -> PMID_27220909_proband.json
2. PMID_36446582_Ockeloen2015_P11-pheval_disease_result.tsv -> PMID_36446582_Ockeloen2015_P11.json
3. PMID_21683322_16-pheval_disease_result.tsv -> PMID_21683322_16.json
4. PMID_37962958_19-pheval_disease_result.tsv -> PMID_37962958_19.json
5. PMID_15266616_137-pheval_disease_result.tsv -> PMID_15266616_137.json


In [None]:
# import os
# import glob
# import json
# import pandas as pd
# from pathlib import Path
# from tqdm import tqdm
# from malco.post_process.mondo_score_utils import score_grounded_result
# from oaklib import get_adapter
# 
# # Define the directories
# phenopacket_dir = "../supplemental_data/phenopackets/"
# results_dir = "../outputdir_all_2024_07_04/pheval_disease_results/"
# output_dir = "../outputdir_all_2024_07_04/pheval_disease_results_IS_CORRECT_USE_MONDO/"
# 
# # Create output directory if it doesn't exist
# Path(output_dir).mkdir(parents=True, exist_ok=True)
# 
# # Initialize the MONDO adapter for each file
# mondo = get_adapter("sqlite:obo:mondo")
#         
# # Get all result files
# result_files = glob.glob(os.path.join(results_dir, "*-pheval_disease_result.tsv"))
# 
# for result_file in tqdm(result_files, desc="Processing files"):
#     try:
#         # Get the base name of the result file (without extension and suffix)
#         base_name = os.path.basename(result_file).replace("-pheval_disease_result.tsv", "")
#         
#         # Construct the path for the matching phenopacket file
#         phenopacket_file = os.path.join(phenopacket_dir, f"{base_name}.json")
#         
#         # Check if the phenopacket file exists
#         if not os.path.exists(phenopacket_file):
#             print(f"No matching phenopacket file found for {result_file}")
#             continue
#         
#         # Read the result file
#         df = pd.read_csv(result_file, sep='\t')
#         
#         # Read the phenopacket file
#         with open(phenopacket_file, 'r') as f:
#             phenopacket_data = json.load(f)
#         
#         # Extract the correct disease ID
#         correct_id = phenopacket_data['interpretations'][0]['diagnosis']['disease']['id']
#         
#         # Add the correct_ID column
#         df['correct_ID'] = correct_id
#         
#         from tqdm.auto import tqdm
# 
#         # Calculate the score for each prediction
#         tqdm.pandas(desc="Calculating scores")
#         df['grounded_score'] = df['disease_identifier'].progress_apply(
#             lambda x: score_grounded_result(x, correct_id, mondo)
#         )
# 
#         # Add the is_correct column (score of 1.0 means exact match)
#         df['is_correct'] = df['grounded_score'] == 1.0
#         
#         # Write the updated dataframe to the new directory
#         output_file = os.path.join(output_dir, os.path.basename(result_file))
#         df.to_csv(output_file, sep='\t', index=False)
#     except Exception as e:
#         print(f"Error processing {result_file}: {str(e)}")
#         continue  # Skip to the next file
# 
# print("Processing completed successfully.")

In [None]:
# import os
# import glob
# import pandas as pd
# import matplotlib.pyplot as plt
# from collections import defaultdict
# 
# # Define the directories
# dir1 = "../outputdir_all_2024_07_04/pheval_disease_results_WITH_IS_CORRECT/"
# dir2 = "../outputdir_all_2024_07_04/pheval_disease_results_WITH_IS_CORRECT_USING_MONDO/"
# 
# def get_correct_rank(file_path):
#     df = pd.read_csv(file_path, sep='\t')
#     correct_rows = df[df['is_correct'] == True]
#     return correct_rows['rank'].min() if not correct_rows.empty else float('inf')
# 
# results = defaultdict(lambda: {'dir1': float('inf'), 'dir2': float('inf')})
# 
# # Process files in both directories
# for directory in [dir1, dir2]:
#     for file_path in glob.glob(os.path.join(directory, "*.tsv")):
#         file_name = os.path.basename(file_path)
#         rank = get_correct_rank(file_path)
#         results[file_name]['dir1' if directory == dir1 else 'dir2'] = rank
# 
# # Analyze results
# dir1_better = 0
# dir2_better = 0
# equal = 0
# 
# for file_name, ranks in results.items():
#     if ranks['dir1'] < ranks['dir2']:
#         dir1_better += 1
#     elif ranks['dir2'] < ranks['dir1']:
#         dir2_better += 1
#     else:
#         equal += 1
# 
# print(f"Times dir1 (WITHOUT_MONDO) was better: {dir1_better}")
# print(f"Times dir2 (WITH_MONDO) was better: {dir2_better}")
# print(f"Times they were equal: {equal}")
# 
# # Create a bar plot
# labels = ['WITHOUT_MONDO better', 'WITH_MONDO better', 'Equal']
# values = [dir1_better, dir2_better, equal]
# 
# plt.figure(figsize=(10, 6))
# plt.bar(labels, values)
# plt.title('Comparison of Correct Diagnosis Ranks')
# plt.ylabel('Number of Cases')
# plt.savefig('rank_comparison.png')
# plt.close()
# 
# # Create a scatter plot
# plt.figure(figsize=(10, 6))
# x = [ranks['dir1'] for ranks in results.values()]
# y = [ranks['dir2'] for ranks in results.values()]
# plt.scatter(x, y, alpha=0.5)
# plt.plot([0, max(max(x), max(y))], [0, max(max(x), max(y))], 'r--')  # Diagonal line
# plt.xlabel('Rank in WITHOUT_MONDO')
# plt.ylabel('Rank in WITH_MONDO')
# plt.title('Comparison of Ranks between WITHOUT_MONDO and WITH_MONDO')
# plt.savefig('rank_scatter.png')
# plt.close()