In [None]:
import json, os,re
from typing import List
from tqdm import tqdm
from rapidfuzz import process

response_dir = "../data/exomiser_14_response"
hash_to_name = {}
exomiser_results = {}
gold_prompts_dict = {}
                                   
def normalize(s):
    return re.sub(r'[^a-zA-Z0-9]', '', s).lower()

def read_result_json(path: str):
    """
    Read the raw result file.

    Args:
        path (str): Path to the raw result file.

    Returns:
        List[dict]: Contents of the raw result file.
    """
    responses = []
    with open(path, "r") as raw_result:
        for line in raw_result:
            responses.append(json.loads(line))
    return responses

def build_exomiser_result(results: List[dict]):
    if not results:
        return None

    # create a text new line delimited string of rank then disease_id for each result
    # ensure the results are sorted by rank
    results = sorted(results, key=lambda x: x['rank'])
    return "\n".join([f"{result['disease_name']}" for result in results])

def read_exomiser_folder():
    # iterate the response_dir folder for tsv files
    gold = read_result_json("../data/exomiser-gold.jsonl")
    for prompt in gold:
        gold_prompts_dict[prompt["id"]] = prompt
    for file in tqdm(os.listdir(response_dir)):
        if file.endswith(".tsv"):
            with open(os.path.join(response_dir, file), "r") as f:
                name = file.split("-pheval_disease_result.tsv")[0]
                name = name.replace("-", "_") + "_en-prompt.txt"
                if "STX" in name and name.startswith("PMID"):
                    # remove everything up to STX
                    name = name.split("STX", 1)[1]
                    name = "STX" + name
                f.readline()  # skip header
                results = []
                for line in f:
                    rank,score,disease_name,disease_identifier,correct_ID,grounded_score,is_correct = line.strip().split("\t") 
                    results.append({
                        "rank": int(rank),
                        "score": float(score),
                        "disease_name": disease_name,
                        "disease_identifier": disease_identifier,
                        "correct_ID": correct_ID,
                        "grounded_score": float(grounded_score),
                        "is_correct": is_correct == "true"
                    })

                try:
                    exomiser_results[name] = {
                        "id": name,
                        "prompt": "",
                        "response": build_exomiser_result(results),
                        "gold": gold_prompts_dict.get(name, {})["gold"]
                    }
                except (TypeError, KeyError):
                    files = gold_prompts_dict.keys()
                    match, score, _ = process.extractOne(name, files)
                    if normalize(match) not in normalize(name) or normalize(name) not in normalize(match):
                         p
malization. {match} vs {name}")
                       
                    if match in exomiser_results:
                        print(f"Duplicate match found for {name} -> {match}, skipping.")
                        print(f"Existing entry: {exomiser_results[match]}")
                    else:
                        exomiser_results[match] = {
                            "id": match,
                            "prompt": "",
                            "response": build_exomiser_result(results),
                            "gold": gold_prompts_dict.get(match, {})["gold"]
                        }
read_exomiser_folder()
print(len(exomiser_results))


 58%|█████▊    | 3013/5212 [00:00<00:00, 6534.60it/s]

 58%|█████▊    | 3013/5212 [00:00<00:00, 6534.60it/s]

No normalized match found for PMID_29330883_Subject2_en-prompt.txt


 58%|█████▊    | 3013/5212 [00:00<00:00, 6534.60it/s]

No normalized match found for PMID_29330883_Subject2_en-prompt.txt


100%|██████████| 5212/5212 [00:00<00:00, 6542.95it/s]
100%|██████████| 5212/5212 [00:00<00:00, 6542.95it/s]


In [2]:
missing_ids = set(gold_prompts_dict.keys()) - set(exomiser_results.keys())
# what is the id missing
for missing_id in missing_ids:
    print(f"Missing ID: {missing_id}")

print(len(exomiser_results))
print(len(missing_ids))

Missing ID: STX_26633542_No_ID_2_en-prompt.txt
5211
1


In [45]:
# need to double check this worked properly
mismatched = 0
for name, result in exomiser_results.items():
    gold = gold_prompts_dict.get(name, {})
    if result["gold"] != gold.get("gold"):
        print(f"Mismatch for {name}:")
        print(f"  Extracted gold: {result['gold']}")
        print(f"  Original gold: {gold_prompts_dict.get(name, {}).get('gold')}")
        mismatched += 1
print(f"Total mismatches found: {mismatched}")

Total mismatches found: 0


In [3]:
with open("../data/responses/exomiser_14.jsonl", "w") as f:
    for line in tqdm(exomiser_results.values(), total=len(exomiser_results)):
        f.write(json.dumps(line) + "\n")



100%|██████████| 5211/5211 [00:00<00:00, 145873.50it/s]
100%|██████████| 5211/5211 [00:00<00:00, 145873.50it/s]
