# 01 Restructure Data 

In [3]:
import json
import os
from collections import defaultdict


def restructure_medical_data(data):
    """
    Restructure medical case data to group images by study.

    Args:
        data: Original data dictionary

    Returns:
        Restructured data with images grouped by study
    """

    # Create new structure with same metadata
    restructured = {
        "url": data["url"],
        "title": data["title"],
        "modalities": data["modalities"],
        "patient_age": data["patient_age"],
        "patient_gender": data["patient_gender"],
        "presentation": data["presentation"],
        "case_discussion": data["case_discussion"],
        "images": {},
    }

    # Group series by study
    studies = defaultdict(list)
    study_captions = {}

    for series_key, series_data in data["images"].items():
        study_title = series_data["study_title"]

        # Store the caption for this study (assuming all series in a study have the same caption)
        if study_title not in study_captions:
            study_captions[study_title] = series_data["caption"]

        # Add series info to the study group
        studies[study_title].append(
            {"series_name": series_data["series_name"], "urls": series_data["urls"]}
        )

    # Create the final structure
    for i, (study_title, series_list) in enumerate(studies.items(), 1):
        group_name = f"group{i}"
        restructured["images"][group_name] = {
            "series": series_list,
            "caption": study_captions[study_title],
        }

    return restructured


path = "/home/justjosh/Turing-Test/moremi_reasoning/src/data/radiopedia"
output_dir = os.path.join(path, "restructured")
os.makedirs(output_dir, exist_ok=True)

modalities = {
    # "ct": {"input_file": f"{path}/ct_cases_collected.json"},
    # "x-ray": {
    #     "input_file": f"{path}/x-ray_cases_collected.json",
    # },
    # "ultrasound": {
    #     "input_file": f"{path}/ultrasound_cases_collected.json",
    # },
    "mammography": {
        "input_file": f"{path}/mammography_cases_collected.json",
    },
    # "mri": {
    #     "input_file": f"{path}/mri_cases_collected.json",
    # },
}

# Process each modality in the dictionary
for modality_key, settings in modalities.items():
    input_file = settings["input_file"]

    try:
        with open(input_file, "r", encoding="utf-8") as f:
            cases = json.load(f)
    except FileNotFoundError:
        print(
            f"Warning: Input file not found for modality '{modality_key}': {input_file}"
        )
        continue

    all_restructured_data = []
    for case in cases:
        try:
            restructured_data = restructure_medical_data(case)
            all_restructured_data.append(restructured_data)
        except Exception as error:
            # Log error for a specific case and continue with the next one
            print(f"Error restructuring case {case.get('url', 'N/A')}: {error}")

    # Save the restructured data to a new file
    output_filename = f"{modality_key}_cases_restructured.json"
    output_file = os.path.join(output_dir, output_filename)
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(all_restructured_data, f, indent=2)

    print(
        f"Finished restructuring for '{modality_key}'. Output saved to '{output_file}'."
    )

Finished restructuring for 'mammography'. Output saved to '/home/justjosh/Turing-Test/moremi_reasoning/src/data/radiopedia/restructured/mammography_cases_restructured.json'.


# 02 Filter By Modality

In [4]:
import json
import os

# The path to the directory containing the restructured JSON files.
path = "/home/justjosh/Turing-Test/moremi_reasoning/src/data/radiopedia/restructured"
# The path to the directory where the output files will be saved.
output_dir = "/home/justjosh/Turing-Test/moremi_reasoning/src/data/radiopedia/modality_specific_cases"
os.makedirs(
    output_dir, exist_ok=True
)  # Create the output directory if it doesn't exist.

# A dictionary defining the modalities and their corresponding input files.
modalities = {
    # "ct": {"input_file": f"{path}/ct_cases_restructured.json"},
    # "x-ray": {
    #     "input_file": f"{path}/x-ray_cases_restructured.json",
    # },
    # "ultrasound": {
    #     "input_file": f"{path}/ultrasound_cases_restructured.json",
    # },
    "mammography": {
        "input_file": f"{path}/mammography_cases_restructured.json",
    },
    # "mri": {
    #     "input_file": f"{path}/mri_cases_restructured.json",
    # },
}

# Process each modality defined in the dictionary.
for modality_key, settings in modalities.items():
    input_file = settings["input_file"]
    modality_to_check = modality_key.upper()

    print(f"--- Processing modality: {modality_to_check} ---")
    print(f"Input file: {input_file}")

    try:
        with open(input_file, "r", encoding="utf-8") as f:
            cases_data = json.load(f)
    except FileNotFoundError:
        print(f"Warning: Input file not found. Skipping: {input_file}")
        continue
    except json.JSONDecodeError:
        print(f"Warning: Could not decode JSON from file. Skipping: {input_file}")
        continue

    filtered_cases = []
    failed_cases = []

    # Iterate over each case in the loaded data.
    for case in cases_data:
        try:
            case_modalities = case.get("modalities", [])

            # Check if the target modality is listed for the case.
            if modality_to_check in case_modalities:
                # Find the index of the modality to locate the corresponding image group.
                # This assumes the group number is related to the modality's position in the list.
                index = case_modalities.index(modality_to_check)
                image_group_key = f"group{index + 1}"

                images_dict = case.get("images", {})
                image_group = images_dict.get(image_group_key)

                if image_group:
                    caption = image_group.get("caption", "").strip()
                    # A case is considered failed if the caption is missing or explicitly unavailable.
                    if not caption or caption == "Caption not available":
                        failed_cases.append(case["url"])
                    else:
                        # If the caption is valid, create a new record with the relevant data.
                        filtered_case_data = {
                            "case_url": case["url"],
                            "modalities": [modality_to_check],
                            "patient_age": case.get("patient_age"),
                            "patient_gender": case.get("patient_gender"),
                            "presentation": case.get("presentation"),
                            "case_discussion": case.get("case_discussion"),
                            "images": image_group,
                        }
                        filtered_cases.append(filtered_case_data)
                else:
                    # The expected image group was not found.
                    failed_cases.append(case["url"])
            # Cases that do not list the target modality are implicitly skipped.

        except Exception as e:
            print(f"Error processing case {case.get('url', 'N/A')}: {e}")
            if case.get("url"):
                failed_cases.append(case["url"])

    # Define output file names for the processed modality.
    only_cases_file = os.path.join(output_dir, f"{modality_key}_only_cases.json")
    failed_cases_file = os.path.join(output_dir, f"{modality_key}_failed_cases.json")

    # Save the filtered cases.
    with open(only_cases_file, "w", encoding="utf-8") as f:
        json.dump(filtered_cases, f, indent=2)

    # Save the list of failed case URLs.
    with open(failed_cases_file, "w", encoding="utf-8") as f:
        json.dump(failed_cases, f, indent=2)

    print(f"Finished processing for '{modality_to_check}'.")
    print(f"  - {len(filtered_cases)} cases saved to '{only_cases_file}'")
    print(
        f"  - {len(failed_cases)} cases failed and were logged in '{failed_cases_file}'\n"
    )

--- Processing modality: MAMMOGRAPHY ---
Input file: /home/justjosh/Turing-Test/moremi_reasoning/src/data/radiopedia/restructured/mammography_cases_restructured.json
Finished processing for 'MAMMOGRAPHY'.
  - 59 cases saved to '/home/justjosh/Turing-Test/moremi_reasoning/src/data/radiopedia/modality_specific_cases/mammography_only_cases.json'
  - 0 cases failed and were logged in '/home/justjosh/Turing-Test/moremi_reasoning/src/data/radiopedia/modality_specific_cases/mammography_failed_cases.json'

