In [2]:
import argparse
import csv
import logging
from pathlib import Path
from typing import Dict

In [3]:
def inject_note_text(notes_map: Dict[int, str], admission: Dict) -> Dict:
    """Inject text in-place into admission"""
    for note in admission["notes"]:
        note_id = note.get("note_id")  # Get note_id from note
        if note_id is not None:
            text = notes_map.get(note_id)  # Get text from notes_map
            if text is not None:
                note["text"] = text
                for annotation in note.get("annotations", []):
                    begin = annotation.get("begin", 0)
                    end = annotation.get("end", 0)
                    annotation["covered_text"] = text[begin:end]
            else:
                # If you want to print a warning message instead of using logger:
                print(f"No text found for note_id: {note_id}")
        else:
            # If you want to print a warning message instead of using logger:
            print("No note_id found in note")

    return admission

In [4]:
def _make_out_path(csv_file: Path, input_dir: Path, out_dir: Path) -> Path:
    """Generate output path for injected CSV file"""
    prefix_len = len(input_dir.parts)
    return out_dir.joinpath(*csv_file.parts[prefix_len:])


In [5]:
def inject_and_persist(notes_map: Dict[int, str], data_dir: Path, out_dir: Path):
    """Inject text into admission and persist in out_dir if provided"""
    if out_dir:
        print(f"Injecting text and persisting to {out_dir.absolute()}")
    else:
        print("Injecting text in place")

    # Iterate through CSV files in data_dir
    for csv_file in data_dir.glob("**/*.csv"):
        with open(csv_file, "r", encoding="utf8") as ifp:
            reader = csv.DictReader(ifp)
            headers = reader.fieldnames
            rows = list(reader)
            injected_rows = []
            for row in rows:
                injected_row = inject_note_text(notes_map, row)
                injected_rows.append(injected_row)

        # Create output path
        if out_dir:
            out_path = _make_out_path(csv_file, data_dir, out_dir)
            out_path.parent.mkdir(parents=True, exist_ok=True)
        else:
            out_path = csv_file

        # Write injected admission to output file
        with open(out_path, "w", newline='', encoding="utf8") as ofp:
            writer = csv.DictWriter(ofp, fieldnames=headers)
            writer.writeheader()
            writer.writerows(injected_rows)


In [6]:
def build_notes_map(noteevents: Path) -> Dict[int, str]:
    """Build mapping from note_id to text from NOTEEVENTS.csv"""
    print(f"Loading {noteevents}")
    id_text_map = dict()
    with open(noteevents, "r", encoding="utf8") as ifp:
        reader = csv.reader(ifp)
        # Skip header
        next(reader)
        for row in reader:
            note_id, text = int(row[0]), row[10]
            id_text_map[note_id] = text
    return id_text_map

In [7]:
def main(noteevents_path: Path, data_dir: Path, out_dir: Path = None):
    # Build mapping from note_id to text
    notes_map = build_notes_map(noteevents_path)
    # Inject text into admissions and persist if out_dir provided
    inject_and_persist(notes_map, data_dir, out_dir)

In [None]:
if __name__ == "__main__":
    noteevents_path = Path("path/to/NOTEEVENTS.csv")
    data_dir = Path("path/to/top/level/MDACE/data/directory")
    out_dir = Path("path/to/output/directory")  # Set to None if you don't want to save to an output directory

    main(noteevents_path, data_dir, out_dir)