In [None]:
import json
from pathlib import Path
import spacy
import medspacy
from spacy.tokens import Span
from spacy import displacy
from spacy.util import filter_spans
from notebooks.cxr_target_rules import rules

In [14]:
nlp = medspacy.load()

context = nlp.get_pipe("medspacy_context")

target_matcher = nlp.get_pipe("medspacy_target_matcher")

target_rules = rules

target_matcher.add(target_rules) #type: ignore

In [15]:
with open("doccano_input.jsonl") as f:
    examples = [json.loads(line) for line in f]

print(f"Loaded {len(examples)} documents.")

Loaded 1000 documents.


In [18]:
def visualize(example):
    doc = nlp(example["text"])

    if not spacy.tokens.Span.has_extension("context_label"):
        spacy.tokens.Span.set_extension("context_label", default="PRESENT")

    # Convert annotations to Span objects
    ents = []
    for ann in example["annotations"]:
        span = doc.char_span(ann["start_offset"], ann["end_offset"], label=ann["label"])
        if span:
            ents.append(span)
    ents = filter_spans(ents)
    doc.ents = ents
    context = nlp.get_pipe("medspacy_context")
    context(doc)
    
    new_ents = []
    for span in doc.ents:
        if span._.is_negated:
            context_label = "NEGATED"
        elif span._.is_uncertain:
            context_label = "UNCERTAIN"
        else:
            context_label = "PRESENT"
        # Update label to include context
        span.label_ = f"{span.label_} ({context_label})"
        new_ents.append(span)

    doc.ents = new_ents

    colors = {
        "FINDING (PRESENT)": "lightgreen",
        "FINDING (NEGATED)": "lightcoral",
        "FINDING (UNCERTAIN)": "gold",
        "ANATOMY (PRESENT)": "lightgreen",
        "ANATOMY (NEGATED)": "lightcoral",
        "ANATOMY (UNCERTAIN)": "gold",
        "DEVICE (PRESENT)": "lightgreen",
        "DEVICE (NEGATED)": "lightcoral",
        "DEVICE (UNCERTAIN)": "gold",
    }

    options = {"ents": [ent.label_ for ent in doc.ents], "colors": colors}

    # Visualize in Jupyter
    return displacy.render(doc, style="ent", options=options, jupyter=True)

In [19]:
visualize(examples[2])

[32m2025-09-17 11:58:03.514[0m | [34m[1mDEBUG   [0m | [36mPyRuSH.PyRuSHSentencizer[0m:[36mpredict[0m:[36m100[0m - [34m[1m[cpredict_split_gaps|call_id=4] [doc 0] Token 0 'FINAL' marked as sentence start (span begin)[0m
[32m2025-09-17 11:58:03.515[0m | [34m[1mDEBUG   [0m | [36mPyRuSH.PyRuSHSentencizer[0m:[36mpredict[0m:[36m100[0m - [34m[1m[cpredict_split_gaps|call_id=4] [doc 0] Token 11 '
 
 ' marked as sentence start (span end whitespace)[0m
[32m2025-09-17 11:58:03.515[0m | [34m[1mDEBUG   [0m | [36mPyRuSH.PyRuSHSentencizer[0m:[36mpredict[0m:[36m100[0m - [34m[1m[cpredict_split_gaps|call_id=4] [doc 0] GAP DETECTED: tokens 11-11 (idx 47-47) between spans 47-50[0m
[32m2025-09-17 11:58:03.515[0m | [34m[1mDEBUG   [0m | [36mPyRuSH.PyRuSHSentencizer[0m:[36mpredict[0m:[36m100[0m - [34m[1m[cpredict_split_gaps|call_id=4] [doc 0] Token 11 '
 
 ' marked as sentence start (whitespace in gap between spans)[0m
[32m2025-09-17 11:58:03.516[0m | [

In [None]:
# Example: add or fix one annotation manually
examples[2]["annotations"].append({
    "start_offset": 10,
    "end_offset": 15,
    "label": "ANATOMY"
})

In [None]:
examples[0]["annotations"].pop(0)

In [None]:
visualize(examples[0])

In [None]:
with open("corrected_doccano_input.jsonl", "w") as f:
    for ex in examples:
        f.write(json.dumps(ex) + "\n")

print("✅ Saved corrected file to corrected_doccano_input.jsonl")