In [1]:
import json
from tqdm import tqdm
import spacy
import medspacy
from medspacy.context import ConText
from medspacy.target_matcher import TargetMatcher
from cxr_target_rules import rules
from typing import Iterable, List, Tuple, Counter
import math
import csv

In [2]:
nlp = spacy.load('en_core_web_sm')
nlp.add_pipe("medspacy_pyrush", before="parser")  # No need to import class
nlp.add_pipe("medspacy_target_matcher")
nlp.add_pipe("medspacy_context")
print(nlp.pipe_names)

['tok2vec', 'tagger', 'medspacy_pyrush', 'parser', 'attribute_ruler', 'lemmatizer', 'ner', 'medspacy_target_matcher', 'medspacy_context']


In [3]:
target_matcher = nlp.get_pipe("medspacy_target_matcher")

target_rules = rules

target_matcher.add(target_rules) #type: ignore

In [178]:
def ngrams(tokens: List[str], n: int) -> Iterable[Tuple[str, ...]]:
    if len(tokens) < n:
        return []
    return zip(*(tokens[i:] for i in range(n)))

In [None]:
def llr_2x2(k11, k12, k21, k22) -> float:
    def _xlogx(x):
        return 0.0 if x == 0 else x * math.log(x)
    N = k11 + k12 + k21 + k22
    return 2.0 * (
        _xlogx(k11) + _xlogx(k12) + _xlogx(k21) + _xlogx(k22)
        - _xlogx(k11 + k12) - _xlogx(k21 + k22)
        - _xlogx(k11 + k21) - _xlogx(k12 + k22)
        + _xlogx(N)
    )

In [180]:
def extract_counts(doc, unigram_counts: Counter, ngram_counts, max_n=5):
    def normalize(tok):
        return tok.text.lower()
    
    total_tokens = 0
    
    for sent in doc.sents:
        keep = False
        for ent in doc.ents:
            if (ent.start_char >= sent.start_char and ent.end_char <= sent.end_char):
                keep = True
                break
        if not keep:
            continue

        tokens = [tok for tok in sent if not tok.is_space and not tok.is_punct and not tok.is_stop]
        valid_unigrams = [tok for tok in tokens if not tok.is_stop and tok.pos_ in {"NOUN", "ADJ", "PROPN"}]
        unigram_counts.update(normalize(tok) for tok in valid_unigrams)
        total_tokens = len(valid_unigrams)

        for n in range(2, max_n + 1):
            for gram in ngrams(valid_unigrams, n):
                if all(tok.is_stop for tok in gram): # type: ignore
                    continue
                if gram[0].lower_ in {"the", "a", "an", "is", "was", "are", "there", "to"}: #type: ignore
                    continue
                if not any(tok.pos_ in {"NOUN", "ADJ", "PROPN"} for tok in gram): # type: ignore
                    continue

                phrase = tuple(normalize(tok) for tok in gram)
                ngram_counts[n].update([phrase])
    return total_tokens

In [181]:
def score_bigrams(unigram_counts: Counter, bigram_counts: Counter, total_tokens: int, min_freq=3,):
    total_unigrams = total_tokens
    bigram_llr = {}
    for (w1, w2), k11 in bigram_counts.items():
        if k11 < min_freq:
            continue
        k1_ = unigram_counts[w1]
        k_1 = unigram_counts[w2]
        k12 = k1_ - k11
        k21 = k_1 - k11
        k22 = total_unigrams - (k11 + k12 + k21)
        score = llr_2x2(k11, k12, k21, k22)
        bigram_llr[(w1, w2)] = round(score, 3)
    return bigram_llr

In [182]:
def write_ngrams_to_csv(
    output_csv: str,
    unigram_counts: Counter,
    ngram_counts: dict,
    bigram_llr: dict,
    min_freq=3
):
    with open(output_csv, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["phrase", "n", "freq", "llr"])

        for w, c in unigram_counts.most_common():
            if c < min_freq:
                continue
            writer.writerow([w, 1, c, ""])

        for n, counter in ngram_counts.items():
            for gram, c in counter.most_common():
                if c < min_freq:
                    continue
                phrase = " ".join(gram)
                llr_val = bigram_llr.get(gram, "") if n == 2 else ""
                writer.writerow([phrase, n, c, llr_val])

    print(f"Saved n-gram data to {output_csv}")

In [187]:
def process_and_save(jsonl_path, output_csv, max_n=3, min_freq=5):
    with open(jsonl_path, "r") as infile:
        texts = [json.loads(line)["text"] for line in infile if "text" in line]

    unigram_counts = Counter()
    ngram_counts = {n: Counter() for n in range(2, max_n + 1)}
    total_tokens = 0

    for doc in tqdm(nlp.pipe(texts, batch_size=32), desc="🔍 Processing docs"):
        total_tokens += extract_counts(doc, unigram_counts, ngram_counts, max_n=max_n)

    # print("Unigrams:", unigram_counts.most_common())
    # print("Bigrams:", ngram_counts[2].most_common())
    bigram_llr = score_bigrams(unigram_counts, ngram_counts[2], total_tokens, min_freq=min_freq)
    write_ngrams_to_csv(output_csv, unigram_counts, ngram_counts, bigram_llr, min_freq=min_freq)

In [188]:
full_reports = "./mimic_cxr_reports.jsonl" 
train_reports = "./mimic_cxr_train_val.jsonl"
output_csv = "./mimic_cxr_vocab.csv"
training_vocab = "./mimic_cxr_training_labelled_vocab.csv"
process_and_save(train_reports, training_vocab)

🔍 Processing docs: 1000it [00:19, 52.40it/s]

Saved n-gram data to ./mimic_cxr_training_labelled_vocab.csv



