In [None]:
### should write a prompt that takes guidelines and converts to amenable format for each model here

# TODO
# - do I need to update the NER spans again?
# - get gliner (with bio backbone), nuextract working
# - scores per report as per standard eval
# - scores per entity/20 reports (x=models, y=entity type)

In [None]:
### convert my QA dataset to format for NER app

from src.utils.json import load_json, save_json

input_json = load_json("src/renal_biopsy/data/output_report_first100.json")
ner_json = [f"MICROSCOPY SECTION: {report_dict['microscopy_section']} \n \
              CONCLUSION SECTION: {report_dict['conclusion_section']}"
            for report_dict in input_json]

# save_json(ner_json, "src/ner/example_output/data/ner_output_report_first100.json")

In [2]:
from src.utils.json import load_json
input_json = load_json(f"src/ner/example_output/data/ner_output_report_first100.json")
ner_gt_json = load_json(f"src/ner/annotations_output/annotations_first20.json")
qa_gt_json = load_json(f"src/renal_biopsy/data/output_report_first100.json")

In [None]:
import spacy
from spacy.tokens import Span
from spacy.language import Language
import re

@Language.component("custom_entity_ruler")
def custom_entity_ruler(doc):
    patterns = {
        "cortex": r"(?i)cortex(?!\s+(?:absent|missing|not\s+(?:present|identified|seen)))",
        "medulla": r"(?i)medulla(?!\s+(?:absent|missing|not\s+(?:present|identified|seen)))",
        "cortex_absent": r"(?i)cortex\s+(?:absent|missing|not\s+(?:present|identified|seen))",
        "medulla_absent": r"(?i)medulla\s+(?:absent|missing|not\s+(?:present|identified|seen))",
        "n_total": r"(?i)(\d+)\s*glomerul[ius]",
        "n_global": r"(?i)(\d+)\s*(?:globally|complete(?:ly)?)\s*sclerosed\s*glomerul[ius]",
        "n_segmental": r"(?i)(\d+)\s*(?:segment(?:al(?:ly)?)|partial(?:ly)?)\s*sclerosed\s*glomerul[ius]",
        "abnormal_glomeruli": r"(?i)(?:glomerul(?:ar|i))?\s*(?:(?P<count>\d+)\s+)?(?:scarring|isch(?:a)?emic\s+changes|thickening\s+of\s+bowman(?:'s)?\s+capsule|thickening\s+of\s+(?:capillary\s+)?basement\s+membrane|mesangial\s+(?:expansion|thickening)|changes?\s+in\s+(?:size|shape)|irregular\s+(?:size|shape))",
        "chronic_change_adj": r"(?i)(mild|moderate|severe|minimal|marked|extensive)\s+(?:tubular\s+atrophy|interstitial\s+fibrosis|chronic\s+changes?)",
        "chronic_change_percentage": r"(?i)(\d+(?:\.\d+)?)\s*%\s*(?:of\s*)?(?:tubular\s+atrophy|interstitial\s+fibrosis|chronic\s+changes?)",
        "transplant": r"(?i)(transplant(?:ation)?|infiltrate[sd]?|infection)",
        "diagnosis": r"(?i)(?:final\s*)?diagnosis\s*[:;]\s*(.*?)(?:\.|$)"
    }
    
    entities = []
    for ent_type, pattern in patterns.items():
        for match in re.finditer(pattern, doc.text):
            start, end = match.span()
            if not any(start < e.end_char and end > e.start_char for e in entities):
                span = doc.char_span(start, end, label=ent_type)
                if span is not None:
                    entities.append(span)
    
    doc.ents = entities
    return doc

def process_reports(texts, n_prototype=40):
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe("custom_entity_ruler")
    
    all_results = []
    for i, text in enumerate(texts):
        if i == n_prototype:
            break
        
        doc = nlp(text)
        
        results = {
            "cortex_present": True,
            "medulla_present": False,
            "n_total": 0,
            "n_segmental": 0,
            "n_global": 0,
            "abnormal_glomeruli": False,
            "chronic_change": None,
            "transplant": False,
            "diagnosis": None
        }
        
        has_cortex_mention = False
        has_medulla_mention = False
        chronic_change_perc = None
        chronic_change_adj = None
        
        for ent in doc.ents:
            if ent.label_ == "cortex":
                results["cortex_present"] = True
                has_cortex_mention = True
            elif ent.label_ == "cortex_absent":
                results["cortex_present"] = False
                has_cortex_mention = True
            elif ent.label_ == "medulla":
                results["medulla_present"] = True
                has_medulla_mention = True
            elif ent.label_ == "medulla_absent":
                results["medulla_present"] = False
                has_medulla_mention = True
            elif ent.label_ == "transplant":
                results["transplant"] = True
            elif ent.label_ == "abnormal_glomeruli":
                results["abnormal_glomeruli"] = True
            elif ent.label_ == "chronic_change_percentage":
                chronic_change_perc = float(re.search(r"(\d+(?:\.\d+)?)", ent.text).group(1))
            elif ent.label_ == "chronic_change_adj":
                chronic_change_adj = re.search(r"(mild|moderate|severe|minimal|marked|extensive)", 
                                            ent.text, re.I).group(1).lower()
            elif ent.label_ in ["n_total", "n_segmental", "n_global"]:
                results[ent.label_] = int(re.search(r"(\d+)", ent.text).group(1))
            elif ent.label_ == "diagnosis":
                results["diagnosis"] = ent.text.split(":", 1)[-1].strip()
        
        results["chronic_change"] = chronic_change_perc if chronic_change_perc is not None else chronic_change_adj
        all_results.append(results)
    
    return all_results

# Example usage
texts = [
    "Cortex present with 20% chronic changes. Total of 31 glomeruli.",
    "Severe chronic changes noted. No rejection."
]
# json_output = process_reports(texts)
# print(json.dumps(json_output, indent=2))

# Example usage
sample_report = """
Kidney biopsy shows not cortex present with just medulla present.
Total of 18 glomeruli identified, with 3 globally sclerosed glomeruli.
Moderate tubular atrophy noted.
Severe interstitial fibrosis with 40% chronic changes.
Mild chronic changes.
This appears to be a transplant kidney.
Final diagnosis: Acute cellular rejection, Banff grade IA.
"""
# results = process_reports([sample_report])
# print(results)

all_results = process_reports(input_json, n_prototype=10)
all_results

In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, pipeline
import torch
import json

class MedicalReportQA:
    def __init__(self, model_name="dmis-lab/biobert-large-cased-v1.1-squad"):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForQuestionAnswering.from_pretrained(model_name)
        self.qa_pipeline = pipeline("question-answering", model=self.model, tokenizer=self.tokenizer)
        
        self.questions = {
            "cortex_present": "Is cortex present in the sample?",
            "medulla_present": "Is medulla present in the sample?",
            "n_total": "How many total glomeruli are there?",
            "n_segmental": "How many segmentally sclerosed glomeruli are there?",
            "n_global": "How many globally sclerosed glomeruli are there?",
            "abnormal_glomeruli": "Are there any abnormal glomeruli features like scarring, thickening, or irregular shape?",
            "chronic_change": "What percentage or degree of chronic changes are present?",
            "transplant": "Is this a transplant case or are there infiltrates/infection?",
            "diagnosis": "What is the final diagnosis?"
        }

    def process_answer(self, answer, entity_type):
        if answer['score'] < 0.1:  # Confidence threshold
            return self._get_default_value(entity_type)
            
        text = answer['answer'].lower().strip()
        
        if entity_type.startswith('n_'):
            numbers = ''.join(c for c in text if c.isdigit())
            return int(numbers) if numbers else 0
            
        elif entity_type == 'chronic_change':
            if '%' in text:
                return float(''.join(c for c in text if c.isdigit() or c == '.'))
            elif any(adj in text for adj in ['mild', 'moderate', 'severe', 'minimal', 'marked', 'extensive']):
                return text
            return None
            
        elif entity_type in ['cortex_present', 'medulla_present', 'abnormal_glomeruli', 'transplant']:
            return not any(neg in text for neg in ['no', 'not', 'absent', 'missing'])
            
        return text

    def _get_default_value(self, entity_type):
        defaults = {
            'cortex_present': True,
            'medulla_present': False,
            'n_total': 0,
            'n_segmental': 0,
            'n_global': 0,
            'abnormal_glomeruli': False,
            'chronic_change': 0,
            'transplant': False,
            'diagnosis': None
        }
        return defaults[entity_type]

    def process_report(self, text):
        results = {}
        for entity, question in self.questions.items():
            qa_result = self.qa_pipeline(question=question, context=text)
            results[entity] = self.process_answer(qa_result, entity)
        return results

    def process_reports(self, texts):
        return [self.process_report(text) for text in texts]

# Example usage
def main(reports):
    qa_model = MedicalReportQA()
    results = qa_model.process_reports(reports)
    # print(json.dumps(results, indent=2))
    return results
    

sample_reports = [
        """
        Kidney biopsy shows cortex present with medulla noted.
        Total of 18 glomeruli identified, with 3 globally sclerosed glomeruli.
        Approximately 20% chronic change observed.
        This appears to be a transplant kidney.
        Final diagnosis: Acute cellular rejection, Banff grade IA.
        """,
        """
        Cortex present. No medulla seen.
        12 glomeruli, 2 showing segmental sclerosis.
        Moderate chronic changes noted.
        """
    ]

results = main(input_json[0:2])
results

In [None]:
# gliner
from gliner import GLiNER
model_name = "urchade/gliner_mediumv2.1" # "urchade/gliner_large_bio-v0.1" 
model = GLiNER.from_pretrained(model_name)
model.eval()

text = """
    PERCUTANEOUS RENAL BIOPSY; FEATURES SUGGESTIVE OF ACUTE URINARY TRACT INFECTION ON A BACKGROUND OF MODERATE CHRONIC DAMAGE
"""

labels = ['diagnosis']

entities = model.predict_entities(text, labels, threshold=0.4)

for entity in entities:
    print(entity["text"], "=>", entity["label"])

In [None]:
# nuextract