Note: you may need to change paths to your own model runs.

In [1]:
import sys
from pathlib import Path
true_root_dir = Path().resolve().parent
sys.path.append(str(true_root_dir))

In [None]:
# see which ollama models you have available

import ollama
from src.utils.general import wrap_text
# print(wrap_text())
for model_info in ollama.list()['models']:
    print(wrap_text(str(model_info), 200))
    print()

### Tweaking LAAJ

In [None]:
import json
from functools import partial
from src.evaluate.tests.single_laaj_experiment  import LAAJExperiment

import ollama
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

def use_bert_to_compare(entity1, entity2, threshold=0.8):
    model = SentenceTransformer("bert-base-nli-mean-tokens")
    embeddings = model.encode([entity1, entity2])
    similarity = cosine_similarity(
        embeddings[0].reshape(1, -1),
        embeddings[1].reshape(1, -1),
    )[0][0]
    print(f"{entity1}, {entity2}: {similarity}")
    return similarity >= threshold

def use_llm_to_compare(entity1: str, entity2: str, model: str = 'gemma2:2b', provider: str = 'ollama') -> bool:
    """Compare two medical entities using specified LLM."""

    # adj
    default_false_phrases = ["none", "None", "null", "Null", "nan", "NaN"]
    # TODO: safe option would be to go to default value for entity if any of these seen
    if entity1 in default_false_phrases:
        entity1 = "0"
    if entity2 in default_false_phrases:
        entity2 = "0"
    
    entity1 = entity1.lower()
    entity2 = entity2.lower()
    
    if entity1 == entity2:
        return True
    if (entity1 == "0" and entity2 != "0") or (entity2 == "0" and entity1 != "0"):
        return False
    
    # full
    query = f"""
    You are a renal biopsy expert.
    Are the phrases "{entity1}" and "{entity2}" describing equivalent or similar concepts? Only answer True or False.
    Answer based on the nouns, adjectives, or numbers.
    """

    # just adj
    #query = f"""
    #Are the phrases "{entity1}" and "{entity2}" describing equivalent or similar concepts? Only answer True or False.
    #Answer based on the nouns, adjectives, or numbers.
    #"""

    # just expert
    #query = f"""
    #You are a renal biopsy expert.
    #Are the phrases "{entity1}" and "{entity2}" describing equivalent or similar concepts? Only answer True or False.
    #"""

    # simple
    #query = f"""
    #Are the phrases "{entity1}" and "{entity2}" describing equivalent or similar concepts? Only answer True or False.
    #"""

    #query = f"""
    #Are the phrases "{entity1}" and "{entity2}" describing equivalent or similar concepts? Only answer True or False.
    #"""
    
    if provider == 'ollama':
        response = ollama.generate(
            model=model,
            prompt=query,
            options={'temperature': 0, 'num_predict': 2, 'num_ctx': 1024}
        )
        return "True" in response['response']
    
    else:
        raise ValueError(f"Unsupported provider: {provider}")

def run_experiment(model_name='qwen2.5:1.5b-instruct-fp16', size="small", save_files=False):
    experiment = LAAJExperiment(partial(use_llm_to_compare, model=model_name), size, n_trials=1)
    
    # bert struggles with abbreviations, and doesn't always know what opposites are. it is symmetric though
    #experiment = LAAJExperiment(use_bert_to_compare, size, n_trials=1)
    
    results_df = experiment.run_trials()
    metrics = experiment.analyse_results(results_df)
    fig = experiment.plot_results(metrics, results_df)

    # Save results
    if save_files:
        results_df.to_csv('llm_judge_results.csv')
        with open('llm_judge_metrics.json', 'w') as f:
            json.dump(metrics, f, indent=2)
        fig.savefig('llm_judge_results.png')
    
    return results_df, metrics

# smollm:360m-instruct-v0.2-fp16
# llama3.2:3b-instruct-q8_0
# qwen2.5:1.5b-instruct-fp16 # 18 mins
# gemma2:2b-instruct-fp16 # 27 mins
results_df, metrics = run_experiment('llama3.2:3b-instruct-fp16', "small")

In [None]:
results_df[results_df['category'] == 'exact']

### Redoing evaluation for LLM methods

In [None]:
from src.utils.json import load_json
from src.preprocessing.guidelines import EntityGuidelines
from src.evaluate.alt_models import evaluate, calculate_entity_accuracy

eg = EntityGuidelines(f'src/renal_biopsy/data/guidelines.xlsx')
qa_gt_json = load_json(f"src/renal_biopsy/data/output_report_first100.json")

gemma_100_json = load_json(f"src/renal_biopsy/data/runs/main/gemma 2 True 1 20241209_190736/predicted.json")
llama_q8_100_json = load_json(f"src/renal_biopsy/data/runs/main/llama q8 2 True 6 20241210_013000/predicted.json")
phi_q4_100_json = load_json(f"src/renal_biopsy/data/runs/main/phi q4 2 True 9 20241213_170353/predicted.json")
phi_q8_100_json = load_json(f"src/renal_biopsy/data/runs/main/phi q8 2 True 14 20241213_233231/predicted.json")
qwen_100_json = load_json(f"src/renal_biopsy/data/runs/main/qwen 2 True 5 20241210_004135/predicted.json") 

all_scores, scores_per_report, final_score = evaluate(qa_gt_json, gemma_100_json, eg, 100)
entity_scores = calculate_entity_accuracy(all_scores, eg)

all_scores, scores_per_report, final_score = evaluate(qa_gt_json, llama_q8_100_json, eg, 100)
entity_scores = calculate_entity_accuracy(all_scores, eg)

all_scores, scores_per_report, final_score = evaluate(qa_gt_json, phi_q4_100_json, eg, 100)
entity_scores = calculate_entity_accuracy(all_scores, eg)

all_scores, scores_per_report, final_score = evaluate(qa_gt_json, phi_q8_100_json, eg, 100)
entity_scores = calculate_entity_accuracy(all_scores, eg)

all_scores, scores_per_report, final_score = evaluate(qa_gt_json, qwen_100_json, eg, 100)
entity_scores = calculate_entity_accuracy(all_scores, eg)

In [14]:
import ast
import os
from src.utils.general import write_metadata_file
from src.utils.json import load_json, save_json
from src.renal_biopsy.qa import RenalBiopsyOllamaQA
from src.evaluate.alt_models import calculate_entity_accuracy
from src.preprocessing.guidelines import EntityGuidelines

def get_all_folders_from_root(root_path="../src/renal_biopsy/data/runs/main"):
    try:
        folders = [entry.name for entry in os.scandir(root_path) if entry.is_dir()]
        # print(folders)
        return folders
    except FileNotFoundError:
        print(f"Directory not found: {root_path}")
    except PermissionError:
        print(f"Permission denied accessing: {root_path}")

def create_args(results_dir):
    with open(f'{results_dir}/metadata_v4.txt', 'r') as file:
        line = file.readline()  # Read the first line
        
    dict_str = line.replace('args: ', '')
    args_dict = ast.literal_eval(dict_str)
    return args_dict

def rerun_via_args(args, results_dir):
    annotated_json = load_json(f'../{args['root_dir']}/data/output_report_first100.json')

    model = RenalBiopsyOllamaQA(model_path=args['model_name'], root_dir="../src/renal_biopsy")
    predicted_json = load_json(f"{results_dir}/predicted.json")

    # Prepare metadata storage
    metadata = {
        "args": args,
        "annotation_start_time": None,
        "annotation_end_time": None,
        "evaluation_start_time": None,
        "evaluation_end_time": None,
        "score_per_report": None,
        "final_score": None
    }

    # Evaluate model
    eg = EntityGuidelines(f'../src/renal_biopsy/data/guidelines.xlsx')
    all_scores, score_per_report, final_score = model.evaluate(annotated_json, predicted_json, n_prototypes=args['n_prototype'])
    entity_scores = calculate_entity_accuracy(all_scores, eg)

    # Save evaluation results
    scores_path = os.path.join(results_dir, "evaluation_scores_redo.json")
    save_json(all_scores, scores_path)

    # Save metadata to a text file
    metadata["score_per_report"] = score_per_report
    metadata["final_score"] = final_score
    metadata_path = os.path.join(results_dir, "metadata_redo.txt")
    write_metadata_file(metadata_path, metadata)
    print("--- Finished ---")

def rerun_with_new_laaj_function(root_path):
    folder_paths = get_all_folders_from_root(root_path)
    for f in folder_paths:
        results_dir = f"{root_path}/{f}"
        args = create_args(results_dir)
        print(args)
        rerun_via_args(args, results_dir)

In [None]:
# reevaluate one model run
results_dir = "../src/renal_biopsy/data/runs/main/llama q8 2 True 6 20241210_013000"
args = create_args(results_dir)
print(args)
rerun_via_args(args, results_dir)

In [None]:
# reevaluate all model runs in a given folder
root_path = "../src/renal_biopsy/data/runs/main"
rerun_with_new_laaj_function(root_path)

### Redoing evaluation for alternative methods

In [None]:
from src.utils.json import load_json, convert_to_strings
from src.preprocessing.guidelines import EntityGuidelines
from src.evaluate.alt_models import evaluate, calculate_entity_accuracy

eg = EntityGuidelines(f'src/renal_biopsy/data/guidelines.xlsx')

ner_gt_json = load_json(f"src/ner/annotations_output/annotations_first20.json")
qa_gt_json = load_json(f"src/renal_biopsy/data/output_report_first100.json")

spacy20_json = convert_to_strings(load_json(f"src/renal_biopsy/data/runs/alt/spacy_first20.json"))
spacy100_json = convert_to_strings(load_json(f"src/renal_biopsy/data/runs/alt/spacy_first100.json"))

biobert20_json = convert_to_strings(load_json(f"src/renal_biopsy/data/runs/alt/biobert_squad_first20.json"))
biobert100_json = convert_to_strings(load_json(f"src/renal_biopsy/data/runs/alt/biobert_squad_first100.json"))
roberta20_json = convert_to_strings(load_json(f"src/renal_biopsy/data/runs/alt/roberta_squad_first20.json"))
roberta100_json = convert_to_strings(load_json(f"src/renal_biopsy/data/runs/alt/roberta_squad_first100.json"))

gliner_03_json = convert_to_strings(load_json(f"src/renal_biopsy/data/runs/alt/gliner/gliner_first100_03_processed.json"))

nuextract_json = load_json(f"src/renal_biopsy/data/runs/alt/nuextract_first20_transformed.json")

all_scores, scores_per_report, final_score = evaluate(qa_gt_json, biobert100_json, eg, 100)
entity_scores = calculate_entity_accuracy(all_scores, eg)

all_scores, scores_per_report, final_score = evaluate(qa_gt_json, roberta100_json, eg, 100)
entity_scores = calculate_entity_accuracy(all_scores, eg)

all_scores, scores_per_report, final_score = evaluate(qa_gt_json, gliner_03_json, eg, 100)
entity_scores = calculate_entity_accuracy(all_scores, eg)

all_scores, scores_per_report, final_score = evaluate(qa_gt_json, spacy100_json, eg, 100)
entity_scores = calculate_entity_accuracy(all_scores, eg)

### Qualitative error analysis

In [None]:
# gemma_100_json = load_json(f"src/renal_biopsy/data/runs/main/gemma 2 True 1 20241209_190736/predicted.json")
# llama_q8_100_json = load_json(f"src/renal_biopsy/data/runs/main/llama q8 2 True 6 20241210_013000/predicted.json")
# phi_q4_100_json = load_json(f"src/renal_biopsy/data/runs/main/phi q4 2 True 9 20241213_170353/predicted.json")
# phi_q8_100_json = load_json(f"src/renal_biopsy/data/runs/main/phi q8 2 True 14 20241213_233231/predicted.json")
# qwen_100_json = load_json(f"src/renal_biopsy/data/runs/main/qwen 2 True 5 20241210_004135/predicted.json") 

root_dir = f"src/renal_biopsy/data/runs/main/gemma 2 True 1 20241209_190736"
answers = load_json(f"{root_dir}/evaluation_scores_v6.json")
anno = load_json(f"{root_dir}/annotated.json")
pred = load_json(f"{root_dir}/predicted.json")

k = 'diagnosis'
#k = 'chronic_change'

list_to_use = []
for a, p, a2 in zip(anno, pred, answers):
    print(f"anno: {a[k]}, pred: {p[k]}, answer: {a2[k]}")
    list_to_use.append((a[k], p[k]))

In [None]:
import ollama
# from src.evaluate.laaj import use_llm_to_compare

def use_llm_to_compare(entity1: str, entity2: str, model: str = 'gemma2:2b', provider: str = 'ollama') -> bool:
    """Compare two medical entities using specified LLM."""

    default_false_phrases = ["none", "None", "null", "Null", "nan", "NaN"]
    # TODO: safe option would be to go to default value for entity if any of these seen
    if entity1 in default_false_phrases:
        entity1 = "0"
    if entity2 in default_false_phrases:
        entity2 = "0"
    
    entity1 = entity1.lower()
    entity2 = entity2.lower()
    
    if entity1 == entity2:
        return True
    if (entity1 == "0" and entity2 != "0") or (entity2 == "0" and entity1 != "0"):
        return False

    query = f"""
    You are a renal biopsy expert.
    Are the phrases "{entity1}" and "{entity2}" describing equivalent or similar concepts? Only answer True or False.
    Answer based on the nouns, adjectives, or numbers.
    """

    if provider == 'ollama':
        response = ollama.generate(
            model=model,
            prompt=query,
            options={'temperature': 0, 'num_predict': 2, 'num_ctx': 1024}
        )
        # print(response['response'])
        return "True" in response['response']
    
    else:
        raise ValueError(f"Unsupported provider: {provider}")

for e1, e2 in list_to_use:
    # llama3.2:3b-instruct-fp16
    # qwen2:7b-instruct-q6_K
    truth_value = use_llm_to_compare(e1, e2, "llama3.2:3b-instruct-fp16")
    print(f"{e1} ||| {e2} -> {truth_value}")
