In [None]:
import ollama
from src.utils.general import wrap_text
# print(wrap_text())
for model_info in ollama.list()['models']:
    print(wrap_text(str(model_info), 200))
    print()

### EDA

In [None]:
from src.preprocessing.guidelines import EntityGuidelines
from src.renal_biopsy.preprocessor import RenalBiopsyProcessor

root_data_dir = "src/renal_biopsy/data"
guidelines = EntityGuidelines(f'{root_data_dir}/guidelines.xlsx')
processor = RenalBiopsyProcessor(guidelines=guidelines)

input_json = processor.create_input_json(
    data_path=f"{root_data_dir}/full_data.xlsx",
    save_path=f"{root_data_dir}/input.json",
    full=True
)

segmented_reports = processor.process_all_reports_real(f"{root_data_dir}/full_data.xlsx")
filtered_reports, microscopy_sections, conclusion_sections = processor.extract_valid_sections(
    segmented_reports, 
    required_sections=['MICROSCOPY', 'CONCLUSION']
)

In [None]:
from src.preprocessing.eda import MedicalReportEDA

eda = MedicalReportEDA()

stats = eda.analyse_section_lengths(segmented_reports, exclude_keys=['entity_key'])

stats = eda.calculate_report_statistics(
    reports=segmented_reports,
    section_keys=['MICROSCOPY', 'CONCLUSION']
)
# all sections
# stats = eda.calculate_report_statistics(reports=segmented_reports)

In [None]:
eda.analyse_word_distributions(microscopy_sections, f'Microscopy Section (n={len(microscopy_sections)})')
eda.analyse_word_distributions(microscopy_sections, f'Microscopy Section (n={len(microscopy_sections)})')


In [None]:
# number of patients
import pandas as pd
sample_data = pd.read_excel(f"{root_data_dir}/full_data.xlsx")
len(sample_data['project_id'].unique())

In [None]:
import Levenshtein as lev

highlight_words = ['glomeruli', 'medulla', 'cortex', 'fibrosis', 'sclerosed', 'chronic', 'interstitial', 'tubular', 'atrophy']
word_freq_df = eda.analyse_word_frequencies_spacy(
    segmented_reports, 
    'MICROSCOPY',
    highlight_words=highlight_words,
    n_terms=40
)

In [None]:
# Check for misspellings of main words
misspellings = {}
for correct_word in highlight_words:
    misspellings[correct_word] = word_freq_df['Word'].apply(lambda x: lev.distance(x, correct_word) <= 3)

# Print misspellings
for correct_word, matches in misspellings.items():
    print(f"\nPossible misspellings of '{correct_word}':")
    print(word_freq_df[matches])

In [None]:
custom_stop_words = ['and', 'are', 'but', 'in', 'is', 'no', 'of', 'the', 'there', 'with', 'seen', 'show', 'shows', 'to', 'which']
eda.analyse_tfidf(microscopy_sections[:40], n_terms=30, custom_stop_words=custom_stop_words)

### In case of JSON parsing errors

In [30]:
# if issues in processing

import os
from datetime import datetime
from src.preprocessing.guidelines import EntityGuidelines
from src.utils.general import write_metadata_file
from src.utils.json import load_json, save_json
from src.renal_biopsy.preprocessor import RenalBiopsyProcessor
from src.renal_biopsy.qa import RenalBiopsyOllamaQA

model_path = "phi3.5:3.8b-mini-instruct-q8"
# src\renal_biopsy\data\runs\20241214_010916
# src\renal_biopsy\data\runs\20241220_185154
results_dir = "src/renal_biopsy/data/runs/20241220_185154" 
n_prototype = 1

args = {'root_dir': 'src/renal_biopsy', 'model_name': 'llama3.2:3b-instruct-q8_0', 'n_shots': 2, 'n_prototype': n_prototype, 'include_guidelines': True}

eg = EntityGuidelines(f'{args['root_dir']}/data/guidelines.xlsx')
processor = RenalBiopsyProcessor(guidelines=eg)
input_json = processor.create_input_json(
    data_path=f"{args['root_dir']}/data/full_data.xlsx",
    save_path=f"{args['root_dir']}/data/real_input.json",
    full=True
)
annotated_json = load_json(f'{args['root_dir']}/data/output_report_first100.json')

model = RenalBiopsyOllamaQA(model_path=model_path, root_dir="src/renal_biopsy")
generated_answers_json = load_json(f"{results_dir}/generated_answers.json") # generated_answers_modified.json"
predicted_json = model.convert_generated_answers_to_json(generated_answers=generated_answers_json, input_json=input_json, n_prototype=args['n_prototype'])

# save predicted JSON
predicted_json_path = os.path.join(results_dir, "predicted2.json")
save_json(predicted_json, predicted_json_path)
print(f"Predicted JSON saved to {predicted_json_path}")

# Prepare metadata storage
metadata = {
    "args": args,
    "annotation_start_time": None,
    "annotation_end_time": None,
    "evaluation_start_time": None,
    "evaluation_end_time": None,
    "score_per_report": None,
    "final_score": None
}

# Evaluate model
metadata["evaluation_start_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
all_scores, score_per_report, final_score = model.evaluate(annotated_json, predicted_json, n_prototypes=args['n_prototype'])
metadata["evaluation_end_time"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")

# Save evaluation results
scores_path = os.path.join(results_dir, "evaluation_scores.json")
save_json(all_scores, scores_path)
print(f"Evaluation results saved to {scores_path}")


# Save final scores to metadata
metadata["score_per_report"] = score_per_report
metadata["final_score"] = final_score
# Save metadata to a text file
metadata_path = os.path.join(results_dir, "metadata.txt")
write_metadata_file(metadata_path, metadata)
print(f"Metadata saved to {metadata_path}")

# time: 33 mins
# errors = 1

Number of renal biopsy histopathology reports: 2462
Number of reports after SPECIMEN keyword filtering: 2128
Predicted JSON saved to src/renal_biopsy/data/runs/20241220_185154\predicted2.json


Evaluating predictions: 100%|███████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.34s/it]

Evaluation results saved to src/renal_biopsy/data/runs/20241220_185154\evaluation_scores.json
Metadata saved to src/renal_biopsy/data/runs/20241220_185154\metadata.txt





### Further Automation

In [None]:
import ollama
from src.automated_annotation.prompts import GUIDELINE_CREATION_TASK, GUIDELINE_EXAMPLE
from src.automated_annotation.prompts import ENTITY_IDENTIFICATION_TASK
from src.automated_annotation.prompts import SECTION_HEADER_IDENTIFIER_TASK
from src.automated_annotation.prompts import create_few_shots_prompt

report_strings_list = []
for i, report in enumerate(input_json):
    if i == 3:
        break
    report_strings_list.append(
        f"""--- PATIENT {i}'S REPORT ---
        "MICROSCOPY SECTION: {report['microscopy_section']}
        CONCLUSION SECTION: {report['conclusion_section']}"
        """
    )

report_strings = "\n".join(report_strings_list)


guideline_creation_prompt = f"{GUIDELINE_CREATION_TASK} \n {report_strings}"
# entity_identification_prompt = f"{ENTITY_IDENTIFICATION_TASK}\n --- REPORTS ---\n {report_strings}"
# section_identifier_prompt = f"{SECTION_HEADER_IDENTIFIER_TASK}\n --- REPORTS ---\n {report_strings}"
few_shot_generation_prompt = # create_few_shots_prompt(3) # num_predict=1500

# gemma2:2b-instruct-fp16
# llama3.2:3b-instruct-q8_0
answer = ollama.generate(
    model="llama3.2:3b-instruct-q8_0",
    prompt=guideline_creation_prompt,
    options={'temperature': 0, 'num_predict': 800}
)

from src.utils.general import wrap_text
# print(wrap_text(guideline_creation_prompt, 100))
print(wrap_text(answer['response'], 100))

In [None]:
### finding common words across all reports
# - could probably do this with a prompt in some way (and produce fig along the way)

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from collections import Counter
import spacy

def plot_word_frequencies(segmented_reports, n_most_common=40):
    # Load spaCy model
    nlp = spacy.load("en_core_web_sm")
    
    # Custom stop words plus spaCy defaults
    stop_words = set(nlp.Defaults.stop_words).union({
        # standard
        'and', 'are', 'but', 'in', 'is', 'no', 'of', 
        'the', 'there', 'with', 'seen', 'show', 'shows', 
        'to', 'which',

        # medical NOTE: issue here is they might be section headers
        'biopsy', 'paediatric', 'clinical', 'pathology', 'consultant',
        'dr', 'measuring', 'changes',
    })

    # Process all texts
    all_words = []
    for text in segmented_reports:
        doc = nlp(text.lower())
        words = [token.text for token in doc 
                if token.is_alpha and token.text not in stop_words]
        all_words.extend(words)
    
    # Count frequencies
    word_freq = Counter(all_words)
    
    # Convert to DataFrame
    df = pd.DataFrame(word_freq.most_common(n_most_common), 
                     columns=['Word', 'Frequency'])
    
    # Plot
    plt.figure(figsize=(10, 6))
    sns.barplot(data=df, x='Frequency', y='Word', color='blue')
    plt.title(f"Top {n_most_common} Words (Total Reports: {len(segmented_reports)})")
    plt.tight_layout()
    plt.show()
    
    return df

# Read and process data
sample_data = pd.read_excel("src/renal_biopsy/data/full_data.xlsx")
segmented_reports = sample_data['content'].tolist()

# Generate plot
word_freq_df = plot_word_frequencies(segmented_reports, n_most_common=40)

In [None]:
### analyse post-disagreement analysis
# - less relevant because if I make the comments, I can probably see the pattern. want llm to do itself without comments

import json
from collections import defaultdict

def analyse_json_by_entity(data):
    # Create a dictionary to store comments by entity type
    entity_comments = defaultdict(list)
    
    # Iterate through all reports
    for report_id, report_content in data.items():
        # Iterate through entities in each report
        for entity_name, entity_data in report_content.items():
            # Store the comment along with the report ID for context
            if "comment" in entity_data:
                entity_comments[entity_name].append({
                    "report_id": report_id,
                    "comment": entity_data["comment"],
                    "pred1": entity_data["pred1"],
                    "pred2": entity_data["pred2"],
                    "match": entity_data["match"]
                })
    
    return entity_comments

def generate_analysis_prompt(entity_comments):
    prompt = "Based on the analysis of prediction mismatches, here are the issues by entity type:\n\n"
    
    for entity, comments in entity_comments.items():
        prompt += f"""Entity: {entity}\n
        Issues observed:\n"""
        for item in comments:
            prompt += f"- Report {item['report_id']}: {item['comment']}\n"
            prompt += f"  Pred1: {item['pred1']}, Pred2: {item['pred2']}, Match: {item['match']}\n"
        prompt += f"""\nPlease analyse these patterns and suggest:\n
        1. Common error patterns in the predictions\n
        2. Specific rules or validation checks that could be implemented\n
        3. Data quality or annotation guidelines that might need refinement\n\n
        """
    
    return prompt

# Example usage
data = {
    "report_0": {
        "medulla_present": {
            "pred1": "False",
            "pred2": "True",
            "match": False,
            "comment": "Cortex is mentioned and medulla not mentioned. Therefore this should be false"
        }
    },
    "report_1": {
        "n_total": {
            "pred1": "1",
            "pred2": "null",
            "match": False,
            "comment": "Wrong null"
        },
        "n_segmental": {
            "pred1": "0",
            "pred2": "null",
            "match": False,
            "comment": "Should put 0"
        }
    }
}

data = load_json(f"src/renal_biopsy/data/runs/20241215_164609/comparison_comments_initial.json")

# Process the data
entity_comments = analyse_json_by_entity(data)

# Generate analysis prompt
entity_mismatch_analysis_prompt = generate_analysis_prompt(entity_comments)
print(entity_mismatch_analysis_prompt)

# gemma2:2b-instruct-fp16
# llama3.2:3b-instruct-q8_0
answer = ollama.generate(
    model="llama3.2:3b-instruct-q8_0",
    prompt=entity_mismatch_analysis_prompt,
    options={'temperature': 0, 'num_predict': 800}
)

from src.utils.general import wrap_text
print(wrap_text(answer['response'], 100))

### LAAJ
- Move this out into a script after finalising.

In [None]:
from functools import partial
from src.evaluate.single_laaj_experiment import LAAJExperiment
from src.evaluate.utils import use_llm_to_compare


def run_experiment(model_name='qwen2.5:1.5b-instruct-fp16', save_files=False):
    experiment = LAAJExperiment(partial(use_llm_to_compare, model=model_name))
    results_df = experiment.run_trials()
    metrics = experiment.analyse_results(results_df)
    fig = experiment.plot_results(metrics)

    # Save results
    if save_files:
        results_df.to_csv('llm_judge_results.csv')
        with open('llm_judge_metrics.json', 'w') as f:
            json.dump(metrics, f, indent=2)
        fig.savefig('llm_judge_results.png')
    
    return results_df, metrics

results_df, metrics = run_experiment('gemma2:2b-instruct-fp16')

In [None]:
import ollama

def use_llm_to_compare(entity1: str, entity2: str, model: str = 'gemma2:2b', provider: str = 'ollama') -> bool:
    """Compare two medical entities using specified LLM."""
    query = f"""Are the phrases "{entity1}" and "{entity2}" the exact same, synonyms, or similar phrases? 
    Allow some deviation in phrasing if necessary. Only answer True or False."""
    
    if provider == 'ollama':
        response = ollama.generate(
            model=model,
            prompt=query,
            options={'temperature': 0, 'num_predict': 2}
        )
        print(response['response'])
        return "True" in response['response']
    
    else:
        raise ValueError(f"Unsupported provider: {provider}")

use_llm_to_compare("moderate", "moderate", 'gemma2:2b-instruct-fp16')

In [None]:
from functools import partial
from src.evaluate.multi_laaj_experiment import MultiLAAJExperiment
from src.evaluate.utils import use_llm_to_compare

def run_experiment(llm_judges, save_files=False):
    experiment = MultiLAAJExperiment(llm_judges)
    results_df = experiment.run_trials()
    metrics = experiment.analyse_results(results_df)
    fig = experiment.plot_results(metrics)
    
    # Save results
    if save_files:
        results_df.to_csv('multi_llm_results.csv')
        fig.savefig('multi_llm_results.png')
    
    return results_df, metrics

# Define model configurations
llm_judges = {
    'smol0.36b': partial(use_llm_to_compare, model='smollm:360m-instruct-v0.2-fp16'), 
    'qwen0.5b': partial(use_llm_to_compare, model='qwen2.5:0.5b-instruct-fp16'),
    'qwen1.5b': partial(use_llm_to_compare, model='qwen2.5:1.5b-instruct-fp16'),
    'gemma2b': partial(use_llm_to_compare, model='gemma2:2b-instruct-fp16')
}

results_df, metrics = run_experiment(llm_judges, save_files=True)

In [None]:
## maybe i should check llm as a judge
# "Mild chronic allograft nephropathy" vs "Mild chronic changes only"
from src.evaluate.laaj import use_llm_to_compare

# (real_string, predicted_string)
chronic_change_pairs = [
    ("0", "mild"), # false... correct
    ("Marked", "mild") # false... correct
]

# (real_string, predicted_string)
diagnosis_string_pairs = [
    ("Mild chronic allograft nephropathy", "Mild chronic changes only"), # true
    ( "MARKED CHRONIC CHANGES WITH SEVERE CHRONIC VASCULAR CHANGE", "Marked chronic changes with severe chronic vasc   ular change"), # true
    ("No rejection, mild chronic allograft nephropathy", "Mild chronic allograft nephropathy"), # this is true... should it be?
    ("Borderline rejection", "Borderline rejection changes"), # this is false... wrong
    ( "Borderline rejection, mild chronic allograft nephropathy", "Borderline acute rejection and mild chronic allograft nephropathy"), # false... wrong
    ("CHRONIC ALLOGRAFT NEPHROPATHY", "Severe chronic allograft nephropathy"), # true... should it be?
    ("Acute rejection 1A", "Mild chronic changes with superimposed acute cellular rejection, grade 1A"), # true... correct
    ("No rejection, mild chronic allograft nephropathy", "mild chronic allograft nephropathy and mild cyclosporin effect"), # true... should it be?
    ("Very mild borderline rejection", "Very mild borderline acute rejection changes only") # true
]
# a lot of this is making me think we need separate strings for rejection type and final diagnosis with them left blank if issues

for real_string, pred_string in chronic_change_pairs:
    print(f"{use_llm_to_compare(real_string, pred_string, llama_cpp=False)} for \t \"{real_string}\" similar to \"{pred_string}\"")

### Corpus Statistics

In [None]:
# plotting summary statistics for the corpus

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from typing import List, Dict

def plot_corpus_statistics(data: List[Dict]):
    df = pd.DataFrame(data)
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    # Numeric distributions
    numeric_cols = ['n_total', 'n_segmental', 'n_global']
    for i, col in enumerate(numeric_cols):
        sns.histplot(data=df, x=col, ax=axes[0, 0])
    axes[0, 0].set_title('Distribution of Glomeruli Counts')
    axes[0, 0].legend(numeric_cols)
    
    # Boolean distributions
    bool_cols = ['cortex_present', 'medulla_present', 'abnormal_glomeruli', 'transplant']
    bool_counts = df[bool_cols].apply(lambda x: x.value_counts().to_dict())
    bool_df = pd.DataFrame(bool_counts).T
    bool_df.plot(kind='bar', ax=axes[0, 1])
    axes[0, 1].set_title('Distribution of Binary Features')
    axes[0, 1].set_ylabel('Count')
    
    # Chronic change distribution
    sns.countplot(data=df, x='chronic_change', ax=axes[1, 0])
    axes[1, 0].set_title('Distribution of Chronic Change')
    axes[1, 0].tick_labels = plt.setp(axes[1, 0].xaxis.get_majorticklabels(), rotation=45)
    
    # Diagnosis distribution
    sns.countplot(data=df, y='diagnosis', ax=axes[1, 1])
    axes[1, 1].set_title('Distribution of Diagnoses')
    
    plt.tight_layout()
    return fig

def generate_summary_stats(data: List[Dict]) -> pd.DataFrame:
    df = pd.DataFrame(data)
    
    numeric_stats = df[['n_total', 'n_segmental', 'n_global']].describe()
    categorical_counts = {
        col: df[col].value_counts().to_dict() 
        for col in ['chronic_change', 'diagnosis']
    }
    boolean_counts = {
        col: df[col].value_counts().to_dict()
        for col in ['cortex_present', 'medulla_present', 'abnormal_glomeruli', 'transplant']
    }
    
    return numeric_stats, categorical_counts, boolean_counts

# Usage
def analyse_corpus(data: List[Dict]):
    fig = plot_corpus_statistics(data)
    numeric_stats, cat_counts, bool_counts = generate_summary_stats(data)
    return fig, numeric_stats, cat_counts, bool_counts

fig, nums, cats, bools = analyse_corpus(predicted_json[0:10])
fig.savefig('corpus_stats.png')
print("Numeric statistics:\n", nums)
print("\nCategory distributions:\n", cats)
print("\nBoolean distributions:\n", bools)

### Classifier

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import pandas as pd
import numpy as np
from typing import List, Dict
import ollama

class KidneyBiopsyPredictor:
    def __init__(self):
        self.label_encoder = LabelEncoder()
        self.rf_model = RandomForestClassifier(random_state=42)
        self.feature_cols = [
            'cortex_present', 'medulla_present', 'n_total',
            'n_segmental', 'n_global', 'abnormal_glomeruli',
            'chronic_change_encoded'
        ]
    
    def prepare_features(self, data: List[Dict]) -> pd.DataFrame:
        df = pd.DataFrame(data)
        
        # Convert boolean strings to actual booleans
        bool_cols = ['cortex_present', 'medulla_present', 'abnormal_glomeruli']
        for col in bool_cols:
            df[col] = df[col].map({'True': True, 'False': False})
        
        # Convert numeric strings to numbers
        num_cols = ['n_total', 'n_segmental', 'n_global']
        for col in num_cols:
            df[col] = pd.to_numeric(df[col])
        
        # Handle chronic_change as categorical
        self.label_encoder.fit(df['chronic_change'])
        df['chronic_change_encoded'] = self.label_encoder.transform(df['chronic_change'])
        
        return df

    def train_transplant_predictor(self, data: List[Dict]):
        df = self.prepare_features(data)
        X = df[self.feature_cols]
        y = df['transplant'].map({'True': True, 'False': False})
        
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42
        )
        
        self.rf_model.fit(X_train, y_train)
        y_pred = self.rf_model.predict(X_test)
        
        return classification_report(y_test, y_pred)
class LLMDiagnosisPredictor:
    def __init__(self, model: str = 'gemma2:2b'):
        self.model = model
        self.prompt_template = """Given a kidney biopsy report with the following findings:
        - Cortex present: {cortex}
        - Medulla present: {medulla}
        - Total glomeruli: {total}
        - Segmental sclerosis: {segmental}
        - Global sclerosis: {n_global}
        - Abnormal glomeruli: {abnormal}
        - Chronic change score: {chronic}
        - Transplant status: {transplant}

        What is the most likely diagnosis? Provide only the diagnosis, no explanation."""
    
    def predict(self, case: Dict) -> str:
        prompt = self.prompt_template.format(
            cortex=case['cortex_present'],
            medulla=case['medulla_present'],
            total=case['n_total'],
            segmental=case['n_segmental'],
            n_global=case['n_global'],
            abnormal=case['abnormal_glomeruli'],
            chronic=case['chronic_change'],
            transplant=case['transplant']
        )
        
        response = ollama.generate(
            model=self.model,
            prompt=prompt,
            options={'temperature': 0}
        )
        return response['response'].strip()
    
    def evaluate(self, test_cases: List[Dict]) -> Dict:
        predictions = []
        actuals = []
        
        for case in test_cases:
            pred = self.predict(case)
            predictions.append(pred)
            actuals.append(case['diagnosis'])
        
        return {
            'accuracy': np.mean([p == a for p, a in zip(predictions, actuals)]),
            'predictions': predictions,
            'actuals': actuals
        }

# Usage example
def run_prediction_experiments(data: List[Dict]):
    # Traditional classifier for transplant prediction
    transplant_predictor = KidneyBiopsyPredictor()
    transplant_results = transplant_predictor.train_transplant_predictor(data[:10])
    
    # LLM classifier for diagnosis prediction
    llm_predictor = LLMDiagnosisPredictor()
    diagnosis_results = llm_predictor.evaluate(data[:10])  # Test on subset
    
    return transplant_results, diagnosis_results

from src.utils.json import load_json
results_dir = "src/renal_biopsy/data/runs/20241214_010916" 
predicted_json = load_json(f"{results_dir}/predicted.json")
transplant_results, diagnosis_results = run_prediction_experiments(predicted_json)