In [1]:

from transformers import AutoModelForCausalLM, AutoTokenizer

import subprocess
import json
import torch
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from datasets import load_from_disk
from transformers import pipeline, StoppingCriteria

from datasets import load_from_disk
from transformers import pipeline, StoppingCriteria
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
import evaluate
from collections import defaultdict
import json

In [2]:
# Cell 2: Create comparative generation function
def generate_comparative_answers(findings):
    """Generate answers from both base and fine-tuned models"""

    system_message = """You are an expert radiologist assistant specializing in generating accurate and concise medical impressions from radiology
       findings.
    
      Your task is to:
      1. **Analyze the findings**: Carefully review all clinical findings, history, and technique information
      2. **Generate focused impressions**: Create clear, prioritized conclusions that directly address the clinical question
      3. **Maintain clinical accuracy**: Ensure all significant findings are appropriately characterized
      4. **Use appropriate medical terminology**: Follow standard radiological reporting conventions
      5. **Adapt communication style**: Match the institutional reporting style and level of detail expected
    
      Generate only the IMPRESSION section based on the provided clinical information."""

    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": findings},
    ]

    # Base model pipeline
    base_pipe = pipeline(
        "text-generation",
        model=base_model,
        tokenizer=base_tokenizer,
    )

    # Fine-tuned model pipeline (reuse existing)
    ft_pipe = pipeline(
        "text-generation",
        model=ft_model,
        tokenizer=ft_tokenizer,
    )

    # Generation arguments
    generation_args = {
        "max_new_tokens": 300,
        "return_full_text": False,
        "temperature": 0.0,
        "do_sample": False,
        "stopping_criteria": [EosListStoppingCriteria()]
    }

    # Generate from both models
    base_output = base_pipe(messages, **generation_args)
    ft_output = ft_pipe(messages, **generation_args)

    return {
        'base': base_output[0]['generated_text'].strip(),
        'finetuned': ft_output[0]['generated_text'].strip()
    }

In [6]:
class ModalityBasedEvaluator:
    def __init__(self,
                 dataset_path="./data/processed/radiology_datasets"
            ):
        """Initialize the systematic evaluator"""
        
        self.load_model()
        self.load_dataset(dataset_path)
        self.results = defaultdict(list)

    def load_model(self):
        """Load fine-tuned model"""
        print("Loading fine-tuned model...")
        base_model_name = "microsoft/MediPhi-Instruct"
        self.ft_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            torch_dtype=torch.bfloat16,
            device_map="auto",
            trust_remote_code=True
        )
        self.ft_tokenizer = AutoTokenizer.from_pretrained(
            base_model_name,
            trust_remote_code=True,
            padding_side="right"
        )

    def load_dataset(self, dataset_path):
        """Load evaluation dataset"""
        print("Loading dataset...")
        self.dataset = load_from_disk(dataset_path)
        self.test_data = self.dataset['test']

        # Convert to pandas for easier analysis
        self.test_df = pd.DataFrame(self.test_data)

        print(f"Test dataset: {len(self.test_df)} samples")
        print(f"Modalities: {self.test_df['modality'].unique()}")
        print(f"Clinics: {self.test_df['clinic_id'].unique()}")

    def get_modality_distribution(self):
        """Analyze modality distribution in test set"""
        modality_stats = self.test_df.groupby('modality').agg({
            'findings': 'count',
            'clinic_id': 'nunique'
        }).rename(columns={'findings': 'sample_count', 'clinic_id': 'clinic_count'})

        return modality_stats

    def generate_impression(self, findings):
        """Generate impression for given findings"""
        system_message = """You are an expert radiologist assistant specializing in generating accurate and concise medical impressions from radiology findings.

        Your task is to:
        1. **Analyze the findings**: Carefully review all clinical findings
        2. **Generate focused impressions**: Create clear, prioritized conclusions
        3. **Maintain clinical accuracy**: Ensure all significant findings are characterized
        4. **Use appropriate medical terminology**: Follow standard radiological conventions

        Generate only the IMPRESSION section based on the provided clinical information."""

        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": findings},
        ]

        pipe = pipeline(
            "text-generation",
            model=self.ft_model,
            tokenizer=self.ft_tokenizer,
        )

        class EosListStoppingCriteria(StoppingCriteria):
            def __init__(self, eos_sequence=[32007]):
                self.eos_sequence = eos_sequence

            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
                last_ids = input_ids[:, -len(self.eos_sequence):].tolist()
                return self.eos_sequence in last_ids

        generation_args = {
            "max_new_tokens": 300,
            "return_full_text": False,
            "temperature": 0.0,
            "do_sample": False,
            "stopping_criteria": [EosListStoppingCriteria()]
        }

        output = pipe(messages, **generation_args)
        return output[0]['generated_text'].strip()

    def evaluate_sample(self, idx):
        """Evaluate single sample"""
        # Convert numpy int64 to Python int
        idx = int(idx)
        sample = self.test_data[idx]

        findings = sample['findings']
        reference = sample['impression']
        modality = sample['modality']
        clinic_id = sample['clinic_id']

        # Generate prediction
        prediction = self.generate_impression(findings)

        # Compute ROUGE scores
        rouge = evaluate.load("rouge")
        rouge_scores = rouge.compute(
            predictions=[prediction],
            references=[reference],
            rouge_types=["rouge1", "rouge2", "rougeL"]
        )

        return {
            'sample_idx': idx,
            'modality': modality,
            'clinic_id': clinic_id,
            'findings': findings,
            'reference': reference,
            'prediction': prediction,
            'rouge1': rouge_scores['rouge1'],
            'rouge2': rouge_scores['rouge2'],
            'rougeL': rouge_scores['rougeL'],
            'findings_length': len(findings),
            'reference_length': len(reference),
            'prediction_length': len(prediction)
        }

    def evaluate_by_modality(self, samples_per_modality=10, random_seed=42):
        """Systematic evaluation across all modalities"""
        np.random.seed(random_seed)

        modality_results = {}

        for modality in self.test_df['modality'].unique():
            print(f"\n🔬 Evaluating modality: {modality}")

            # Get samples for this modality
            modality_samples = self.test_df[self.test_df['modality'] == modality]

            # Sample random subset
            n_samples = min(samples_per_modality, len(modality_samples))
            # Get the actual dataset indices (not DataFrame indices)
            available_indices = list(range(len(self.test_df)))
            modality_indices = [i for i in available_indices if self.test_df.iloc[i]['modality'] == modality]

            sampled_indices = np.random.choice(
                modality_indices,
                size=n_samples,
                replace=False
            )

            modality_results[modality] = []

            for i, idx in enumerate(sampled_indices):
                print(f"  Processing sample {i+1}/{n_samples}...")
                result = self.evaluate_sample(idx)
                modality_results[modality].append(result)

        return modality_results

    def evaluate_by_clinic_modality(self, samples_per_combination=5):
        """Evaluate by clinic-modality combinations"""
        np.random.seed(42)

        combination_results = {}

        for combo in self.test_df['clinic_modality'].unique():
            print(f"\n🏥 Evaluating combination: {combo}")

            combo_samples = self.test_df[self.test_df['clinic_modality'] == combo]

            n_samples = min(samples_per_combination, len(combo_samples))
            if n_samples == 0:
                continue

            # Get actual dataset indices for this combination
            combo_indices = [i for i in range(len(self.test_df)) if self.test_df.iloc[i]['clinic_modality'] == combo]

            sampled_indices = np.random.choice(
                combo_indices,
                size=n_samples,
                replace=False
            )

            combination_results[combo] = []

            for i, idx in enumerate(sampled_indices):
                print(f"  Processing sample {i+1}/{n_samples}...")
                result = self.evaluate_sample(idx)
                combination_results[combo].append(result)

        return combination_results

    def compute_aggregate_metrics(self, results):
        """Compute aggregate metrics from evaluation results"""
        aggregated = {}

        for category, samples in results.items():
            if not samples:
                continue

            metrics = {
                'sample_count': len(samples),
                'rouge1_mean': np.mean([s['rouge1'] for s in samples]),
                'rouge1_std': np.std([s['rouge1'] for s in samples]),
                'rouge2_mean': np.mean([s['rouge2'] for s in samples]),
                'rouge2_std': np.std([s['rouge2'] for s in samples]),
                'rougeL_mean': np.mean([s['rougeL'] for s in samples]),
                'rougeL_std': np.std([s['rougeL'] for s in samples]),
                'avg_findings_length': np.mean([s['findings_length'] for s in samples]),
                'avg_reference_length': np.mean([s['reference_length'] for s in samples]),
                'avg_prediction_length': np.mean([s['prediction_length'] for s in samples])
            }

            aggregated[category] = metrics

        return aggregated

    def generate_evaluation_report(self, modality_results, combination_results=None):
        """Generate comprehensive evaluation report"""

        print("\n" + "="*80)
        print("📊 SYSTEMATIC EVALUATION REPORT")
        print("="*80)

        # Modality-based metrics
        modality_metrics = self.compute_aggregate_metrics(modality_results)

        print("\n🔬 MODALITY-BASED PERFORMANCE")
        print("-" * 50)

        modality_df = pd.DataFrame(modality_metrics).T
        modality_df = modality_df.round(4)
        print(modality_df[['sample_count', 'rouge1_mean', 'rouge2_mean', 'rougeL_mean']])

        # Overall performance
        all_samples = []
        for samples in modality_results.values():
            all_samples.extend(samples)

        overall_metrics = {
            'total_samples': len(all_samples),
            'overall_rouge1': np.mean([s['rouge1'] for s in all_samples]),
            'overall_rouge2': np.mean([s['rouge2'] for s in all_samples]),
            'overall_rougeL': np.mean([s['rougeL'] for s in all_samples])
        }

        print(f"\n📈 OVERALL PERFORMANCE")
        print("-" * 30)
        print(f"Total samples evaluated: {overall_metrics['total_samples']}")
        print(f"Average ROUGE-1: {overall_metrics['overall_rouge1']:.4f}")
        print(f"Average ROUGE-2: {overall_metrics['overall_rouge2']:.4f}")
        print(f"Average ROUGE-L: {overall_metrics['overall_rougeL']:.4f}")

        # Best and worst performing modalities
        rouge1_by_modality = {mod: metrics['rouge1_mean'] for mod, metrics in modality_metrics.items()}
        best_modality = max(rouge1_by_modality, key=rouge1_by_modality.get)
        worst_modality = min(rouge1_by_modality, key=rouge1_by_modality.get)

        print(f"\n🏆 Best performing modality: {best_modality} (ROUGE-1: {rouge1_by_modality[best_modality]:.4f})")
        print(f"⚠️  Lowest performing modality: {worst_modality} (ROUGE-1: {rouge1_by_modality[worst_modality]:.4f})")

        return {
            'modality_metrics': modality_metrics,
            'overall_metrics': overall_metrics,
            'detailed_results': modality_results
        }

    def save_results(self, results, filename="systematic_evaluation_results.json"):
        """Save evaluation results to file"""
        with open(filename, 'w') as f:
            json.dump(results, f, indent=2)
        print(f"\n💾 Results saved to {filename}")


In [9]:
evaluator = ModalityBasedEvaluator()
# Show dataset distribution
print("\n📊 Dataset Distribution:")
print(evaluator.get_modality_distribution())

# Run systematic evaluation
print("\n🚀 Starting systematic evaluation...")
modality_results = evaluator.evaluate_by_modality(samples_per_modality=20)

# Generate report
final_results = evaluator.generate_evaluation_report(modality_results)

# Save results
evaluator.save_results(final_results)

Loading fine-tuned model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cuda:0


Loading dataset...
Test dataset: 1915 samples
Modalities: ['MR' 'CT' 'XR' 'CR' 'US' 'NM' 'nan' 'OTHER']
Clinics: ['clinic_1' 'clinic_3' 'clinic_6' 'clinic_5' 'clinic_4' 'clinic_2' 'nan']

📊 Dataset Distribution:
          sample_count  clinic_count
modality                            
CR                 214             4
CT                 243             6
MR                1194             6
NM                  16             3
OTHER                2             2
US                  38             4
XR                 104             2
nan                104             1

🚀 Starting systematic evaluation...

🔬 Evaluating modality: MR
  Processing sample 1/20...


Device set to use cuda:0


  Processing sample 2/20...


Device set to use cuda:0


  Processing sample 3/20...


Device set to use cuda:0


  Processing sample 4/20...


Device set to use cuda:0


  Processing sample 5/20...


Device set to use cuda:0


  Processing sample 6/20...


Device set to use cuda:0


  Processing sample 7/20...


Device set to use cuda:0


  Processing sample 8/20...


Device set to use cuda:0


  Processing sample 9/20...


Device set to use cuda:0


  Processing sample 10/20...


Device set to use cuda:0


  Processing sample 11/20...


Device set to use cuda:0


  Processing sample 12/20...


Device set to use cuda:0


  Processing sample 13/20...


Device set to use cuda:0


  Processing sample 14/20...


Device set to use cuda:0


  Processing sample 15/20...


Device set to use cuda:0


  Processing sample 16/20...


Device set to use cuda:0


  Processing sample 17/20...


Device set to use cuda:0


  Processing sample 18/20...


Device set to use cuda:0


  Processing sample 19/20...


Device set to use cuda:0


  Processing sample 20/20...


Device set to use cuda:0



🔬 Evaluating modality: CT
  Processing sample 1/20...


Device set to use cuda:0


  Processing sample 2/20...


Device set to use cuda:0


  Processing sample 3/20...


Device set to use cuda:0


  Processing sample 4/20...


Device set to use cuda:0


  Processing sample 5/20...


Device set to use cuda:0


  Processing sample 6/20...


Device set to use cuda:0


  Processing sample 7/20...


Device set to use cuda:0


  Processing sample 8/20...


Device set to use cuda:0


  Processing sample 9/20...


Device set to use cuda:0


  Processing sample 10/20...


Device set to use cuda:0


  Processing sample 11/20...


Device set to use cuda:0


  Processing sample 12/20...


Device set to use cuda:0


  Processing sample 13/20...


Device set to use cuda:0


  Processing sample 14/20...


Device set to use cuda:0


  Processing sample 15/20...


Device set to use cuda:0


  Processing sample 16/20...


Device set to use cuda:0


  Processing sample 17/20...


Device set to use cuda:0


  Processing sample 18/20...


Device set to use cuda:0


  Processing sample 19/20...


Device set to use cuda:0


  Processing sample 20/20...


Device set to use cuda:0



🔬 Evaluating modality: XR
  Processing sample 1/20...


Device set to use cuda:0


  Processing sample 2/20...


Device set to use cuda:0


  Processing sample 3/20...


Device set to use cuda:0


  Processing sample 4/20...


Device set to use cuda:0


  Processing sample 5/20...


Device set to use cuda:0


  Processing sample 6/20...


Device set to use cuda:0


  Processing sample 7/20...


Device set to use cuda:0


  Processing sample 8/20...


Device set to use cuda:0


  Processing sample 9/20...


Device set to use cuda:0


  Processing sample 10/20...


Device set to use cuda:0


  Processing sample 11/20...


Device set to use cuda:0


  Processing sample 12/20...


Device set to use cuda:0


  Processing sample 13/20...


Device set to use cuda:0


  Processing sample 14/20...


Device set to use cuda:0


  Processing sample 15/20...


Device set to use cuda:0


  Processing sample 16/20...


Device set to use cuda:0


  Processing sample 17/20...


Device set to use cuda:0


  Processing sample 18/20...


Device set to use cuda:0


  Processing sample 19/20...


Device set to use cuda:0


  Processing sample 20/20...


Device set to use cuda:0



🔬 Evaluating modality: CR
  Processing sample 1/20...


Device set to use cuda:0


  Processing sample 2/20...


Device set to use cuda:0


  Processing sample 3/20...


Device set to use cuda:0


  Processing sample 4/20...


Device set to use cuda:0


  Processing sample 5/20...


Device set to use cuda:0


  Processing sample 6/20...


Device set to use cuda:0


  Processing sample 7/20...


Device set to use cuda:0


  Processing sample 8/20...


Device set to use cuda:0


  Processing sample 9/20...


Device set to use cuda:0


  Processing sample 10/20...


Device set to use cuda:0


  Processing sample 11/20...


Device set to use cuda:0


  Processing sample 12/20...


Device set to use cuda:0


  Processing sample 13/20...


Device set to use cuda:0


  Processing sample 14/20...


Device set to use cuda:0


  Processing sample 15/20...


Device set to use cuda:0


  Processing sample 16/20...


Device set to use cuda:0


  Processing sample 17/20...


Device set to use cuda:0


  Processing sample 18/20...


Device set to use cuda:0


  Processing sample 19/20...


Device set to use cuda:0


  Processing sample 20/20...


Device set to use cuda:0



🔬 Evaluating modality: US
  Processing sample 1/20...


Device set to use cuda:0


  Processing sample 2/20...


Device set to use cuda:0


  Processing sample 3/20...


Device set to use cuda:0


  Processing sample 4/20...


Device set to use cuda:0


  Processing sample 5/20...


Device set to use cuda:0


  Processing sample 6/20...


Device set to use cuda:0


  Processing sample 7/20...


Device set to use cuda:0


  Processing sample 8/20...


Device set to use cuda:0


  Processing sample 9/20...


Device set to use cuda:0


  Processing sample 10/20...


Device set to use cuda:0


  Processing sample 11/20...


Device set to use cuda:0


  Processing sample 12/20...


Device set to use cuda:0


  Processing sample 13/20...


Device set to use cuda:0


  Processing sample 14/20...


Device set to use cuda:0


  Processing sample 15/20...


Device set to use cuda:0


  Processing sample 16/20...


Device set to use cuda:0


  Processing sample 17/20...


Device set to use cuda:0


  Processing sample 18/20...


Device set to use cuda:0


  Processing sample 19/20...


Device set to use cuda:0


  Processing sample 20/20...


Device set to use cuda:0



🔬 Evaluating modality: NM
  Processing sample 1/16...


Device set to use cuda:0


  Processing sample 2/16...


Device set to use cuda:0


  Processing sample 3/16...


Device set to use cuda:0


  Processing sample 4/16...


Device set to use cuda:0


  Processing sample 5/16...


Device set to use cuda:0


  Processing sample 6/16...


Device set to use cuda:0


  Processing sample 7/16...


Device set to use cuda:0


  Processing sample 8/16...


Device set to use cuda:0


  Processing sample 9/16...


Device set to use cuda:0


  Processing sample 10/16...


Device set to use cuda:0


  Processing sample 11/16...


Device set to use cuda:0


  Processing sample 12/16...


Device set to use cuda:0


  Processing sample 13/16...


Device set to use cuda:0


  Processing sample 14/16...


Device set to use cuda:0


  Processing sample 15/16...


Device set to use cuda:0


  Processing sample 16/16...


Device set to use cuda:0



🔬 Evaluating modality: nan
  Processing sample 1/20...


Device set to use cuda:0


  Processing sample 2/20...


Device set to use cuda:0


  Processing sample 3/20...


Device set to use cuda:0


  Processing sample 4/20...


Device set to use cuda:0


  Processing sample 5/20...


Device set to use cuda:0


  Processing sample 6/20...


Device set to use cuda:0


  Processing sample 7/20...


Device set to use cuda:0


  Processing sample 8/20...


Device set to use cuda:0


  Processing sample 9/20...


Device set to use cuda:0


  Processing sample 10/20...


Device set to use cuda:0


  Processing sample 11/20...


Device set to use cuda:0


  Processing sample 12/20...


Device set to use cuda:0


  Processing sample 13/20...


Device set to use cuda:0


  Processing sample 14/20...


Device set to use cuda:0


  Processing sample 15/20...


Device set to use cuda:0


  Processing sample 16/20...


Device set to use cuda:0


  Processing sample 17/20...


Device set to use cuda:0


  Processing sample 18/20...


Device set to use cuda:0


  Processing sample 19/20...


Device set to use cuda:0


  Processing sample 20/20...


Device set to use cuda:0



🔬 Evaluating modality: OTHER
  Processing sample 1/2...


Device set to use cuda:0


  Processing sample 2/2...

📊 SYSTEMATIC EVALUATION REPORT

🔬 MODALITY-BASED PERFORMANCE
--------------------------------------------------
       sample_count  rouge1_mean  rouge2_mean  rougeL_mean
MR             20.0       0.4642       0.2927       0.3490
CT             20.0       0.2836       0.1013       0.2101
XR             20.0       0.2859       0.1521       0.2437
CR             20.0       0.3283       0.1694       0.2754
US             20.0       0.3073       0.1230       0.2542
NM             16.0       0.3440       0.1825       0.2849
nan            20.0       0.4186       0.2490       0.3082
OTHER           2.0       0.2745       0.0833       0.1331

📈 OVERALL PERFORMANCE
------------------------------
Total samples evaluated: 138
Average ROUGE-1: 0.3465
Average ROUGE-2: 0.1800
Average ROUGE-L: 0.2727

🏆 Best performing modality: MR (ROUGE-1: 0.4642)
⚠️  Lowest performing modality: OTHER (ROUGE-1: 0.2745)

💾 Results saved to systematic_evaluation_results.json
