In [1]:
import subprocess
import json
import torch
import numpy as np
import pandas as pd
from datasets import Dataset, DatasetDict
from datasets import load_from_disk
from transformers import pipeline, StoppingCriteria

from datasets import load_from_disk
from transformers import pipeline, StoppingCriteria
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer
import evaluate
from collections import defaultdict
import json

In [2]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

ft_model = AutoPeftModelForCausalLM.from_pretrained(
    "./lora_adapter",
    torch_dtype="auto",
    device_map="auto"
)
ft_tokenizer = AutoTokenizer.from_pretrained("./lora_adapter")

`torch_dtype` is deprecated! Use `dtype` instead!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
# dataset = load_from_disk("./dataset")
processed_data_path = "./data/processed/radiology_datasets"
dataset = load_from_disk(processed_data_path)

In [4]:
dataset

DatasetDict({
    train: Dataset({
        features: ['findings', 'impression', 'text', 'clinic_id', 'modality', 'clinic_modality'],
        num_rows: 8865
    })
    validation: Dataset({
        features: ['findings', 'impression', 'text', 'clinic_id', 'modality', 'clinic_modality'],
        num_rows: 1901
    })
    test: Dataset({
        features: ['findings', 'impression', 'text', 'clinic_id', 'modality', 'clinic_modality'],
        num_rows: 1915
    })
})

In [5]:
findings = dataset['test'][0]['findings']
impression = dataset['test'][0]['impression']

print(f"Findings: {findings}\n")
print(f"Impressions: {impression}")

Findings: [CLINIC: clinic_1] [MODALITY: MR] FINDINGS: No abnormality along the sacral plexus presacral. Left sciatic nerve is normal in the greater sciatic foramen to the mid thigh. There is no mass or compression or edema along this nerve. Beginning distal mid thigh axial 36 and 35, there is intense edema of the peroneal branch of the sciatic nerve extending to the inferior margin lateral femoral condyle level left side only. There is no soft tissue mass or cyst along this nerve. No notable edema at the level of the fibular head. There is muscle denervation edema anterior and peroneal muscle compartments of the proximal leg left side only. No abnormality along the tibiofibular joint and specifically no marginating cyst. There is no other muscle denervation edema. The tibial branch of this left sciatic nerve is normal. Remaining muscles are normal. Left hamstring origin is intact with no marginating inflammation. Sacrum and sacroiliac joints are normal. Pubic symphysis has no marginati

In [11]:
def generate_answer(findings):
    # https://huggingface.co/microsoft/MediPhi-Instruct
    system_message = """You are an expert radiologist assistant specializing in generating accurate and concise medical impressions from radiology
       findings.
    
      Your task is to:
      1. **Analyze the findings**: Carefully review all clinical findings, history, and technique information
      2. **Generate focused impressions**: Create clear, prioritized conclusions that directly address the clinical question
      3. **Maintain clinical accuracy**: Ensure all significant findings are appropriately characterized
      4. **Use appropriate medical terminology**: Follow standard radiological reporting conventions
      5. **Adapt communication style**: Match the institutional reporting style and level of detail expected
    
      Generate only the IMPRESSION section based on the provided clinical information."""
    
    # Hugging Face pipeline for text generation does apply apply_chat_template under the hood. 
    # So we do not need to process for the text generation
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": findings},
    ]
    
    pipe = pipeline(
        "text-generation",
        model=ft_model,
        tokenizer=ft_tokenizer,
    )
    
    #  stops generation when the model generates token ID 32007
    class EosListStoppingCriteria(StoppingCriteria):
      def __init__(self, eos_sequence = [32007]):
          self.eos_sequence = eos_sequence
    
      def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
          last_ids = input_ids[:,-len(self.eos_sequence):].tolist()
          return self.eos_sequence in last_ids
    
    generation_args = {
        "max_new_tokens": 500,
        "return_full_text": False,
        "temperature": 0.0,
        "do_sample": False,
        "stopping_criteria": [EosListStoppingCriteria()]
    }
    output = pipe(messages, **generation_args)
    return output

In [12]:
import numpy as np

def print_test_result(iterations=10):
    for i in range(iterations):
        index = np.random.randint(0, len(dataset['test']))
        findings = dataset['test'][index]['findings']  # Fixed: was 'impression'
        impression = dataset['test'][index]['impression']
        
        output = generate_answer(findings)
        
        # Header with test number
        print("=" * 80)
        print(f"🧪 TEST {i+1}")
        print("=" * 80)
        
        # Question section
        print("\n🤔 Findings:")
        print("-" * 40)
        print(f"{findings}")
        
        # Original answer section
        print("\n✅ ORIGINAL IMPRESSIONS:")
        print("-" * 40)
        print(f"{impression}")
        
        # AI response section
        print("\n🤖 FINE-TUNED AI RESPONSE:")
        print("-" * 40)
        print(f"{output[0]['generated_text']}")
        
        # Separator
        print("\n" + "~" * 80 + "\n")

In [14]:
print_test_result(iterations = 10)

Device set to use cuda:0
Device set to use cuda:0


🧪 TEST 1

🤔 Findings:
----------------------------------------
[CLINIC: clinic_2] [MODALITY: MR] FINDINGS: Diffuse thoracic spondylosis. Exaggerated diffuse kyphosis. Multiple nodular areas of low T1 signal exhibiting mostly increased T2 signal intensity are seen at all imaged vertebral body levels, compatible with osseous metastatic disease. T5, T8 and T9 are most severely involved. There is expansion of the T5 posterior elements, not severe enough to cause neural compromise. Scout images show metastatic involvement in the cervical spine as well. Visualized portions of the spinal cord exhibit normal signal intensity. IMPRESSION:

✅ ORIGINAL IMPRESSIONS:
----------------------------------------
1. Diffuse osseous metastatic disease of the cervical, thoracic, and lumbar spine, not causing spinal/foraminal stenosis or fracture. 2. Spondylosis, kyphosis and multilevel disc bulges, causing spinal stenosis at T12-L1, L1-2, and L2-3. 3. 9 cm multicystic mass in the upper abdomen near or in t

Device set to use cuda:0


🧪 TEST 2

🤔 Findings:
----------------------------------------
[CLINIC: clinic_1] [MODALITY: MR] FINDINGS: There is a moderate effusion with mild synovial irregularity and thickening suggesting synovitis. There is a small medial popliteal cyst. There is mild quadriceps tendinosis and moderate patellar tendinosis. There is moderate to severe proximal popliteus tendinosis with small adjacent subcortical cysts and mild surrounding bone marrow edema in the lateral femoral condyle. The severity of the popliteus tendinosis is likely due to involvement of the popliteus tendon with gout. There is moderate fatty infiltration and atrophy of the semimembranosus muscle. The cruciate and collateral ligaments appear intact. In the medial compartment there is mild cartilage loss in the inferior medial aspect of the medial femoral condyle and medial aspect of the medial tibial plateau. There is no evidence of a medial meniscal tear. In the lateral compartment the articular cartilage appears intact. Th

Device set to use cuda:0


🧪 TEST 3

🤔 Findings:
----------------------------------------
[CLINIC: clinic_6] [MODALITY: CT] FINDINGS: Mild fatty liver infiltration. Diffuse fatty atrophy of the pancreas. The liver, spleen, pancreas, kidneys and adrenals are otherwise within normal limits. The gallbladder is present without intra/extrahepatic biliary dilatation. IMPRESSION:

✅ ORIGINAL IMPRESSIONS:
----------------------------------------
1. Subcutaneous strandy changes ventral abdominal wall especially on the right side, can be seen with postsurgical changes/trocar site. 2. Normal appendix. 3. Mild fatty liver infiltration.

🤖 FINE-TUNED AI RESPONSE:
----------------------------------------
 Mild fatty liver infiltration.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



Device set to use cuda:0


🧪 TEST 4

🤔 Findings:
----------------------------------------
[CLINIC: clinic_1] [MODALITY: MR] FINDINGS: Comparison is made with prior exam performed on June 4, 2018. There is no interval tear of the medial meniscus. Medial compartment articular surfaces remain well preserved. There is interval degeneration, maceration and tearing of the lateral meniscus body and posterior horn, coronal images 11 and sagittal images 8. There is high-grade cartilage loss and fissuring along the lateral tibial plateau, present on prior study. There is mild chondral softening of the lateral femoral condyle. Cruciate and collateral ligaments are intact. There is no acute ligament injury noted. There is mild distal quadriceps and mild proximal patellar tendinosis. Patellofemoral compartment articular surfaces are preserved. Tibial tubercle trochlear groove distance is normal. There is moderate knee joint effusion, mild synovitis and ruptured popliteal cyst. Edema is noted within the medial head of the gas

Device set to use cuda:0


🧪 TEST 5

🤔 Findings:
----------------------------------------
[CLINIC: clinic_1] [MODALITY: MR] FINDINGS: Medial meniscus, destabilizing radial tear of the posterior root, with loss of circumferential hoop containment of the meniscus and partial extrusion of body segment from medial joint line, sagittal image 9, coronal image 16. Anterior and posterior cruciate ligament, medial and lateral collateral ligament, ligamentous structures of the posterior lateral corner, iliotibial band, quadriceps and patellar tendons, are within normal limits. Diffuse grade 3-4 chondral loss medial femoral-tibial compartment, more prominent on the femoral side of the articulation, sagittal image 7, coronal image 13. Grade 2 chondral loss lateral tibial plateau adjacent to the intraspinous region of the tibia, coronal image 14-15. Diffuse grade 4 chondral loss median patellar ridge extending onto medial and lateral facet of patella, axial image 9. Grade 4 chondral loss medial trochlear ridge, axial image 1

Device set to use cuda:0


🧪 TEST 6

🤔 Findings:
----------------------------------------
[CLINIC: clinic_1] [MODALITY: CR] FINDINGS: There is varus alignment. There is bone-on-bone arthropathy in the medial compartment with large marginal osteophytes. There is bone-on-bone arthropathy with marginal osteophytes in the patellofemoral compartment. Lateral compartment joint spaces well preserved. IMPRESSION:

✅ ORIGINAL IMPRESSIONS:
----------------------------------------
1. Advanced medial and patellofemoral compartment arthrosis with varus alignment.

🤖 FINE-TUNED AI RESPONSE:
----------------------------------------
 Osteoarthritis with bone-on-bone arthropathy in the medial and patellofemoral compartments.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



Device set to use cuda:0


🧪 TEST 7

🤔 Findings:
----------------------------------------
[CLINIC: clinic_5] [MODALITY: CR] FINDINGS: No fracture, dislocation or erosion. Normal joint spaces. There are small osteophytes. There are some chronic opacification of the quadriceps insertion. IMPRESSION:

✅ ORIGINAL IMPRESSIONS:
----------------------------------------
No acute findings. Mild OA. Signed on 08/13/2021 12:59 PM by Mark E 42edd445 c42e7b65

🤖 FINE-TUNED AI RESPONSE:
----------------------------------------
 No acute findings

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



Device set to use cuda:0


🧪 TEST 8

🤔 Findings:
----------------------------------------
[CLINIC: clinic_1] [MODALITY: MR] FINDINGS: Trace physiologic knee joint fluid is present. There is a thin shelf-like medial plica. The medial meniscus and medial compartment cartilage are intact. The lateral meniscus and lateral compartment cartilage are intact. There are areas of edema-like signal of the anterior aspect of the medial and lateral tibial plateaus. Clinically correlate for bone contusion and history of trauma. The iliotibial band is intact. The patellofemoral joint space cartilage is intact. The extensor mechanism is intact. IMPRESSION:

✅ ORIGINAL IMPRESSIONS:
----------------------------------------
1. Areas of edema-like signal anterior aspect of the medial and lateral tibial plateaus. Clinically correlate for bone contusion and history of trauma. 2. No evidence of meniscus or ligament tear. Intact chondral surfaces.

🤖 FINE-TUNED AI RESPONSE:
----------------------------------------
 1. Areas of edema-li

Device set to use cuda:0


🧪 TEST 9

🤔 Findings:
----------------------------------------
[CLINIC: nan] [MODALITY: nan] FINDINGS: No fracture or dislocation is noted. Mild medial joint space narrowing is noted. A suprapatellar spur is noted. No destructive osseous lesions noted. No joint effusion is noted. IMPRESSION:

✅ ORIGINAL IMPRESSIONS:
----------------------------------------
No fracture or dislocation. Mild arthritic change as described above.

🤖 FINE-TUNED AI RESPONSE:
----------------------------------------
 No fracture or dislocation is noted.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

🧪 TEST 10

🤔 Findings:
----------------------------------------
[CLINIC: clinic_1] [MODALITY: MR] FINDINGS: There is a small knee joint effusion. There is a trace ruptured popliteal cyst extending superiorly. There is a horizontal tear of the posterior horn and body segment medial meniscus extending to the meniscal free edge and tibial articular surface. There is a very small dis

### Modality evaluation

In [3]:
class ModalityBasedEvaluator:
    def __init__(self, model_path="./lora_adapter", dataset_path="./data/processed/radiology_datasets"):
        """Initialize the systematic evaluator"""
        self.load_model(model_path)
        self.load_dataset(dataset_path)
        self.results = defaultdict(list)

    def load_model(self, model_path):
        """Load fine-tuned model"""
        print("Loading fine-tuned model...")
        self.ft_model = AutoPeftModelForCausalLM.from_pretrained(
            model_path,
            torch_dtype="auto",
            device_map="auto"
        )
        self.ft_tokenizer = AutoTokenizer.from_pretrained(model_path)

    def load_dataset(self, dataset_path):
        """Load evaluation dataset"""
        print("Loading dataset...")
        self.dataset = load_from_disk(dataset_path)
        self.test_data = self.dataset['test']

        # Convert to pandas for easier analysis
        self.test_df = pd.DataFrame(self.test_data)

        print(f"Test dataset: {len(self.test_df)} samples")
        print(f"Modalities: {self.test_df['modality'].unique()}")
        print(f"Clinics: {self.test_df['clinic_id'].unique()}")

    def get_modality_distribution(self):
        """Analyze modality distribution in test set"""
        modality_stats = self.test_df.groupby('modality').agg({
            'findings': 'count',
            'clinic_id': 'nunique'
        }).rename(columns={'findings': 'sample_count', 'clinic_id': 'clinic_count'})

        return modality_stats

    def generate_impression(self, findings):
        """Generate impression for given findings"""
        system_message = """You are an expert radiologist assistant specializing in generating accurate and concise medical impressions from radiology findings.

        Your task is to:
        1. **Analyze the findings**: Carefully review all clinical findings
        2. **Generate focused impressions**: Create clear, prioritized conclusions
        3. **Maintain clinical accuracy**: Ensure all significant findings are characterized
        4. **Use appropriate medical terminology**: Follow standard radiological conventions

        Generate only the IMPRESSION section based on the provided clinical information."""

        messages = [
            {"role": "system", "content": system_message},
            {"role": "user", "content": findings},
        ]

        pipe = pipeline(
            "text-generation",
            model=self.ft_model,
            tokenizer=self.ft_tokenizer,
        )

        class EosListStoppingCriteria(StoppingCriteria):
            def __init__(self, eos_sequence=[32007]):
                self.eos_sequence = eos_sequence

            def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
                last_ids = input_ids[:, -len(self.eos_sequence):].tolist()
                return self.eos_sequence in last_ids

        generation_args = {
            "max_new_tokens": 300,
            "return_full_text": False,
            "temperature": 0.0,
            "do_sample": False,
            "stopping_criteria": [EosListStoppingCriteria()]
        }

        output = pipe(messages, **generation_args)
        return output[0]['generated_text'].strip()

    def evaluate_sample(self, idx):
        """Evaluate single sample"""
        # Convert numpy int64 to Python int
        idx = int(idx)
        sample = self.test_data[idx]

        findings = sample['findings']
        reference = sample['impression']
        modality = sample['modality']
        clinic_id = sample['clinic_id']

        # Generate prediction
        prediction = self.generate_impression(findings)

        # Compute ROUGE scores
        rouge = evaluate.load("rouge")
        rouge_scores = rouge.compute(
            predictions=[prediction],
            references=[reference],
            rouge_types=["rouge1", "rouge2", "rougeL"]
        )

        return {
            'sample_idx': idx,
            'modality': modality,
            'clinic_id': clinic_id,
            'findings': findings,
            'reference': reference,
            'prediction': prediction,
            'rouge1': rouge_scores['rouge1'],
            'rouge2': rouge_scores['rouge2'],
            'rougeL': rouge_scores['rougeL'],
            'findings_length': len(findings),
            'reference_length': len(reference),
            'prediction_length': len(prediction)
        }

    def evaluate_by_modality(self, samples_per_modality=10, random_seed=42):
        """Systematic evaluation across all modalities"""
        np.random.seed(random_seed)

        modality_results = {}

        for modality in self.test_df['modality'].unique():
            print(f"\n🔬 Evaluating modality: {modality}")

            # Get samples for this modality
            modality_samples = self.test_df[self.test_df['modality'] == modality]

            # Sample random subset
            n_samples = min(samples_per_modality, len(modality_samples))
            # Get the actual dataset indices (not DataFrame indices)
            available_indices = list(range(len(self.test_df)))
            modality_indices = [i for i in available_indices if self.test_df.iloc[i]['modality'] == modality]

            sampled_indices = np.random.choice(
                modality_indices,
                size=n_samples,
                replace=False
            )

            modality_results[modality] = []

            for i, idx in enumerate(sampled_indices):
                print(f"  Processing sample {i+1}/{n_samples}...")
                result = self.evaluate_sample(idx)
                modality_results[modality].append(result)

        return modality_results

    def evaluate_by_clinic_modality(self, samples_per_combination=5):
        """Evaluate by clinic-modality combinations"""
        np.random.seed(42)

        combination_results = {}

        for combo in self.test_df['clinic_modality'].unique():
            print(f"\n🏥 Evaluating combination: {combo}")

            combo_samples = self.test_df[self.test_df['clinic_modality'] == combo]

            n_samples = min(samples_per_combination, len(combo_samples))
            if n_samples == 0:
                continue

            # Get actual dataset indices for this combination
            combo_indices = [i for i in range(len(self.test_df)) if self.test_df.iloc[i]['clinic_modality'] == combo]

            sampled_indices = np.random.choice(
                combo_indices,
                size=n_samples,
                replace=False
            )

            combination_results[combo] = []

            for i, idx in enumerate(sampled_indices):
                print(f"  Processing sample {i+1}/{n_samples}...")
                result = self.evaluate_sample(idx)
                combination_results[combo].append(result)

        return combination_results

    def compute_aggregate_metrics(self, results):
        """Compute aggregate metrics from evaluation results"""
        aggregated = {}

        for category, samples in results.items():
            if not samples:
                continue

            metrics = {
                'sample_count': len(samples),
                'rouge1_mean': np.mean([s['rouge1'] for s in samples]),
                'rouge1_std': np.std([s['rouge1'] for s in samples]),
                'rouge2_mean': np.mean([s['rouge2'] for s in samples]),
                'rouge2_std': np.std([s['rouge2'] for s in samples]),
                'rougeL_mean': np.mean([s['rougeL'] for s in samples]),
                'rougeL_std': np.std([s['rougeL'] for s in samples]),
                'avg_findings_length': np.mean([s['findings_length'] for s in samples]),
                'avg_reference_length': np.mean([s['reference_length'] for s in samples]),
                'avg_prediction_length': np.mean([s['prediction_length'] for s in samples])
            }

            aggregated[category] = metrics

        return aggregated

    def generate_evaluation_report(self, modality_results, combination_results=None):
        """Generate comprehensive evaluation report"""

        print("\n" + "="*80)
        print("📊 SYSTEMATIC EVALUATION REPORT")
        print("="*80)

        # Modality-based metrics
        modality_metrics = self.compute_aggregate_metrics(modality_results)

        print("\n🔬 MODALITY-BASED PERFORMANCE")
        print("-" * 50)

        modality_df = pd.DataFrame(modality_metrics).T
        modality_df = modality_df.round(4)
        print(modality_df[['sample_count', 'rouge1_mean', 'rouge2_mean', 'rougeL_mean']])

        # Overall performance
        all_samples = []
        for samples in modality_results.values():
            all_samples.extend(samples)

        overall_metrics = {
            'total_samples': len(all_samples),
            'overall_rouge1': np.mean([s['rouge1'] for s in all_samples]),
            'overall_rouge2': np.mean([s['rouge2'] for s in all_samples]),
            'overall_rougeL': np.mean([s['rougeL'] for s in all_samples])
        }

        print(f"\n📈 OVERALL PERFORMANCE")
        print("-" * 30)
        print(f"Total samples evaluated: {overall_metrics['total_samples']}")
        print(f"Average ROUGE-1: {overall_metrics['overall_rouge1']:.4f}")
        print(f"Average ROUGE-2: {overall_metrics['overall_rouge2']:.4f}")
        print(f"Average ROUGE-L: {overall_metrics['overall_rougeL']:.4f}")

        # Best and worst performing modalities
        rouge1_by_modality = {mod: metrics['rouge1_mean'] for mod, metrics in modality_metrics.items()}
        best_modality = max(rouge1_by_modality, key=rouge1_by_modality.get)
        worst_modality = min(rouge1_by_modality, key=rouge1_by_modality.get)

        print(f"\n🏆 Best performing modality: {best_modality} (ROUGE-1: {rouge1_by_modality[best_modality]:.4f})")
        print(f"⚠️  Lowest performing modality: {worst_modality} (ROUGE-1: {rouge1_by_modality[worst_modality]:.4f})")

        return {
            'modality_metrics': modality_metrics,
            'overall_metrics': overall_metrics,
            'detailed_results': modality_results
        }

    def save_results(self, results, filename="systematic_evaluation_results.json"):
        """Save evaluation results to file"""
        with open(filename, 'w') as f:
            json.dump(results, f, indent=2)
        print(f"\n💾 Results saved to {filename}")


In [4]:
evaluator = ModalityBasedEvaluator()
# Show dataset distribution
print("\n📊 Dataset Distribution:")
print(evaluator.get_modality_distribution())

# Run systematic evaluation
print("\n🚀 Starting systematic evaluation...")
modality_results = evaluator.evaluate_by_modality(samples_per_modality=20)

# Generate report
final_results = evaluator.generate_evaluation_report(modality_results)

# Save results
evaluator.save_results(final_results)

Loading fine-tuned model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading dataset...


Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


Test dataset: 1915 samples
Modalities: ['MR' 'CT' 'XR' 'CR' 'US' 'NM' 'nan' 'OTHER']
Clinics: ['clinic_1' 'clinic_3' 'clinic_6' 'clinic_5' 'clinic_4' 'clinic_2' 'nan']

📊 Dataset Distribution:
          sample_count  clinic_count
modality                            
CR                 214             4
CT                 243             6
MR                1194             6
NM                  16             3
OTHER                2             2
US                  38             4
XR                 104             2
nan                104             1

🚀 Starting systematic evaluation...

🔬 Evaluating modality: MR
  Processing sample 1/20...


Device set to use cuda:0


  Processing sample 2/20...


Device set to use cuda:0


  Processing sample 3/20...


Device set to use cuda:0


  Processing sample 4/20...


Device set to use cuda:0


  Processing sample 5/20...


Device set to use cuda:0


  Processing sample 6/20...


Device set to use cuda:0


  Processing sample 7/20...


Device set to use cuda:0


  Processing sample 8/20...


Device set to use cuda:0


  Processing sample 9/20...


Device set to use cuda:0


  Processing sample 10/20...


Device set to use cuda:0


  Processing sample 11/20...


Device set to use cuda:0


  Processing sample 12/20...


Device set to use cuda:0


  Processing sample 13/20...


Device set to use cuda:0


  Processing sample 14/20...


Device set to use cuda:0


  Processing sample 15/20...


Device set to use cuda:0


  Processing sample 16/20...


Device set to use cuda:0


  Processing sample 17/20...


Device set to use cuda:0


  Processing sample 18/20...


Device set to use cuda:0


  Processing sample 19/20...


Device set to use cuda:0


  Processing sample 20/20...


Device set to use cuda:0



🔬 Evaluating modality: CT
  Processing sample 1/20...


Device set to use cuda:0


  Processing sample 2/20...


Device set to use cuda:0


  Processing sample 3/20...


Device set to use cuda:0


  Processing sample 4/20...


Device set to use cuda:0


  Processing sample 5/20...


Device set to use cuda:0


  Processing sample 6/20...


Device set to use cuda:0


  Processing sample 7/20...


Device set to use cuda:0


  Processing sample 8/20...


Device set to use cuda:0


  Processing sample 9/20...


Device set to use cuda:0


  Processing sample 10/20...


Device set to use cuda:0


  Processing sample 11/20...


Device set to use cuda:0


  Processing sample 12/20...


Device set to use cuda:0


  Processing sample 13/20...


Device set to use cuda:0


  Processing sample 14/20...


Device set to use cuda:0


  Processing sample 15/20...


Device set to use cuda:0


  Processing sample 16/20...


Device set to use cuda:0


  Processing sample 17/20...


Device set to use cuda:0


  Processing sample 18/20...


Device set to use cuda:0


  Processing sample 19/20...


Device set to use cuda:0


  Processing sample 20/20...


Device set to use cuda:0



🔬 Evaluating modality: XR
  Processing sample 1/20...


Device set to use cuda:0


  Processing sample 2/20...


Device set to use cuda:0


  Processing sample 3/20...


Device set to use cuda:0


  Processing sample 4/20...


Device set to use cuda:0


  Processing sample 5/20...


Device set to use cuda:0


  Processing sample 6/20...


Device set to use cuda:0


  Processing sample 7/20...


Device set to use cuda:0


  Processing sample 8/20...


Device set to use cuda:0


  Processing sample 9/20...


Device set to use cuda:0


  Processing sample 10/20...


Device set to use cuda:0


  Processing sample 11/20...


Device set to use cuda:0


  Processing sample 12/20...


Device set to use cuda:0


  Processing sample 13/20...


Device set to use cuda:0


  Processing sample 14/20...


Device set to use cuda:0


  Processing sample 15/20...


Device set to use cuda:0


  Processing sample 16/20...


Device set to use cuda:0


  Processing sample 17/20...


Device set to use cuda:0


  Processing sample 18/20...


Device set to use cuda:0


  Processing sample 19/20...


Device set to use cuda:0


  Processing sample 20/20...


Device set to use cuda:0



🔬 Evaluating modality: CR
  Processing sample 1/20...


Device set to use cuda:0


  Processing sample 2/20...


Device set to use cuda:0


  Processing sample 3/20...


Device set to use cuda:0


  Processing sample 4/20...


Device set to use cuda:0


  Processing sample 5/20...


Device set to use cuda:0


  Processing sample 6/20...


Device set to use cuda:0


  Processing sample 7/20...


Device set to use cuda:0


  Processing sample 8/20...


Device set to use cuda:0


  Processing sample 9/20...


Device set to use cuda:0


  Processing sample 10/20...


Device set to use cuda:0


  Processing sample 11/20...


Device set to use cuda:0


  Processing sample 12/20...


Device set to use cuda:0


  Processing sample 13/20...


Device set to use cuda:0


  Processing sample 14/20...


Device set to use cuda:0


  Processing sample 15/20...


Device set to use cuda:0


  Processing sample 16/20...


Device set to use cuda:0


  Processing sample 17/20...


Device set to use cuda:0


  Processing sample 18/20...


Device set to use cuda:0


  Processing sample 19/20...


Device set to use cuda:0


  Processing sample 20/20...


Device set to use cuda:0



🔬 Evaluating modality: US
  Processing sample 1/20...


Device set to use cuda:0


  Processing sample 2/20...


Device set to use cuda:0


  Processing sample 3/20...


Device set to use cuda:0


  Processing sample 4/20...


Device set to use cuda:0


  Processing sample 5/20...


Device set to use cuda:0


  Processing sample 6/20...


Device set to use cuda:0


  Processing sample 7/20...


Device set to use cuda:0


  Processing sample 8/20...


Device set to use cuda:0


  Processing sample 9/20...


Device set to use cuda:0


  Processing sample 10/20...


Device set to use cuda:0


  Processing sample 11/20...


Device set to use cuda:0


  Processing sample 12/20...


Device set to use cuda:0


  Processing sample 13/20...


Device set to use cuda:0


  Processing sample 14/20...


Device set to use cuda:0


  Processing sample 15/20...


Device set to use cuda:0


  Processing sample 16/20...


Device set to use cuda:0


  Processing sample 17/20...


Device set to use cuda:0


  Processing sample 18/20...


Device set to use cuda:0


  Processing sample 19/20...


Device set to use cuda:0


  Processing sample 20/20...


Device set to use cuda:0



🔬 Evaluating modality: NM
  Processing sample 1/16...


Device set to use cuda:0


  Processing sample 2/16...


Device set to use cuda:0


  Processing sample 3/16...


Device set to use cuda:0


  Processing sample 4/16...


Device set to use cuda:0


  Processing sample 5/16...


Device set to use cuda:0


  Processing sample 6/16...


Device set to use cuda:0


  Processing sample 7/16...


Device set to use cuda:0


  Processing sample 8/16...


Device set to use cuda:0


  Processing sample 9/16...


Device set to use cuda:0


  Processing sample 10/16...


Device set to use cuda:0


  Processing sample 11/16...


Device set to use cuda:0


  Processing sample 12/16...


Device set to use cuda:0


  Processing sample 13/16...


Device set to use cuda:0


  Processing sample 14/16...


Device set to use cuda:0


  Processing sample 15/16...


Device set to use cuda:0


  Processing sample 16/16...


Device set to use cuda:0



🔬 Evaluating modality: nan
  Processing sample 1/20...


Device set to use cuda:0


  Processing sample 2/20...


Device set to use cuda:0


  Processing sample 3/20...


Device set to use cuda:0


  Processing sample 4/20...


Device set to use cuda:0


  Processing sample 5/20...


Device set to use cuda:0


  Processing sample 6/20...


Device set to use cuda:0


  Processing sample 7/20...


Device set to use cuda:0


  Processing sample 8/20...


Device set to use cuda:0


  Processing sample 9/20...


Device set to use cuda:0


  Processing sample 10/20...


Device set to use cuda:0


  Processing sample 11/20...


Device set to use cuda:0


  Processing sample 12/20...


Device set to use cuda:0


  Processing sample 13/20...


Device set to use cuda:0


  Processing sample 14/20...


Device set to use cuda:0


  Processing sample 15/20...


Device set to use cuda:0


  Processing sample 16/20...


Device set to use cuda:0


  Processing sample 17/20...


Device set to use cuda:0


  Processing sample 18/20...


Device set to use cuda:0


  Processing sample 19/20...


Device set to use cuda:0


  Processing sample 20/20...


Device set to use cuda:0



🔬 Evaluating modality: OTHER
  Processing sample 1/2...


Device set to use cuda:0


  Processing sample 2/2...

📊 SYSTEMATIC EVALUATION REPORT

🔬 MODALITY-BASED PERFORMANCE
--------------------------------------------------
       sample_count  rouge1_mean  rouge2_mean  rougeL_mean
MR             20.0       0.6274       0.4963       0.5641
CT             20.0       0.2978       0.1396       0.2589
XR             20.0       0.3812       0.2998       0.3754
CR             20.0       0.3970       0.2640       0.3743
US             20.0       0.3394       0.1714       0.3048
NM             16.0       0.2872       0.1666       0.2479
nan            20.0       0.5655       0.4341       0.4773
OTHER           2.0       0.2276       0.0566       0.1411

📈 OVERALL PERFORMANCE
------------------------------
Total samples evaluated: 138
Average ROUGE-1: 0.4146
Average ROUGE-2: 0.2818
Average ROUGE-L: 0.3720

🏆 Best performing modality: MR (ROUGE-1: 0.6274)
⚠️  Lowest performing modality: OTHER (ROUGE-1: 0.2276)

💾 Results saved to systematic_evaluation_results.json
