In [None]:
import json
import os
import re
from pathlib import Path
from typing import Dict, List, Tuple, Any
import pandas as pd
from gatenlp import Document
from gatenlp.corpora import ListCorpus
from GatenlpUtils import loadCorpus

In [None]:
# Configuration - Set your pipeline results folder here
PIPELINE_RESULTS_FOLDER = "output/pipeline_results_20250725_111753"

print(f"📁 Using pipeline results folder: {PIPELINE_RESULTS_FOLDER}")
print("=" * 60)

In [None]:
# Verify Path Structure
print("🔍 VERIFYING PATH STRUCTURE")
print("=" * 50)

# Check the pipeline results folder structure
folder_path = Path(PIPELINE_RESULTS_FOLDER)
print(f"Target folder: {folder_path}")
print(f"Folder exists: {folder_path.exists()}")

if folder_path.exists():
    print(f"\n📁 Contents of {folder_path.name}:")
    
    # Count different file types
    json_files = list(folder_path.glob("*.json"))
    txt_files = list(folder_path.glob("*.txt"))
    csv_files = list(folder_path.glob("*.csv"))
    subdirs = [item for item in folder_path.iterdir() if item.is_dir()]
    
    print(f"  📄 JSON files: {len(json_files)}")
    print(f"  📋 TXT files: {len(txt_files)}")
    print(f"  📊 CSV files: {len(csv_files)}")
    print(f"  📁 Subdirectories: {len(subdirs)}")
    
    # Show result JSON files (exclude system files)
    result_jsons = [f for f in json_files if not f.name.startswith("pipeline_results_")]
    print(f"\n📊 Result JSON files ({len(result_jsons)}):")
    for json_file in result_jsons[:10]:  # Show first 10
        print(f"  📄 {json_file.name}")
    if len(result_jsons) > 10:
        print(f"  ... and {len(result_jsons) - 10} more")
    
    # Show subdirectories
    if subdirs:
        print(f"\n📁 Subdirectories:")
        for subdir in subdirs:
            print(f"  📁 {subdir.name}")
            # Check if it's the annotated corpus folder
            if "annotated_corpus" in subdir.name:
                corpus_files = list(subdir.glob("*"))
                print(f"    Contains {len(corpus_files)} files")
    
    print(f"\n✅ Path structure looks correct!")
    print(f"Expected structure:")
    print(f"📁 {folder_path.name}/")
    print(f"  📄 *.json (result files)")
    print(f"  📋 *.txt (reports)")
    print(f"  📊 *.csv (data)")
    print(f"  📁 annotated_corpus_with_predictions/")
    
else:
    print(f"❌ Folder doesn't exist: {folder_path}")
    
    # Check what's in the output directory
    output_dir = Path("output")
    if output_dir.exists():
        print(f"\n📁 Available folders in output/:")
        for item in output_dir.iterdir():
            if item.is_dir() and "pipeline" in item.name:
                print(f"  📁 {item.name}")
    else:
        print("❌ Output directory doesn't exist")

In [None]:
class LLMEventResultsEvaluator:
    """
    Evaluates LLM event annotation results by comparing predictions against 
    gold standard consensus annotations in GateNLP documents.
    """
    
    def __init__(self, output_dir: str = "output", corpus=None):
        self.output_dir = Path(output_dir)
        self.corpus = corpus if corpus is not None else loadCorpus()
        self.evaluation_results = []
        self.runtime_results = []  # Track runtime per model and document
        self.annotation_counts = []  # Track annotation counts per document and model
        
        # Event-related annotation types to evaluate based on consensus annotation set
        self.event_annotation_types = [
            "Event",      # General event annotation
            "Event_who",  # Who annotation 
            "Event_what", # What annotation
            "Event_when"  # When annotation
        ]
        
    def find_result_folders(self) -> List[Path]:
        """Find all timestamped result folders in output directory."""
        folders = []
        if self.output_dir.exists():
            # Check if output_dir itself is a timestamped folder (direct path to pipeline_results_*)
            if re.search(r'pipeline_results_\d{8}_\d{6}', self.output_dir.name):
                # We're pointing directly to a timestamped folder
                folders.append(self.output_dir)
                print(f"✅ Using direct pipeline results folder: {self.output_dir}")
            else:
                # Look for timestamped folders within the output directory
                for item in self.output_dir.iterdir():
                    if item.is_dir() and re.search(r'pipeline_results_\d{8}_\d{6}', item.name):
                        folders.append(item)
                        print(f"✅ Found pipeline results folder: {item}")
        else:
            print(f"❌ Output directory doesn't exist: {self.output_dir}")
        
        print(f"📊 Total pipeline result folders found: {len(folders)}")
        return sorted(folders)
    
    def find_result_jsons(self, folder: Path) -> List[Path]:
        """Find all result JSON files (excluding pipeline_results_*.json and other system files)."""
        jsons = []
        for json_file in folder.glob("*.json"):
            # Exclude system files like pipeline_results_*.json
            if not json_file.name.startswith("pipeline_results_"):
                jsons.append(json_file)
        
        print(f"📄 Found {len(jsons)} result JSON files in {folder.name}")
        for json_file in jsons:
            print(f"  📄 {json_file.name}")
        
        return jsons
    
    def extract_doc_name_from_path(self, file_path: str) -> str:
        """Extract document name from file path for matching with corpus."""
        return Path(file_path).stem
    
    def find_corpus_document(self, doc_identifier: str) -> Document:
        """Find corresponding document in corpus."""
        for doc in self.corpus:
            doc_name = doc.features.get("gate.SourceURL", "")
            if (doc_identifier in doc_name or 
                doc_name.endswith(f"{doc_identifier}.xml") or
                Path(doc_name).stem == doc_identifier):
                return doc
        return None
    
    def parse_llm_event_predictions(self, result_data: Dict[str, Any]) -> Dict[str, Dict[str, List[Dict]]]:
        """
        Parse LLM event predictions from result JSON.
        Maps JSON events to consensus annotation types for proper comparison:
        - "event" in JSON -> "Event" annotation with the event text as annotation span
        - "event_who" in JSON -> "Event_who" annotation with the who text as annotation span
        - "event_what" in JSON -> "Event_what" annotation with the what text as annotation span  
        - "event_when" in JSON -> "Event_when" annotation with the when text as annotation span
        - "event_type" in JSON -> type metadata of "Event" annotation (remove "event_" prefix)
        """
        predictions = {}
        
        # Check if annotations array exists
        if "annotations" not in result_data:
            print("No 'annotations' key found in result data")
            return predictions
        
        # Process each model's annotations
        for model_annotation in result_data["annotations"]:
            if not isinstance(model_annotation, dict):
                continue
                
            model_name = model_annotation.get("model_name", "unknown_model")
            predictions[model_name] = {}
            
            # Initialize all event annotation types
            for ann_type in self.event_annotation_types:
                predictions[model_name][ann_type] = []
            
            # Extract events from this model's results
            if "events" in model_annotation:
                events = model_annotation["events"]
                for event in events:
                    if isinstance(event, dict):
                        source_text = event.get("source_text", "")
                        
                        # Map "event" field to "Event" annotation
                        if "event" in event and event["event"].strip():
                            event_type = event.get("event_type", "")
                            # Remove "event_" prefix from event_type to match consensus type metadata
                            if event_type.startswith("event_"):
                                event_type = event_type[6:]  # Remove "event_" prefix
                            elif event_type.startswith("Event_"):
                                event_type = event_type[6:]  # Remove "Event_" prefix
                            
                            # Use the event text as the annotation span
                            event_text = event["event"].strip()
                            ann_dict = {
                                "text": event_text,  # The actual text to annotate
                                "start": 0,  # Will be calculated from event text
                                "end": 0,    # Will be calculated from event text
                                "features": {
                                    "source_text": source_text,
                                    "type": event_type  # This will be compared to consensus Event type metadata
                                }
                            }
                            predictions[model_name]["Event"].append(ann_dict)
                        
                        # Map "event_who" field to "Event_who" annotation
                        if "event_who" in event and event["event_who"].strip():
                            who_text = event["event_who"].strip()
                            ann_dict = {
                                "text": who_text,  # The actual text to annotate
                                "start": 0,
                                "end": 0,
                                "features": {
                                    "source_text": source_text
                                }
                            }
                            predictions[model_name]["Event_who"].append(ann_dict)
                        
                        # Map "event_what" field to "Event_what" annotation
                        if "event_what" in event and event["event_what"].strip():
                            what_text = event["event_what"].strip()
                            ann_dict = {
                                "text": what_text,  # The actual text to annotate
                                "start": 0,
                                "end": 0,
                                "features": {
                                    "source_text": source_text
                                }
                            }
                            predictions[model_name]["Event_what"].append(ann_dict)
                        
                        # Map "event_when" field to "Event_when" annotation
                        if "event_when" in event and event["event_when"].strip():
                            when_text = event["event_when"].strip()
                            ann_dict = {
                                "text": when_text,  # The actual text to annotate
                                "start": 0,
                                "end": 0,
                                "features": {
                                    "source_text": source_text
                                }
                            }
                            predictions[model_name]["Event_when"].append(ann_dict)
        
        return predictions
    
    def extract_runtime_info(self, result_data: Dict[str, Any], doc_name: str):
        """Extract runtime information from result JSON."""
        if "annotations" not in result_data:
            return
        
        # Process each model's runtime
        for model_annotation in result_data["annotations"]:
            if not isinstance(model_annotation, dict):
                continue
                
            model_name = model_annotation.get("model_name", "unknown_model")
            runtime_seconds = model_annotation.get("runtime_seconds", 0)
            
            # Store runtime information
            runtime_info = {
                "document": doc_name,
                "model": model_name,
                "runtime_seconds": runtime_seconds,
                "runtime_minutes": runtime_seconds / 60 if runtime_seconds else 0
            }
            self.runtime_results.append(runtime_info)
    
    def collect_annotation_counts(self, doc: Document, doc_name: str):
        """Collect annotation counts for consensus and all model prediction sets."""
        
        # Get consensus annotation counts
        consensus_annset = doc.annset("consensus")
        consensus_counts = {
            "document": doc_name,
            "annotation_set": "consensus",
            "model": "consensus"
        }
        
        for ann_type in self.event_annotation_types:
            ann_count = len(list(consensus_annset.with_type(ann_type)))
            consensus_counts[ann_type] = ann_count
        
        self.annotation_counts.append(consensus_counts)
        
        # Get model prediction annotation counts
        pred_annset_names = [name for name in doc.annset_names() if name.endswith("_predictions")]
        
        for annset_name in pred_annset_names:
            model_name = annset_name.replace("_predictions", "")
            pred_annset = doc.annset(annset_name)
            
            model_counts = {
                "document": doc_name,
                "annotation_set": annset_name,
                "model": model_name
            }
            
            for ann_type in self.event_annotation_types:
                ann_count = len(list(pred_annset.with_type(ann_type)))
                model_counts[ann_type] = ann_count
            
            self.annotation_counts.append(model_counts)
    
    def calculate_text_positions(self, target_text: str, document_text: str) -> Tuple[int, int]:
        """
        Calculate start and end positions of target_text within document_text.
        Returns: (start, end) positions or (0, 0) if not found.
        """
        if not target_text or not document_text:
            return (0, 0)
        
        # Clean up the target text for better matching
        cleaned_target = target_text.strip()
        
        # Try to find the text in the document (case-insensitive)
        start_pos = document_text.lower().find(cleaned_target.lower())
        if start_pos != -1:
            end_pos = start_pos + len(cleaned_target)
            return (start_pos, end_pos)
        
        # If exact match fails, try with normalized whitespace
        import re
        normalized_target = re.sub(r'\s+', ' ', cleaned_target)
        normalized_doc = re.sub(r'\s+', ' ', document_text)
        
        start_pos = normalized_doc.lower().find(normalized_target.lower())
        if start_pos != -1:
            # Find the actual positions in the original text
            # This is a simplified approach - might need refinement
            end_pos = start_pos + len(normalized_target)
            return (start_pos, end_pos)
        
        # If still not found, try partial matching with the first few words
        words = cleaned_target.split()
        if len(words) > 1:
            # Try with first 3 words
            partial_text = " ".join(words[:min(3, len(words))])
            start_pos = document_text.lower().find(partial_text.lower())
            if start_pos != -1:
                end_pos = start_pos + len(partial_text)
                return (start_pos, end_pos)
        
        return (0, 0)
    
    def extract_annotations_from_response(self, response_text: str) -> Dict[str, List[Dict]]:
        """
        Extract event annotations from LLM response text.
        Adjust this method based on your actual LLM response format.
        """
        annotations = {}
        for ann_type in self.event_annotation_types:
            annotations[ann_type] = []
        
        # Example parsing - adjust based on your actual response format
        # This assumes annotations are in JSON format within the response
        try:
            # Try to find JSON blocks in the response
            json_matches = re.findall(r'\{.*?\}', response_text, re.DOTALL)
            for json_str in json_matches:
                try:
                    parsed_json = json.loads(json_str)
                    if isinstance(parsed_json, dict):
                        for ann_type in self.event_annotation_types:
                            if ann_type in parsed_json:
                                annotations[ann_type].extend(parsed_json[ann_type])
                except json.JSONDecodeError:
                    continue
        except Exception as e:
            print(f"Error parsing annotations from response: {e}")
        
        return annotations
    
    def add_prediction_annotations(self, doc: Document, model_name: str, 
                                predictions: Dict[str, List[Dict]]):
        """Add prediction annotations to document as new annotation set."""
        # Create BOTH the temporary _predictions set (for evaluation) AND the clean set (for viewing)
        
        # 1. Create temporary prediction set for evaluation
        temp_annset_name = f"{model_name}_predictions"
        temp_annset = doc.annset(temp_annset_name)
        temp_annset.clear()
        
        # 2. Create clean permanent set for viewing/saving
        clean_annset = doc.annset(model_name)
        clean_annset.clear()
        
        # Add predicted event annotations to BOTH sets
        for ann_type, ann_list in predictions.items():
            for ann_dict in ann_list:
                try:
                    # Extract the target text and features from annotation dict
                    target_text = ann_dict.get("text", "")
                    start = ann_dict.get("start", 0)
                    end = ann_dict.get("end", 0)
                    features = ann_dict.get("features", {})
                    
                    # If start/end positions are not provided or are 0, calculate from target text
                    if (start == 0 and end == 0) and target_text:
                        start, end = self.calculate_text_positions(target_text, doc.text)
                    
                    # Only add annotation if we found valid positions
                    if start < end:
                        # Ensure boundaries are within document bounds
                        start = max(0, min(start, len(doc.text)))
                        end = max(start, min(end, len(doc.text)))
                        
                        # Features for temporary set (with extra metadata)
                        temp_features = dict(features)
                        temp_features.update({
                            "source": "llm_prediction",
                            "model": model_name
                        })
                        
                        # Features for clean set (without extra metadata)
                        clean_features = dict(features)
                        clean_features.pop("source", None)
                        clean_features.pop("model", None)
                        
                        # Add to both sets
                        temp_annset.add(start, end, ann_type, temp_features)
                        clean_annset.add(start, end, ann_type, clean_features)
                        
                        print(f"    Added {ann_type} annotation: '{doc.text[start:end]}' at {start}-{end}")
                    else:
                        print(f"    Skipped {ann_type} annotation: text '{target_text}' not found in document")
                    
                except Exception as e:
                    print(f"Error adding annotation {ann_dict}: {e}")
    
    def calculate_annotation_overlap_metrics(self, gold_annset, pred_annset, ann_type: str, document: Document = None) -> Dict[str, float]:
        """Calculate precision, recall, F1 for a specific annotation type."""
        gold_anns = list(gold_annset.with_type(ann_type))
        pred_anns = list(pred_annset.with_type(ann_type))
        
        if not gold_anns and not pred_anns:
            return {"precision": 1.0, "recall": 1.0, "f1": 1.0, "gold_count": 0, "pred_count": 0}
        
        if not pred_anns:
            return {"precision": 0.0, "recall": 0.0, "f1": 0.0, "gold_count": len(gold_anns), "pred_count": 0}
        
        if not gold_anns:
            return {"precision": 0.0, "recall": 0.0, "f1": 0.0, "gold_count": 0, "pred_count": len(pred_anns)}
        
        # Get the document text to extract annotation text
        # Use the document parameter or try to get it from annotation set
        if document is None:
            # Try to get document from annotation set's owner
            document = getattr(gold_annset, '_owner', None)
            if document is None:
                print(f"Warning: Could not access document for annotation comparison")
                return {"precision": 0.0, "recall": 0.0, "f1": 0.0, "gold_count": len(gold_anns), "pred_count": len(pred_anns)}
        
        # For Event annotations, consider both position and type metadata matching
        if ann_type == "Event":
            matches = 0
            matched_gold = set()
            matched_pred = set()
            
            for i, pred_ann in enumerate(pred_anns):
                pred_type = pred_ann.features.get("type", "")
                pred_text = document.text[pred_ann.start:pred_ann.end].strip().lower()
                
                for j, gold_ann in enumerate(gold_anns):
                    if j in matched_gold:
                        continue
                        
                    gold_type = gold_ann.features.get("type", "")
                    gold_text = document.text[gold_ann.start:gold_ann.end].strip().lower()
                    
                    # Check for text similarity and type matching
                    text_match = (pred_text == gold_text or 
                                pred_text in gold_text or 
                                gold_text in pred_text or
                                self.calculate_text_similarity(pred_text, gold_text) > 0.7)
                    
                    type_match = (pred_type == gold_type or 
                                (not pred_type and not gold_type))
                    
                    if text_match and type_match:
                        matches += 1
                        matched_gold.add(j)
                        matched_pred.add(i)
                        break
            
            precision = matches / len(pred_anns) if pred_anns else 0
            recall = matches / len(gold_anns) if gold_anns else 0
            
        else:
            # For other annotation types (Event_who, Event_what, Event_when), use text similarity
            matches = 0
            matched_gold = set()
            
            for i, pred_ann in enumerate(pred_anns):
                pred_text = document.text[pred_ann.start:pred_ann.end].strip().lower()
                
                for j, gold_ann in enumerate(gold_anns):
                    if j in matched_gold:
                        continue
                        
                    gold_text = document.text[gold_ann.start:gold_ann.end].strip().lower()
                    
                    # Check for text similarity
                    text_match = (pred_text == gold_text or 
                                pred_text in gold_text or 
                                gold_text in pred_text or
                                self.calculate_text_similarity(pred_text, gold_text) > 0.7)
                    
                    if text_match:
                        matches += 1
                        matched_gold.add(j)
                        break
            
            precision = matches / len(pred_anns) if pred_anns else 0
            recall = matches / len(gold_anns) if gold_anns else 0
        
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        
        return {
            "precision": precision,
            "recall": recall,
            "f1": f1,
            "gold_count": len(gold_anns),
            "pred_count": len(pred_anns),
            "matches": matches
        }
    
    def calculate_text_similarity(self, text1: str, text2: str) -> float:
        """Calculate simple text similarity score between two strings."""
        if not text1 or not text2:
            return 0.0
        
        # Simple word-based similarity
        words1 = set(text1.lower().split())
        words2 = set(text2.lower().split())
        
        if not words1 or not words2:
            return 0.0
        
        intersection = words1.intersection(words2)
        union = words1.union(words2)
        
        return len(intersection) / len(union) if union else 0.0
    
    def evaluate_document(self, doc: Document, doc_name: str, result_folder: str) -> List[Dict]:
        """Evaluate all model predictions for a single document against consensus annotations."""
        gold_annset = doc.annset("consensus")  # Use consensus annotation set as gold standard
        
        if not gold_annset:
            print(f"Warning: No 'consensus' annotation set found in document {doc_name}")
            return []
        
        results = []
        
        # Get all prediction annotation sets
        pred_annset_names = [name for name in doc.annset_names() if name.endswith("_predictions")]
        
        for annset_name in pred_annset_names:
            model_name = annset_name.replace("_predictions", "")
            pred_annset = doc.annset(annset_name)
            
            # Evaluate each event annotation type
            for ann_type in self.event_annotation_types:
                metrics = self.calculate_annotation_overlap_metrics(gold_annset, pred_annset, ann_type, doc)
                
                result = {
                    "result_folder": result_folder,
                    "document": doc_name,
                    "model": model_name,
                    "annotation_type": ann_type,
                    **metrics
                }
                results.append(result)
        
        return results
    
    def process_result_json(self, json_path: Path, result_folder: str):
        """Process a single result JSON file."""
        try:
            with open(json_path, 'r', encoding='utf-8') as f:
                result_data = json.load(f)
        except Exception as e:
            print(f"Error reading {json_path}: {e}")
            return
        
        # Extract document identifier
        doc_path = result_data.get("Document", "")
        doc_name = self.extract_doc_name_from_path(doc_path)
        
        # Find corresponding corpus document
        corpus_doc = self.find_corpus_document(doc_name)
        if corpus_doc is None:
            print(f"Warning: Could not find corpus document for {doc_name}")
            return
        
        # Parse LLM event predictions
        predictions = self.parse_llm_event_predictions(result_data)
        
        # Extract runtime information
        self.extract_runtime_info(result_data, doc_name)
        
        if not predictions:
            print(f"Warning: No predictions found in {json_path.name}")
            return
        
        # Debug: Print what we found
        print(f"Found predictions for models: {list(predictions.keys())}")
        for model_name, model_preds in predictions.items():
            total_events = sum(len(events) for events in model_preds.values())
            print(f"  {model_name}: {total_events} total events")
            for ann_type, events in model_preds.items():
                if events:
                    print(f"    {ann_type}: {len(events)} events")
        
        # Add prediction annotations to document
        for model_name, model_predictions in predictions.items():
            self.add_prediction_annotations(corpus_doc, model_name, model_predictions)
        
        # Collect annotation counts after adding predictions
        self.collect_annotation_counts(corpus_doc, doc_name)
        
        # Evaluate predictions against consensus annotations
        doc_results = self.evaluate_document(corpus_doc, doc_name, result_folder)
        self.evaluation_results.extend(doc_results)
        
        print(f"Processed {json_path.name}: {len(predictions)} models, {len(doc_results)} evaluations")
    
    def run_evaluation(self) -> pd.DataFrame:
        """Run complete evaluation on all result folders."""
        result_folders = self.find_result_folders()
        
        if not result_folders:
            print("No result folders found in output directory")
            return pd.DataFrame()
        
        print(f"Found {len(result_folders)} result folders")
        
        for folder in result_folders:
            print(f"\nProcessing folder: {folder.name}")
            result_jsons = self.find_result_jsons(folder)
            
            for json_path in result_jsons:
                self.process_result_json(json_path, folder.name)
        
        # Convert results to DataFrame
        results = {}
        if self.evaluation_results:
            results['evaluation'] = pd.DataFrame(self.evaluation_results)
        else:
            results['evaluation'] = pd.DataFrame()
        
        if self.runtime_results:
            results['runtime'] = pd.DataFrame(self.runtime_results)
        else:
            results['runtime'] = pd.DataFrame()
            
        if self.annotation_counts:
            results['annotation_counts'] = pd.DataFrame(self.annotation_counts)
        else:
            results['annotation_counts'] = pd.DataFrame()
        
        return results
    
    def generate_summary_report(self, results_df: pd.DataFrame) -> str:
        """Generate summary report of evaluation results."""
        if results_df.empty:
            return "No results to summarize"
        
        report = []
        report.append("LLM Event Annotation Evaluation Summary")
        report.append("=" * 60)
        
        # Overall statistics
        total_evaluations = len(results_df)
        unique_models = results_df['model'].nunique()
        unique_docs = results_df['document'].nunique()
        unique_ann_types = results_df['annotation_type'].nunique()
        
        report.append(f"Total evaluations: {total_evaluations}")
        report.append(f"Unique models: {unique_models}")
        report.append(f"Unique documents: {unique_docs}")
        report.append(f"Annotation types evaluated: {unique_ann_types}")
        report.append("")
        
        # Average F1 by model
        report.append("Average F1 Scores by Model:")
        model_f1 = results_df.groupby('model')['f1'].mean().sort_values(ascending=False)
        for model, f1 in model_f1.items():
            report.append(f"  {model}: {f1:.3f}")
        report.append("")
        
        # Average F1 by annotation type
        report.append("Average F1 Scores by Annotation Type:")
        ann_type_f1 = results_df.groupby('annotation_type')['f1'].mean().sort_values(ascending=False)
        for ann_type, f1 in ann_type_f1.items():
            report.append(f"  {ann_type}: {f1:.3f}")
        report.append("")
        
        # Best performing model-annotation combinations
        report.append("Best Model-Annotation Combinations (Top 10 F1):")
        best_combinations = results_df.groupby(['model', 'annotation_type'])['f1'].mean().sort_values(ascending=False).head(10)
        for (model, ann_type), f1 in best_combinations.items():
            report.append(f"  {model} - {ann_type}: {f1:.3f}")
        report.append("")
        
        # Detailed precision/recall breakdown
        report.append("Detailed Metrics by Annotation Type:")
        for ann_type in results_df['annotation_type'].unique():
            ann_data = results_df[results_df['annotation_type'] == ann_type]
            avg_precision = ann_data['precision'].mean()
            avg_recall = ann_data['recall'].mean()
            avg_f1 = ann_data['f1'].mean()
            total_gold = ann_data['gold_count'].sum()
            total_pred = ann_data['pred_count'].sum()
            
            report.append(f"  {ann_type}:")
            report.append(f"    Precision: {avg_precision:.3f}")
            report.append(f"    Recall: {avg_recall:.3f}")
            report.append(f"    F1: {avg_f1:.3f}")
            report.append(f"    Total gold annotations: {total_gold}")
            report.append(f"    Total predicted annotations: {total_pred}")
        
        return "\n".join(report)
    
    def generate_runtime_report(self, runtime_df: pd.DataFrame) -> str:
        """Generate runtime analysis report."""
        if runtime_df.empty:
            return "No runtime data available"
        
        report = []
        report.append("LLM Runtime Analysis")
        report.append("=" * 40)
        
        # Overall runtime statistics
        total_runtime = runtime_df['runtime_seconds'].sum()
        avg_runtime = runtime_df['runtime_seconds'].mean()
        
        report.append(f"Total processing time: {total_runtime:.2f} seconds ({total_runtime/60:.2f} minutes)")
        report.append(f"Average processing time per document: {avg_runtime:.2f} seconds")
        report.append("")
        
        # Runtime by model
        report.append("Average Runtime by Model:")
        model_runtime = runtime_df.groupby('model')['runtime_seconds'].agg(['mean', 'std', 'min', 'max']).round(2)
        for model, stats in model_runtime.iterrows():
            report.append(f"  {model}:")
            report.append(f"    Mean: {stats['mean']:.2f}s")
            report.append(f"    Std:  {stats['std']:.2f}s")
            report.append(f"    Min:  {stats['min']:.2f}s")
            report.append(f"    Max:  {stats['max']:.2f}s")
        report.append("")
        
        # Runtime by document
        report.append("Runtime by Document:")
        doc_runtime = runtime_df.groupby('document')['runtime_seconds'].sum().sort_values(ascending=False)
        for doc, runtime in doc_runtime.items():
            report.append(f"  {doc}: {runtime:.2f}s ({runtime/60:.2f}m)")
        
        return "\n".join(report)
    
    def generate_annotation_counts_report(self, counts_df: pd.DataFrame) -> str:
        """Generate annotation counts comparison report."""
        if counts_df.empty:
            return "No annotation count data available"
        
        report = []
        report.append("Annotation Counts Comparison")
        report.append("=" * 50)
        
        # Get unique documents
        documents = counts_df['document'].unique()
        
        for doc in sorted(documents):
            report.append(f"\nDocument: {doc}")
            report.append("-" * (len(doc) + 10))
            
            doc_data = counts_df[counts_df['document'] == doc]
            
            # Create a comparison table
            comparison_data = []
            for ann_type in self.event_annotation_types:
                row = {'annotation_type': ann_type}
                for _, row_data in doc_data.iterrows():
                    model = row_data['model']
                    count = row_data[ann_type]
                    row[model] = count
                comparison_data.append(row)
            
            comparison_df = pd.DataFrame(comparison_data)
            if not comparison_df.empty:
                comparison_df = comparison_df.set_index('annotation_type')
                report.append(comparison_df.to_string())
            report.append("")
        
        # Overall summary
        report.append("Overall Annotation Count Summary:")
        report.append("-" * 40)
        
        for ann_type in self.event_annotation_types:
            report.append(f"\n{ann_type}:")
            ann_summary = counts_df.groupby('model')[ann_type].agg(['sum', 'mean', 'std']).round(2)
            for model, stats in ann_summary.iterrows():
                report.append(f"  {model}: Total={int(stats['sum'])}, Avg={stats['mean']:.1f}, Std={stats['std']:.1f}")
        
        return "\n".join(report)

def main():
    """Main execution function."""
    print("Starting LLM Event Annotation Results Evaluation...")
    
    # Use the globally defined pipeline results folder
    evaluator = LLMEventResultsEvaluator(output_dir=PIPELINE_RESULTS_FOLDER)
    
    # Run evaluation
    results = evaluator.run_evaluation()
    
    results_df = results['evaluation']
    runtime_df = results['runtime']
    counts_df = results['annotation_counts']
    
    runtime_dir = Path(f"{PIPELINE_RESULTS_FOLDER}/runtime_analysis")
    runtime_dir.mkdir(parents=True, exist_ok=True)

    if not results_df.empty:
        # Save evaluation results
        output_path = f"{PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_event_evaluation_results.csv"
        results_df.to_csv(output_path, index=False)
        print(f"\nEvaluation results saved to: {output_path}")
        
        # Save runtime results
        if not runtime_df.empty:
            runtime_path = f"{PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_runtime_analysis.csv"
            runtime_df.to_csv(runtime_path, index=False)
            print(f"Runtime analysis saved to: {runtime_path}")
        
        # Save annotation counts
        if not counts_df.empty:
            counts_path = f"{PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_annotation_counts.csv"
            counts_df.to_csv(counts_path, index=False)
            print(f"Annotation counts saved to: {counts_path}")
        
        # Generate and save summary report
        summary = evaluator.generate_summary_report(results_df)
        summary_path = f"{PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_event_evaluation_summary.txt"
        with open(summary_path, 'w', encoding='utf-8') as f:
            f.write(summary)
        print(f"Summary report saved to: {summary_path}")
        
        # Generate and save runtime report
        if not runtime_df.empty:
            runtime_report = evaluator.generate_runtime_report(runtime_df)
            runtime_report_path = f"{PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_runtime_report.txt"
            with open(runtime_report_path, 'w', encoding='utf-8') as f:
                f.write(runtime_report)
            print(f"Runtime report saved to: {runtime_report_path}")
        
        # Generate and save annotation counts report
        if not counts_df.empty:
            counts_report = evaluator.generate_annotation_counts_report(counts_df)
            counts_report_path = f"{PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_annotation_counts_report.txt"
            with open(counts_report_path, 'w', encoding='utf-8') as f:
                f.write(counts_report)
            print(f"Annotation counts report saved to: {counts_report_path}")
        
        # Print summary to console
        print("\n" + summary)
        
        # Print runtime summary if available
        if not runtime_df.empty:
            print("\n" + runtime_report)
        
        # Print annotation counts summary if available
        if not counts_df.empty:
            print("\n" + counts_report)
        
        # Display some basic statistics
        print("\nDetailed Results Preview:")
        print(results_df.head(15).to_string(index=False))
        
    else:
        print("No results generated. Check if result folders and JSON files exist.")

if __name__ == "__main__":
    main()

In [None]:
def save_corpus_with_annotations(self):
    """Save the corpus with all the new model annotations."""
    try:
        from gatenlp.corpora import DirFilesDestination
        
        # Create output directory for annotated corpus
        output_corpus_dir = Path(f"{PIPELINE_RESULTS_FOLDER}/annotated_corpus_with_predictions")
        output_corpus_dir.mkdir(parents=True, exist_ok=True)
        
        print(f"\n💾 Saving annotated corpus to: {output_corpus_dir}")
        
        # Use GateNLP's DirFilesDestination to properly save the corpus
        with DirFilesDestination(str(output_corpus_dir), ext="xml", fmt="gatexml") as dest:
            for doc in self.corpus:
                doc_name = Path(doc.features.get("gate.SourceURL", "")).stem
                
                # Set a filename for the document based on the original name
                # This will override the default path generation
                doc.features["_relpath"] = f"{doc_name}.xml"
                
                dest.append(doc)
                
                # Print summary of annotation sets for this document
                annset_summary = []
                for annset_name in doc.annset_names():
                    if annset_name and not annset_name.endswith("_predictions"):  # Skip temporary sets
                        ann_count = len(doc.annset(annset_name))
                        if ann_count > 0:
                            annset_summary.append(f"{annset_name}({ann_count})")
                
                if annset_summary:
                    print(f"  {doc_name}.xml: {', '.join(annset_summary)}")
        
        # Count saved files
        saved_files = list(output_corpus_dir.glob("*.xml"))
        print(f"✅ Saved {len(saved_files)} annotated documents")
        
        # Create a summary file
        summary_file = output_corpus_dir / "annotation_summary.txt"
        with open(summary_file, 'w', encoding='utf-8') as f:
            f.write("Annotated Corpus Summary\n")
            f.write("=" * 50 + "\n\n")
            f.write(f"Generated on: {pd.Timestamp.now()}\n")
            f.write(f"Total documents: {len(saved_files)}\n\n")
            
            f.write("Annotation Sets Added:\n")
            f.write("-" * 30 + "\n")
            
            # Get all unique annotation set names across all documents
            all_annsets = set()
            for doc in self.corpus:
                for annset_name in doc.annset_names():
                    if annset_name and not annset_name.endswith("_predictions"):
                        all_annsets.add(annset_name)
            
            for annset_name in sorted(all_annsets):
                if annset_name not in ["consensus", ""]:  # Skip gold standard and default
                    f.write(f"- {annset_name} (LLM predictions)\n")
            
            f.write(f"\nFiles saved to: {output_corpus_dir}\n")
            f.write("\nTo view in Gate:\n")
            f.write("1. Open Gate Developer\n")
            f.write("2. Load documents from this directory\n")
            f.write("3. View different annotation sets in the annotation sets panel\n")
        
        return str(output_corpus_dir)
        
    except Exception as e:
        print(f"❌ Error saving corpus: {e}")
        import traceback
        traceback.print_exc()
        return None

In [None]:
# Visual Analysis and Better Results Overview
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def create_visual_analysis(results_df: pd.DataFrame):
    """Create comprehensive visual analysis of the evaluation results."""
    
    if results_df.empty:
        print("No results to visualize")
        return
    
    # Set up the plotting style
    plt.style.use('default')
    sns.set_palette("husl")
    
    # Create a large figure with multiple subplots
    fig = plt.figure(figsize=(20, 16))
    
    # 1. Heatmap: F1 scores by Model and Annotation Type
    plt.subplot(3, 3, 1)
    pivot_f1 = results_df.pivot_table(values='f1', index='model', columns='annotation_type', aggfunc='mean')
    sns.heatmap(pivot_f1, annot=True, fmt='.3f', cmap='RdYlGn', center=0.5, 
                cbar_kws={'label': 'F1 Score'})
    plt.title('F1 Scores by Model and Annotation Type')
    plt.xlabel('Annotation Type')
    plt.ylabel('Model')
    
    # 2. Bar plot: Average F1 by Model
    plt.subplot(3, 3, 2)
    model_f1 = results_df.groupby('model')['f1'].mean().sort_values(ascending=True)
    model_f1.plot(kind='barh', color='skyblue')
    plt.title('Average F1 Score by Model')
    plt.xlabel('F1 Score')
    plt.grid(axis='x', alpha=0.3)
    
    # 3. Bar plot: Average F1 by Annotation Type
    plt.subplot(3, 3, 3)
    ann_f1 = results_df.groupby('annotation_type')['f1'].mean().sort_values(ascending=True)
    ann_f1.plot(kind='barh', color='lightcoral')
    plt.title('Average F1 Score by Annotation Type')
    plt.xlabel('F1 Score')
    plt.grid(axis='x', alpha=0.3)
    
    # 4. Precision vs Recall scatter plot
    plt.subplot(3, 3, 4)
    for model in results_df['model'].unique():
        model_data = results_df[results_df['model'] == model]
        plt.scatter(model_data['recall'], model_data['precision'], 
                   label=model, alpha=0.7, s=60)
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.title('Precision vs Recall by Model')
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(alpha=0.3)
    
    # 5. Distribution of F1 scores
    plt.subplot(3, 3, 5)
    results_df.boxplot(column='f1', by='model', ax=plt.gca())
    plt.title('F1 Score Distribution by Model')
    plt.suptitle('')  # Remove the default title
    plt.xticks(rotation=45)
    plt.ylabel('F1 Score')
    
    # 6. Document-level performance heatmap
    plt.subplot(3, 3, 6)
    doc_model_f1 = results_df.groupby(['document', 'model'])['f1'].mean().reset_index()
    doc_model_pivot = doc_model_f1.pivot(index='document', columns='model', values='f1')
    sns.heatmap(doc_model_pivot, annot=True, fmt='.2f', cmap='RdYlGn', center=0.3,
                cbar_kws={'label': 'F1 Score'})
    plt.title('F1 Scores by Document and Model')
    plt.xlabel('Model')
    plt.ylabel('Document')
    
    # 7. Gold vs Predicted annotations count
    plt.subplot(3, 3, 7)
    total_gold = results_df.groupby('model')['gold_count'].sum()
    total_pred = results_df.groupby('model')['pred_count'].sum()
    
    x = np.arange(len(total_gold))
    width = 0.35
    
    plt.bar(x - width/2, total_gold.values, width, label='Gold Standard', color='gold', alpha=0.8)
    plt.bar(x + width/2, total_pred.values, width, label='Predicted', color='steelblue', alpha=0.8)
    
    plt.xlabel('Model')
    plt.ylabel('Total Annotations')
    plt.title('Gold vs Predicted Annotation Counts')
    plt.xticks(x, total_gold.index, rotation=45)
    plt.legend()
    plt.grid(axis='y', alpha=0.3)
    
    # 8. Annotation type performance comparison
    plt.subplot(3, 3, 8)
    ann_metrics = results_df.groupby('annotation_type')[['precision', 'recall', 'f1']].mean()
    ann_metrics.plot(kind='bar', ax=plt.gca(), color=['lightblue', 'lightgreen', 'lightcoral'])
    plt.title('Average Metrics by Annotation Type')
    plt.ylabel('Score')
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid(axis='y', alpha=0.3)
    
    # 9. Model performance ranking
    plt.subplot(3, 3, 9)
    model_ranking = results_df.groupby('model')[['precision', 'recall', 'f1']].mean().sort_values('f1', ascending=False)
    model_ranking.plot(kind='bar', ax=plt.gca(), color=['lightblue', 'lightgreen', 'lightcoral'])
    plt.title('Model Performance Ranking')
    plt.ylabel('Score')
    plt.xticks(rotation=45)
    plt.legend()
    plt.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

def create_detailed_results_table(results_df: pd.DataFrame):
    """Create a detailed results table with better formatting."""
    
    if results_df.empty:
        print("No results to display")
        return
    
    print("\n" + "="*120)
    print("DETAILED EVALUATION RESULTS")
    print("="*120)
    
    # Summary by model
    print("\n📊 SUMMARY BY MODEL:")
    print("-" * 80)
    model_summary = results_df.groupby('model').agg({
        'precision': ['mean', 'std'],
        'recall': ['mean', 'std'],
        'f1': ['mean', 'std'],
        'gold_count': 'sum',
        'pred_count': 'sum'
    }).round(3)
    
    # Flatten column names
    model_summary.columns = ['_'.join(col).strip() for col in model_summary.columns.values]
    print(model_summary.to_string())
    
    # Summary by annotation type
    print("\n\n📋 SUMMARY BY ANNOTATION TYPE:")
    print("-" * 80)
    ann_summary = results_df.groupby('annotation_type').agg({
        'precision': ['mean', 'std'],
        'recall': ['mean', 'std'], 
        'f1': ['mean', 'std'],
        'gold_count': 'sum',
        'pred_count': 'sum'
    }).round(3)
    
    ann_summary.columns = ['_'.join(col).strip() for col in ann_summary.columns.values]
    print(ann_summary.to_string())
    
    # Best and worst performers
    print("\n\n🏆 TOP PERFORMERS:")
    print("-" * 50)
    top_performers = results_df.nlargest(10, 'f1')[['model', 'annotation_type', 'document', 'f1', 'precision', 'recall']]
    print(top_performers.to_string(index=False))
    
    print("\n\n⚠️  LOWEST PERFORMERS:")
    print("-" * 50)
    low_performers = results_df.nsmallest(10, 'f1')[['model', 'annotation_type', 'document', 'f1', 'precision', 'recall']]
    print(low_performers.to_string(index=False))
    
    # Document-level analysis
    print("\n\n📄 DOCUMENT-LEVEL ANALYSIS:")
    print("-" * 80)
    doc_analysis = results_df.groupby('document').agg({
        'f1': ['mean', 'std', 'min', 'max'],
        'model': 'count'
    }).round(3)
    
    doc_analysis.columns = ['_'.join(col).strip() for col in doc_analysis.columns.values]
    doc_analysis = doc_analysis.rename(columns={'model_count': 'num_evaluations'})
    print(doc_analysis.to_string())

def enhanced_main():
    """Enhanced main function with better visualization."""
    print("Starting LLM Event Annotation Results Evaluation...")
    
    # Use the globally defined pipeline results folder
    evaluator = LLMEventResultsEvaluator(output_dir=PIPELINE_RESULTS_FOLDER)
    
    # Run evaluation
    results = evaluator.run_evaluation()
    
    results_df = results['evaluation']
    runtime_df = results['runtime']
    counts_df = results['annotation_counts']
    
    if not results_df.empty:
        # Save results
        output_path = f"{PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_event_evaluation_results.csv"
        results_df.to_csv(output_path, index=False)
        print(f"\nEvaluation results saved to: {output_path}")
        
        # Save additional data
        if not runtime_df.empty:
            runtime_df.to_csv(f"{PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_runtime_analysis.csv", index=False)
            print(f"Runtime analysis saved to: {PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_runtime_analysis.csv")
        
        if not counts_df.empty:
            counts_df.to_csv(f"{PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_annotation_counts.csv", index=False)
            print(f"Annotation counts saved to: {PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_annotation_counts.csv")
        
        # Generate and save summary report
        summary = evaluator.generate_summary_report(results_df)
        summary_path = f"{PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_event_evaluation_summary.txt"
        with open(summary_path, 'w', encoding='utf-8') as f:
            f.write(summary)
        print(f"Summary report saved to: {summary_path}")
        
        # Create detailed results table
        create_detailed_results_table(results_df)
        
        # Create visual analysis
        print("\n\n📈 GENERATING VISUAL ANALYSIS...")
        create_visual_analysis(results_df)
        
        # Print basic summary
        print(f"\n\n📊 QUICK STATS:")
        print(f"Total evaluations: {len(results_df)}")
        print(f"Models evaluated: {results_df['model'].nunique()}")
        print(f"Documents processed: {results_df['document'].nunique()}")
        print(f"Annotation types: {', '.join(results_df['annotation_type'].unique())}")
        print(f"Average F1 score: {results_df['f1'].mean():.3f}")
        print(f"Best performing model: {results_df.groupby('model')['f1'].mean().idxmax()}")
        print(f"Best annotation type: {results_df.groupby('annotation_type')['f1'].mean().idxmax()}")
        
        # Runtime summary
        if not runtime_df.empty:
            print(f"\n⏱️  RUNTIME SUMMARY:")
            total_time = runtime_df['runtime_seconds'].sum()
            avg_time_per_doc = runtime_df.groupby('document')['runtime_seconds'].sum().mean()
            fastest_model = runtime_df.groupby('model')['runtime_seconds'].mean().idxmin()
            print(f"Total processing time: {total_time:.1f}s ({total_time/60:.1f}m)")
            print(f"Average time per document: {avg_time_per_doc:.1f}s")
            print(f"Fastest model on average: {fastest_model}")
        
        # Annotation counts summary
        if not counts_df.empty:
            print(f"\n📊 ANNOTATION COUNTS SUMMARY:")
            consensus_total = counts_df[counts_df['model'] == 'consensus'][evaluator.event_annotation_types].sum().sum()
            print(f"Total consensus annotations: {consensus_total}")
            
            for model in counts_df[counts_df['model'] != 'consensus']['model'].unique():
                model_total = counts_df[counts_df['model'] == model][evaluator.event_annotation_types].sum().sum()
                print(f"Total {model} annotations: {model_total}")
        
        print(f"\n🎯 OUTPUTS GENERATED:")
        print(f"📊 Evaluation results: {PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_event_evaluation_results.csv")
        print(f"📋 Summary report: {PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_event_evaluation_summary.txt")
        if not runtime_df.empty:
            print(f"⏱️  Runtime analysis: {PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_runtime_analysis.csv")
        if not counts_df.empty:
            print(f"📊 Annotation counts: {PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_annotation_counts.csv")
        print(f"📁 Annotated corpus: {PIPELINE_RESULTS_FOLDER}/runtime_analysis/annotated_corpus_with_predictions/")
        print(f"   → Open these XML files in Gate to view model predictions!")
        
    else:
        print("No results generated. Check if result folders and JSON files exist.")

# Run the enhanced analysis
if __name__ == "__main__":
    enhanced_main()

In [None]:
# Enhanced Analysis with Runtime and Annotation Count Visualizations
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def create_runtime_visualizations(runtime_df: pd.DataFrame):
    """Create visualizations for runtime analysis."""
    if runtime_df.empty:
        print("No runtime data to visualize")
        return
    
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle('Runtime Analysis', fontsize=16)
    
    # 1. Runtime by model (box plot)
    ax1 = axes[0, 0]
    runtime_df.boxplot(column='runtime_seconds', by='model', ax=ax1)
    ax1.set_title('Runtime Distribution by Model')
    ax1.set_xlabel('Model')
    ax1.set_ylabel('Runtime (seconds)')
    ax1.tick_params(axis='x', rotation=45)
    
    # 2. Runtime by document (bar plot)
    ax2 = axes[0, 1]
    doc_runtime = runtime_df.groupby('document')['runtime_seconds'].sum().sort_values(ascending=True)
    doc_runtime.plot(kind='barh', ax=ax2, color='lightblue')
    ax2.set_title('Total Runtime by Document')
    ax2.set_xlabel('Runtime (seconds)')
    
    # 3. Average runtime by model
    ax3 = axes[1, 0]
    model_avg_runtime = runtime_df.groupby('model')['runtime_seconds'].mean().sort_values(ascending=True)
    model_avg_runtime.plot(kind='barh', ax=ax3, color='lightgreen')
    ax3.set_title('Average Runtime by Model')
    ax3.set_xlabel('Average Runtime (seconds)')
    
    # 4. Runtime heatmap (model vs document)
    ax4 = axes[1, 1]
    runtime_pivot = runtime_df.pivot_table(values='runtime_seconds', index='model', columns='document', aggfunc='mean')
    sns.heatmap(runtime_pivot, annot=True, fmt='.1f', cmap='YlOrRd', ax=ax4, cbar_kws={'label': 'Runtime (seconds)'})
    ax4.set_title('Runtime Heatmap (Model vs Document)')
    ax4.set_xlabel('Document')
    ax4.set_ylabel('Model')
    
    plt.tight_layout()
    plt.show()

def create_annotation_count_visualizations(counts_df: pd.DataFrame, event_types: List[str]):
    """Create visualizations for annotation count analysis."""
    if counts_df.empty:
        print("No annotation count data to visualize")
        return
    
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('Annotation Count Analysis', fontsize=16)
    
    # 1. Total annotations by model and type (stacked bar)
    ax1 = axes[0, 0]
    model_type_counts = counts_df.groupby('model')[event_types].sum()
    model_type_counts.plot(kind='bar', stacked=True, ax=ax1, colormap='Set3')
    ax1.set_title('Total Annotations by Model and Type')
    ax1.set_xlabel('Model')
    ax1.set_ylabel('Annotation Count')
    ax1.legend(title='Annotation Type', bbox_to_anchor=(1.05, 1), loc='upper left')
    ax1.tick_params(axis='x', rotation=45)
    
    # 2. Annotation counts comparison (consensus vs models)
    ax2 = axes[0, 1]
    consensus_counts = counts_df[counts_df['model'] == 'consensus'][event_types].sum()
    model_counts = counts_df[counts_df['model'] != 'consensus'].groupby('model')[event_types].sum().mean()
    
    x = np.arange(len(event_types))
    width = 0.35
    
    ax2.bar(x - width/2, consensus_counts, width, label='Consensus', color='gold', alpha=0.8)
    ax2.bar(x + width/2, model_counts, width, label='Models (avg)', color='steelblue', alpha=0.8)
    
    ax2.set_title('Consensus vs Model Predictions (Average)')
    ax2.set_xlabel('Annotation Type')
    ax2.set_ylabel('Count')
    ax2.set_xticks(x)
    ax2.set_xticklabels(event_types, rotation=45)
    ax2.legend()
    ax2.grid(axis='y', alpha=0.3)
    
    # 3. Document-wise annotation distribution
    ax3 = axes[1, 0]
    doc_totals = counts_df.groupby(['document', 'model'])[event_types].sum().sum(axis=1).reset_index()
    doc_totals_pivot = doc_totals.pivot(index='document', columns='model', values=0)
    doc_totals_pivot.plot(kind='bar', ax=ax3, colormap='viridis')
    ax3.set_title('Total Annotations by Document and Model')
    ax3.set_xlabel('Document')
    ax3.set_ylabel('Total Annotations')
    ax3.legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
    ax3.tick_params(axis='x', rotation=45)
    
    # 4. Annotation type performance (model deviation from consensus)
    ax4 = axes[1, 1]
    consensus_data = counts_df[counts_df['model'] == 'consensus'].groupby('document')[event_types].sum()
    model_data = counts_df[counts_df['model'] != 'consensus'].groupby(['document', 'model'])[event_types].sum().groupby('document').mean()
    
    differences = []
    for event_type in event_types:
        diff = model_data[event_type] - consensus_data[event_type]
        differences.append(diff.mean())
    
    colors = ['red' if d < 0 else 'green' for d in differences]
    ax4.bar(event_types, differences, color=colors, alpha=0.7)
    ax4.set_title('Average Model Deviation from Consensus')
    ax4.set_xlabel('Annotation Type')
    ax4.set_ylabel('Average Difference (Model - Consensus)')
    ax4.axhline(y=0, color='black', linestyle='-', alpha=0.3)
    ax4.tick_params(axis='x', rotation=45)
    ax4.grid(axis='y', alpha=0.3)
    
    plt.tight_layout()
    plt.show()

def create_comprehensive_analysis(results_df: pd.DataFrame, runtime_df: pd.DataFrame, counts_df: pd.DataFrame, event_types: List[str]):
    """Create comprehensive analysis combining all data."""
    print("📊 COMPREHENSIVE ANALYSIS")
    print("=" * 60)
    
    # Performance vs Runtime correlation
    if not results_df.empty and not runtime_df.empty:
        print("\n🔗 PERFORMANCE vs RUNTIME CORRELATION:")
        
        # Merge performance and runtime data
        perf_runtime = results_df.merge(runtime_df, on=['document', 'model'], how='inner')
        
        if not perf_runtime.empty:
            correlation = perf_runtime['f1'].corr(perf_runtime['runtime_seconds'])
            print(f"F1 Score vs Runtime correlation: {correlation:.3f}")
            
            # Create scatter plot
            plt.figure(figsize=(10, 6))
            for model in perf_runtime['model'].unique():
                model_data = perf_runtime[perf_runtime['model'] == model]
                plt.scatter(model_data['runtime_seconds'], model_data['f1'], 
                          label=model, alpha=0.7, s=60)
            
            plt.xlabel('Runtime (seconds)')
            plt.ylabel('F1 Score')
            plt.title('Performance vs Runtime by Model')
            plt.legend()
            plt.grid(alpha=0.3)
            plt.show()
    
    # Annotation count vs Performance analysis
    if not results_df.empty and not counts_df.empty:
        print("\n📈 ANNOTATION COUNT vs PERFORMANCE ANALYSIS:")
        
        # Calculate total predictions per model per document
        model_totals = counts_df[counts_df['model'] != 'consensus'].groupby(['document', 'model'])[event_types].sum().sum(axis=1).reset_index()
        model_totals.columns = ['document', 'model', 'total_predictions']
        
        # Get consensus totals
        consensus_totals = counts_df[counts_df['model'] == 'consensus'].groupby('document')[event_types].sum().sum(axis=1).reset_index()
        consensus_totals.columns = ['document', 'consensus_total']
        
        # Merge with performance data
        perf_counts = results_df.merge(model_totals, on=['document', 'model'], how='inner')
        perf_counts = perf_counts.merge(consensus_totals, on='document', how='inner')
        
        if not perf_counts.empty:
            # Calculate prediction ratio (model predictions / consensus annotations)
            perf_counts['prediction_ratio'] = perf_counts['total_predictions'] / perf_counts['consensus_total']
            
            avg_f1_by_ratio = perf_counts.groupby('model').agg({
                'f1': 'mean',
                'prediction_ratio': 'mean'
            }).round(3)
            
            print("Model Performance vs Prediction Ratio:")
            print(avg_f1_by_ratio.to_string())
    
    # Create visualizations
    if not runtime_df.empty:
        create_runtime_visualizations(runtime_df)
    
    if not counts_df.empty:
        create_annotation_count_visualizations(counts_df, event_types)

def run_comprehensive_analysis():
    """Run the comprehensive analysis with all new features."""
    print("🚀 Starting Comprehensive LLM Analysis...")
    
    # Use the globally defined pipeline results folder
    evaluator = LLMEventResultsEvaluator(output_dir=PIPELINE_RESULTS_FOLDER)
    
    # Run evaluation
    results = evaluator.run_evaluation()
    
    results_df = results['evaluation']
    runtime_df = results['runtime']
    counts_df = results['annotation_counts']
    
    if not results_df.empty:
        print(f"\n✅ Analysis completed successfully!")
        print(f"📊 Evaluation data: {len(results_df)} rows")
        print(f"⏱️  Runtime data: {len(runtime_df)} rows")
        print(f"📋 Annotation count data: {len(counts_df)} rows")
        
        # Create comprehensive analysis
        create_comprehensive_analysis(results_df, runtime_df, counts_df, evaluator.event_annotation_types)
        
        # Save all data
        results_df.to_csv(f"{PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_event_evaluation_results.csv", index=False)
        if not runtime_df.empty:
            runtime_df.to_csv(f"{PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_runtime_analysis.csv", index=False)
        if not counts_df.empty:
            counts_df.to_csv(f"{PIPELINE_RESULTS_FOLDER}/runtime_analysis/llm_annotation_counts.csv", index=False)
        
        print(f"\n💾 All data saved to output/ directory")
        
    else:
        print("❌ No results generated. Check if result folders and JSON files exist.")

# Run comprehensive analysis
if __name__ == "__main__":
    run_comprehensive_analysis()

In [None]:
# Save all documents with predictions using JSON format
def save_all_documents_with_predictions():
    """Save all documents that have predictions using JSON format."""
    
    # Load corpus and process predictions using the global folder
    evaluator = LLMEventResultsEvaluator(PIPELINE_RESULTS_FOLDER)
    result_folders = evaluator.find_result_folders()
    
    if result_folders:
        for folder in result_folders:
            result_jsons = evaluator.find_result_jsons(folder)
            for json_path in result_jsons:
                evaluator.process_result_json(json_path, folder.name)
    
    output_dir = Path(f"{PIPELINE_RESULTS_FOLDER}/annotated_corpus_with_predictions")
    output_dir.mkdir(parents=True, exist_ok=True)
    
    saved_count = 0
    
    # Process all documents
    for doc in evaluator.corpus:
        doc_name = Path(doc.features.get("gate.SourceURL", "")).stem
        pred_annsets = [name for name in doc.annset_names() if name.endswith("_predictions")]
        
        if pred_annsets:
            print(f"📋 Processing document: {doc_name}")
            
            # Create permanent annotation sets
            for annset_name in pred_annsets:
                model_name = annset_name.replace("_predictions", "")
                pred_annset = doc.annset(annset_name)
                permanent_annset = doc.annset(model_name)
                permanent_annset.clear()
                
                for ann in pred_annset:
                    features = dict(ann.features)
                    features.pop("source", None)
                    features.pop("model", None)
                    permanent_annset.add(ann.start, ann.end, ann.type, features)
                
                print(f"  ✅ Created {model_name} annotation set: {len(pred_annset)} annotations")
            
            # Save document as JSON
            try:
                output_file = output_dir / f"{doc_name}.bdocjs"
                doc.save(str(output_file), fmt="bdocjs")
                saved_count += 1
                print(f"  💾 Saved: {output_file.name}")
                
            except Exception as e:
                print(f"  ❌ Failed to save {doc_name}: {e}")
    
    # List final results
    files = list(output_dir.glob("*.json"))
    print(f"\n✅ Successfully saved {len(files)} documents with predictions!")
    print(f"\n📁 Files in {output_dir}:")
    for file in files:
        print(f"  📄 {file.name}")
    
    # Update summary file
    summary_file = output_dir / "annotation_summary.txt"
    with open(summary_file, 'w', encoding='utf-8') as f:
        f.write("Annotated Corpus Summary\n")
        f.write("=" * 50 + "\n\n")
        f.write(f"Generated on: {pd.Timestamp.now()}\n")
        f.write(f"Total documents: {len(files)}\n")
        f.write(f"Format: JSON (GateNLP BDOC format)\n\n")
        f.write("Model annotation sets included:\n")
        f.write("- gemma3:1b\n")
        f.write("- gemma3:12b\n") 
        f.write("- mistral:latest\n\n")
        f.write("To view in Gate:\n")
        f.write("1. Open Gate Developer\n")
        f.write("2. Load documents from this directory\n")
        f.write("3. Select JSON/BDOC format when loading\n")
        f.write("4. View different annotation sets in the annotation sets panel\n")
    
    return str(output_dir)

# Run the complete saving
result = save_all_documents_with_predictions()
print(f"\n🎯 FINAL SUCCESS! Annotated corpus saved to: {result}")

In [None]:
# Load corpus for viewing with GateNLP CorpusViewer
from GatenlpUtils import loadCorpus
from gatenlp.visualization import CorpusViewer

# Load corpus WITH model predictions for comparison
def load_corpus_with_predictions():
    """Load corpus and add the model predictions for viewing."""
    
    # Start fresh
    corpus_with_preds = loadCorpus()
    
    # Add predictions using the global folder
    evaluator = LLMEventResultsEvaluator(PIPELINE_RESULTS_FOLDER)
    result_folders = evaluator.find_result_folders()
    
    if result_folders:
        for folder in result_folders:
            result_jsons = evaluator.find_result_jsons(folder)
            for json_path in result_jsons:
                evaluator.process_result_json(json_path, folder.name)
    
    # Copy predictions to our viewing corpus
    docs_with_preds = 0
    for i, doc in enumerate(corpus_with_preds):
        if i < len(evaluator.corpus):
            evaluator_doc = evaluator.corpus[i]
            pred_annsets = [name for name in evaluator_doc.annset_names() if name.endswith("_predictions")]
            
            if pred_annsets:
                docs_with_preds += 1
                for annset_name in pred_annsets:
                    model_name = annset_name.replace("_predictions", "")
                    pred_annset = evaluator_doc.annset(annset_name)
                    
                    # Create permanent annotation set
                    permanent_annset = doc.annset(model_name)
                    permanent_annset.clear()
                    
                    for ann in pred_annset:
                        features = dict(ann.features)
                        features.pop("source", None)
                        features.pop("model", None)
                        permanent_annset.add(ann.start, ann.end, ann.type, features)
    
    print(f"Added predictions to {docs_with_preds} documents")
    return corpus_with_preds

corpus_with_predictions = load_corpus_with_predictions()
viewer_with_preds = CorpusViewer(corpus_with_predictions)
viewer_with_preds.show()



In [None]:
"""
# Save documents with predictions in BDOC format for GateNLP
output_dir = Path(f"{PIPELINE_RESULTS_FOLDER}/annotated_corpus_with_predictions")
for doc_pred in corpus_with_predictions:
    # Save each document with predictions in BDOC format
    doc_name = Path(doc_pred.features.get("gate.SourceURL", "")).stem
    output_file = output_dir / f"{doc_name}.bdocjs"
    doc_pred.save(str(output_file), fmt="bdocjs")
"""


In [None]:
def create_document_level_performance_tables():
    """Create comprehensive performance tables for each document showing precision, recall, and F1-score."""
    
    print("📊 CREATING DOCUMENT-LEVEL PERFORMANCE TABLES")
    print("=" * 80)
    
    # Load the evaluator and run the analysis
    evaluator = LLMEventResultsEvaluator(PIPELINE_RESULTS_FOLDER)
    results = evaluator.run_evaluation()
    results_df = results['evaluation']
    
    if results_df.empty:
        print("❌ No evaluation results found!")
        return
    
    # Create a performance table output directory
    performance_table_output_dir = Path(f"{PIPELINE_RESULTS_FOLDER}/performance_tables")
    performance_table_output_dir.mkdir(parents=True, exist_ok=True)

    # Get unique documents
    documents = sorted(results_df['document'].unique())
    
    # Annotation types to analyze (including event_type which is part of Event annotations)
    annotation_types = ["Event", "Event_who", "Event_what", "Event_when"]
    
    for doc_name in documents:
        print(f"\n📄 DOCUMENT: {doc_name}")
        print("=" * (len(doc_name) + 20))
        
        # Filter data for this document
        doc_data = results_df[results_df['document'] == doc_name]
        
        if doc_data.empty:
            print(f"No data found for document: {doc_name}")
            continue
        
        # Get all models for this document
        models = sorted(doc_data['model'].unique())
        
        # Create a comprehensive table for this document
        print(f"\n🎯 Performance Metrics for {doc_name}")
        print("-" * 100)
        
        # Create table data
        table_data = []
        
        for ann_type in annotation_types:
            # Get data for this annotation type
            type_data = doc_data[doc_data['annotation_type'] == ann_type]
            
            if type_data.empty:
                continue
            
            # Create row for each annotation type
            for _, row in type_data.iterrows():
                model = row['model']
                precision = row['precision']
                recall = row['recall']
                f1 = row['f1']
                gold_count = row['gold_count']
                pred_count = row['pred_count']
                matches = row.get('matches', 0)
                
                # Handle NaN values properly
                import math
                
                table_data.append({
                    'Annotation_Type': ann_type,
                    'Model': model,
                    'Precision': f"{precision:.3f}" if not math.isnan(precision) else "0.000",
                    'Recall': f"{recall:.3f}" if not math.isnan(recall) else "0.000",
                    'F1_Score': f"{f1:.3f}" if not math.isnan(f1) else "0.000",
                    'Gold_Count': int(gold_count) if not math.isnan(gold_count) else 0,
                    'Pred_Count': int(pred_count) if not math.isnan(pred_count) else 0,
                    'Matches': int(matches) if not math.isnan(matches) else 0
                })
        
        if table_data:
            # Convert to DataFrame for better display
            doc_df = pd.DataFrame(table_data)
            
            # Display the table
            print(doc_df.to_string(index=False))
            
            # Calculate and display summary statistics for this document
            print(f"\n📈 Summary Statistics for {doc_name}:")
            print("-" * 50)
            
            # Overall performance by model
            model_summary = doc_data.groupby('model').agg({
                'precision': 'mean',
                'recall': 'mean', 
                'f1': 'mean',
                'gold_count': 'sum',
                'pred_count': 'sum'
            }).round(3)
            
            print("\nOverall Performance by Model:")
            print(model_summary.to_string())
            
            # Performance by annotation type
            type_summary = doc_data.groupby('annotation_type').agg({
                'precision': 'mean',
                'recall': 'mean',
                'f1': 'mean',
                'gold_count': 'sum', 
                'pred_count': 'sum'
            }).round(3)
            
            print(f"\nPerformance by Annotation Type:")
            print(type_summary.to_string())
            
            # Best performing combinations for this document
            print(f"\n🏆 Best Performing Model-Type Combinations for {doc_name}:")
            best_combos = doc_data.nlargest(5, 'f1')[['model', 'annotation_type', 'f1', 'precision', 'recall']]
            print(best_combos.to_string(index=False))
            
            # Save document-specific results
            output_file = f"{PIPELINE_RESULTS_FOLDER}/performance_tables/{doc_name}_performance_table.csv"
            doc_df.to_csv(output_file, index=False)
            print(f"\n💾 Document table saved to: {output_file}")
            
        print("\n" + "="*100)
    
    # Create a consolidated summary table across all documents
    print(f"\n📊 CONSOLIDATED SUMMARY ACROSS ALL DOCUMENTS")
    print("=" * 80)
    
    # Overall summary by model across all documents
    overall_model_summary = results_df.groupby('model').agg({
        'precision': ['mean', 'std'],
        'recall': ['mean', 'std'],
        'f1': ['mean', 'std'],
        'gold_count': 'sum',
        'pred_count': 'sum'
    }).round(3)
    
    print("\n🔍 Overall Performance by Model (across all documents):")
    print(overall_model_summary.to_string())
    
    # Overall summary by annotation type across all documents
    overall_type_summary = results_df.groupby('annotation_type').agg({
        'precision': ['mean', 'std'],
        'recall': ['mean', 'std'],
        'f1': ['mean', 'std'],
        'gold_count': 'sum',
        'pred_count': 'sum'
    }).round(3)
    
    print(f"\n🎯 Overall Performance by Annotation Type (across all documents):")
    print(overall_type_summary.to_string())
    
    # Event type analysis (if available in features)
    print(f"\n🏷️  EVENT TYPE ANALYSIS:")
    print("-" * 40)
    
    # For Event annotations, also analyze performance by event_type if available
    # We need to look at the actual documents to get event type information
    event_type_analysis = []
    
    # Load corpus to check event types
    try:
        for doc in evaluator.corpus:
            doc_name = Path(doc.features.get("gate.SourceURL", "")).stem
            if doc_name in documents:
                consensus_annset = doc.annset("consensus")
                event_anns = list(consensus_annset.with_type("Event"))
                
                # Count event types in consensus
                event_types = {}
                for ann in event_anns:
                    event_type = ann.features.get("type", "unspecified")
                    event_types[event_type] = event_types.get(event_type, 0) + 1
                
                if event_types:
                    print(f"\n{doc_name} - Consensus Event Types:")
                    for etype, count in sorted(event_types.items()):
                        print(f"  {etype}: {count}")
                
                # Check model predictions for event types
                pred_annsets = [name for name in doc.annset_names() if name.endswith("_predictions")]
                for annset_name in pred_annsets:
                    model_name = annset_name.replace("_predictions", "")
                    pred_annset = doc.annset(annset_name)
                    pred_events = list(pred_annset.with_type("Event"))
                    
                    pred_event_types = {}
                    for ann in pred_events:
                        event_type = ann.features.get("type", "unspecified")
                        pred_event_types[event_type] = pred_event_types.get(event_type, 0) + 1
                    
                    if pred_event_types:
                        print(f"{doc_name} - {model_name} Event Types:")
                        for etype, count in sorted(pred_event_types.items()):
                            print(f"  {etype}: {count}")
    
    except Exception as e:
        print(f"Could not analyze event types: {e}")
    
    # Save consolidated results
    consolidated_output = f"{PIPELINE_RESULTS_FOLDER}/consolidated_document_performance_summary.csv"
    results_df.to_csv(consolidated_output, index=False)
    print(f"\n💾 Consolidated results saved to: {consolidated_output}")
    
    print(f"\n✅ Document-level performance analysis complete!")
    print(f"📁 Individual document tables saved to: {PIPELINE_RESULTS_FOLDER}/")
    print(f"🔍 Check the generated CSV files for detailed per-document analysis")

# Run the document-level analysis
create_document_level_performance_tables()

In [None]:
def create_concise_document_performance_summary():
    """Create concise, easy-to-read performance summary tables for each document."""
    
    print("📊 CONCISE DOCUMENT PERFORMANCE SUMMARY")
    print("=" * 80)
    
    # Load the evaluator and run the analysis
    evaluator = LLMEventResultsEvaluator(PIPELINE_RESULTS_FOLDER)
    results = evaluator.run_evaluation()
    results_df = results['evaluation']
    
    if results_df.empty:
        print("❌ No evaluation results found!")
        return
    
    # Get unique documents
    documents = sorted(results_df['document'].unique())
    annotation_types = ["Event", "Event_who", "Event_what", "Event_when"]
    
    # Create a master summary table
    print("\n📋 MASTER SUMMARY TABLE - All Documents")
    print("=" * 120)
    
    master_data = []
    
    for doc_name in documents:
        doc_data = results_df[results_df['document'] == doc_name]
        
        if doc_data.empty:
            continue
        
        # Calculate average metrics across all annotation types for each model
        for model in sorted(doc_data['model'].unique()):
            model_data = doc_data[doc_data['model'] == model]
            
            avg_precision = model_data['precision'].mean()
            avg_recall = model_data['recall'].mean()
            avg_f1 = model_data['f1'].mean()
            total_gold = model_data['gold_count'].sum()
            total_pred = model_data['pred_count'].sum()
            
            master_data.append({
                'Document': doc_name,
                'Model': model,
                'Avg_Precision': f"{avg_precision:.3f}",
                'Avg_Recall': f"{avg_recall:.3f}",
                'Avg_F1': f"{avg_f1:.3f}",
                'Total_Gold': int(total_gold),
                'Total_Pred': int(total_pred)
            })
    
    master_df = pd.DataFrame(master_data)
    print(master_df.to_string(index=False))
    
    # Save master summary
    master_output = f"{PIPELINE_RESULTS_FOLDER}/performance_tables/master_document_performance_summary.csv"
    master_df.to_csv(master_output, index=False)
    print(f"\n💾 Master summary saved to: {master_output}")
    
    # Create individual document tables in a more readable format
    for doc_name in documents:
        print(f"\n\n📄 {doc_name}")
        print("=" * (len(doc_name) + 4))
        
        doc_data = results_df[results_df['document'] == doc_name]
        
        if doc_data.empty:
            continue
        
        # Create pivot table for better readability
        models = sorted(doc_data['model'].unique())
        
        print(f"\n🎯 F1 Scores by Annotation Type:")
        print("-" * 60)
        
        # F1 Score table
        f1_data = []
        for ann_type in annotation_types:
            row = {'Annotation_Type': ann_type}
            type_data = doc_data[doc_data['annotation_type'] == ann_type]
            
            for model in models:
                model_type_data = type_data[type_data['model'] == model]
                if not model_type_data.empty:
                    f1_score = model_type_data['f1'].iloc[0]
                    row[model] = f"{f1_score:.3f}" if not pd.isna(f1_score) else "0.000"
                else:
                    row[model] = "N/A"
            
            f1_data.append(row)
        
        f1_df = pd.DataFrame(f1_data)
        print(f1_df.to_string(index=False))
        
        print(f"\n🎯 Precision by Annotation Type:")
        print("-" * 60)
        
        # Precision table
        prec_data = []
        for ann_type in annotation_types:
            row = {'Annotation_Type': ann_type}
            type_data = doc_data[doc_data['annotation_type'] == ann_type]
            
            for model in models:
                model_type_data = type_data[type_data['model'] == model]
                if not model_type_data.empty:
                    precision = model_type_data['precision'].iloc[0]
                    row[model] = f"{precision:.3f}" if not pd.isna(precision) else "0.000"
                else:
                    row[model] = "N/A"
            
            prec_data.append(row)
        
        prec_df = pd.DataFrame(prec_data)
        print(prec_df.to_string(index=False))
        
        print(f"\n🎯 Recall by Annotation Type:")
        print("-" * 60)
        
        # Recall table
        recall_data = []
        for ann_type in annotation_types:
            row = {'Annotation_Type': ann_type}
            type_data = doc_data[doc_data['annotation_type'] == ann_type]
            
            for model in models:
                model_type_data = type_data[type_data['model'] == model]
                if not model_type_data.empty:
                    recall = model_type_data['recall'].iloc[0]
                    row[model] = f"{recall:.3f}" if not pd.isna(recall) else "0.000"
                else:
                    row[model] = "N/A"
            
            recall_data.append(row)
        
        recall_df = pd.DataFrame(recall_data)
        print(recall_df.to_string(index=False))
        
        print(f"\n📊 Annotation Counts:")
        print("-" * 40)
        
        # Annotation counts table
        count_data = []
        for ann_type in annotation_types:
            row = {'Type': ann_type}
            type_data = doc_data[doc_data['annotation_type'] == ann_type]
            
            if not type_data.empty:
                # Gold count should be the same across all models for the same annotation type
                gold_count = type_data['gold_count'].iloc[0]
                row['Gold'] = int(gold_count) if not pd.isna(gold_count) else 0
                
                for model in models:
                    model_type_data = type_data[type_data['model'] == model]
                    if not model_type_data.empty:
                        pred_count = model_type_data['pred_count'].iloc[0]
                        row[f'{model}_Pred'] = int(pred_count) if not pd.isna(pred_count) else 0
                    else:
                        row[f'{model}_Pred'] = 0
            
            count_data.append(row)
        
        count_df = pd.DataFrame(count_data)
        print(count_df.to_string(index=False))
        
        # Save individual document summary
        doc_output_dir = Path(f"{PIPELINE_RESULTS_FOLDER}/document_summaries")
        doc_output_dir.mkdir(exist_ok=True)
        
        # Save all tables for this document
        f1_df.to_csv(doc_output_dir / f"{doc_name}_f1_scores.csv", index=False)
        prec_df.to_csv(doc_output_dir / f"{doc_name}_precision.csv", index=False)
        recall_df.to_csv(doc_output_dir / f"{doc_name}_recall.csv", index=False)
        count_df.to_csv(doc_output_dir / f"{doc_name}_counts.csv", index=False)
    
    print(f"\n\n✅ CONCISE SUMMARY COMPLETE!")
    print(f"📁 Master summary: {master_output}")
    print(f"📁 Individual document summaries: {PIPELINE_RESULTS_FOLDER}/document_summaries/")
    print(f"🔍 Each document has separate CSV files for F1, Precision, Recall, and Counts")

# Run the concise summary
create_concise_document_performance_summary()

In [None]:
def save_clean_documents_without_predictions():
    """Save documents with only permanent model annotation sets, removing temporary _predictions sets."""
    
    print("🧹 SAVING CLEAN DOCUMENTS (WITHOUT _predictions ANNOTATION SETS)")
    print("=" * 80)
    
    # Load corpus and process predictions using the global folder
    evaluator = LLMEventResultsEvaluator(PIPELINE_RESULTS_FOLDER)
    result_folders = evaluator.find_result_folders()
    
    if result_folders:
        for folder in result_folders:
            result_jsons = evaluator.find_result_jsons(folder)
            for json_path in result_jsons:
                evaluator.process_result_json(json_path, folder.name)
    
    # Create a clean output directory
    clean_output_dir = Path(f"{PIPELINE_RESULTS_FOLDER}/clean_annotated_corpus")
    clean_output_dir.mkdir(parents=True, exist_ok=True)
    
    saved_count = 0
    model_sets_created = set()
    
    # Process all documents
    for doc in evaluator.corpus:
        doc_name = Path(doc.features.get("gate.SourceURL", "")).stem
        pred_annsets = [name for name in doc.annset_names() if name.endswith("_predictions")]
        
        if pred_annsets:
            print(f"📋 Cleaning document: {doc_name}")
            
            # First, ensure we have clean permanent annotation sets
            for annset_name in pred_annsets:
                model_name = annset_name.replace("_predictions", "")
                pred_annset = doc.annset(annset_name)
                
                # Create/update permanent annotation set
                permanent_annset = doc.annset(model_name)
                permanent_annset.clear()
                
                for ann in pred_annset:
                    features = dict(ann.features)
                    # Remove evaluation-specific metadata
                    features.pop("source", None)
                    features.pop("model", None)
                    permanent_annset.add(ann.start, ann.end, ann.type, features)
                
                model_sets_created.add(model_name)
                print(f"  ✅ Clean {model_name} annotation set: {len(pred_annset)} annotations")
            
            # Now remove ALL temporary _predictions annotation sets
            annsets_to_remove = [name for name in doc.annset_names() if name.endswith("_predictions")]
            for temp_annset_name in annsets_to_remove:
                try:
                    # Remove the temporary annotation set
                    doc.remove_annset(temp_annset_name)
                    print(f"  🗑️  Removed temporary set: {temp_annset_name}")
                except Exception as e:
                    print(f"  ⚠️  Could not remove {temp_annset_name}: {e}")
            
            # Verify what annotation sets remain
            remaining_sets = [name for name in doc.annset_names() if name]
            print(f"  📊 Remaining annotation sets: {remaining_sets}")
            
            # Save clean document
            try:
                clean_output_file = clean_output_dir / f"{doc_name}.bdocjs"
                doc.save(str(clean_output_file), fmt="bdocjs")
                saved_count += 1
                print(f"  💾 Saved clean document: {clean_output_file.name}")
                
            except Exception as e:
                print(f"  ❌ Failed to save clean {doc_name}: {e}")
    
    # List final results
    clean_files = list(clean_output_dir.glob("*.bdocjs"))
    print(f"\n✅ Successfully saved {len(clean_files)} CLEAN documents!")
    print(f"🗑️  Removed all temporary '*_predictions' annotation sets")
    print(f"📊 Created clean annotation sets for models: {sorted(model_sets_created)}")
    
    print(f"\n📁 Clean files saved to: {clean_output_dir}")
    for file in clean_files:
        print(f"  📄 {file.name}")
    
    # Create updated summary file
    clean_summary_file = clean_output_dir / "clean_annotation_summary.txt"
    with open(clean_summary_file, 'w', encoding='utf-8') as f:
        f.write("Clean Annotated Corpus Summary\n")
        f.write("=" * 50 + "\n\n")
        f.write(f"Generated on: {pd.Timestamp.now()}\n")
        f.write(f"Total documents: {len(clean_files)}\n")
        f.write(f"Format: JSON (GateNLP BDOC format)\n\n")
        f.write("✅ CLEAN VERSION - No temporary annotation sets included\n")
        f.write("Model annotation sets included:\n")
        for model in sorted(model_sets_created):
            f.write(f"- {model}\n")
        f.write("\nRemoved annotation sets:\n")
        f.write("- All *_predictions sets (temporary evaluation sets)\n\n")
        f.write("To view in Gate:\n")
        f.write("1. Open Gate Developer\n")
        f.write("2. Load documents from this directory\n")
        f.write("3. Select JSON/BDOC format when loading\n")
        f.write("4. View different annotation sets in the annotation sets panel\n")
        f.write("5. Only consensus and model prediction sets should be visible\n")
    
    # Verify one of the clean files doesn't contain _predictions
    if clean_files:
        test_file = clean_files[0]
        print(f"\n🔍 VERIFICATION: Checking {test_file.name} for cleanliness...")
        
        # Check if the file contains _predictions
        with open(test_file, 'r', encoding='utf-8') as f:
            content = f.read()
            if '_predictions' in content:
                print(f"  ⚠️  WARNING: {test_file.name} still contains '_predictions' strings!")
            else:
                print(f"  ✅ VERIFIED: {test_file.name} is clean - no '_predictions' found!")
    
    return str(clean_output_dir)

# Run the clean saving
clean_result = save_clean_documents_without_predictions()
print(f"\n🎯 CLEAN CORPUS SUCCESS! Clean documents saved to: {clean_result}")
print(f"\n📝 The clean corpus should now only contain:")
print(f"   - 'consensus' annotation sets (gold standard)")
print(f"   - Model name annotation sets (e.g., 'gemma3:1b', 'mistral:latest')")
print(f"   - NO '*_predictions' annotation sets in corpus")