In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
pip install anthropic pandas

Collecting anthropic
  Downloading anthropic-0.55.0-py3-none-any.whl.metadata (27 kB)
Downloading anthropic-0.55.0-py3-none-any.whl (289 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m289.3/289.3 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.55.0


# CLAUDE

In [None]:
"""
GROUND TRUTH Forensic Analysis Performance Evaluator using Claude Sonnet 4
Requirements: pip install anthropic pandas

🚀 GROUND TRUTH EVALUATION MODE: Evaluates ground truth forensic analyses!

✅ MODIFIED VERSION: Now evaluates ground truth data instead of technique comparisons!
🔒 GUARANTEED 100% DATA PROCESSING: Verifies all ground truth files are processed!
🤖 CLAUDE SONNET 4: Uses latest Claude Sonnet 4 (claude-sonnet-4-20250514) model!

Features:
- ✅ Processes ground truth evaluations in optimized batches
- ✅ Concurrent execution with thread pool for speed
- ✅ Batch-level checkpointing and error recovery
- ✅ Smart load balancing across API calls
- ✅ Automatic retry logic for failed evaluations
- ✅ Progress tracking at both batch and individual level
- ✅ Memory-optimized data loading per batch
- ✅ Claude Sonnet 4 with enhanced context window (300K+ chars)

⚡ PERFORMANCE BENEFITS:
- 6x faster processing through parallelization
- Better API utilization with concurrent requests
- Reduced total evaluation time from hours to minutes
- Intelligent batch sizing to respect rate limits
- Enhanced capabilities with Claude Sonnet 4

🎯 GROUND TRUTH STRATEGY: Evaluates each ground truth analysis against forensic standards
and provides comprehensive quality assessment for reference data.

🔧 MODIFICATION: Adapted for ground truth evaluation with flat file structure!
🤖 MODEL: Uses Claude Sonnet 4 (claude-sonnet-4-20250514) as primary model!
"""

import json
import os
import pandas as pd
from pathlib import Path
import anthropic
import time
from datetime import datetime
import re
import concurrent.futures
import threading
from collections import defaultdict
import logging

# Configuration - UPDATED FOR GROUND TRUTH
API_KEY_FILE = "/content/drive/Shareddrives/DR KOFI RESEARCH/RESEARCH/PROMPTS/API-KEYS/claude.txt"
SOURCE_DIR = "/content/drive/Shareddrives/DR KOFI RESEARCH/RESEARCH/PROMPTS/GroundTruth"  # Updated path
OUTPUT_DIR = "/content/drive/Shareddrives/DR KOFI RESEARCH/RESEARCH/PROMPTS/GROUNDTRUTH-EVALUATION-RESULTS"  # Updated output

# Batch processing configuration
BATCH_SIZE = 6  # Number of evaluations per batch
MAX_WORKERS = 6  # Number of concurrent threads
RETRY_ATTEMPTS = 10  # Attempts per individual evaluation
BATCH_RETRY_ATTEMPTS = 3  # Retry failed batches
INTER_BATCH_DELAY = 10  # Seconds between batches

# Thread-safe counters
evaluation_lock = threading.Lock()
success_counter = 0
failure_counter = 0
truncation_counter = 0

def setup_logging(output_dir):
    """Set up logging for batch processing"""
    log_dir = os.path.join(output_dir, "logs")
    os.makedirs(log_dir, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = os.path.join(log_dir, f"groundtruth_evaluation_{timestamp}.log")

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler()
        ]
    )

    return log_file

# Read API key from file
def load_api_key():
    try:
        with open(API_KEY_FILE, "r") as f:
            api_key = f.read().strip()
        print("✅ Claude API key loaded successfully")
        logging.info("✅ Claude API key loaded successfully")
        return api_key
    except Exception as e:
        print(f"❌ Error loading API key: {e}")
        logging.error(f"❌ Error loading API key: {e}")
        return None

# Initialize Anthropic client
ANTHROPIC_API_KEY = load_api_key()
if ANTHROPIC_API_KEY:
    try:
        client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
        print("✅ Claude client initialized")
        logging.info("✅ Claude client initialized")

        # Test API connection
        print("🔍 Testing API connection...")
        logging.info("🔍 Testing API connection...")
        test_response = client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=10,
            messages=[{"role": "user", "content": "Hi"}]
        )
        print("✅ API connection successful")
        logging.info("✅ API connection successful")
    except Exception as e:
        print(f"⚠️  API setup warning: {e}")
        logging.warning(f"⚠️  API setup warning: {e}")
        print("🔄 Continuing anyway - will test during evaluation")
        client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY) if ANTHROPIC_API_KEY else None
else:
    client = None
    print("❌ Failed to initialize Claude client")
    logging.error("❌ Failed to initialize Claude client")

# Evaluation criteria - UPDATED FOR GROUND TRUTH EVALUATION
EVALUATION_CRITERIA = """
GROUND TRUTH EVALUATION CRITERIA:
Evaluate how well this ground truth analysis serves as a reference standard for forensic analysis training and evaluation.

1. Crime Classification and Intent Detection (10 points)
Forensic Question: How accurately and comprehensively does the ground truth identify and classify all criminal offenses with precise legal terminology?
Rating Scale:
10 points: Perfect identification of all crime types with expert-level legal precision, suitable as definitive reference
8-9 points: Highly accurate crime classification with minor gaps, excellent reference quality
6-7 points: Good crime identification but missing some nuances or secondary offenses
3-5 points: Basic crime classification with significant gaps or inaccuracies
0-2 points: Poor crime identification, unsuitable as reference standard

2. Temporal Forensic Reconstruction (10 points)
Forensic Question: How precisely does the ground truth establish the complete chronological sequence with forensic-quality timestamps?
Rating Scale:
10 points: Exemplary timeline reconstruction with precise timestamps, perfect reference standard
8-9 points: Excellent chronology with most critical timestamps, high-quality reference
6-7 points: Good temporal reconstruction but missing some key markers
3-5 points: Basic timeline with significant gaps
0-2 points: Poor temporal documentation, inadequate for reference use

3. Subject Identification and Behavioral Analysis (10 points)
Forensic Question: How thoroughly does the ground truth document all identifying features and behavioral patterns for training purposes?
Rating Scale:
10 points: Comprehensive subject documentation with expert-level behavioral analysis, ideal training reference
8-9 points: Detailed subject profiles with strong behavioral insights, excellent reference quality
6-7 points: Good identification details but lacks some forensic descriptors
3-5 points: Basic subject documentation with limited behavioral analysis
0-2 points: Inadequate subject identification for training purposes

4. Physical Evidence Documentation (10 points)
Forensic Question: How completely does the ground truth catalog all physical evidence with proper forensic methodology?
Rating Scale:
10 points: Complete evidence inventory with expert forensic documentation, perfect reference standard
8-9 points: Comprehensive evidence documentation with strong forensic methodology
6-7 points: Good evidence tracking but missing some items or details
3-5 points: Basic evidence documentation with gaps
0-2 points: Poor evidence documentation, unsuitable for reference

5. Violence Assessment and Weapon Analysis (10 points)
Forensic Question: How expertly does the ground truth document violence dynamics, weapons, and force patterns?
Rating Scale:
10 points: Expert-level violence and weapon analysis, definitive reference quality
8-9 points: Excellent violence documentation with detailed weapon analysis
6-7 points: Good violence assessment but missing some analytical depth
3-5 points: Basic violence documentation with limited analysis
0-2 points: Poor violence assessment, inadequate for reference use

6. Criminal Network and Coordination Analysis (10 points)
Forensic Question: How completely does the ground truth map criminal relationships and communication patterns?
Rating Scale:
10 points: Complete criminal network mapping with expert analysis, perfect reference
8-9 points: Excellent network analysis with detailed relationship documentation
6-7 points: Good coordination analysis but missing some connections
3-5 points: Basic network documentation with gaps
0-2 points: Poor network analysis, unsuitable for reference

7. Modus Operandi Documentation (10 points)
Forensic Question: How expertly does the ground truth identify and document criminal methods for pattern recognition training?
Rating Scale:
10 points: Expert MO analysis with comprehensive signature behavior documentation, ideal reference
8-9 points: Excellent MO identification with detailed criminal methodology
6-7 points: Good MO documentation but lacks some analytical depth
3-5 points: Basic MO identification with limited detail
0-2 points: Poor MO analysis, inadequate for training purposes

8. Scene Analysis and Environmental Context (10 points)
Forensic Question: How thoroughly does the ground truth document scene characteristics and environmental factors?
Rating Scale:
10 points: Complete scene analysis with expert environmental assessment, perfect reference
8-9 points: Excellent scene documentation with comprehensive context analysis
6-7 points: Good scene analysis but missing some environmental factors
3-5 points: Basic scene documentation with limited context
0-2 points: Poor scene analysis, unsuitable for reference use

9. Escape Route and Exit Strategy Analysis (10 points)
Forensic Question: How completely does the ground truth reconstruct escape routes and document exit strategies?
Rating Scale:
10 points: Complete escape analysis with expert route mapping, definitive reference quality
8-9 points: Excellent escape documentation with detailed route analysis
6-7 points: Good escape route identification but missing some details
3-5 points: Basic escape documentation with gaps
0-2 points: Poor escape analysis, inadequate for reference

10. Reference Quality and Training Utility (10 points)
Forensic Question: How well does the ground truth serve as a comprehensive reference standard for training and evaluation?
Rating Scale:
10 points: Exemplary reference quality with perfect structure, ideal for training and evaluation
8-9 points: Excellent reference standard with high training utility
6-7 points: Good reference quality but could be enhanced for training
3-5 points: Basic reference quality with limited training utility
0-2 points: Poor reference quality, unsuitable for training or evaluation purposes
"""

def load_groundtruth_files(directory):
    """Load all ground truth JSON files from the directory - MODIFIED FOR GROUND TRUTH STRUCTURE"""
    print(f"🔍 Scanning ground truth directory: {directory}")
    logging.info(f"🔍 Scanning ground truth directory: {directory}")

    groundtruth_data = {}
    stats = {'total_files': 0, 'loaded_files': 0, 'failed_files': 0}

    if not os.path.exists(directory):
        print(f"❌ Directory not found: {directory}")
        logging.error(f"❌ Directory not found: {directory}")
        return groundtruth_data, stats

    # Get all ground truth JSON files
    json_files = [f for f in os.listdir(directory)
                  if f.endswith('-GroundTruth.json') or f.endswith('GroundTruth.json')]

    print(f"📁 Found {len(json_files)} ground truth files")
    logging.info(f"📁 Found {len(json_files)} ground truth files")

    for json_file in json_files:
        stats['total_files'] += 1

        # Extract crime category from filename (e.g., "Abuse-GroundTruth.json" -> "Abuse")
        if '-GroundTruth.json' in json_file:
            crime_category = json_file.replace('-GroundTruth.json', '')
        else:
            crime_category = json_file.replace('GroundTruth.json', '').rstrip('-')

        file_path = os.path.join(directory, json_file)

        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)

                # Validate data is not empty
                if data and (not isinstance(data, dict) or len(data) > 0):
                    groundtruth_data[crime_category] = data
                    stats['loaded_files'] += 1
                    print(f"    ✅ Loaded: {crime_category}")
                    logging.info(f"    ✅ Loaded: {crime_category}")
                else:
                    print(f"    ⚠️  Empty file: {crime_category}")
                    logging.warning(f"    ⚠️  Empty file: {crime_category}")
                    stats['failed_files'] += 1
        except Exception as e:
            print(f"    ❌ Error loading {json_file}: {e}")
            logging.error(f"    ❌ Error loading {json_file}: {e}")
            stats['failed_files'] += 1

    print(f"\n📊 Ground Truth Loading Summary:")
    print(f"  Total files found: {stats['total_files']}")
    print(f"  Successfully loaded: {stats['loaded_files']}")
    print(f"  Failed to load: {stats['failed_files']}")
    logging.info(f"📊 Ground Truth Loading Summary: {stats['loaded_files']}/{stats['total_files']} loaded successfully")

    return groundtruth_data, stats

def truncate_analysis_data(analysis_data, max_chars=300000):
    """Truncate analysis data to fit within token limits (Claude Sonnet 4 has even larger context window)"""

    if isinstance(analysis_data, dict):
        combined_text = ""
        total_chars = 0

        for key, value in analysis_data.items():
            if isinstance(value, dict):
                section = f"\n--- Ground Truth Analysis {key} ---\n" + json.dumps(value, indent=2)
            else:
                section = f"\n--- Ground Truth Analysis {key} ---\n{str(value)}\n"

            # Check if adding this section would exceed limit
            if total_chars + len(section) > max_chars:
                # Add truncation notice and break
                combined_text += f"\n\n[NOTE: Ground truth analysis truncated due to length. Showing first {total_chars:,} characters of {total_chars + len(str(analysis_data)) - len(combined_text):,} total characters]"
                break

            combined_text += section
            total_chars += len(section)

        return combined_text
    else:
        text = str(analysis_data)
        if len(text) > max_chars:
            return text[:max_chars] + f"\n\n[NOTE: Ground truth analysis truncated. Showing first {max_chars:,} characters of {len(text):,} total characters]"
        return text

def create_groundtruth_evaluation_prompt(crime_category, groundtruth_data):
    """Create evaluation prompt for ground truth analysis - MODIFIED FOR GROUND TRUTH EVALUATION"""

    # Truncate data to prevent token limit issues (Claude Sonnet 4 has very large context window)
    combined_text = truncate_analysis_data(groundtruth_data, max_chars=300000)

    prompt = f"""
You are a forensic analysis expert evaluating GROUND TRUTH reference data.

EVALUATION TASK:
Evaluate the following GROUND TRUTH forensic analysis for a {crime_category} incident. This is reference-quality data that should serve as a training standard for forensic analysts.

EVALUATION CRITERIA:
{EVALUATION_CRITERIA}

GROUND TRUTH ANALYSIS TO EVALUATE:
{combined_text}

EVALUATION INSTRUCTIONS:
1. Carefully read the provided ground truth analysis
2. Score each of the 10 criteria on a scale of 0-10 points
3. Assess whether this analysis meets the standards expected of reference-quality ground truth data
4. Provide your evaluation in the following EXACT format:

SCORES:
1. Crime Classification and Intent Detection: [score]/10
2. Temporal Forensic Reconstruction: [score]/10
3. Subject Identification and Behavioral Analysis: [score]/10
4. Physical Evidence Documentation: [score]/10
5. Violence Assessment and Weapon Analysis: [score]/10
6. Criminal Network and Coordination Analysis: [score]/10
7. Modus Operandi Documentation: [score]/10
8. Scene Analysis and Environmental Context: [score]/10
9. Escape Route and Exit Strategy Analysis: [score]/10
10. Reference Quality and Training Utility: [score]/10

TOTAL SCORE: [sum]/100

BRIEF JUSTIFICATION:
[Provide 2-3 sentences explaining the overall assessment of this ground truth analysis quality]

Be objective and rigorous in your scoring. Ground truth data should meet the highest forensic standards and serve as excellent reference material for training and evaluation.
"""

    return prompt

def evaluate_with_claude(prompt, evaluation_id, max_retries=RETRY_ATTEMPTS):
    """Send evaluation prompt to Claude API - optimized for batch processing"""

    for attempt in range(max_retries):
        try:
            print(f"    🤖 {evaluation_id} - Attempt {attempt + 1}")
            logging.debug(f"    🤖 {evaluation_id} - Attempt {attempt + 1}")

            # Progressive truncation for batch efficiency
            current_prompt = prompt
            if attempt >= 2:
                # More generous limits for Claude Sonnet 4
                base_chars = 250000 if "sonnet-4" in str(models_to_try[0]) else 200000
                reduction_per_attempt = 50000  # Less aggressive reduction for larger context
                max_chars = max(30000, base_chars - (attempt * reduction_per_attempt))

                lines = prompt.split('\n')
                analysis_start = None
                for i, line in enumerate(lines):
                    if "GROUND TRUTH ANALYSIS TO EVALUATE:" in line:
                        analysis_start = i + 1
                        break

                if analysis_start:
                    pre_analysis = '\n'.join(lines[:analysis_start])
                    analysis_section = '\n'.join(lines[analysis_start:])

                    if len(analysis_section) > max_chars:
                        truncated = analysis_section[:max_chars]
                        analysis_section = truncated + f"\n\n[BATCH TRUNCATION: Attempt {attempt + 1}, {max_chars:,} chars]"

                    current_prompt = pre_analysis + analysis_section

            # Model selection optimized for batch processing
            models_to_try = [
                "claude-sonnet-4-20250514",      # Latest Claude Sonnet 4 - most capable
                "claude-3-5-sonnet-20241022",    # Fallback to Claude 3.5 Sonnet
                "claude-3-haiku-20240307"        # Fast model for batch processing
            ]

            for model in models_to_try:
                try:
                    # Optimized settings for each model
                    if "sonnet-4" in model:
                        max_tokens = 2500  # Claude Sonnet 4 - highest capacity
                    elif "3-5-sonnet" in model:
                        max_tokens = 2000  # Claude 3.5 Sonnet
                    elif "haiku" in model:
                        max_tokens = 1500  # Claude 3 Haiku - faster processing
                    else:
                        max_tokens = 2000  # Default

                    response = client.messages.create(
                        model=model,
                        max_tokens=max_tokens,
                        temperature=0.1,
                        messages=[{
                            "role": "user",
                            "content": current_prompt
                        }]
                    )

                    print(f"    ✅ {evaluation_id} - SUCCESS with {model}")
                    logging.info(f"    ✅ {evaluation_id} - SUCCESS with {model}")
                    return response.content[0].text

                except Exception as model_error:
                    print(f"    ❌ {evaluation_id} - {model} failed: {str(model_error)[:50]}...")
                    logging.warning(f"    ❌ {evaluation_id} - {model} failed: {str(model_error)[:50]}...")
                    continue

        except Exception as e:
            print(f"    ❌ {evaluation_id} - API Error (attempt {attempt + 1}): {str(e)[:100]}")
            logging.error(f"    ❌ {evaluation_id} - API Error (attempt {attempt + 1}): {str(e)[:100]}")

            error_str = str(e).lower()

            if "too long" in error_str or "prompt is too long" in error_str:
                # Emergency truncation for batch processing
                if "GROUND TRUTH ANALYSIS TO EVALUATE:" in current_prompt:
                    parts = current_prompt.split("GROUND TRUTH ANALYSIS TO EVALUATE:")
                    if len(parts) == 2:
                        emergency_chars = max(15000, 60000 - (attempt * 15000))
                        truncated_analysis = parts[1][:emergency_chars] + f"\n\n[EMERGENCY TRUNCATION: {emergency_chars:,} chars]"
                        current_prompt = parts[0] + "GROUND TRUTH ANALYSIS TO EVALUATE:" + truncated_analysis
                        continue

            elif "rate limit" in error_str:
                # Shorter waits for batch processing
                wait_time = min(30 + (attempt * 10), 90)
                print(f"    ⏳ {evaluation_id} - Rate limit, waiting {wait_time}s...")
                logging.info(f"    ⏳ {evaluation_id} - Rate limit, waiting {wait_time}s...")
                time.sleep(wait_time)
                continue

            # Progressive backoff
            if attempt < max_retries - 1:
                wait_time = min(5 + (attempt * 2), 20)
                time.sleep(wait_time)
            else:
                print(f"    ❌ {evaluation_id} - Failed after {max_retries} attempts")
                logging.error(f"    ❌ {evaluation_id} - Failed after {max_retries} attempts")
                return None

    return None

def process_single_evaluation(evaluation_task):
    """Process a single evaluation task - designed for concurrent execution"""
    global success_counter, failure_counter, truncation_counter

    crime_category, groundtruth_data, eval_number, total_evals = evaluation_task
    evaluation_id = f"GroundTruth-{crime_category}"

    try:
        # Check if data will be truncated
        if isinstance(groundtruth_data, dict):
            data_size = sum(len(str(v)) for v in groundtruth_data.values())
        else:
            data_size = len(str(groundtruth_data))

        if data_size > 200000:
            with evaluation_lock:
                truncation_counter += 1

        # Create evaluation prompt
        prompt = create_groundtruth_evaluation_prompt(crime_category, groundtruth_data)

        # Get evaluation from Claude
        response = evaluate_with_claude(prompt, evaluation_id)

        if response:
            # Parse scores
            scores = parse_evaluation_response(response)

            if scores:
                result = {
                    'crime_category': crime_category,
                    'total_score': scores['Total_Score'],
                    'detailed_scores': scores,
                    'success': True,
                    'response': response,
                    'evaluation_id': evaluation_id
                }

                with evaluation_lock:
                    success_counter += 1

                print(f"    ✅ {evaluation_id} - Score: {scores['Total_Score']}/100")
                logging.info(f"    ✅ {evaluation_id} - Score: {scores['Total_Score']}/100")
                return result
            else:
                print(f"    ❌ {evaluation_id} - Failed to parse evaluation")
                logging.error(f"    ❌ {evaluation_id} - Failed to parse evaluation")
                with evaluation_lock:
                    failure_counter += 1
                return {
                    'crime_category': crime_category,
                    'success': False,
                    'error': 'PARSE_ERROR',
                    'evaluation_id': evaluation_id
                }
        else:
            print(f"    ❌ {evaluation_id} - No response")
            logging.error(f"    ❌ {evaluation_id} - No response")
            with evaluation_lock:
                failure_counter += 1
            return {
                'crime_category': crime_category,
                'success': False,
                'error': 'NO_RESPONSE',
                'evaluation_id': evaluation_id
            }

    except Exception as e:
        print(f"    ❌ {evaluation_id} - Exception: {e}")
        logging.error(f"    ❌ {evaluation_id} - Exception: {e}")
        with evaluation_lock:
            failure_counter += 1
        return {
            'crime_category': crime_category,
            'success': False,
            'error': f'EXCEPTION: {str(e)}',
            'evaluation_id': evaluation_id
        }

def process_batch(batch_tasks, batch_number, total_batches):
    """Process a batch of evaluations concurrently"""
    print(f"\n🚀 BATCH {batch_number}/{total_batches} - Processing {len(batch_tasks)} ground truth evaluations")
    logging.info(f"🚀 BATCH {batch_number}/{total_batches} - Processing {len(batch_tasks)} ground truth evaluations")

    batch_results = []
    batch_start_time = time.time()

    # Process batch with ThreadPoolExecutor
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all tasks in the batch
        future_to_task = {
            executor.submit(process_single_evaluation, task): task
            for task in batch_tasks
        }

        # Collect results as they complete
        for future in concurrent.futures.as_completed(future_to_task):
            task = future_to_task[future]
            try:
                result = future.result()
                batch_results.append(result)
            except Exception as exc:
                print(f"    ❌ Task generated exception: {exc}")
                logging.error(f"    ❌ Task generated exception: {exc}")
                # Create error result
                crime_category = task[0]
                batch_results.append({
                    'crime_category': crime_category,
                    'success': False,
                    'error': f'FUTURE_EXCEPTION: {str(exc)}',
                    'evaluation_id': f"GroundTruth-{crime_category}"
                })

    batch_duration = time.time() - batch_start_time
    successful_in_batch = sum(1 for r in batch_results if r['success'])

    print(f"📊 BATCH {batch_number} COMPLETE - {successful_in_batch}/{len(batch_tasks)} successful ({batch_duration:.1f}s)")
    logging.info(f"📊 BATCH {batch_number} COMPLETE - {successful_in_batch}/{len(batch_tasks)} successful ({batch_duration:.1f}s)")

    return batch_results

def create_evaluation_batches(groundtruth_data):
    """Create optimized batches of evaluation tasks - MODIFIED FOR GROUND TRUTH"""
    print("🔧 Creating ground truth evaluation batches...")
    logging.info("🔧 Creating ground truth evaluation batches...")

    all_tasks = []

    # Create all evaluation tasks
    eval_number = 0
    for crime_category, groundtruth_analysis in groundtruth_data.items():
        eval_number += 1
        task = (crime_category, groundtruth_analysis, eval_number, 0)  # Will update total later
        all_tasks.append(task)

    # Update total count in all tasks
    total_evals = len(all_tasks)
    all_tasks = [(t[0], t[1], t[2], total_evals) for t in all_tasks]

    # Split into batches
    batches = []
    for i in range(0, len(all_tasks), BATCH_SIZE):
        batch = all_tasks[i:i + BATCH_SIZE]
        batches.append(batch)

    print(f"📦 Created {len(batches)} batches of {BATCH_SIZE} evaluations each")
    print(f"📊 Total ground truth evaluations: {total_evals}")
    logging.info(f"📦 Created {len(batches)} batches of {BATCH_SIZE} evaluations each, total: {total_evals}")

    return batches

def retry_failed_batches(failed_results, groundtruth_data):
    """Retry failed evaluations as separate batches"""
    if not failed_results:
        return []

    print(f"\n🔄 RETRYING {len(failed_results)} failed ground truth evaluations...")
    logging.info(f"🔄 RETRYING {len(failed_results)} failed ground truth evaluations...")

    # Create retry tasks
    retry_tasks = []
    for result in failed_results:
        crime_category = result['crime_category']

        if crime_category in groundtruth_data:
            analysis_data = groundtruth_data[crime_category]
            task = (crime_category, analysis_data, 0, len(failed_results))
            retry_tasks.append(task)

    if not retry_tasks:
        return []

    # Process retries in smaller batches
    retry_batch_size = max(1, BATCH_SIZE // 2)  # Smaller batches for retries
    retry_results = []

    for i in range(0, len(retry_tasks), retry_batch_size):
        batch = retry_tasks[i:i + retry_batch_size]
        batch_number = (i // retry_batch_size) + 1
        total_retry_batches = (len(retry_tasks) + retry_batch_size - 1) // retry_batch_size

        print(f"🔄 RETRY BATCH {batch_number}/{total_retry_batches}")
        logging.info(f"🔄 RETRY BATCH {batch_number}/{total_retry_batches}")

        batch_results = process_batch(batch, f"RETRY-{batch_number}", total_retry_batches)
        retry_results.extend(batch_results)

        # Longer delay between retry batches
        if i + retry_batch_size < len(retry_tasks):
            time.sleep(INTER_BATCH_DELAY * 2)

    return retry_results

def parse_evaluation_response(response_text):
    """Parse Claude's evaluation response to extract scores"""

    if not response_text:
        return None

    scores = {}
    total_score = 0
    justification = ""

    try:
        # Extract individual scores - UPDATED FOR GROUND TRUTH CRITERIA
        score_patterns = [
            r"1\.\s*Crime Classification and Intent Detection:\s*(\d+)",
            r"2\.\s*Temporal Forensic Reconstruction:\s*(\d+)",
            r"3\.\s*Subject Identification and Behavioral Analysis:\s*(\d+)",
            r"4\.\s*Physical Evidence Documentation:\s*(\d+)",
            r"5\.\s*Violence Assessment and Weapon Analysis:\s*(\d+)",
            r"6\.\s*Criminal Network and Coordination Analysis:\s*(\d+)",
            r"7\.\s*Modus Operandi Documentation:\s*(\d+)",
            r"8\.\s*Scene Analysis and Environmental Context:\s*(\d+)",
            r"9\.\s*Escape Route and Exit Strategy Analysis:\s*(\d+)",
            r"10\.\s*Reference Quality and Training Utility:\s*(\d+)"  # Updated criterion
        ]

        criteria_names = [
            "Crime_Classification",
            "Temporal_Reconstruction",
            "Subject_Identification",
            "Physical_Evidence",
            "Violence_Assessment",
            "Criminal_Network",
            "Modus_Operandi",
            "Scene_Analysis",
            "Escape_Route",
            "Reference_Quality"  # Updated name
        ]

        for i, pattern in enumerate(score_patterns):
            match = re.search(pattern, response_text, re.IGNORECASE)
            if match:
                score = int(match.group(1))
                scores[criteria_names[i]] = score
                total_score += score

        # Extract total score
        total_match = re.search(r"TOTAL SCORE:\s*(\d+)", response_text, re.IGNORECASE)
        if total_match:
            reported_total = int(total_match.group(1))
            # Use calculated total if reported total doesn't match
            if reported_total != total_score and total_score > 0:
                print(f"  ⚠️  Total score mismatch: calculated={total_score}, reported={reported_total}")
                logging.warning(f"  ⚠️  Total score mismatch: calculated={total_score}, reported={reported_total}")

        # Extract justification
        just_match = re.search(r"BRIEF JUSTIFICATION:\s*(.+?)(?:\n\n|\Z)", response_text, re.DOTALL | re.IGNORECASE)
        if just_match:
            justification = just_match.group(1).strip()

        scores['Total_Score'] = total_score
        scores['Justification'] = justification

        return scores

    except Exception as e:
        print(f"  ❌ Error parsing response: {e}")
        logging.error(f"  ❌ Error parsing response: {e}")
        return None

def verify_complete_data_processing(groundtruth_data, results):
    """Verify that ALL available ground truth data has been processed - NO DATA LEFT BEHIND"""

    verification_report = {
        'total_data_files': 0,
        'processed_files': 0,
        'unprocessed_files': [],
        'completion_percentage': 0,
        'missing_evaluations': []
    }

    print("\n🔍 VERIFYING COMPLETE GROUND TRUTH DATA PROCESSING...")
    logging.info("🔍 VERIFYING COMPLETE GROUND TRUTH DATA PROCESSING...")

    # Count all available data
    for category, data in groundtruth_data.items():
        verification_report['total_data_files'] += 1

        # Check if this category was processed successfully
        result = results.get(category, "NOT_ATTEMPTED")

        if isinstance(result, (int, float)):  # Successful evaluation (got a score)
            verification_report['processed_files'] += 1
        else:
            verification_report['unprocessed_files'].append(category)
            verification_report['missing_evaluations'].append({
                'category': category,
                'result': result,
                'reason': result if isinstance(result, str) else "UNKNOWN"
            })

    # Calculate completion
    if verification_report['total_data_files'] > 0:
        verification_report['completion_percentage'] = (
            verification_report['processed_files'] / verification_report['total_data_files'] * 100
        )

    # Log verification results
    print(f"📊 GROUND TRUTH DATA PROCESSING VERIFICATION:")
    print(f"  Total ground truth files available: {verification_report['total_data_files']}")
    print(f"  Files successfully processed: {verification_report['processed_files']}")
    print(f"  Files unprocessed: {len(verification_report['unprocessed_files'])}")
    print(f"  Completion rate: {verification_report['completion_percentage']:.2f}%")

    logging.info(f"📊 GROUND TRUTH DATA PROCESSING VERIFICATION:")
    logging.info(f"  Total ground truth files: {verification_report['total_data_files']}")
    logging.info(f"  Successfully processed: {verification_report['processed_files']}")
    logging.info(f"  Unprocessed: {len(verification_report['unprocessed_files'])}")
    logging.info(f"  Completion rate: {verification_report['completion_percentage']:.2f}%")

    if verification_report['unprocessed_files']:
        print(f"⚠️  UNPROCESSED GROUND TRUTH FILES:")
        logging.warning(f"⚠️  UNPROCESSED GROUND TRUTH FILES:")
        for file_name in verification_report['unprocessed_files']:
            print(f"    - {file_name}")
            logging.warning(f"    - {file_name}")
    else:
        print("🎉 ALL GROUND TRUTH FILES HAVE BEEN PROCESSED!")
        logging.info("🎉 ALL GROUND TRUTH FILES HAVE BEEN PROCESSED!")

    return verification_report

def force_process_remaining_data(groundtruth_data, results, detailed_results):
    """Force process any remaining unprocessed ground truth data files"""

    print("\n🔒 FORCE PROCESSING REMAINING GROUND TRUTH DATA...")
    logging.info("🔒 FORCE PROCESSING REMAINING GROUND TRUTH DATA...")

    unprocessed_tasks = []

    # Find unprocessed data
    for category, data in groundtruth_data.items():
        result = results.get(category, "NOT_ATTEMPTED")

        if not isinstance(result, (int, float)):  # Not successfully processed
            unprocessed_tasks.append((category, data, 0, 0))

    if not unprocessed_tasks:
        print("✅ No unprocessed ground truth data found!")
        logging.info("✅ No unprocessed ground truth data found!")
        return 0

    print(f"🔧 Found {len(unprocessed_tasks)} unprocessed ground truth files")
    logging.info(f"🔧 Found {len(unprocessed_tasks)} unprocessed ground truth files")

    # Process in smaller batches with more conservative settings
    force_batch_size = 3  # Smaller batches for problematic data
    force_processed = 0

    for i in range(0, len(unprocessed_tasks), force_batch_size):
        batch = unprocessed_tasks[i:i + force_batch_size]
        batch_number = (i // force_batch_size) + 1
        total_force_batches = (len(unprocessed_tasks) + force_batch_size - 1) // force_batch_size

        print(f"🔧 FORCE BATCH {batch_number}/{total_force_batches} - Processing {len(batch)} files")
        logging.info(f"🔧 FORCE BATCH {batch_number}/{total_force_batches} - Processing {len(batch)} files")

        # Process with more conservative settings
        for task in batch:
            category, data, _, _ = task
            evaluation_id = f"FORCE-GroundTruth-{category}"

            # Use most conservative prompt settings
            prompt = create_groundtruth_evaluation_prompt(category, data)

            # Try with emergency truncation if needed
            if len(prompt) > 100000:  # Very conservative limit
                lines = prompt.split('\n')
                analysis_start = None
                for j, line in enumerate(lines):
                    if "GROUND TRUTH ANALYSIS TO EVALUATE:" in line:
                        analysis_start = j + 1
                        break

                if analysis_start:
                    pre_analysis = '\n'.join(lines[:analysis_start])
                    analysis_section = '\n'.join(lines[analysis_start:])

                    if len(analysis_section) > 30000:  # Emergency limit
                        analysis_section = analysis_section[:30000] + "\n\n[FORCE PROCESSING TRUNCATION]"

                    prompt = pre_analysis + analysis_section

            # Single-threaded processing for force mode
            response = evaluate_with_claude(prompt, evaluation_id, max_retries=15)

            if response:
                scores = parse_evaluation_response(response)

                if scores and scores.get('Total_Score', 0) > 0:
                    # Success
                    results[category] = scores['Total_Score']

                    # Add to detailed results
                    detailed_entry = {
                        'Crime_Category': category,
                        'Total_Score': scores['Total_Score'],
                        'Justification': scores.get('Justification', 'Force processed'),
                        'Raw_Response': response,
                        'Batch_Number': 'FORCE',
                        'Processing_Type': 'FORCE_COMPLETE'
                    }

                    for criterion, score in scores.items():
                        if criterion not in ['Total_Score', 'Justification']:
                            detailed_entry[criterion] = score

                    detailed_results.append(detailed_entry)
                    force_processed += 1

                    print(f"    ✅ FORCE SUCCESS: {evaluation_id} - Score: {scores['Total_Score']}/100")
                    logging.info(f"    ✅ FORCE SUCCESS: {evaluation_id} - Score: {scores['Total_Score']}/100")

                    # Update global counters
                    global success_counter
                    with evaluation_lock:
                        success_counter += 1
                else:
                    results[category] = "FORCE_PARSE_FAILED"
                    print(f"    ❌ FORCE PARSE FAILED: {evaluation_id}")
                    logging.error(f"    ❌ FORCE PARSE FAILED: {evaluation_id}")
            else:
                results[category] = "FORCE_API_FAILED"
                print(f"    ❌ FORCE API FAILED: {evaluation_id}")
                logging.error(f"    ❌ FORCE API FAILED: {evaluation_id}")

            # Longer delay for force processing
            time.sleep(5)

        # Longer delay between force batches
        if i + force_batch_size < len(unprocessed_tasks):
            time.sleep(20)

    print(f"\n📊 FORCE PROCESSING SUMMARY:")
    print(f"  Unprocessed files found: {len(unprocessed_tasks)}")
    print(f"  Successfully force processed: {force_processed}")
    print(f"  Still failed after force processing: {len(unprocessed_tasks) - force_processed}")

    logging.info(f"📊 FORCE PROCESSING SUMMARY:")
    logging.info(f"  Unprocessed files: {len(unprocessed_tasks)}")
    logging.info(f"  Force processed: {force_processed}")
    logging.info(f"  Still failed: {len(unprocessed_tasks) - force_processed}")

    return force_processed

def save_batch_checkpoint(results, detailed_results, batch_number, total_batches, output_dir):
    """Save checkpoint after each batch"""
    checkpoint_dir = os.path.join(output_dir, "checkpoints")
    os.makedirs(checkpoint_dir, exist_ok=True)

    checkpoint_data = {
        'results': results,
        'detailed_results': detailed_results,
        'batch_progress': {
            'completed_batches': batch_number,
            'total_batches': total_batches,
            'completion_percentage': (batch_number / total_batches * 100) if total_batches > 0 else 0
        },
        'stats': {
            'successful_evaluations': success_counter,
            'failed_evaluations': failure_counter,
            'truncated_files': truncation_counter
        },
        'timestamp': datetime.now().isoformat()
    }

    checkpoint_file = os.path.join(checkpoint_dir, "groundtruth_checkpoint.json")
    try:
        with open(checkpoint_file, 'w', encoding='utf-8') as f:
            json.dump(checkpoint_data, f, indent=2, ensure_ascii=False)
        print(f"    💾 Checkpoint saved ({batch_number}/{total_batches} batches)")
        logging.info(f"    💾 Checkpoint saved ({batch_number}/{total_batches} batches)")
    except Exception as e:
        print(f"    ⚠️  Checkpoint save failed: {e}")
        logging.error(f"    ⚠️  Checkpoint save failed: {e}")

def run_groundtruth_evaluation():
    """Main ground truth evaluation function - MODIFIED FOR GROUND TRUTH PROCESSING"""

    print("🚀 Starting Ground Truth Forensic Analysis Performance Evaluation")
    print("🔒 GUARANTEED 100% GROUND TRUTH DATA PROCESSING - ALL FILES WILL BE EVALUATED")
    print(f"📁 Source Directory: {SOURCE_DIR}")
    print(f"💾 Output Directory: {OUTPUT_DIR}")
    print(f"📦 Batch Size: {BATCH_SIZE}")
    print(f"🧵 Max Workers: {MAX_WORKERS}")
    print(f"🎯 Strategy: Concurrent batch processing of ground truth with verification & force completion")

    # Setup
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    log_file = setup_logging(OUTPUT_DIR)

    # Check API key
    if not ANTHROPIC_API_KEY or not client:
        print("❌ Claude API key not loaded. Please check the API key file.")
        logging.error("❌ Claude API key not loaded. Please check the API key file.")
        return

    # Load all ground truth data
    print("\n" + "="*60)
    print("📖 LOADING GROUND TRUTH DATA")
    print("="*60)

    groundtruth_data, data_stats = load_groundtruth_files(SOURCE_DIR)

    if not groundtruth_data:
        print("❌ No ground truth data found. Exiting.")
        logging.error("❌ No ground truth data found. Exiting.")
        return

    # Create evaluation batches
    print("\n" + "="*60)
    print("📦 CREATING GROUND TRUTH EVALUATION BATCHES")
    print("="*60)

    batches = create_evaluation_batches(groundtruth_data)

    if not batches:
        print("❌ No evaluation batches created. Exiting.")
        logging.error("❌ No evaluation batches created. Exiting.")
        return

    # Initialize results
    results = {}
    detailed_results = []
    all_batch_results = []

    # Process all batches
    print("\n" + "="*60)
    print("🔍 PROCESSING GROUND TRUTH EVALUATION BATCHES")
    print("="*60)

    total_batches = len(batches)

    for batch_idx, batch in enumerate(batches, 1):
        print(f"\n📊 Overall Progress: {batch_idx}/{total_batches} batches")
        logging.info(f"📊 Overall Progress: {batch_idx}/{total_batches} batches")

        # Process batch
        batch_results = process_batch(batch, batch_idx, total_batches)
        all_batch_results.extend(batch_results)

        # Update results structure
        for result in batch_results:
            crime_category = result['crime_category']

            if result['success']:
                results[crime_category] = result['total_score']

                # Add to detailed results
                detailed_entry = {
                    'Crime_Category': crime_category,
                    'Total_Score': result['total_score'],
                    'Justification': result['detailed_scores'].get('Justification', ''),
                    'Raw_Response': result['response'],
                    'Batch_Number': batch_idx,
                    'Processing_Type': 'BATCH'
                }

                # Add individual criterion scores
                for criterion, score in result['detailed_scores'].items():
                    if criterion not in ['Total_Score', 'Justification']:
                        detailed_entry[criterion] = score

                detailed_results.append(detailed_entry)
            else:
                results[crime_category] = result['error']

        # Save checkpoint after each batch
        save_batch_checkpoint(results, detailed_results, batch_idx, total_batches, OUTPUT_DIR)

        # Inter-batch delay (except for last batch)
        if batch_idx < total_batches:
            print(f"⏳ Inter-batch delay: {INTER_BATCH_DELAY}s")
            logging.info(f"⏳ Inter-batch delay: {INTER_BATCH_DELAY}s")
            time.sleep(INTER_BATCH_DELAY)

    # Retry failed evaluations
    print("\n" + "="*60)
    print("🔄 RETRY PHASE - FAILED GROUND TRUTH EVALUATIONS")
    print("="*60)

    failed_results = [r for r in all_batch_results if not r['success']]

    if failed_results:
        print(f"🔄 Found {len(failed_results)} failed ground truth evaluations to retry")
        logging.info(f"🔄 Found {len(failed_results)} failed ground truth evaluations to retry")

        retry_results = retry_failed_batches(failed_results, groundtruth_data)

        # Update results with retry successes
        for result in retry_results:
            if result['success']:
                crime_category = result['crime_category']

                results[crime_category] = result['total_score']

                # Add to detailed results
                detailed_entry = {
                    'Crime_Category': crime_category,
                    'Total_Score': result['total_score'],
                    'Justification': result['detailed_scores'].get('Justification', ''),
                    'Raw_Response': result['response'],
                    'Batch_Number': 'RETRY',
                    'Processing_Type': 'RETRY'
                }

                for criterion, score in result['detailed_scores'].items():
                    if criterion not in ['Total_Score', 'Justification']:
                        detailed_entry[criterion] = score

                detailed_results.append(detailed_entry)

                # Update counters
                with evaluation_lock:
                    global success_counter, failure_counter
                    success_counter += 1
                    failure_counter = max(0, failure_counter - 1)
    else:
        print("🎉 No failed ground truth evaluations to retry!")
        logging.info("🎉 No failed ground truth evaluations to retry!")

    # VERIFY COMPLETE DATA PROCESSING
    print("\n" + "="*60)
    print("🔍 VERIFICATION: ENSURING ALL GROUND TRUTH DATA PROCESSED")
    print("="*60)

    verification_report = verify_complete_data_processing(groundtruth_data, results)

    # FORCE PROCESS REMAINING DATA if needed
    if verification_report['completion_percentage'] < 100:
        print("\n" + "="*60)
        print("🔒 FORCE PROCESSING - NO GROUND TRUTH DATA LEFT BEHIND")
        print("="*60)

        force_processed = force_process_remaining_data(groundtruth_data, results, detailed_results)

        # Re-verify after force processing
        final_verification = verify_complete_data_processing(groundtruth_data, results)
        print(f"\n🎯 FINAL VERIFICATION: {final_verification['completion_percentage']:.2f}% complete")
        logging.info(f"🎯 FINAL VERIFICATION: {final_verification['completion_percentage']:.2f}% complete")
    else:
        print("🎉 VERIFICATION PASSED: 100% GROUND TRUTH DATA PROCESSING CONFIRMED!")
        logging.info("🎉 VERIFICATION PASSED: 100% GROUND TRUTH DATA PROCESSING CONFIRMED!")
        final_verification = verification_report

    # Save final results
    print("\n" + "="*60)
    print("💾 SAVING FINAL GROUND TRUTH RESULTS")
    print("="*60)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Get all crime categories
    crime_categories = sorted(list(groundtruth_data.keys()))

    # Ensure all categories have results
    for category in crime_categories:
        if category not in results:
            results[category] = "NOT_PROCESSED"

    # Save summary as series (since it's just one "technique" - ground truth)
    summary_df = pd.DataFrame({
        'Crime_Category': crime_categories,
        'Ground_Truth_Score': [results.get(cat, "NOT_PROCESSED") for cat in crime_categories]
    })
    summary_file = os.path.join(OUTPUT_DIR, f"groundtruth_summary_{timestamp}.csv")
    summary_df.to_csv(summary_file, index=False)
    print(f"✅ Summary saved: {summary_file}")
    logging.info(f"✅ Summary saved: {summary_file}")

    # Save detailed results
    if detailed_results:
        detailed_df = pd.DataFrame(detailed_results)
        detailed_file = os.path.join(OUTPUT_DIR, f"groundtruth_detailed_{timestamp}.csv")
        detailed_df.to_csv(detailed_file, index=False)
        print(f"✅ Detailed results saved: {detailed_file}")
        logging.info(f"✅ Detailed results saved: {detailed_file}")

    # Save comprehensive statistics
    total_evaluations = len(crime_categories)
    actual_data_files = final_verification.get('total_data_files', len(groundtruth_data))

    final_stats = {
        'execution_summary': {
            'total_ground_truth_files': total_evaluations,
            'actual_data_files': actual_data_files,
            'successful_evaluations': success_counter,
            'failed_evaluations': failure_counter,
            'truncated_files': truncation_counter,
            'success_rate': (success_counter / actual_data_files * 100) if actual_data_files > 0 else 0,
            'total_batches_processed': total_batches,
            'batch_size': BATCH_SIZE,
            'max_workers': MAX_WORKERS,
            'verification_completion_percentage': final_verification.get('completion_percentage', 0),
            'all_data_processed': final_verification.get('completion_percentage', 0) >= 99.0
        },
        'data_loading': data_stats,
        'verification_report': final_verification,
        'timestamp': timestamp,
        'evaluator': 'Claude-Sonnet-4-GroundTruth',
        'primary_model': 'claude-sonnet-4-20250514'
    }

    # Save raw results as JSON
    raw_file = os.path.join(OUTPUT_DIR, f"groundtruth_complete_{timestamp}.json")
    with open(raw_file, 'w', encoding='utf-8') as f:
        json.dump({
            'summary': results,
            'detailed': detailed_results,
            'statistics': final_stats,
            'batch_results': all_batch_results
        }, f, indent=2, ensure_ascii=False)
    print(f"✅ Complete data saved: {raw_file}")
    logging.info(f"✅ Complete data saved: {raw_file}")

    # Print comprehensive summary
    print("\n" + "="*80)
    print("📊 GROUND TRUTH EVALUATION SUMMARY - 100% DATA VERIFICATION")
    print("="*80)
    print(summary_df.to_string(index=False))

    print(f"\n" + "="*60)
    print("📈 GROUND TRUTH PROCESSING STATISTICS - COMPLETE DATA COVERAGE")
    print("="*60)
    print(f"📦 Total batches processed: {total_batches}")
    print(f"🧵 Concurrent workers used: {MAX_WORKERS}")
    print(f"📊 Batch size: {BATCH_SIZE}")
    print(f"📁 Ground truth files found: {actual_data_files}")
    print(f"✅ Successful evaluations: {success_counter}")
    print(f"❌ Failed evaluations: {failure_counter}")
    print(f"⚠️  Truncated files: {truncation_counter}")
    print(f"🔍 Verification completion: {final_verification.get('completion_percentage', 0):.2f}%")

    # DEFINITIVE DATA PROCESSING CONFIRMATION
    if final_stats['execution_summary'].get('all_data_processed', False):
        print("\n🏆 CONFIRMATION: ALL AVAILABLE GROUND TRUTH DATA HAS BEEN PROCESSED!")
        print("🔒 GUARANTEE FULFILLED: 100% of ground truth files have been evaluated")
        logging.info("🏆 CONFIRMATION: ALL AVAILABLE GROUND TRUTH DATA HAS BEEN PROCESSED!")
    else:
        remaining_pct = 100 - final_verification.get('completion_percentage', 0)
        print(f"\n⚠️  {remaining_pct:.2f}% of ground truth data could not be processed despite maximum efforts")
        print("📊 This may be due to corrupted files or API limitations")
        logging.warning(f"⚠️  {remaining_pct:.2f}% of ground truth data could not be processed")

    if actual_data_files > 0:
        success_rate = success_counter / actual_data_files * 100
        print(f"🎯 Success rate: {success_rate:.1f}%")

        if success_rate >= 95:
            print("🏆 EXCELLENT: 95%+ success rate achieved with ground truth evaluation!")
        elif success_rate >= 90:
            print("🥇 VERY GOOD: 90%+ success rate achieved with ground truth evaluation!")
        elif success_rate >= 80:
            print("🥈 GOOD: 80%+ success rate achieved with ground truth evaluation!")
        else:
            print(f"📊 COMPLETED: {success_rate:.1f}% success rate with ground truth evaluation")

    print(f"\n🎉 Ground Truth Evaluation Complete!")
    print(f"📁 Results saved in: {OUTPUT_DIR}")
    print(f"📝 Log file: {log_file}")

    # Clean up checkpoint if successful
    if success_counter >= actual_data_files * 0.9:  # 90% success rate
        try:
            checkpoint_file = os.path.join(OUTPUT_DIR, "checkpoints", "groundtruth_checkpoint.json")
            if os.path.exists(checkpoint_file):
                os.remove(checkpoint_file)
                print("🧹 Cleanup: Removed checkpoint file")
                logging.info("🧹 Cleanup: Removed checkpoint file")
        except:
            pass

if __name__ == "__main__":
    try:
        run_groundtruth_evaluation()
    except KeyboardInterrupt:
        print("\n⏸️  Ground truth evaluation interrupted by user")
        print("💾 Progress has been saved - run again to resume")
        logging.info("⏸️  Ground truth evaluation interrupted by user")
    except Exception as e:
        print(f"\n💥 Unexpected error: {e}")
        print("💾 Progress has been saved - run again to resume")
        logging.error(f"💥 Unexpected error: {e}", exc_info=True)

✅ Claude API key loaded successfully
✅ Claude client initialized
🔍 Testing API connection...
✅ API connection successful
🚀 Starting Ground Truth Forensic Analysis Performance Evaluation
🔒 GUARANTEED 100% GROUND TRUTH DATA PROCESSING - ALL FILES WILL BE EVALUATED
📁 Source Directory: /content/drive/Shareddrives/DR KOFI RESEARCH/RESEARCH/PROMPTS/GroundTruth
💾 Output Directory: /content/drive/Shareddrives/DR KOFI RESEARCH/RESEARCH/PROMPTS/GROUNDTRUTH-EVALUATION-RESULTS
📦 Batch Size: 6
🧵 Max Workers: 6
🎯 Strategy: Concurrent batch processing of ground truth with verification & force completion

📖 LOADING GROUND TRUTH DATA
🔍 Scanning ground truth directory: /content/drive/Shareddrives/DR KOFI RESEARCH/RESEARCH/PROMPTS/GroundTruth
📁 Found 11 ground truth files
    ✅ Loaded: Vandalism
    ✅ Loaded: Fighting
    ✅ Loaded: Assault
    ✅ Loaded: Shoplifting
    ✅ Loaded: Arson
    ✅ Loaded: Explosion
    ✅ Loaded: Robbery
    ✅ Loaded: Abuse
    ✅ Loaded: Stealing
    ✅ Loaded: Shooting
    ✅ Loa

/content/drive/Shareddrives/DR KOFI RESEARCH/RESEARCH/PROMPTS/RESULTS/CLAUDE-METRICS