In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install anthropic pandas

Collecting anthropic
  Downloading anthropic-0.57.1-py3-none-any.whl.metadata (27 kB)
Downloading anthropic-0.57.1-py3-none-any.whl (292 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/292.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━[0m [32m174.1/292.8 kB[0m [31m4.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m292.8/292.8 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: anthropic
Successfully installed anthropic-0.57.1


# CLAUDE

In [3]:
"""
BATCH-OPTIMIZED Forensic Analysis Performance Evaluator using Claude Sonnet 4
Requirements: pip install anthropic pandas

🚀 BATCH PROCESSING MODE: Processes evaluations in optimized batches for maximum efficiency!

✅ CORRECTED VERSION: Now actually uses the batch processing functions!
🔒 GUARANTEED 100% DATA PROCESSING: Verifies all data files are processed!
🤖 CLAUDE SONNET 4: Uses latest Claude Sonnet 4 (claude-sonnet-4-20250514) model!

Features:
- ✅ Processes 6 evaluations simultaneously per batch
- ✅ Concurrent execution with thread pool for speed
- ✅ Batch-level checkpointing and error recovery
- ✅ Smart load balancing across API calls
- ✅ Automatic retry logic for failed batches
- ✅ Progress tracking at both batch and individual level
- ✅ Memory-optimized data loading per batch
- ✅ Claude Sonnet 4 with enhanced context window (300K+ chars)

⚡ PERFORMANCE BENEFITS:
- 6x faster processing through parallelization
- Better API utilization with concurrent requests
- Reduced total evaluation time from hours to minutes
- Intelligent batch sizing to respect rate limits
- Enhanced capabilities with Claude Sonnet 4

🎯 BATCH STRATEGY: Groups evaluations into optimal batches, processes concurrently,
and provides comprehensive progress tracking with automatic error recovery.

🔧 CORRECTION: Fixed main evaluation loop to actually USE batch processing functions!
🤖 MODEL: Uses Claude Sonnet 4 (claude-sonnet-4-20250514) as primary model!
"""

import json
import os
import pandas as pd
from pathlib import Path
import anthropic
import time
from datetime import datetime
import re
import concurrent.futures
import threading
from collections import defaultdict
import logging

# Configuration
API_KEY_FILE = "/content/drive/Shareddrives/DR KOFI RESEARCH/RESEARCH/PROMPTS/SAVE/FINAL-COMPLETED/API-KEYS/claude.txt"
SOURCE_DIR = "/content/drive/Shareddrives/DR KOFI RESEARCH/RESEARCH/PROMPTS/SAVE/FINAL-COMPLETED/ANALY-CLAUDE-NEW"
OUTPUT_DIR = "/content/drive/Shareddrives/DR KOFI RESEARCH/RESEARCH/PROMPTS/RESULTS"

# Batch processing configuration
BATCH_SIZE = 6  # Number of evaluations per batch
MAX_WORKERS = 6  # Number of concurrent threads
RETRY_ATTEMPTS = 10  # Attempts per individual evaluation
BATCH_RETRY_ATTEMPTS = 3  # Retry failed batches
INTER_BATCH_DELAY = 10  # Seconds between batches

# Thread-safe counters
evaluation_lock = threading.Lock()
success_counter = 0
failure_counter = 0
truncation_counter = 0

def setup_logging(output_dir):
    """Set up logging for batch processing"""
    log_dir = os.path.join(output_dir, "logs")
    os.makedirs(log_dir, exist_ok=True)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    log_file = os.path.join(log_dir, f"claude_batch_evaluation_{timestamp}.log")

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_file),
            logging.StreamHandler()
        ]
    )

    return log_file

# Read API key from file
def load_api_key():
    try:
        with open(API_KEY_FILE, "r") as f:
            api_key = f.read().strip()
        print("✅ Claude API key loaded successfully")
        logging.info("✅ Claude API key loaded successfully")
        return api_key
    except Exception as e:
        print(f"❌ Error loading API key: {e}")
        logging.error(f"❌ Error loading API key: {e}")
        return None

# Initialize Anthropic client
ANTHROPIC_API_KEY = load_api_key()
if ANTHROPIC_API_KEY:
    try:
        client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY)
        print("✅ Claude client initialized")
        logging.info("✅ Claude client initialized")

        # Test API connection
        print("🔍 Testing API connection...")
        logging.info("🔍 Testing API connection...")
        test_response = client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=10,
            messages=[{"role": "user", "content": "Hi"}]
        )
        print("✅ API connection successful")
        logging.info("✅ API connection successful")
    except Exception as e:
        print(f"⚠️  API setup warning: {e}")
        logging.warning(f"⚠️  API setup warning: {e}")
        print("🔄 Continuing anyway - will test during evaluation")
        client = anthropic.Anthropic(api_key=ANTHROPIC_API_KEY) if ANTHROPIC_API_KEY else None
else:
    client = None
    print("❌ Failed to initialize Claude client")
    logging.error("❌ Failed to initialize Claude client")

# Evaluation criteria
EVALUATION_CRITERIA = """
1. Crime Classification and Intent Detection (10 points)
Forensic Question: How accurately can the analyst identify and classify criminal offenses while distinguishing criminal intent from non-criminal behavior?
Rating Scale:
10 points: Perfect identification of all crime types with precise legal terminology
8-9 points: Accurately identifies most crimes with minor classification errors
6-7 points: Identifies primary offenses but misses secondary criminal acts
3-5 points: Confuses crime types or misinterprets criminal intent
0-2 points: Fails to identify crimes or distinguish criminal behavior

2. Temporal Forensic Reconstruction (10 points)
Forensic Question: How precisely does the analyst establish the chronological sequence of criminal events with evidentiary timestamps?
Rating Scale:
10 points: Complete timeline with exact timestamps for all forensically significant events
8-9 points: Accurate sequence with most critical timestamps documented
6-7 points: Basic chronology established but missing key temporal markers
3-5 points: Significant gaps in timeline reconstruction
0-2 points: Unable to establish forensic timeline

3. Subject Identification and Behavioral Analysis (10 points)
Forensic Question: How thoroughly does the analyst document perpetrator identification features and behavioral patterns for investigative purposes?
Rating Scale:
10 points: Comprehensive subject profiles with all identifying features and behaviors documented
8-9 points: Detailed descriptions of most subjects with good behavioral analysis
6-7 points: Basic identification details but lacks specific forensic descriptors
3-5 points: Incomplete subject documentation with limited behavioral notes
0-2 points: Inadequate subject identification for investigative use

4. Physical Evidence Documentation (10 points)
Forensic Question: How effectively does the analyst catalog and track all physical evidence throughout the criminal incident?
Rating Scale:
10 points: Complete evidence inventory with chain of custody tracking
8-9 points: Most evidence documented with good continuity
6-7 points: Primary evidence noted but secondary items overlooked
3-5 points: Inconsistent evidence tracking with gaps
0-2 points: Critical evidence undocumented

5. Violence Assessment and Weapon Analysis (10 points)
Forensic Question: How comprehensively does the analyst document use of force, weapon involvement, and violence escalation patterns?
Rating Scale:
10 points: Expert analysis of all violence dynamics, weapons, and force levels
8-9 points: Good documentation of most violent acts and weapons
6-7 points: Basic violence patterns identified but missing details
3-5 points: Limited violence documentation
0-2 points: Fails to properly assess violence or weapons

6. Criminal Network and Coordination Analysis (10 points)
Forensic Question: How well does the analyst identify co-conspirator relationships, roles, and communication patterns?
Rating Scale:
10 points: Complete mapping of criminal network with all interactions documented
8-9 points: Most accomplice relationships and communications identified
6-7 points: Basic coordination recognized but subtle patterns missed
3-5 points: Limited understanding of criminal coordination
0-2 points: Unable to identify criminal network patterns

7. Modus Operandi Documentation (10 points)
Forensic Question: How precisely does the analyst identify signature criminal methods and techniques that could link to other cases?
Rating Scale:
10 points: Expert MO analysis with all signature behaviors documented
8-9 points: Good identification of criminal methods and patterns
6-7 points: Basic MO elements noted but lacks detail
3-5 points: Limited recognition of criminal methodology
0-2 points: No MO pattern identification

8. Scene Analysis and Environmental Context (10 points)
Forensic Question: How thoroughly does the analyst document crime scene characteristics and environmental factors affecting the incident?
Rating Scale:
10 points: Complete scene documentation with all environmental impacts analyzed
8-9 points: Good scene analysis with most relevant factors noted
6-7 points: Basic scene elements documented but context missing
3-5 points: Limited scene documentation
0-2 points: Inadequate scene analysis

9. Escape Route and Exit Strategy Analysis (10 points)
Forensic Question: How completely does the analyst reconstruct perpetrator escape routes and document exit strategies?
Rating Scale:
10 points: Full escape route mapping with pre-planning elements identified
8-9 points: Most escape paths documented with good detail
6-7 points: Basic escape direction noted but details missing
3-5 points: Limited escape route documentation
0-2 points: No escape analysis provided

10. Forensic Narrative and Court Readiness (10 points)
Forensic Question: How well does the analyst produce a coherent forensic narrative suitable for investigative and prosecutorial use?
Rating Scale:
10 points: Court-ready narrative with all elements integrated and properly cited
8-9 points: Clear narrative suitable for investigative reports
6-7 points: Basic narrative but needs refinement for legal use
3-5 points: Disjointed narrative requiring significant revision
0-2 points: Narrative unsuitable for forensic purposes
"""

def load_json_files(directory):
    """Load all JSON files from the directory structure with comprehensive validation"""
    print(f"🔍 Scanning directory: {directory}")
    logging.info(f"🔍 Scanning directory: {directory}")

    data_structure = {}
    stats = {'total_files': 0, 'loaded_files': 0, 'failed_files': 0}

    if not os.path.exists(directory):
        print(f"❌ Directory not found: {directory}")
        logging.error(f"❌ Directory not found: {directory}")
        return data_structure, stats

    # Get all technique folders
    technique_folders = [f for f in os.listdir(directory)
                        if os.path.isdir(os.path.join(directory, f))]

    print(f"📁 Found {len(technique_folders)} technique folders: {technique_folders}")
    logging.info(f"📁 Found {len(technique_folders)} technique folders: {technique_folders}")

    for technique in technique_folders:
        technique_path = os.path.join(directory, technique)
        data_structure[technique] = {}

        # Get all JSON files in technique folder
        json_files = [f for f in os.listdir(technique_path) if f.endswith('.json')]

        print(f"  📁 {technique}: {len(json_files)} files")
        logging.info(f"  📁 {technique}: {len(json_files)} files")

        for json_file in json_files:
            stats['total_files'] += 1

            # Extract crime category from filename (e.g., "Stealing-CLAUDE.json" -> "Stealing")
            crime_category = json_file.replace('-CLAUDE.json', '').replace('.json', '')

            file_path = os.path.join(technique_path, json_file)

            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)

                    # Validate data is not empty
                    if data and (not isinstance(data, dict) or len(data) > 0):
                        data_structure[technique][crime_category] = data
                        stats['loaded_files'] += 1
                        print(f"    ✅ Loaded: {crime_category}")
                        logging.info(f"    ✅ Loaded: {crime_category}")
                    else:
                        print(f"    ⚠️  Empty file: {crime_category}")
                        logging.warning(f"    ⚠️  Empty file: {crime_category}")
                        stats['failed_files'] += 1
            except Exception as e:
                print(f"    ❌ Error loading {json_file}: {e}")
                logging.error(f"    ❌ Error loading {json_file}: {e}")
                stats['failed_files'] += 1

    print(f"\n📊 Data Loading Summary:")
    print(f"  Total files found: {stats['total_files']}")
    print(f"  Successfully loaded: {stats['loaded_files']}")
    print(f"  Failed to load: {stats['failed_files']}")
    logging.info(f"📊 Data Loading Summary: {stats['loaded_files']}/{stats['total_files']} loaded successfully")

    return data_structure, stats

def truncate_analysis_data(analysis_data, max_chars=300000):
    """Truncate analysis data to fit within token limits (Claude Sonnet 4 has even larger context window)"""

    if isinstance(analysis_data, dict):
        combined_text = ""
        total_chars = 0

        for key, value in analysis_data.items():
            if isinstance(value, dict):
                section = f"\n--- Analysis {key} ---\n" + json.dumps(value, indent=2)
            else:
                section = f"\n--- Analysis {key} ---\n{str(value)}\n"

            # Check if adding this section would exceed limit
            if total_chars + len(section) > max_chars:
                # Add truncation notice and break
                combined_text += f"\n\n[NOTE: Analysis truncated due to length. Showing first {total_chars:,} characters of {total_chars + len(str(analysis_data)) - len(combined_text):,} total characters]"
                break

            combined_text += section
            total_chars += len(section)

        return combined_text
    else:
        text = str(analysis_data)
        if len(text) > max_chars:
            return text[:max_chars] + f"\n\n[NOTE: Analysis truncated. Showing first {max_chars:,} characters of {len(text):,} total characters]"
        return text

def create_evaluation_prompt(technique, crime_category, analysis_data):
    """Create evaluation prompt for Claude Sonnet 4"""

    # Truncate data to prevent token limit issues (Claude Sonnet 4 has very large context window)
    combined_text = truncate_analysis_data(analysis_data, max_chars=300000)

    prompt = f"""
You are a forensic analysis expert evaluating AI-generated crime analysis reports.

EVALUATION TASK:
Evaluate the following forensic analysis for a {crime_category} incident generated using the {technique} prompting technique.

EVALUATION CRITERIA:
{EVALUATION_CRITERIA}

ANALYSIS TO EVALUATE:
{combined_text}

EVALUATION INSTRUCTIONS:
1. Carefully read the provided analysis
2. Score each of the 10 criteria on a scale of 0-10 points
3. Provide your evaluation in the following EXACT format:

SCORES:
1. Crime Classification and Intent Detection: [score]/10
2. Temporal Forensic Reconstruction: [score]/10
3. Subject Identification and Behavioral Analysis: [score]/10
4. Physical Evidence Documentation: [score]/10
5. Violence Assessment and Weapon Analysis: [score]/10
6. Criminal Network and Coordination Analysis: [score]/10
7. Modus Operandi Documentation: [score]/10
8. Scene Analysis and Environmental Context: [score]/10
9. Escape Route and Exit Strategy Analysis: [score]/10
10. Forensic Narrative and Court Readiness: [score]/10

TOTAL SCORE: [sum]/100

BRIEF JUSTIFICATION:
[Provide 2-3 sentences explaining the overall assessment]

Be objective and consistent in your scoring. Focus on the presence and quality of forensic elements rather than writing style.
"""

    return prompt

def evaluate_with_claude(prompt, evaluation_id, max_retries=RETRY_ATTEMPTS):
    """Send evaluation prompt to Claude API - optimized for batch processing"""

    for attempt in range(max_retries):
        try:
            print(f"    🤖 {evaluation_id} - Attempt {attempt + 1}")
            logging.debug(f"    🤖 {evaluation_id} - Attempt {attempt + 1}")

            # Progressive truncation for batch efficiency
            current_prompt = prompt
            if attempt >= 2:
                # More generous limits for Claude Sonnet 4
                base_chars = 250000 if "sonnet-4" in str(models_to_try[0]) else 200000
                reduction_per_attempt = 50000  # Less aggressive reduction for larger context
                max_chars = max(30000, base_chars - (attempt * reduction_per_attempt))

                lines = prompt.split('\n')
                analysis_start = None
                for i, line in enumerate(lines):
                    if "ANALYSIS TO EVALUATE:" in line:
                        analysis_start = i + 1
                        break

                if analysis_start:
                    pre_analysis = '\n'.join(lines[:analysis_start])
                    analysis_section = '\n'.join(lines[analysis_start:])

                    if len(analysis_section) > max_chars:
                        truncated = analysis_section[:max_chars]
                        analysis_section = truncated + f"\n\n[BATCH TRUNCATION: Attempt {attempt + 1}, {max_chars:,} chars]"

                    current_prompt = pre_analysis + analysis_section

            # Model selection optimized for batch processing
            models_to_try = [
                "claude-sonnet-4-20250514",      # Latest Claude Sonnet 4 - most capable
                "claude-3-5-sonnet-20241022",    # Fallback to Claude 3.5 Sonnet
                "claude-3-haiku-20240307"        # Fast model for batch processing
            ]

            for model in models_to_try:
                try:
                    # Optimized settings for each model
                    if "sonnet-4" in model:
                        max_tokens = 2500  # Claude Sonnet 4 - highest capacity
                    elif "3-5-sonnet" in model:
                        max_tokens = 2000  # Claude 3.5 Sonnet
                    elif "haiku" in model:
                        max_tokens = 1500  # Claude 3 Haiku - faster processing
                    else:
                        max_tokens = 2000  # Default

                    response = client.messages.create(
                        model=model,
                        max_tokens=max_tokens,
                        temperature=0.1,
                        messages=[{
                            "role": "user",
                            "content": current_prompt
                        }]
                    )

                    print(f"    ✅ {evaluation_id} - SUCCESS with {model}")
                    logging.info(f"    ✅ {evaluation_id} - SUCCESS with {model}")
                    return response.content[0].text

                except Exception as model_error:
                    print(f"    ❌ {evaluation_id} - {model} failed: {str(model_error)[:50]}...")
                    logging.warning(f"    ❌ {evaluation_id} - {model} failed: {str(model_error)[:50]}...")
                    continue

        except Exception as e:
            print(f"    ❌ {evaluation_id} - API Error (attempt {attempt + 1}): {str(e)[:100]}")
            logging.error(f"    ❌ {evaluation_id} - API Error (attempt {attempt + 1}): {str(e)[:100]}")

            error_str = str(e).lower()

            if "too long" in error_str or "prompt is too long" in error_str:
                # Emergency truncation for batch processing
                if "ANALYSIS TO EVALUATE:" in current_prompt:
                    parts = current_prompt.split("ANALYSIS TO EVALUATE:")
                    if len(parts) == 2:
                        emergency_chars = max(15000, 60000 - (attempt * 15000))
                        truncated_analysis = parts[1][:emergency_chars] + f"\n\n[EMERGENCY TRUNCATION: {emergency_chars:,} chars]"
                        current_prompt = parts[0] + "ANALYSIS TO EVALUATE:" + truncated_analysis
                        continue

            elif "rate limit" in error_str:
                # Shorter waits for batch processing
                wait_time = min(30 + (attempt * 10), 90)
                print(f"    ⏳ {evaluation_id} - Rate limit, waiting {wait_time}s...")
                logging.info(f"    ⏳ {evaluation_id} - Rate limit, waiting {wait_time}s...")
                time.sleep(wait_time)
                continue

            # Progressive backoff
            if attempt < max_retries - 1:
                wait_time = min(5 + (attempt * 2), 20)
                time.sleep(wait_time)
            else:
                print(f"    ❌ {evaluation_id} - Failed after {max_retries} attempts")
                logging.error(f"    ❌ {evaluation_id} - Failed after {max_retries} attempts")
                return None

    return None

def process_single_evaluation(evaluation_task):
    """Process a single evaluation task - designed for concurrent execution"""
    global success_counter, failure_counter, truncation_counter

    technique, crime_category, analysis_data, eval_number, total_evals = evaluation_task
    evaluation_id = f"{technique}-{crime_category}"

    try:
        # Check if data will be truncated
        if isinstance(analysis_data, dict):
            data_size = sum(len(str(v)) for v in analysis_data.values())
        else:
            data_size = len(str(analysis_data))

        if data_size > 200000:
            with evaluation_lock:
                truncation_counter += 1

        # Create evaluation prompt
        prompt = create_evaluation_prompt(technique, crime_category, analysis_data)

        # Get evaluation from Claude
        response = evaluate_with_claude(prompt, evaluation_id)

        if response:
            # Parse scores
            scores = parse_evaluation_response(response)

            if scores:
                result = {
                    'technique': technique,
                    'crime_category': crime_category,
                    'total_score': scores['Total_Score'],
                    'detailed_scores': scores,
                    'success': True,
                    'response': response,
                    'evaluation_id': evaluation_id
                }

                with evaluation_lock:
                    success_counter += 1

                print(f"    ✅ {evaluation_id} - Score: {scores['Total_Score']}/100")
                logging.info(f"    ✅ {evaluation_id} - Score: {scores['Total_Score']}/100")
                return result
            else:
                print(f"    ❌ {evaluation_id} - Failed to parse evaluation")
                logging.error(f"    ❌ {evaluation_id} - Failed to parse evaluation")
                with evaluation_lock:
                    failure_counter += 1
                return {
                    'technique': technique,
                    'crime_category': crime_category,
                    'success': False,
                    'error': 'PARSE_ERROR',
                    'evaluation_id': evaluation_id
                }
        else:
            print(f"    ❌ {evaluation_id} - No response")
            logging.error(f"    ❌ {evaluation_id} - No response")
            with evaluation_lock:
                failure_counter += 1
            return {
                'technique': technique,
                'crime_category': crime_category,
                'success': False,
                'error': 'NO_RESPONSE',
                'evaluation_id': evaluation_id
            }

    except Exception as e:
        print(f"    ❌ {evaluation_id} - Exception: {e}")
        logging.error(f"    ❌ {evaluation_id} - Exception: {e}")
        with evaluation_lock:
            failure_counter += 1
        return {
            'technique': technique,
            'crime_category': crime_category,
            'success': False,
            'error': f'EXCEPTION: {str(e)}',
            'evaluation_id': evaluation_id
        }

def process_batch(batch_tasks, batch_number, total_batches):
    """Process a batch of evaluations concurrently"""
    print(f"\n🚀 BATCH {batch_number}/{total_batches} - Processing {len(batch_tasks)} evaluations")
    logging.info(f"🚀 BATCH {batch_number}/{total_batches} - Processing {len(batch_tasks)} evaluations")

    batch_results = []
    batch_start_time = time.time()

    # Process batch with ThreadPoolExecutor
    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all tasks in the batch
        future_to_task = {
            executor.submit(process_single_evaluation, task): task
            for task in batch_tasks
        }

        # Collect results as they complete
        for future in concurrent.futures.as_completed(future_to_task):
            task = future_to_task[future]
            try:
                result = future.result()
                batch_results.append(result)
            except Exception as exc:
                print(f"    ❌ Task generated exception: {exc}")
                logging.error(f"    ❌ Task generated exception: {exc}")
                # Create error result
                technique, crime_category = task[0], task[1]
                batch_results.append({
                    'technique': technique,
                    'crime_category': crime_category,
                    'success': False,
                    'error': f'FUTURE_EXCEPTION: {str(exc)}',
                    'evaluation_id': f"{technique}-{crime_category}"
                })

    batch_duration = time.time() - batch_start_time
    successful_in_batch = sum(1 for r in batch_results if r['success'])

    print(f"📊 BATCH {batch_number} COMPLETE - {successful_in_batch}/{len(batch_tasks)} successful ({batch_duration:.1f}s)")
    logging.info(f"📊 BATCH {batch_number} COMPLETE - {successful_in_batch}/{len(batch_tasks)} successful ({batch_duration:.1f}s)")

    return batch_results

def create_evaluation_batches(data_structure):
    """Create optimized batches of evaluation tasks"""
    print("🔧 Creating evaluation batches...")
    logging.info("🔧 Creating evaluation batches...")

    all_tasks = []

    # Create all evaluation tasks
    eval_number = 0
    for technique, categories in data_structure.items():
        for crime_category, analysis_data in categories.items():
            eval_number += 1
            task = (technique, crime_category, analysis_data, eval_number, 0)  # Will update total later
            all_tasks.append(task)

    # Update total count in all tasks
    total_evals = len(all_tasks)
    all_tasks = [(t[0], t[1], t[2], t[3], total_evals) for t in all_tasks]

    # Split into batches
    batches = []
    for i in range(0, len(all_tasks), BATCH_SIZE):
        batch = all_tasks[i:i + BATCH_SIZE]
        batches.append(batch)

    print(f"📦 Created {len(batches)} batches of {BATCH_SIZE} evaluations each")
    print(f"📊 Total evaluations: {total_evals}")
    logging.info(f"📦 Created {len(batches)} batches of {BATCH_SIZE} evaluations each, total: {total_evals}")

    return batches

def retry_failed_batches(failed_results, data_structure):
    """Retry failed evaluations as separate batches"""
    if not failed_results:
        return []

    print(f"\n🔄 RETRYING {len(failed_results)} failed evaluations...")
    logging.info(f"🔄 RETRYING {len(failed_results)} failed evaluations...")

    # Create retry tasks
    retry_tasks = []
    for result in failed_results:
        technique = result['technique']
        crime_category = result['crime_category']

        if technique in data_structure and crime_category in data_structure[technique]:
            analysis_data = data_structure[technique][crime_category]
            task = (technique, crime_category, analysis_data, 0, len(failed_results))
            retry_tasks.append(task)

    if not retry_tasks:
        return []

    # Process retries in smaller batches
    retry_batch_size = max(1, BATCH_SIZE // 2)  # Smaller batches for retries
    retry_results = []

    for i in range(0, len(retry_tasks), retry_batch_size):
        batch = retry_tasks[i:i + retry_batch_size]
        batch_number = (i // retry_batch_size) + 1
        total_retry_batches = (len(retry_tasks) + retry_batch_size - 1) // retry_batch_size

        print(f"🔄 RETRY BATCH {batch_number}/{total_retry_batches}")
        logging.info(f"🔄 RETRY BATCH {batch_number}/{total_retry_batches}")

        batch_results = process_batch(batch, f"RETRY-{batch_number}", total_retry_batches)
        retry_results.extend(batch_results)

        # Longer delay between retry batches
        if i + retry_batch_size < len(retry_tasks):
            time.sleep(INTER_BATCH_DELAY * 2)

    return retry_results

def parse_evaluation_response(response_text):
    """Parse Claude's evaluation response to extract scores"""

    if not response_text:
        return None

    scores = {}
    total_score = 0
    justification = ""

    try:
        # Extract individual scores
        score_patterns = [
            r"1\.\s*Crime Classification and Intent Detection:\s*(\d+)",
            r"2\.\s*Temporal Forensic Reconstruction:\s*(\d+)",
            r"3\.\s*Subject Identification and Behavioral Analysis:\s*(\d+)",
            r"4\.\s*Physical Evidence Documentation:\s*(\d+)",
            r"5\.\s*Violence Assessment and Weapon Analysis:\s*(\d+)",
            r"6\.\s*Criminal Network and Coordination Analysis:\s*(\d+)",
            r"7\.\s*Modus Operandi Documentation:\s*(\d+)",
            r"8\.\s*Scene Analysis and Environmental Context:\s*(\d+)",
            r"9\.\s*Escape Route and Exit Strategy Analysis:\s*(\d+)",
            r"10\.\s*Forensic Narrative and Court Readiness:\s*(\d+)"
        ]

        criteria_names = [
            "Crime_Classification",
            "Temporal_Reconstruction",
            "Subject_Identification",
            "Physical_Evidence",
            "Violence_Assessment",
            "Criminal_Network",
            "Modus_Operandi",
            "Scene_Analysis",
            "Escape_Route",
            "Forensic_Narrative"
        ]

        for i, pattern in enumerate(score_patterns):
            match = re.search(pattern, response_text, re.IGNORECASE)
            if match:
                score = int(match.group(1))
                scores[criteria_names[i]] = score
                total_score += score

        # Extract total score
        total_match = re.search(r"TOTAL SCORE:\s*(\d+)", response_text, re.IGNORECASE)
        if total_match:
            reported_total = int(total_match.group(1))
            # Use calculated total if reported total doesn't match
            if reported_total != total_score and total_score > 0:
                print(f"  ⚠️  Total score mismatch: calculated={total_score}, reported={reported_total}")
                logging.warning(f"  ⚠️  Total score mismatch: calculated={total_score}, reported={reported_total}")

        # Extract justification
        just_match = re.search(r"BRIEF JUSTIFICATION:\s*(.+?)(?:\n\n|\Z)", response_text, re.DOTALL | re.IGNORECASE)
        if just_match:
            justification = just_match.group(1).strip()

        scores['Total_Score'] = total_score
        scores['Justification'] = justification

        return scores

    except Exception as e:
        print(f"  ❌ Error parsing response: {e}")
        logging.error(f"  ❌ Error parsing response: {e}")
        return None

def verify_complete_data_processing(data_structure, results):
    """Verify that ALL available data has been processed - NO DATA LEFT BEHIND"""

    verification_report = {
        'total_data_files': 0,
        'processed_files': 0,
        'unprocessed_files': [],
        'completion_percentage': 0,
        'missing_evaluations': []
    }

    print("\n🔍 VERIFYING COMPLETE DATA PROCESSING...")
    logging.info("🔍 VERIFYING COMPLETE DATA PROCESSING...")

    # Count all available data
    for technique, categories in data_structure.items():
        for category, data in categories.items():
            verification_report['total_data_files'] += 1

            # Check if this combination was processed successfully
            result = results.get(technique, {}).get(category, "NOT_ATTEMPTED")

            if isinstance(result, (int, float)):  # Successful evaluation (got a score)
                verification_report['processed_files'] += 1
            else:
                verification_report['unprocessed_files'].append(f"{technique}/{category}")
                verification_report['missing_evaluations'].append({
                    'technique': technique,
                    'category': category,
                    'result': result,
                    'reason': result if isinstance(result, str) else "UNKNOWN"
                })

    # Calculate completion
    if verification_report['total_data_files'] > 0:
        verification_report['completion_percentage'] = (
            verification_report['processed_files'] / verification_report['total_data_files'] * 100
        )

    # Log verification results
    print(f"📊 DATA PROCESSING VERIFICATION:")
    print(f"  Total data files available: {verification_report['total_data_files']}")
    print(f"  Files successfully processed: {verification_report['processed_files']}")
    print(f"  Files unprocessed: {len(verification_report['unprocessed_files'])}")
    print(f"  Completion rate: {verification_report['completion_percentage']:.2f}%")

    logging.info(f"📊 DATA PROCESSING VERIFICATION:")
    logging.info(f"  Total data files: {verification_report['total_data_files']}")
    logging.info(f"  Successfully processed: {verification_report['processed_files']}")
    logging.info(f"  Unprocessed: {len(verification_report['unprocessed_files'])}")
    logging.info(f"  Completion rate: {verification_report['completion_percentage']:.2f}%")

    if verification_report['unprocessed_files']:
        print(f"⚠️  UNPROCESSED FILES:")
        logging.warning(f"⚠️  UNPROCESSED FILES:")
        for file_path in verification_report['unprocessed_files'][:10]:  # Show first 10
            print(f"    - {file_path}")
            logging.warning(f"    - {file_path}")
        if len(verification_report['unprocessed_files']) > 10:
            print(f"    ... and {len(verification_report['unprocessed_files']) - 10} more")
    else:
        print("🎉 ALL DATA FILES HAVE BEEN PROCESSED!")
        logging.info("🎉 ALL DATA FILES HAVE BEEN PROCESSED!")

    return verification_report

def force_process_remaining_data(data_structure, results, detailed_results):
    """Force process any remaining unprocessed data files"""

    print("\n🔒 FORCE PROCESSING REMAINING DATA...")
    logging.info("🔒 FORCE PROCESSING REMAINING DATA...")

    unprocessed_tasks = []

    # Find unprocessed data
    for technique, categories in data_structure.items():
        for category, data in categories.items():
            result = results.get(technique, {}).get(category, "NOT_ATTEMPTED")

            if not isinstance(result, (int, float)):  # Not successfully processed
                unprocessed_tasks.append((technique, category, data, 0, 0))

    if not unprocessed_tasks:
        print("✅ No unprocessed data found!")
        logging.info("✅ No unprocessed data found!")
        return 0

    print(f"🔧 Found {len(unprocessed_tasks)} unprocessed data files")
    logging.info(f"🔧 Found {len(unprocessed_tasks)} unprocessed data files")

    # Process in smaller batches with more conservative settings
    force_batch_size = 3  # Smaller batches for problematic data
    force_processed = 0

    for i in range(0, len(unprocessed_tasks), force_batch_size):
        batch = unprocessed_tasks[i:i + force_batch_size]
        batch_number = (i // force_batch_size) + 1
        total_force_batches = (len(unprocessed_tasks) + force_batch_size - 1) // force_batch_size

        print(f"🔧 FORCE BATCH {batch_number}/{total_force_batches} - Processing {len(batch)} files")
        logging.info(f"🔧 FORCE BATCH {batch_number}/{total_force_batches} - Processing {len(batch)} files")

        # Process with more conservative settings
        batch_results = []
        for task in batch:
            technique, category, data, _, _ = task
            evaluation_id = f"FORCE-{technique}-{category}"

            # Use most conservative prompt settings
            prompt = create_evaluation_prompt(technique, category, data)

            # Try with emergency truncation if needed
            if len(prompt) > 100000:  # Very conservative limit
                lines = prompt.split('\n')
                analysis_start = None
                for j, line in enumerate(lines):
                    if "ANALYSIS TO EVALUATE:" in line:
                        analysis_start = j + 1
                        break

                if analysis_start:
                    pre_analysis = '\n'.join(lines[:analysis_start])
                    analysis_section = '\n'.join(lines[analysis_start:])

                    if len(analysis_section) > 30000:  # Emergency limit
                        analysis_section = analysis_section[:30000] + "\n\n[FORCE PROCESSING TRUNCATION]"

                    prompt = pre_analysis + analysis_section

            # Single-threaded processing for force mode
            response = evaluate_with_claude(prompt, evaluation_id, max_retries=15)

            if response:
                scores = parse_evaluation_response(response)

                if scores and scores.get('Total_Score', 0) > 0:
                    # Success
                    if technique not in results:
                        results[technique] = {}
                    results[technique][category] = scores['Total_Score']

                    # Add to detailed results
                    detailed_entry = {
                        'Technique': technique,
                        'Crime_Category': category,
                        'Total_Score': scores['Total_Score'],
                        'Justification': scores.get('Justification', 'Force processed'),
                        'Raw_Response': response,
                        'Batch_Number': 'FORCE',
                        'Processing_Type': 'FORCE_COMPLETE'
                    }

                    for criterion, score in scores.items():
                        if criterion not in ['Total_Score', 'Justification']:
                            detailed_entry[criterion] = score

                    detailed_results.append(detailed_entry)
                    force_processed += 1

                    print(f"    ✅ FORCE SUCCESS: {evaluation_id} - Score: {scores['Total_Score']}/100")
                    logging.info(f"    ✅ FORCE SUCCESS: {evaluation_id} - Score: {scores['Total_Score']}/100")

                    # Update global counters
                    global success_counter
                    with evaluation_lock:
                        success_counter += 1
                else:
                    results[technique][category] = "FORCE_PARSE_FAILED"
                    print(f"    ❌ FORCE PARSE FAILED: {evaluation_id}")
                    logging.error(f"    ❌ FORCE PARSE FAILED: {evaluation_id}")
            else:
                if technique not in results:
                    results[technique] = {}
                results[technique][category] = "FORCE_API_FAILED"
                print(f"    ❌ FORCE API FAILED: {evaluation_id}")
                logging.error(f"    ❌ FORCE API FAILED: {evaluation_id}")

            # Longer delay for force processing
            time.sleep(5)

        # Longer delay between force batches
        if i + force_batch_size < len(unprocessed_tasks):
            time.sleep(20)

    print(f"\n📊 FORCE PROCESSING SUMMARY:")
    print(f"  Unprocessed files found: {len(unprocessed_tasks)}")
    print(f"  Successfully force processed: {force_processed}")
    print(f"  Still failed after force processing: {len(unprocessed_tasks) - force_processed}")

    logging.info(f"📊 FORCE PROCESSING SUMMARY:")
    logging.info(f"  Unprocessed files: {len(unprocessed_tasks)}")
    logging.info(f"  Force processed: {force_processed}")
    logging.info(f"  Still failed: {len(unprocessed_tasks) - force_processed}")

    return force_processed

def save_batch_checkpoint(results, detailed_results, batch_number, total_batches, output_dir):
    """Save checkpoint after each batch"""
    checkpoint_dir = os.path.join(output_dir, "checkpoints")
    os.makedirs(checkpoint_dir, exist_ok=True)

    checkpoint_data = {
        'results': results,
        'detailed_results': detailed_results,
        'batch_progress': {
            'completed_batches': batch_number,
            'total_batches': total_batches,
            'completion_percentage': (batch_number / total_batches * 100) if total_batches > 0 else 0
        },
        'stats': {
            'successful_evaluations': success_counter,
            'failed_evaluations': failure_counter,
            'truncated_files': truncation_counter
        },
        'timestamp': datetime.now().isoformat()
    }

    checkpoint_file = os.path.join(checkpoint_dir, "batch_checkpoint.json")
    try:
        with open(checkpoint_file, 'w', encoding='utf-8') as f:
            json.dump(checkpoint_data, f, indent=2, ensure_ascii=False)
        print(f"    💾 Checkpoint saved ({batch_number}/{total_batches} batches)")
        logging.info(f"    💾 Checkpoint saved ({batch_number}/{total_batches} batches)")
    except Exception as e:
        print(f"    ⚠️  Checkpoint save failed: {e}")
        logging.error(f"    ⚠️  Checkpoint save failed: {e}")

def run_batch_evaluation():
    """Main batch evaluation function - CORRECTED TO USE BATCH PROCESSING"""

    print("🚀 Starting Forensic Analysis Performance Evaluation - CLAUDE BATCH EVALUATOR")
    print("🔒 GUARANTEED 100% DATA PROCESSING - ALL FILES WILL BE EVALUATED")
    print(f"📁 Source Directory: {SOURCE_DIR}")
    print(f"💾 Output Directory: {OUTPUT_DIR}")
    print(f"📦 Batch Size: {BATCH_SIZE}")
    print(f"🧵 Max Workers: {MAX_WORKERS}")
    print(f"🎯 Strategy: Concurrent batch processing with verification & force completion")

    # Setup
    os.makedirs(OUTPUT_DIR, exist_ok=True)
    log_file = setup_logging(OUTPUT_DIR)

    # Check API key
    if not ANTHROPIC_API_KEY or not client:
        print("❌ Claude API key not loaded. Please check the API key file.")
        logging.error("❌ Claude API key not loaded. Please check the API key file.")
        return

    # Load all data
    print("\n" + "="*60)
    print("📖 LOADING DATA")
    print("="*60)

    data_structure, data_stats = load_json_files(SOURCE_DIR)

    if not data_structure:
        print("❌ No data found. Exiting.")
        logging.error("❌ No data found. Exiting.")
        return

    # Create evaluation batches
    print("\n" + "="*60)
    print("📦 CREATING EVALUATION BATCHES")
    print("="*60)

    batches = create_evaluation_batches(data_structure)

    if not batches:
        print("❌ No evaluation batches created. Exiting.")
        logging.error("❌ No evaluation batches created. Exiting.")
        return

    # Initialize results
    results = {}
    detailed_results = []
    all_batch_results = []

    # Process all batches
    print("\n" + "="*60)
    print("🔍 PROCESSING EVALUATION BATCHES")
    print("="*60)

    total_batches = len(batches)

    for batch_idx, batch in enumerate(batches, 1):
        print(f"\n📊 Overall Progress: {batch_idx}/{total_batches} batches")
        logging.info(f"📊 Overall Progress: {batch_idx}/{total_batches} batches")

        # Process batch
        batch_results = process_batch(batch, batch_idx, total_batches)
        all_batch_results.extend(batch_results)

        # Update results structure
        for result in batch_results:
            technique = result['technique']
            crime_category = result['crime_category']

            if technique not in results:
                results[technique] = {}

            if result['success']:
                results[technique][crime_category] = result['total_score']

                # Add to detailed results
                detailed_entry = {
                    'Technique': technique,
                    'Crime_Category': crime_category,
                    'Total_Score': result['total_score'],
                    'Justification': result['detailed_scores'].get('Justification', ''),
                    'Raw_Response': result['response'],
                    'Batch_Number': batch_idx,
                    'Processing_Type': 'BATCH'
                }

                # Add individual criterion scores
                for criterion, score in result['detailed_scores'].items():
                    if criterion not in ['Total_Score', 'Justification']:
                        detailed_entry[criterion] = score

                detailed_results.append(detailed_entry)
            else:
                results[technique][crime_category] = result['error']

        # Save checkpoint after each batch
        save_batch_checkpoint(results, detailed_results, batch_idx, total_batches, OUTPUT_DIR)

        # Inter-batch delay (except for last batch)
        if batch_idx < total_batches:
            print(f"⏳ Inter-batch delay: {INTER_BATCH_DELAY}s")
            logging.info(f"⏳ Inter-batch delay: {INTER_BATCH_DELAY}s")
            time.sleep(INTER_BATCH_DELAY)

    # Retry failed evaluations
    print("\n" + "="*60)
    print("🔄 RETRY PHASE - FAILED EVALUATIONS")
    print("="*60)

    failed_results = [r for r in all_batch_results if not r['success']]

    if failed_results:
        print(f"🔄 Found {len(failed_results)} failed evaluations to retry")
        logging.info(f"🔄 Found {len(failed_results)} failed evaluations to retry")

        retry_results = retry_failed_batches(failed_results, data_structure)

        # Update results with retry successes
        for result in retry_results:
            if result['success']:
                technique = result['technique']
                crime_category = result['crime_category']

                results[technique][crime_category] = result['total_score']

                # Add to detailed results
                detailed_entry = {
                    'Technique': technique,
                    'Crime_Category': crime_category,
                    'Total_Score': result['total_score'],
                    'Justification': result['detailed_scores'].get('Justification', ''),
                    'Raw_Response': result['response'],
                    'Batch_Number': 'RETRY',
                    'Processing_Type': 'RETRY'
                }

                for criterion, score in result['detailed_scores'].items():
                    if criterion not in ['Total_Score', 'Justification']:
                        detailed_entry[criterion] = score

                detailed_results.append(detailed_entry)

                # Update counters
                with evaluation_lock:
                    global success_counter, failure_counter
                    success_counter += 1
                    failure_counter = max(0, failure_counter - 1)
    else:
        print("🎉 No failed evaluations to retry!")
        logging.info("🎉 No failed evaluations to retry!")

    # VERIFY COMPLETE DATA PROCESSING
    print("\n" + "="*60)
    print("🔍 VERIFICATION: ENSURING ALL DATA PROCESSED")
    print("="*60)

    verification_report = verify_complete_data_processing(data_structure, results)

    # FORCE PROCESS REMAINING DATA if needed
    if verification_report['completion_percentage'] < 100:
        print("\n" + "="*60)
        print("🔒 FORCE PROCESSING - NO DATA LEFT BEHIND")
        print("="*60)

        force_processed = force_process_remaining_data(data_structure, results, detailed_results)

        # Re-verify after force processing
        final_verification = verify_complete_data_processing(data_structure, results)
        print(f"\n🎯 FINAL VERIFICATION: {final_verification['completion_percentage']:.2f}% complete")
        logging.info(f"🎯 FINAL VERIFICATION: {final_verification['completion_percentage']:.2f}% complete")
    else:
        print("🎉 VERIFICATION PASSED: 100% DATA PROCESSING CONFIRMED!")
        logging.info("🎉 VERIFICATION PASSED: 100% DATA PROCESSING CONFIRMED!")
        final_verification = verification_report

    # Save final results
    print("\n" + "="*60)
    print("💾 SAVING FINAL RESULTS")
    print("="*60)

    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")

    # Get all techniques and categories for complete matrix
    techniques = list(data_structure.keys())
    all_crime_categories = set()
    for technique_data in data_structure.values():
        all_crime_categories.update(technique_data.keys())
    crime_categories = sorted(list(all_crime_categories))

    # Create complete results matrix
    for technique in techniques:
        if technique not in results:
            results[technique] = {}
        for category in crime_categories:
            if category not in results[technique]:
                if category in data_structure[technique]:
                    results[technique][category] = "NOT_PROCESSED"
                else:
                    results[technique][category] = "NO_DATA"

    # Save summary matrix
    summary_df = pd.DataFrame(results).T
    summary_df = summary_df.reindex(columns=crime_categories)
    summary_file = os.path.join(OUTPUT_DIR, f"claude_batch_summary_{timestamp}.csv")
    summary_df.to_csv(summary_file)
    print(f"✅ Summary saved: {summary_file}")
    logging.info(f"✅ Summary saved: {summary_file}")

    # Save detailed results
    if detailed_results:
        detailed_df = pd.DataFrame(detailed_results)
        detailed_file = os.path.join(OUTPUT_DIR, f"claude_batch_detailed_{timestamp}.csv")
        detailed_df.to_csv(detailed_file, index=False)
        print(f"✅ Detailed results saved: {detailed_file}")
        logging.info(f"✅ Detailed results saved: {detailed_file}")

    # Save comprehensive statistics
    total_evaluations = len(techniques) * len(crime_categories)
    actual_data_files = final_verification.get('total_data_files', sum(len(categories) for categories in data_structure.values()))

    final_stats = {
        'execution_summary': {
            'total_possible_evaluations': total_evaluations,
            'actual_data_files': actual_data_files,
            'successful_evaluations': success_counter,
            'failed_evaluations': failure_counter,
            'truncated_files': truncation_counter,
            'success_rate': (success_counter / actual_data_files * 100) if actual_data_files > 0 else 0,
            'total_batches_processed': total_batches,
            'batch_size': BATCH_SIZE,
            'max_workers': MAX_WORKERS,
            'verification_completion_percentage': final_verification.get('completion_percentage', 0),
            'all_data_processed': final_verification.get('completion_percentage', 0) >= 99.0
        },
        'data_loading': data_stats,
        'verification_report': final_verification,
        'timestamp': timestamp,
        'evaluator': 'Claude-Sonnet-4-Batch',
        'primary_model': 'claude-sonnet-4-20250514'
    }

    # Save raw results as JSON
    raw_file = os.path.join(OUTPUT_DIR, f"claude_batch_complete_{timestamp}.json")
    with open(raw_file, 'w', encoding='utf-8') as f:
        json.dump({
            'summary': results,
            'detailed': detailed_results,
            'statistics': final_stats,
            'batch_results': all_batch_results
        }, f, indent=2, ensure_ascii=False)
    print(f"✅ Complete data saved: {raw_file}")
    logging.info(f"✅ Complete data saved: {raw_file}")

    # Print comprehensive summary
    print("\n" + "="*80)
    print("📊 CLAUDE BATCH EVALUATION SUMMARY - 100% DATA VERIFICATION")
    print("="*80)
    print(summary_df.to_string())

    print(f"\n" + "="*60)
    print("📈 BATCH PROCESSING STATISTICS - COMPLETE DATA COVERAGE")
    print("="*60)
    print(f"📦 Total batches processed: {total_batches}")
    print(f"🧵 Concurrent workers used: {MAX_WORKERS}")
    print(f"📊 Batch size: {BATCH_SIZE}")
    print(f"📁 Data files found: {actual_data_files}")
    print(f"✅ Successful evaluations: {success_counter}")
    print(f"❌ Failed evaluations: {failure_counter}")
    print(f"⚠️  Truncated files: {truncation_counter}")
    print(f"🔍 Verification completion: {final_verification.get('completion_percentage', 0):.2f}%")

    # DEFINITIVE DATA PROCESSING CONFIRMATION
    if final_stats['execution_summary'].get('all_data_processed', False):
        print("\n🏆 CONFIRMATION: ALL AVAILABLE DATA HAS BEEN PROCESSED!")
        print("🔒 GUARANTEE FULFILLED: 100% of data files have been evaluated")
        logging.info("🏆 CONFIRMATION: ALL AVAILABLE DATA HAS BEEN PROCESSED!")
    else:
        remaining_pct = 100 - final_verification.get('completion_percentage', 0)
        print(f"\n⚠️  {remaining_pct:.2f}% of data could not be processed despite maximum efforts")
        print("📊 This may be due to corrupted files or API limitations")
        logging.warning(f"⚠️  {remaining_pct:.2f}% of data could not be processed")

    if actual_data_files > 0:
        success_rate = success_counter / actual_data_files * 100
        print(f"🎯 Success rate: {success_rate:.1f}%")

        if success_rate >= 95:
            print("🏆 EXCELLENT: 95%+ success rate achieved with batch processing!")
        elif success_rate >= 90:
            print("🥇 VERY GOOD: 90%+ success rate achieved with batch processing!")
        elif success_rate >= 80:
            print("🥈 GOOD: 80%+ success rate achieved with batch processing!")
        else:
            print(f"📊 COMPLETED: {success_rate:.1f}% success rate with batch processing")

    print(f"\n🎉 Claude Batch Evaluation Complete!")
    print(f"📁 Results saved in: {OUTPUT_DIR}")
    print(f"📝 Log file: {log_file}")

    # Clean up checkpoint if successful
    if success_counter >= actual_data_files * 0.9:  # 90% success rate
        try:
            checkpoint_file = os.path.join(OUTPUT_DIR, "checkpoints", "batch_checkpoint.json")
            if os.path.exists(checkpoint_file):
                os.remove(checkpoint_file)
                print("🧹 Cleanup: Removed checkpoint file")
                logging.info("🧹 Cleanup: Removed checkpoint file")
        except:
            pass

if __name__ == "__main__":
    try:
        run_batch_evaluation()
    except KeyboardInterrupt:
        print("\n⏸️  Evaluation interrupted by user")
        print("💾 Progress has been saved - run again to resume")
        logging.info("⏸️  Evaluation interrupted by user")
    except Exception as e:
        print(f"\n💥 Unexpected error: {e}")
        print("💾 Progress has been saved - run again to resume")
        logging.error(f"💥 Unexpected error: {e}", exc_info=True)

✅ Claude API key loaded successfully
✅ Claude client initialized
🔍 Testing API connection...
✅ API connection successful
🚀 Starting Forensic Analysis Performance Evaluation - CLAUDE BATCH EVALUATOR
🔒 GUARANTEED 100% DATA PROCESSING - ALL FILES WILL BE EVALUATED
📁 Source Directory: /content/drive/Shareddrives/DR KOFI RESEARCH/RESEARCH/PROMPTS/SAVE/FINAL-COMPLETED/ANALY-CLAUDE-NEW
💾 Output Directory: /content/drive/Shareddrives/DR KOFI RESEARCH/RESEARCH/PROMPTS/RESULTS
📦 Batch Size: 6
🧵 Max Workers: 6
🎯 Strategy: Concurrent batch processing with verification & force completion

📖 LOADING DATA
🔍 Scanning directory: /content/drive/Shareddrives/DR KOFI RESEARCH/RESEARCH/PROMPTS/SAVE/FINAL-COMPLETED/ANALY-CLAUDE-NEW
📁 Found 8 technique folders: ['ZERO', 'SEQUENTIAL', 'LEAST-TO-MOST', 'REACT', 'CHAIN-OF-THOUGHT', 'META-PROMPTING', 'SELF-CONSISTENCY', 'TRUE-ITERATIVE']
  📁 ZERO: 11 files
    ✅ Loaded: Stealing
    ✅ Loaded: Abuse
    ✅ Loaded: Explosion
    ✅ Loaded: Shooting
    ✅ Loaded: Ass



    ❌ LEAST-TO-MOST-Vandalism - claude-sonnet-4-20250514 failed: Error code: 429 - {'type': 'error', 'error': {'typ...
    ✅ LEAST-TO-MOST-Assault - SUCCESS with claude-sonnet-4-20250514
    ✅ LEAST-TO-MOST-Assault - Score: 19/100
    ✅ LEAST-TO-MOST-Vandalism - SUCCESS with claude-3-5-sonnet-20241022
    ✅ LEAST-TO-MOST-Vandalism - Score: 86/100
📊 BATCH 5 COMPLETE - 6/6 successful (70.2s)
    💾 Checkpoint saved (5/15 batches)
⏳ Inter-batch delay: 10s

📊 Overall Progress: 6/15 batches

🚀 BATCH 6/15 - Processing 6 evaluations
    🤖 LEAST-TO-MOST-Robbery - Attempt 1
    🤖 LEAST-TO-MOST-Arson - Attempt 1
    🤖 LEAST-TO-MOST-Shoplifting - Attempt 1
    🤖 REACT-Vandalism - Attempt 1
    🤖 REACT-Burglary - Attempt 1
    🤖 REACT-Robbery - Attempt 1
    ✅ LEAST-TO-MOST-Shoplifting - SUCCESS with claude-sonnet-4-20250514
    ✅ LEAST-TO-MOST-Shoplifting - Score: 19/100
    ✅ REACT-Burglary - SUCCESS with claude-sonnet-4-20250514
    ✅ REACT-Burglary - Score: 77/100
    ✅ REACT-Vandalism - SUCCES



    ❌ LEAST-TO-MOST-Robbery - claude-sonnet-4-20250514 failed: Error code: 429 - {'type': 'error', 'error': {'typ...
    ✅ LEAST-TO-MOST-Arson - SUCCESS with claude-sonnet-4-20250514
    ✅ LEAST-TO-MOST-Arson - Score: 66/100
    ✅ REACT-Robbery - SUCCESS with claude-sonnet-4-20250514
    ✅ REACT-Robbery - Score: 71/100
    ✅ LEAST-TO-MOST-Robbery - SUCCESS with claude-3-5-sonnet-20241022
    ✅ LEAST-TO-MOST-Robbery - Score: 77/100
📊 BATCH 6 COMPLETE - 6/6 successful (84.8s)
    💾 Checkpoint saved (6/15 batches)
⏳ Inter-batch delay: 10s

📊 Overall Progress: 7/15 batches

🚀 BATCH 7/15 - Processing 6 evaluations
    🤖 REACT-Arson - Attempt 1
    🤖 REACT-Stealing - Attempt 1
    🤖 REACT-Shoplifting - Attempt 1
    🤖 REACT-Abuse - Attempt 1
    🤖 REACT-Explosion - Attempt 1
    🤖 REACT-Shooting - Attempt 1
    ✅ REACT-Stealing - SUCCESS with claude-sonnet-4-20250514
    ✅ REACT-Stealing - Score: 56/100
    ✅ REACT-Shoplifting - SUCCESS with claude-sonnet-4-20250514
    ✅ REACT-Shoplifting -



    ❌ CHAIN-OF-THOUGHT-Arson - claude-sonnet-4-20250514 failed: Error code: 429 - {'type': 'error', 'error': {'typ...
    ✅ CHAIN-OF-THOUGHT-Shooting - SUCCESS with claude-sonnet-4-20250514
    ✅ CHAIN-OF-THOUGHT-Shooting - Score: 66/100
    ✅ CHAIN-OF-THOUGHT-Arson - SUCCESS with claude-3-5-sonnet-20241022
    ✅ CHAIN-OF-THOUGHT-Arson - Score: 77/100
    ✅ CHAIN-OF-THOUGHT-Explosion - SUCCESS with claude-sonnet-4-20250514
    ✅ CHAIN-OF-THOUGHT-Explosion - Score: 29/100
📊 BATCH 9 COMPLETE - 6/6 successful (35.8s)
    💾 Checkpoint saved (9/15 batches)
⏳ Inter-batch delay: 10s

📊 Overall Progress: 10/15 batches

🚀 BATCH 10/15 - Processing 6 evaluations
    🤖 CHAIN-OF-THOUGHT-Assault - Attempt 1
    🤖 META-PROMPTING-Burglary - Attempt 1
    🤖 META-PROMPTING-Stealing - Attempt 1
    🤖 META-PROMPTING-Fighting - Attempt 1
    🤖 META-PROMPTING-Robbery - Attempt 1
    🤖 META-PROMPTING-Shoplifting - Attempt 1
    ✅ META-PROMPTING-Shoplifting - SUCCESS with claude-sonnet-4-20250514
    ✅ META-P