In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
"""
Multi-Model ReAct Ablation Study
Tests ReAct phase importance across Gemini, GPT-4, and Claude
Processes every 90th frame from crime video data
"""

import os
import json
import base64
import requests
import time
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Dict, List, Optional
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from collections import defaultdict
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# Mount Google Drive if in Colab
try:
    from google.colab import drive
    if not os.path.exists('/content/drive'):
        drive.mount('/content/drive')
        print("‚úì Google Drive mounted")
except:
    pass

# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (18, 12)

# Configuration
API_KEYS_DIR = "/content/drive/Shareddrives/DR KOFI RESEARCH/RESEARCH/COMPLETED/PROMPTS/SAVE/FINAL-COMPLETED/API-KEYS"
DATA_DIR = "/content/drive/Shareddrives/DR KOFI RESEARCH/RESEARCH/COMPLETED/PROMPTS/crime-data"
SAVE_DIR = "/content/drive/MyDrive/multi_model_ablation_results"
FRAME_SKIP = 90  # Process every 90th frame
CHUNK_SIZE = 10  # Frames per API call


class MultiModelReActAgent:
    """ReAct Agent that works with multiple models"""

    def __init__(self, model_name: str, api_key: str, phases: List[str], config_name: str):
        """
        Initialize agent for specific model

        Args:
            model_name: 'gemini', 'gpt', or 'claude'
            api_key: API key for the model
            phases: Enabled ReAct phases ['thought', 'decision', 'observation']
            config_name: Configuration name (e.g., 'Full ReAct')
        """
        self.model_name = model_name
        self.api_key = api_key
        self.phases = phases
        self.config_name = config_name
        self.history = []

        # Model-specific setup
        if model_name == 'gemini':
            self.model_id = "gemini-2.0-flash-exp"
            self.base_url = f"https://generativelanguage.googleapis.com/v1beta/models/{self.model_id}:generateContent"
        elif model_name == 'gpt':
            self.model_id = "gpt-4o"
            self.base_url = "https://api.openai.com/v1/chat/completions"
        elif model_name == 'claude':
            self.model_id = "claude-sonnet-4-20250514"
            self.base_url = "https://api.anthropic.com/v1/messages"

        self.prompt_template = self._create_prompt_template()

    def _create_prompt_template(self) -> str:
        """Create phase-specific prompt template"""
        base_intro = """Analyze these frames from a crime surveillance video. """

        phase_instructions = []

        if 'thought' in self.phases:
            phase_instructions.append("""
1. THOUGHT/REASONING: Carefully analyze what you observe:
   - Identify people, their appearances, and positions
   - Note actions, behaviors, and movements
   - Observe objects, items, and spatial relationships
   - Consider temporal sequence of events
   - Reason about what these observations might indicate
""")

        if 'decision' in self.phases:
            phase_instructions.append("""
2. DECISION/ACTION: Based on your reasoning, decide:
   - What specific aspects require closer analysis?
   - What actions or focus areas are most important?
   - What should be prioritized in the investigation?
   - Rate severity/priority level (HIGH/MEDIUM/LOW)
""")

        if 'observation' in self.phases:
            phase_instructions.append("""
3. OBSERVATION/FEEDBACK: Provide detailed observations:
   - Document specific evidence you notice
   - Note confirming or contradicting details
   - Describe environmental context
   - Record temporal progression of events
""")

        conclusion = """
FINAL ANALYSIS:
- What crime or incident appears to be occurring?
- Who are the individuals involved (describe without names)?
- What evidence supports your conclusion?
- Confidence level (HIGH/MEDIUM/LOW)

Analyzing frames {frame_range} of {total_frames}.
"""

        return base_intro + "".join(phase_instructions) + conclusion

    def process_frames(self, frames_data: List[str], frame_range: str, total_frames: int) -> Dict:
        """Process frames with the model"""
        prompt = self.prompt_template.format(
            frame_range=frame_range,
            total_frames=total_frames
        )

        try:
            if self.model_name == 'gemini':
                return self._process_gemini(frames_data, prompt)
            elif self.model_name == 'gpt':
                return self._process_gpt(frames_data, prompt)
            elif self.model_name == 'claude':
                return self._process_claude(frames_data, prompt)
        except Exception as e:
            return {"error": str(e), "analysis": None}

    def _process_gemini(self, frames_data: List[str], prompt: str) -> Dict:
        """Process with Gemini"""
        parts = [{"text": prompt}]

        for frame in frames_data:
            parts.append({
                "inline_data": {
                    "mime_type": "image/png",
                    "data": frame
                }
            })

        payload = {
            "contents": [{"parts": parts}],
            "generationConfig": {
                "temperature": 0.1,
                "maxOutputTokens": 4096,
                "topP": 0.8,
                "topK": 10
            }
        }

        url = f"{self.base_url}?key={self.api_key}"
        response = requests.post(url, headers={"Content-Type": "application/json"}, json=payload)

        if response.status_code != 200:
            return {"error": f"API Error {response.status_code}", "analysis": None}

        result = response.json()
        if "candidates" in result and result["candidates"]:
            if "content" in result["candidates"][0]:
                analysis = result["candidates"][0]["content"]["parts"][0]["text"]
                return {"analysis": analysis, "phases_used": self.phases}

        return {"error": "No valid response", "analysis": None}

    def _process_gpt(self, frames_data: List[str], prompt: str) -> Dict:
        """Process with GPT-4"""
        messages = [
            {
                "role": "system",
                "content": f"You are an expert crime analyst using {self.config_name} approach. Enabled phases: {', '.join(self.phases).upper()}."
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": prompt}
                ]
            }
        ]

        # Add images
        for frame in frames_data:
            messages[1]["content"].append({
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/png;base64,{frame}",
                    "detail": "high"
                }
            })

        payload = {
            "model": self.model_id,
            "messages": messages,
            "max_tokens": 4096,
            "temperature": 0.1
        }

        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {self.api_key}"
        }

        response = requests.post(self.base_url, headers=headers, json=payload)

        if response.status_code != 200:
            return {"error": f"API Error {response.status_code}", "analysis": None}

        result = response.json()
        if "choices" in result and result["choices"]:
            analysis = result["choices"][0]["message"]["content"]
            return {"analysis": analysis, "phases_used": self.phases}

        return {"error": "No valid response", "analysis": None}

    def _process_claude(self, frames_data: List[str], prompt: str) -> Dict:
        """Process with Claude"""
        content = [{"type": "text", "text": prompt}]

        # Add images
        for frame in frames_data:
            content.append({
                "type": "image",
                "source": {
                    "type": "base64",
                    "media_type": "image/png",
                    "data": frame
                }
            })

        payload = {
            "model": self.model_id,
            "max_tokens": 4096,
            "temperature": 0.1,
            "system": f"You are an expert crime analyst using {self.config_name} approach. Enabled phases: {', '.join(self.phases).upper()}.",
            "messages": [
                {
                    "role": "user",
                    "content": content
                }
            ]
        }

        headers = {
            "Content-Type": "application/json",
            "x-api-key": self.api_key,
            "anthropic-version": "2023-06-01"
        }

        response = requests.post(self.base_url, headers=headers, json=payload)

        if response.status_code != 200:
            return {"error": f"API Error {response.status_code}", "analysis": None}

        result = response.json()
        if "content" in result and result["content"]:
            analysis = result["content"][0]["text"]
            return {"analysis": analysis, "phases_used": self.phases}

        return {"error": "No valid response", "analysis": None}


class CrimeDataLoader:
    """Load crime video frames"""

    def __init__(self, data_path: str, frame_skip: int = 90):
        self.data_path = Path(data_path)
        self.frame_skip = frame_skip

    def discover_all_videos(self) -> Dict[str, Dict]:
        """Discover all videos"""
        print(f"\n=== DISCOVERING VIDEOS ===")
        print(f"Scanning: {self.data_path}")

        all_videos = {}

        try:
            crime_types = [d for d in os.listdir(self.data_path)
                          if os.path.isdir(os.path.join(self.data_path, d))]

            print(f"Found {len(crime_types)} crime types: {crime_types}")

            for crime_type in crime_types:
                crime_dir = os.path.join(self.data_path, crime_type)
                all_files = os.listdir(crime_dir)

                video_groups = defaultdict(list)

                for filename in all_files:
                    if not any(filename.lower().endswith(ext)
                             for ext in ['.png', '.jpg', '.jpeg', '.bmp']):
                        continue

                    video_id = self._extract_video_id(filename)
                    if video_id:
                        video_groups[video_id].append(filename)

                print(f"  {crime_type}: {len(video_groups)} videos")

                for video_id, frames in video_groups.items():
                    all_videos[f"{crime_type}_{video_id}"] = {
                        'crime_type': crime_type,
                        'video_id': video_id,
                        'frames': sorted(frames, key=self._extract_frame_number),
                        'crime_dir': crime_dir
                    }

        except Exception as e:
            print(f"Error: {str(e)}")

        print(f"Total videos: {len(all_videos)}")
        return all_videos

    def _extract_video_id(self, filename: str) -> str:
        """Extract video ID from filename"""
        import re
        name_without_ext = os.path.splitext(filename)[0]

        if '_frame_' in name_without_ext:
            return name_without_ext.split('_frame_')[0]

        parts = name_without_ext.split('_')
        if len(parts) >= 2:
            try:
                int(parts[-1])
                return '_'.join(parts[:-1])
            except ValueError:
                pass

        video_id = re.sub(r'_?\d+$', '', name_without_ext)
        if video_id and video_id != name_without_ext:
            return video_id

        return name_without_ext

    def _extract_frame_number(self, filename: str) -> int:
        """Extract frame number for sorting"""
        import re
        try:
            if '_frame_' in filename:
                parts = filename.split('_frame_')
                if len(parts) > 1:
                    return int(parts[1].split('.')[0])
            numbers = re.findall(r'\d+', filename)
            if numbers:
                return int(numbers[-1])
        except:
            pass
        return 0

    def load_video_frames(self, video_info: Dict) -> List[str]:
        """Load frames (every 90th frame)"""
        crime_dir = video_info['crime_dir']
        all_frames = video_info['frames']

        selected_frames = all_frames[::self.frame_skip]

        frames_data = []
        for frame_file in selected_frames:
            frame_path = os.path.join(crime_dir, frame_file)
            try:
                with open(frame_path, 'rb') as f:
                    frame_data = base64.b64encode(f.read()).decode('utf-8')
                    frames_data.append(frame_data)
            except Exception as e:
                print(f"  Error loading {frame_file}: {str(e)}")

        return frames_data


class MultiModelAblationStudy:
    """Conduct ablation study across multiple models"""

    def __init__(self, api_keys: Dict[str, str], data_path: str, frame_skip: int = 90):
        self.api_keys = api_keys
        self.data_loader = CrimeDataLoader(data_path, frame_skip)
        self.frame_skip = frame_skip
        self.chunk_size = CHUNK_SIZE

        # Create agents for each model and configuration
        self.models = {}
        for model_name, api_key in api_keys.items():
            self.models[model_name] = {
                'Full ReAct': MultiModelReActAgent(
                    model_name, api_key,
                    ['thought', 'decision', 'observation'],
                    'Full ReAct'
                ),
                'No Decision': MultiModelReActAgent(
                    model_name, api_key,
                    ['thought', 'observation'],
                    'No Decision'
                ),
                'No Observation': MultiModelReActAgent(
                    model_name, api_key,
                    ['thought', 'decision'],
                    'No Observation'
                ),
                'Only Thought': MultiModelReActAgent(
                    model_name, api_key,
                    ['thought'],
                    'Only Thought'
                )
            }

        self.results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
        os.makedirs(SAVE_DIR, exist_ok=True)
        print(f"Results will be saved to: {SAVE_DIR}")

    def run_study(self, max_videos: int = None) -> Dict:
        """Run ablation study across all models"""
        print("\n" + "="*80)
        print("MULTI-MODEL REACT ABLATION STUDY")
        print(f"Testing {len(self.models)} models √ó 4 configurations")
        print(f"Processing every {self.frame_skip}th frame")
        print("="*80 + "\n")

        # Discover videos
        all_videos = self.data_loader.discover_all_videos()

        if not all_videos:
            print("No videos found!")
            return {}

        videos_to_process = list(all_videos.items())
        if max_videos:
            videos_to_process = videos_to_process[:max_videos]
            print(f"‚ö† Processing {max_videos} videos for testing\n")

        # Process each video with each model and configuration
        for video_key, video_info in tqdm(videos_to_process, desc="Videos"):
            print(f"\n--- {video_key} ---")
            print(f"Crime type: {video_info['crime_type']}")

            # Load frames
            frames_data = self.data_loader.load_video_frames(video_info)

            if not frames_data:
                print("  No frames loaded")
                continue

            print(f"  Loaded {len(frames_data)} frames")

            # Split into chunks
            frame_chunks = [frames_data[i:i+self.chunk_size]
                          for i in range(0, len(frames_data), self.chunk_size)]

            # Test each model
            for model_name in self.models.keys():
                print(f"\n  Model: {model_name.upper()}")

                # Test each configuration
                for config_name, agent in self.models[model_name].items():
                    print(f"    Config: {config_name}", end=" ")

                    chunk_analyses = []
                    processing_times = []

                    for chunk_idx, chunk in enumerate(frame_chunks):
                        frame_start = chunk_idx * self.chunk_size + 1
                        frame_end = min((chunk_idx + 1) * self.chunk_size, len(frames_data))
                        frame_range = f"{frame_start}-{frame_end}"

                        start_time = time.time()
                        result = agent.process_frames(chunk, frame_range, len(frames_data))
                        processing_time = time.time() - start_time
                        processing_times.append(processing_time)

                        if result.get('analysis'):
                            chunk_analyses.append(result['analysis'])
                        elif result.get('error'):
                            print(f"‚ö†", end=" ")

                        time.sleep(2)  # Rate limiting

                    # Extract prediction
                    prediction = self._extract_prediction(chunk_analyses)
                    ground_truth = self._get_ground_truth(video_info['crime_type'])

                    # Store results
                    self.results[video_key][model_name][config_name] = {
                        'prediction': prediction,
                        'ground_truth': ground_truth,
                        'chunk_analyses': chunk_analyses,
                        'num_chunks': len(frame_chunks),
                        'avg_processing_time': np.mean(processing_times) if processing_times else 0,
                        'phases_used': agent.phases
                    }

                    match = "‚úì" if prediction == ground_truth else "‚úó"
                    print(f"‚Üí {match} {prediction}")

        print(f"\n‚úì Processed {len(self.results)} videos")
        return dict(self.results)

    def _extract_prediction(self, analyses: List[str]) -> str:
        """Extract crime prediction from analyses"""
        if not analyses:
            return 'uncertain'

        full_text = " ".join(analyses).lower()

        crime_keywords = ['crime', 'theft', 'assault', 'robbery', 'suspicious',
                         'incident', 'illegal', 'criminal', 'violence', 'shoplifting']

        crime_count = sum(1 for keyword in crime_keywords if keyword in full_text)
        high_conf = any(word in full_text for word in ['high confidence', 'clearly', 'definitely'])

        if crime_count >= 3 or (crime_count >= 2 and high_conf):
            return 'crime'
        elif 'normal' in full_text or 'no crime' in full_text:
            return 'normal'
        return 'uncertain'

    def _get_ground_truth(self, crime_type: str) -> str:
        """Get ground truth label"""
        normal_categories = ['normal', 'regular', 'ordinary', 'safe', 'benign']
        if any(cat in crime_type.lower() for cat in normal_categories):
            return 'normal'
        return 'crime'

    def _calculate_metrics(self, predictions: List[str], ground_truths: List[str]) -> Dict:
        """Calculate performance metrics"""
        uncertain_count = sum(1 for p in predictions if p == 'uncertain')

        valid_pairs = [(p, g) for p, g in zip(predictions, ground_truths)
                      if p in ['crime', 'normal']]

        if not valid_pairs:
            return {
                'accuracy': 0.0, 'precision': 0.0, 'recall': 0.0, 'f1': 0.0,
                'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0,
                'uncertain_count': uncertain_count
            }

        predictions, ground_truths = zip(*valid_pairs)

        tp = sum(1 for p, g in zip(predictions, ground_truths) if p == 'crime' and g == 'crime')
        fp = sum(1 for p, g in zip(predictions, ground_truths) if p == 'crime' and g == 'normal')
        tn = sum(1 for p, g in zip(predictions, ground_truths) if p == 'normal' and g == 'normal')
        fn = sum(1 for p, g in zip(predictions, ground_truths) if p == 'normal' and g == 'crime')

        total = tp + fp + tn + fn
        accuracy = (tp + tn) / total if total > 0 else 0
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        return {
            'accuracy': accuracy,
            'precision': precision,
            'recall': recall,
            'f1': f1,
            'tp': tp, 'fp': fp, 'tn': tn, 'fn': fn,
            'uncertain_count': uncertain_count
        }

    def generate_report(self):
        """Generate comprehensive multi-model report"""
        print("\n" + "="*80)
        print("GENERATING MULTI-MODEL ABLATION REPORT")
        print("="*80 + "\n")

        # Calculate metrics for each model and configuration
        all_metrics = {}

        for model_name in self.models.keys():
            all_metrics[model_name] = {}

            for config_name in ['Full ReAct', 'No Decision', 'No Observation', 'Only Thought']:
                predictions = []
                ground_truths = []
                processing_times = []

                for video_key, models_results in self.results.items():
                    if model_name in models_results and config_name in models_results[model_name]:
                        result = models_results[model_name][config_name]
                        predictions.append(result['prediction'])
                        ground_truths.append(result['ground_truth'])
                        processing_times.append(result['avg_processing_time'])

                metrics = self._calculate_metrics(predictions, ground_truths)
                metrics['avg_processing_time'] = np.mean(processing_times) if processing_times else 0
                all_metrics[model_name][config_name] = metrics

        # Create summary table
        self._create_summary_table(all_metrics)

        # Create visualizations
        self._create_visualizations(all_metrics)

        # Create detailed analysis
        self._create_detailed_analysis(all_metrics)

        # Save raw results
        results_file = os.path.join(SAVE_DIR, "raw_results_all_models.json")
        with open(results_file, 'w') as f:
            json.dump(dict(self.results), f, indent=2)

        print(f"\n‚úì Report saved to: {SAVE_DIR}")

    def _create_summary_table(self, all_metrics: Dict):
        """Create summary table"""
        rows = []

        for model_name, configs in all_metrics.items():
            for config_name, metrics in configs.items():
                rows.append({
                    'Model': model_name.upper(),
                    'Configuration': config_name,
                    'Accuracy': f"{metrics['accuracy']:.4f}",
                    'Precision': f"{metrics['precision']:.4f}",
                    'Recall': f"{metrics['recall']:.4f}",
                    'F1': f"{metrics['f1']:.4f}",
                    'Avg Time (s)': f"{metrics['avg_processing_time']:.2f}",
                    'Uncertain': metrics['uncertain_count']
                })

        df = pd.DataFrame(rows)
        summary_path = os.path.join(SAVE_DIR, "summary_all_models.csv")
        df.to_csv(summary_path, index=False)

        print(f"‚úì Summary table: {summary_path}")
        print("\nSummary:")
        print(df.to_string(index=False))

    def _create_visualizations(self, all_metrics: Dict):
        """Create comparison visualizations"""
        fig, axes = plt.subplots(3, 3, figsize=(20, 16))
        fig.suptitle(f'Multi-Model ReAct Ablation Study\nEvery {self.frame_skip}th Frame',
                    fontsize=18, fontweight='bold')

        models = list(all_metrics.keys())
        configs = ['Full ReAct', 'No Decision', 'No Observation', 'Only Thought']
        colors = {'gemini': '#4285F4', 'gpt': '#10A37F', 'claude': '#6B46C1'}

        # Row 1: F1 Score Comparison
        for idx, model in enumerate(models):
            ax = axes[0, idx]
            f1_scores = [all_metrics[model][c]['f1'] for c in configs]
            bars = ax.bar(range(len(configs)), f1_scores, color=colors.get(model, '#999'),
                         alpha=0.8, edgecolor='black')
            ax.set_title(f'{model.upper()} - F1 Score', fontsize=14, fontweight='bold')
            ax.set_xticks(range(len(configs)))
            ax.set_xticklabels(configs, rotation=45, ha='right', fontsize=9)
            ax.set_ylim([0, 1.0])
            ax.set_ylabel('F1 Score', fontweight='bold')
            ax.grid(axis='y', alpha=0.3)

            for bar in bars:
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height,
                       f'{height:.3f}', ha='center', va='bottom', fontsize=9)

        # Row 2: Processing Time Comparison
        for idx, model in enumerate(models):
            ax = axes[1, idx]
            times = [all_metrics[model][c]['avg_processing_time'] for c in configs]
            bars = ax.bar(range(len(configs)), times, color=colors.get(model, '#999'),
                         alpha=0.8, edgecolor='black')
            ax.set_title(f'{model.upper()} - Processing Time', fontsize=14, fontweight='bold')
            ax.set_xticks(range(len(configs)))
            ax.set_xticklabels(configs, rotation=45, ha='right', fontsize=9)
            ax.set_ylabel('Time (seconds)', fontweight='bold')
            ax.grid(axis='y', alpha=0.3)

            for bar in bars:
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height,
                       f'{height:.1f}s', ha='center', va='bottom', fontsize=9)

        # Row 3: Phase Impact Comparison
        for idx, model in enumerate(models):
            ax = axes[2, idx]
            baseline_f1 = all_metrics[model]['Full ReAct']['f1']
            impacts = {
                'Decision': baseline_f1 - all_metrics[model]['No Decision']['f1'],
                'Observation': baseline_f1 - all_metrics[model]['No Observation']['f1'],
                'Both': baseline_f1 - all_metrics[model]['Only Thought']['f1']
            }

            phases = list(impacts.keys())
            values = list(impacts.values())
            bars = ax.bar(phases, values, color=['#e74c3c', '#3498db', '#9b59b6'],
                         alpha=0.8, edgecolor='black')
            ax.set_title(f'{model.upper()} - Phase Impact', fontsize=14, fontweight='bold')
            ax.set_ylabel('F1 Impact (drop)', fontweight='bold')
            ax.axhline(y=0, color='black', linestyle='--', linewidth=1)
            ax.grid(axis='y', alpha=0.3)

            for bar in bars:
                height = bar.get_height()
                ax.text(bar.get_x() + bar.get_width()/2., height,
                       f'{height:.3f}', ha='center',
                       va='bottom' if height > 0 else 'top', fontsize=10)

        plt.tight_layout()
        viz_path = os.path.join(SAVE_DIR, "multi_model_comparison.png")
        plt.savefig(viz_path, dpi=300, bbox_inches='tight')
        plt.close()

        print(f"‚úì Visualizations: {viz_path}")

    def _create_detailed_analysis(self, all_metrics: Dict):
        """Create detailed text analysis"""
        report_path = os.path.join(SAVE_DIR, "detailed_analysis_all_models.txt")

        with open(report_path, 'w') as f:
            f.write("="*80 + "\n")
            f.write("MULTI-MODEL REACT ABLATION STUDY - DETAILED ANALYSIS\n")
            f.write(f"Frame Sampling: Every {self.frame_skip}th frame\n")
            f.write("="*80 + "\n\n")

            f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
            f.write(f"Models Tested: {', '.join([m.upper() for m in all_metrics.keys()])}\n")
            f.write(f"Videos Analyzed: {len(self.results)}\n\n")

            # Overall findings
            f.write("-"*80 + "\n")
            f.write("OVERALL FINDINGS\n")
            f.write("-"*80 + "\n\n")

            # Find best model and configuration
            best_f1 = 0
            best_model = None
            best_config = None

            for model_name, configs in all_metrics.items():
                for config_name, metrics in configs.items():
                    if metrics['f1'] > best_f1:
                        best_f1 = metrics['f1']
                        best_model = model_name
                        best_config = config_name

            f.write(f"üèÜ BEST OVERALL: {best_model.upper()} - {best_config}\n")
            f.write(f"   F1 Score: {best_f1:.4f}\n\n")

            # Per-model analysis
            f.write("-"*80 + "\n")
            f.write("PER-MODEL ANALYSIS\n")
            f.write("-"*80 + "\n\n")

            for model_name, configs in all_metrics.items():
                f.write(f"\n{model_name.upper()}:\n")
                f.write("-" * 40 + "\n")

                baseline_f1 = configs['Full ReAct']['f1']
                decision_impact = baseline_f1 - configs['No Decision']['f1']
                observation_impact = baseline_f1 - configs['No Observation']['f1']

                f.write(f"  Best Configuration: ")
                best_config_f1 = max(configs.items(), key=lambda x: x[1]['f1'])
                f.write(f"{best_config_f1[0]} (F1={best_config_f1[1]['f1']:.4f})\n\n")

                f.write(f"  Phase Importance:\n")
                f.write(f"    Decision Impact:    {decision_impact:+.4f}\n")
                f.write(f"    Observation Impact: {observation_impact:+.4f}\n\n")

                f.write(f"  Processing Efficiency:\n")
                for config_name, metrics in sorted(configs.items(),
                                                   key=lambda x: x[1]['avg_processing_time']):
                    f.write(f"    {config_name:20s}: {metrics['avg_processing_time']:.2f}s\n")

                f.write("\n")

            # Comparative insights
            f.write("-"*80 + "\n")
            f.write("COMPARATIVE INSIGHTS\n")
            f.write("-"*80 + "\n\n")

            f.write("Which model benefits most from each phase?\n\n")

            for phase in ['Decision', 'Observation']:
                impacts = {}
                for model_name, configs in all_metrics.items():
                    baseline = configs['Full ReAct']['f1']
                    if phase == 'Decision':
                        ablated = configs['No Decision']['f1']
                    else:
                        ablated = configs['No Observation']['f1']
                    impacts[model_name] = baseline - ablated

                most_dependent = max(impacts.items(), key=lambda x: abs(x[1]))
                f.write(f"{phase} Phase:\n")
                f.write(f"  Most dependent: {most_dependent[0].upper()} (impact: {most_dependent[1]:+.4f})\n")
                f.write(f"  All impacts: {', '.join([f'{m.upper()}={v:+.3f}' for m, v in impacts.items()])}\n\n")

            f.write("\n" + "="*80 + "\n")
            f.write("END OF REPORT\n")
            f.write("="*80 + "\n")

        print(f"‚úì Detailed analysis: {report_path}")


def load_api_keys(keys_dir: str) -> Dict[str, str]:
    """Load API keys for all models"""
    print("\n=== LOADING API KEYS ===")

    api_keys = {}
    key_files = {
        'gemini': 'Gemini.txt',
        'gpt': 'chatgpt.txt',
        'claude': 'claude.txt'
    }

    for model_name, filename in key_files.items():
        filepath = os.path.join(keys_dir, filename)
        try:
            with open(filepath, 'r') as f:
                key = f.read().strip()
                if key:
                    api_keys[model_name] = key
                    print(f"‚úì {model_name.upper()}: Loaded")
                else:
                    print(f"‚úó {model_name.upper()}: Empty file")
        except Exception as e:
            print(f"‚úó {model_name.upper()}: {str(e)}")

    print(f"\nLoaded {len(api_keys)}/{len(key_files)} API keys\n")
    return api_keys


def main():
    """Main execution"""
    print("\n" + "‚ïî" + "="*78 + "‚ïó")
    print("‚ïë" + " "*15 + "MULTI-MODEL REACT ABLATION STUDY" + " "*31 + "‚ïë")
    print("‚ïë" + " "*10 + "Testing Gemini, GPT-4, and Claude with Every 90th Frame" + " "*13 + "‚ïë")
    print("‚ïö" + "="*78 + "‚ïù\n")

    # Load API keys
    api_keys = load_api_keys(API_KEYS_DIR)

    if not api_keys:
        print("‚úó No API keys loaded. Exiting.")
        return {}, None

    # Verify data directory
    if not os.path.exists(DATA_DIR):
        print(f"‚úó Data directory not found: {DATA_DIR}")
        return {}, None

    print(f"‚úì Data directory: {DATA_DIR}")

    # Initialize study
    study = MultiModelAblationStudy(
        api_keys=api_keys,
        data_path=DATA_DIR,
        frame_skip=FRAME_SKIP
    )

    # Run study
    print(f"\nüöÄ Starting multi-model ablation study...")
    results = study.run_study(max_videos=5)  # Set to None for all videos

    if not results:
        print("\n‚ö† No results generated")
        return {}, None

    # Generate report
    study.generate_report()

    print("\n" + "="*80)
    print("‚úì MULTI-MODEL ABLATION STUDY COMPLETED")
    print("="*80)
    print(f"\nüìÅ Results: {SAVE_DIR}/")
    print("\nGenerated files:")
    print("  ‚Ä¢ summary_all_models.csv - Metrics for all models")
    print("  ‚Ä¢ multi_model_comparison.png - Visual comparison")
    print("  ‚Ä¢ detailed_analysis_all_models.txt - Complete analysis")
    print("  ‚Ä¢ raw_results_all_models.json - Raw data")
    print("\n" + "="*80 + "\n")

    return results, study


if __name__ == "__main__":
    main()


‚ïë               MULTI-MODEL REACT ABLATION STUDY                               ‚ïë
‚ïë          Testing Gemini, GPT-4, and Claude with Every 90th Frame             ‚ïë


=== LOADING API KEYS ===
‚úì GEMINI: Loaded
‚úì GPT: Loaded
‚úì CLAUDE: Loaded

Loaded 3/3 API keys

‚úì Data directory: /content/drive/Shareddrives/DR KOFI RESEARCH/RESEARCH/COMPLETED/PROMPTS/crime-data
Results will be saved to: /content/drive/MyDrive/multi_model_ablation_results

üöÄ Starting multi-model ablation study...

MULTI-MODEL REACT ABLATION STUDY
Testing 3 models √ó 4 configurations
Processing every 90th frame


=== DISCOVERING VIDEOS ===
Scanning: /content/drive/Shareddrives/DR KOFI RESEARCH/RESEARCH/COMPLETED/PROMPTS/crime-data
Found 11 crime types: ['Shoplifting', 'Fighting', 'Shooting', 'Stealing', 'Explosion', 'Arson', 'Vandalism', 'Abuse', 'Robbery', 'Burglary', 'Assault']
  Shoplifting: 2 videos
  Fighting: 2 videos
  Shooting: 2 videos
  Stealing: 2 videos
  Explosion: 2 videos
  Arson: 2 videos


Videos:   0%|          | 0/5 [00:00<?, ?it/s]


--- Shoplifting_Shoplifting003_x264 ---
Crime type: Shoplifting
  Loaded 11 frames

  Model: GEMINI
    Config: Full ReAct ‚Üí ‚úì crime
    Config: No Decision ‚Üí ‚úì crime
    Config: No Observation ‚Üí ‚úì crime
    Config: Only Thought ‚Üí ‚úì crime

  Model: GPT
    Config: Full ReAct ‚Üí ‚úì crime
    Config: No Decision ‚Üí ‚úì crime
    Config: No Observation ‚Üí ‚úì crime
    Config: Only Thought ‚Üí ‚úì crime

  Model: CLAUDE
    Config: Full ReAct ‚Üí ‚úì crime
    Config: No Decision ‚Üí ‚úì crime
    Config: No Observation ‚Üí ‚úì crime
    Config: Only Thought 

Videos:  20%|‚ñà‚ñà        | 1/5 [05:29<21:56, 329.22s/it]

‚Üí ‚úì crime

--- Shoplifting_Shoplifting004_x264 ---
Crime type: Shoplifting
  Loaded 8 frames

  Model: GEMINI
    Config: Full ReAct ‚Üí ‚úì crime
    Config: No Decision ‚Üí ‚úì crime
    Config: No Observation ‚Üí ‚úì crime
    Config: Only Thought ‚Üí ‚úì crime

  Model: GPT
    Config: Full ReAct ‚Üí ‚úì crime
    Config: No Decision ‚Üí ‚úó uncertain
    Config: No Observation ‚Üí ‚úó uncertain
    Config: Only Thought ‚Üí ‚úó uncertain

  Model: CLAUDE
    Config: Full ReAct ‚Üí ‚úì crime
    Config: No Decision ‚Üí ‚úó normal
    Config: No Observation ‚Üí ‚úó normal
    Config: Only Thought 

Videos:  40%|‚ñà‚ñà‚ñà‚ñà      | 2/5 [08:45<12:33, 251.09s/it]

‚Üí ‚úì crime

--- Fighting_Fighting003_x264 ---
Crime type: Fighting
  Loaded 3 frames

  Model: GEMINI
    Config: Full ReAct ‚Üí ‚úì crime
    Config: No Decision ‚Üí ‚úì crime
    Config: No Observation ‚Üí ‚úì crime
    Config: Only Thought ‚Üí ‚úó normal

  Model: GPT
    Config: Full ReAct ‚Üí ‚úì crime
    Config: No Decision ‚Üí ‚úó uncertain
    Config: No Observation ‚Üí ‚úì crime
    Config: Only Thought ‚Üí ‚úì crime

  Model: CLAUDE
    Config: Full ReAct ‚Üí ‚úì crime
    Config: No Decision ‚Üí ‚úó normal
    Config: No Observation ‚Üí ‚úì crime
    Config: Only Thought 

Videos:  60%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà    | 3/5 [11:42<07:14, 217.23s/it]

‚Üí ‚úì crime

--- Fighting_Fighting016_x264 ---
Crime type: Fighting
  Loaded 3 frames

  Model: GEMINI
    Config: Full ReAct ‚Üí ‚úì crime
    Config: No Decision ‚Üí ‚úì crime
    Config: No Observation ‚Üí ‚úì crime
    Config: Only Thought ‚Üí ‚úì crime

  Model: GPT
    Config: Full ReAct ‚Üí ‚úì crime
    Config: No Decision ‚Üí ‚úì crime
    Config: No Observation ‚Üí ‚úì crime
    Config: Only Thought ‚Üí ‚úì crime

  Model: CLAUDE
    Config: Full ReAct ‚Üí ‚úì crime
    Config: No Decision ‚Üí ‚úì crime
    Config: No Observation ‚Üí ‚úì crime
    Config: Only Thought 

Videos:  80%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà  | 4/5 [14:37<03:20, 200.44s/it]

‚Üí ‚úì crime

--- Shooting_Shooting005_x264 ---
Crime type: Shooting
  Loaded 2 frames

  Model: GEMINI
    Config: Full ReAct ‚Üí ‚úì crime
    Config: No Decision ‚Üí ‚úì crime
    Config: No Observation ‚Üí ‚úì crime
    Config: Only Thought ‚Üí ‚úì crime

  Model: GPT
    Config: Full ReAct ‚Üí ‚úì crime
    Config: No Decision ‚Üí ‚úì crime
    Config: No Observation ‚Üí ‚úì crime
    Config: Only Thought ‚Üí ‚úì crime

  Model: CLAUDE
    Config: Full ReAct ‚Üí ‚úì crime
    Config: No Decision ‚Üí ‚úì crime
    Config: No Observation ‚Üí ‚úì crime
    Config: Only Thought 

Videos: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [17:14<00:00, 206.93s/it]

‚Üí ‚úì crime

‚úì Processed 5 videos

GENERATING MULTI-MODEL ABLATION REPORT

‚úì Summary table: /content/drive/MyDrive/multi_model_ablation_results/summary_all_models.csv

Summary:
 Model  Configuration Accuracy Precision Recall     F1 Avg Time (s)  Uncertain
GEMINI     Full ReAct   1.0000    1.0000 1.0000 1.0000         7.18          0
GEMINI    No Decision   1.0000    1.0000 1.0000 1.0000         7.87          0
GEMINI No Observation   1.0000    1.0000 1.0000 1.0000         7.08          0
GEMINI   Only Thought   0.8000    1.0000 0.8000 0.8889         4.32          0
   GPT     Full ReAct   1.0000    1.0000 1.0000 1.0000        15.34          0
   GPT    No Decision   1.0000    1.0000 1.0000 1.0000        13.11          2
   GPT No Observation   1.0000    1.0000 1.0000 1.0000        13.79          1
   GPT   Only Thought   1.0000    1.0000 1.0000 1.0000        14.33          1
CLAUDE     Full ReAct   1.0000    1.0000 1.0000 1.0000        18.98          0
CLAUDE    No Decision   0.6




‚úì Visualizations: /content/drive/MyDrive/multi_model_ablation_results/multi_model_comparison.png
‚úì Detailed analysis: /content/drive/MyDrive/multi_model_ablation_results/detailed_analysis_all_models.txt

‚úì Report saved to: /content/drive/MyDrive/multi_model_ablation_results

‚úì MULTI-MODEL ABLATION STUDY COMPLETED

üìÅ Results: /content/drive/MyDrive/multi_model_ablation_results/

Generated files:
  ‚Ä¢ summary_all_models.csv - Metrics for all models
  ‚Ä¢ multi_model_comparison.png - Visual comparison
  ‚Ä¢ detailed_analysis_all_models.txt - Complete analysis
  ‚Ä¢ raw_results_all_models.json - Raw data


