### A bit of Ollama integration notes.

- Ollama runs as a separate service/process outside your Python environment.
- It exposes an HTTP API on port 11434 (by default).
- Any Python environment (virtual or not) can make HTTP requests to this API.



In [19]:


import requests
try:
    response = requests.get("http://localhost:11434/api/tags")
    print(f"Ollama status: {response.status_code}")
    print(response.json())
except Exception as e:
    print(f"Ollama connection error: {e}")


print('\n\n')

def check_ollama_running():
    try:
        response = requests.get("http://localhost:11434/api/tags", timeout=5)
        return response.status_code == 200
    except:
        return False

if not check_ollama_running():
    print("Ollama service not running. Please start it manually with 'ollama serve'")
    sys.exit(1)
    
    

Ollama status: 200
{'models': [{'name': 'dolphin-llama3:8b', 'model': 'dolphin-llama3:8b', 'modified_at': '2025-04-13T05:23:36.9187971-04:00', 'size': 4661235994, 'digest': '613f068e29f863bb900e568f920401b42678efca873d7a7c87b0d6ef4945fadd', 'details': {'parent_model': '', 'format': 'gguf', 'family': 'llama', 'families': ['llama'], 'parameter_size': '8B', 'quantization_level': 'Q4_0'}}, {'name': 'qwen2.5:14b', 'model': 'qwen2.5:14b', 'modified_at': '2025-04-13T05:19:09.914628-04:00', 'size': 8988124069, 'digest': '7cdf5a0187d5c58cc5d369b255592f7841d1c4696d45a8c8a9489440385b22f6', 'details': {'parent_model': '', 'format': 'gguf', 'family': 'qwen2', 'families': ['qwen2'], 'parameter_size': '14.8B', 'quantization_level': 'Q4_K_M'}}, {'name': 'yi:9b', 'model': 'yi:9b', 'modified_at': '2025-04-13T05:10:58.5931043-04:00', 'size': 5037006897, 'digest': '3af70141e8ebd597c62c01a8521b8833656e9a734bfb258803bc2ea21987587a', 'details': {'parent_model': '', 'format': 'gguf', 'family': 'llama', 'famil

## OllamaDebateManager Class
- Downloading, Model Run Management, Trimming Think Tags, etc. Will interact later on with the MultiAgent pipeline but minimally. ( as client.run_method().. )
- restarts if needed, sample compare prompts etc. 

In [None]:
import os
import subprocess
import requests
import time


## --> didnt work. 
# custom path for Ollama models
# model_path = r"D:\JoelDesktop folds_24\NEU SPRING25 - DL, HCI\Projects\Multi-Agent LLM Debator\models"
# os.environ["OLLAMA_MODELS"] = model_path
# Note: You should start the Ollama server with the environment variable set before running this script


def restart_ollama_service():
    try:
        # Stop the ollama service if it's running
        subprocess.run(["taskkill", "/f", "/im", "ollama.exe"], 
                      stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        time.sleep(2)
        
        # Start ollama service again with the new environment variable
        subprocess.Popen(["ollama", "serve"], 
                        stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        time.sleep(5)  # Give it time to start up
        print("Ollama service restarted with custom models path")
    except Exception as e:
        print(f"Error restarting Ollama service: {e}")

# restart_ollama_service()

## -------------------------------------------------------------------------------------------------------------------------------------------------
## -------------------------------------------------------------------------------------------------------------------------------------------------


class OllamaDebateManager:
    def __init__(self, base_url="http://localhost:11434"):
        self.base_url = base_url

        self.models = {
            "mixtral": "mixtral:8x7b",
            "gemma3": "gemma3:27b",
            "phi4": "phi4:latest", 
            "phi3": "phi3:14b",                         
            "deepseek_r1": "deepseek-r1:14b",  # original, non distilled. 

            ## "llama3.1": "llama3.1:8b",
            ## "deepseek_llm": "deepseek-llm:7b",
            ## "deepseek_r1_14b": "deepseek-r1:14b",
            ## "deepseek_r1_7b": "deepseek-r1:7b"
            "yi": "yi:9b",
            "qwen2.5": "qwen2.5:14b",
            "dolphin": "dolphin-llama3:8b"
        }
        


    
    def check_model_exists(self, model_name):
        """Check if a model is already downloaded"""
        try:
            # Force fresh data by clearing any cache
            response = requests.get(f"{self.base_url}/api/tags", 
                                   headers={"Cache-Control": "no-cache"})
            
            if response.status_code == 200:
                models = response.json().get("models", [])
                model_names = [m.get('name', '') for m in models]
                print(f"Available models: {model_names}")
                print(f"Looking for: {self.models[model_name]}")
                return self.models[model_name] in model_names
            return False
        except Exception as e:
            print(f"Error checking models: {e}")
            return False

    
                
    def download_model(self, model_name):
        """Download a model using Ollama pull command"""
        if model_name not in self.models:
            print(f"Unknown model: {model_name}")
            return False
            
        if self.check_model_exists(model_name):
            print(f"Model {self.models[model_name]} already exists.")
            return True
                
        print(f"Downloading {self.models[model_name]}...")
        
        try:
            # Use requests instead of subprocess for better control
            response = requests.post(
                f"{self.base_url}/api/pull",
                json={"name": self.models[model_name]},
                timeout=300  # 5 minute timeout per model
            )
            
            if response.status_code == 200:
                print(f"Successfully downloaded {self.models[model_name]}")
                return True
            else:
                print(f"Failed to download {self.models[model_name]}: {response.status_code}")
                print(response.text)
                return False
        except Exception as e:
            print(f"Error downloading model: {e}")
            return False
    

            
    def download_all_models(self):
        """Download all models defined in self.models"""
        for model_name in self.models:
            print(f"Processing model: {model_name}")
            result = self.download_model(model_name)
            if result:
                # delay to allow model to be registered
                time.sleep(2)
            else:
                print(f"Skipping to next model due to error with {model_name}")
            print("-" * 30)  #  for clarity




            
    def run_model(self, model_name, prompt):
        """Run a query against a model with focused error diagnostics"""
        if model_name not in self.models:
            print(f"ERROR: Unknown model: {model_name}")
            return None
                
        # Check if model exists, download if not
        if not self.check_model_exists(model_name):
            print(f"Model {self.models[model_name]} not found. Downloading...")
            if not self.download_model(model_name):
                return None
        
        # Simple prompt length check
        prompt_length = len(prompt)
        if prompt_length > 15000:
            print(f"WARNING: Large prompt ({prompt_length} chars) may exceed context window for {model_name}")
        
        # Run the query with focused error handling
        try:
            print(f"Sending request to {self.models[model_name]}...")
            
            response = requests.post(
                f"{self.base_url}/api/generate",
                json={
                    "model": self.models[model_name],
                    "prompt": prompt,
                    "stream": False
                }
            )
            
            if response.status_code == 200:
                raw_response = response.json().get("response", "")
                
                # Check if response is empty
                if not raw_response or len(raw_response.strip()) < 10:
                    print(f"WARNING: Model returned empty or very short response")
                    return None
                
                # Clean thinking tags and return
                cleaned_response = self._clean_thinking_tags(raw_response)
                return cleaned_response
                
            elif response.status_code == 400:
                print(f"ERROR: Bad request - likely context length exceeded for {model_name}")
                return None
                
            elif response.status_code != 200:
                print(f"ERROR: API returned status code {response.status_code}")
                print(f"Error details: {response.text[:200]}")
                return None
                    
        except Exception as e:
            print(f"ERROR running model: {e}")
            return None
    
            

    
    def _clean_thinking_tags(self, text):
        """Remove content within <think> tags and any standalone <think> tags"""
        import re
        
        # First check if there are any thinking tags
        if "<think>" not in text:
            return text
        
        # Remove complete thinking sections with their content
        cleaned = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
        
        # Remove any standalone opening or closing tags
        cleaned = re.sub(r'<think>', '', cleaned)
        cleaned = re.sub(r'</think>', '', cleaned)
        
        # Remove "Final Answer:" or similar prefixes often used after thinking sections
        cleaned = re.sub(r'^(Final Answer:|Final Response:|Answer:)\s*', '', cleaned.strip())
        
        # Clean up any double newlines created by removing sections
        cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
        
        return cleaned.strip()

    
    def compare_models_on_debate(self, topic, stance="FOR"):
        """Compare all models on a debate topic"""
        results = {}
        
        debate_prompt = f"""You are an expert debater participating in a formal debate.
        Please provide a strong, well-reasoned argument {stance} the following topic:
        
        TOPIC: {topic}
        
        Your argument should be logical, factual, and persuasive.
        Structure it with a clear thesis, supporting points, and a conclusion.
        Anticipate and address potential counterarguments.
        """
        
        for model_name in self.models:
            print(f"Getting {stance} argument from {model_name}...")
            start_time = time.time()
            response = self.run_model(model_name, debate_prompt)
            end_time = time.time()
            
            if response:
                results[model_name] = {
                    "response": response,
                    "time_taken": end_time - start_time
                }
                print(f"Response received in {end_time - start_time:.2f} seconds")
            else:
                print(f"Failed to get response from {model_name}")
        
        return results



    
## ------------------------------------------------------------------------------------------------------------------------------------------------------
## ------------------------------------------------------------------------------------------------------------------------------------------------------

if __name__ == "__main__":
    debate_manager = OllamaDebateManager()
    
    # Option 1: Download all models (uncomment to run)
    # debate_manager.download_all_models()

    
    # Option 2: Download specific models
    ## print("Downloading selected models...")
    ## for model in ["mixtral", "llama3", "deepseek_llm"]:
    ##     debate_manager.download_model(model)
    
    # Option 3: Test a specific model
    test_prompt = "What are the three strongest arguments in favor of renewable energy?"
    response = debate_manager.run_model("deepseek_r1", test_prompt)

    print("\n\nSample response:")
    print(response)


    # test_prompt = "What are three key aspects of effective debate strategy?"
    # response = debate_manager.run_model("yi", test_prompt)    
    # print("\n\nSample response from Yi 9B:")
    # print(response)

    test_prompt = "What are three key aspects of effective debate strategy?"
    response = debate_manager.run_model("qwen2.5", test_prompt)    
    print("\n\nSample response from Qwen:")
    print(response)

    
    # test_prompt = "What are three key aspects of effective debate strategy?"
    # response = debate_manager.run_model("dolphin", test_prompt)    
    # print("\n\nSample response from Dolphin:")
    # print(response)


    ## Gemma 3 27B vs Phi-3 14B
    ## Mixtral 8x7B vs DeepSeek-R1 14B

Available models: ['dolphin-llama3:8b', 'qwen2.5:14b', 'gemma3:27b', 'phi4:latest', 'deepseek-r1:14b', 'phi3:14b', 'mixtral:8x7b']
Looking for: deepseek-r1:14b
Sending request to deepseek-r1:14b...


### Quick Checks:

In [21]:
import requests

def check_model_sizes():
    try:
        response = requests.get("http://localhost:11434/api/tags")
        
        if response.status_code != 200:
            print(f"Error: {response.status_code}")
            return
            
        models = response.json().get("models", [])
        
        print(f"{'Model Name':<30} {'Size (GB)':<10} {'Modified Date':<20}")
        print("-" * 60)
        
        total_size = 0
        for model in models:
            name = model.get("name", "Unknown")
            size_bytes = model.get("size", 0)
            size_gb = round(size_bytes / (1024 * 1024 * 1024), 2)
            modified = model.get("modified", "Unknown")
            
            total_size += size_bytes
            print(f"{name:<30} {size_gb:<10.2f} {modified:<20}")
            
        total_size_gb = round(total_size / (1024 * 1024 * 1024), 2)
        print("-" * 60)
        print(f"Total size: {total_size_gb} GB")
        
    except Exception as e:
        print(f"Error: {e}")

check_model_sizes()

Model Name                     Size (GB)  Modified Date       
------------------------------------------------------------
dolphin-llama3:8b              4.34       Unknown             
qwen2.5:14b                    8.37       Unknown             
yi:9b                          4.69       Unknown             
gemma3:27b                     16.20      Unknown             
phi4:latest                    8.43       Unknown             
deepseek-r1:14b                8.37       Unknown             
phi3:14b                       7.35       Unknown             
mixtral:8x7b                   24.63      Unknown             
------------------------------------------------------------
Total size: 82.39 GB


### Debator Ideas, Analysis:

- In a real debate, participants don't deliver separate arguments from different rhetorical angles - they deliver one cohesive argument that integrates multiple persuasive elements.
- We cannot fragment what should be a unified argument into separate specialized components. This doesn't mirror how skilled debaters actually function.
- A strong debater naturally incorporates logical reasoning, emotional appeals, expert positioning, and compelling examples within a single, well-rounded argument.


- **Debates are responsive conversations, not isolated speeches**. Each response needs to directly engage with what came before it.
- debating isn't about delivering perfect individual arguments - it's about dynamic responsive argumentation that effectively engages with the opponent's points while advancing your own position.

- We need Logical consistency (avoid fallacies), Factual accuracy (use verifiable claims), Persuasive rhetoric (engage audience effectively), compelling case argument, persuasive writing, exceptionally convincing argument, all in a single unified prompt.
- When giving rebuttal, we need ALL of the above, an also expert debate critic, CRITIC/ADVERSARIAL PROMPTS, Logical fallacies (identify the specific type) Factual inaccuracies (explain the correct information), Weak or missing evidence, Unstated assumptions, Rhetorical weaknesses etc. focused. We also need strongest possible counter-position with steel manning.

```
    You are an expert in formal logic and critical thinking. Create a methodically reasoned argument {stance} the position: "{topic}".
    
    Your argument should:
    - Begin with clearly defined terms and parameters
    - Present premises in a careful, sequential order
    - Use valid logical structures (avoiding all fallacies)
    - Anticipate logical objections and address them systematically
    - Draw conclusions that necessarily follow from your premises

    Use a strategic mix of:
    - Ethos: Establish credibility and ethical appeal
    - Pathos: Connect emotionally with the audience
    - Logos: Provide strong logical reasoning and evidence
    
    Your expertise allows you to anticipate objections, frame issues effectively, and use language that resonates deeply with readers. Draw on historical precedents, relevant research, and powerful narratives to make your case irrefutable.

```



#### Improved Ideas:
 Opponent Argument → Critique Generation → Critique-Informed Rebuttal 


## Initial draft of DS; later improved.

```
debate_data {
    "debate_id": "debate_1234567890",
    "topic": "Universal basic income should be implemented nationwide",
    "timestamp": "2025-04-10T15:30:45.123456",
    "status": "in_progress",
    "config": {
        "num_rounds": 3, "word_limit": [300, 500], "date": "2025-04-10"
    },

    "debaters": {
        "for": { "model": "mixtral", "stance": "FOR" },
        "against": { "model": "llama3.1", "stance": "AGAINST" }
    },

   "rounds": [
        {
            "round_number": 0, "round_type": "opening",
            "exchanges": {
                "for": {
                    "response": "UBI would reduce poverty and stimulate...", "timestamp": "2025-04-10T15:32:10.123456",
                    "word_count": 450, "responding_to": null
                },
                "against": {
                    "response": "UBI would be too expensive and reduce...", "timestamp": "2025-04-10T15:33:25.123456",
                    "word_count": 422, "responding_to": null
                }
            }
        },

        {
            "round_number": 1, "round_type": "rebuttal",
            "critiques": {
                "for": "1. LOGICAL GAPS: The opponent claims UBI would...",
                "against": "1. FACTUAL ERRORS: The opponent's claim about..."
            },

            "exchanges": {
                "for": {
                    "response": "My opponent argues that UBI would be...", "timestamp": "2025-04-10T15:35:40.123456",
                    "word_count": 485, "responding_to": "UBI would be too expensive and reduce..."
                },
                "against": {
                    "response": "The proponent claims that UBI would...", "timestamp": "2025-04-10T15:37:15.123456",
                    "word_count": 432, "responding_to": "UBI would reduce poverty and stimulate..."
                }
            }
        }
    ],

    "full_transcript": "DEBATE TOPIC: Universal basic income...[full text here]"
}

```

### Items to look into:


In round 10, should agent 2 critique:
- Agent 1's round 9 rebuttal (lagged response)
- Agent 1's round 10 rebuttal (immediate response)

    
    - "synchronous updating" or "simultaneous turns" in game theory and multi-agent systems.
    - sequential updating.
    - Synchronous (current system): More like formal debate where each side has equal preparation time
    - Sequential: More like conversation, with more recency but less symmetry
    


### Implementing the debate exchange code; agent vs agent.

In [22]:

import os
import json
import yaml
import time
import datetime
from typing import Dict, List, Any, Optional

class PromptManager:
    """Manages the loading and formatting of debate prompts from YAML files"""

    # Class variable for default word limit
    DEFAULT_WORD_LIMIT = 270

    def __init__(self, prompt_file: str = "prompts/debate_prompts.yml"):
        """Initialize the prompt manager with a YAML file"""
        self.prompts = self._load_prompts(prompt_file)
    
    def _load_prompts(self, prompt_file: str) -> Dict[str, Any]:
        """Load prompts from a YAML file"""
        try:
            if os.path.exists(prompt_file):
                with open(prompt_file, "r", encoding="utf-8") as f:
                    prompts = yaml.safe_load(f)
                print(f"Loaded prompts from {prompt_file}")
                return prompts
            else:
                print(f"Warning: Prompt file not found at {prompt_file}")
                return {"debater": {}}
        except Exception as e:
            print(f"Error loading prompts: {e}")
            return {"debater": {}}

    
    
    def get_debater_prompt(self, round_type: str, params: Dict[str, Any]) -> str:
        """
        Generate a complete debate prompt based on round type
        
        Args:
            round_type: Type of debate round ('opening', 'rebuttal', 'closing')
            params: Dictionary of parameters for prompt formatting
                    Must include: stance, topic, round_num, total_rounds, word_limit
        
        Returns:
            Formatted prompt string
        """
        # Get the base unified prompt
        base_prompt = self.prompts.get("debater", {}).get("unified_base", "")
        
        # Get round-specific extensions
        round_prompt = self.prompts.get("debater", {}).get(round_type, "")
        
        # Get evidence and citation guidance
        evidence_guidance = self.prompts.get("evidence_and_citation_guidance", "")
        
        # Combine prompts
        combined_prompt = f"{base_prompt}\n\n{round_prompt}\n\n{evidence_guidance}"
        
        # Apply parameters
        try:
            return combined_prompt.format(**params)
        except KeyError as e:
            print(f"Missing parameter in prompt template: {e}")
            return combined_prompt  # Return unformatted prompt on error


    ### format(**params) approach is used consistently across all prompt methods (even when a specific template might not have placeholders)
    def get_critique_prompt(self, params: Dict[str, Any]) -> str:
        """
        Generate an adversarial critique prompt
        
        Args:
            params: Dictionary with keys: argument, stance, topic
        
        Returns:
            Formatted critique prompt
        """
        critique_template = self.prompts.get("debater", {}).get("adversarial_critique", "")
        
        # Add word_limit parameter if missing but needed
        if "word_limit" not in params and "{word_limit}" in critique_template:
            params = params.copy()  # Create a copy to avoid modifying the original
            params["word_limit"] = self.DEFAULT_WORD_LIMIT
        
        try:
            return critique_template.format(**params)
        except KeyError as e:
            print(f"Missing parameter in critique template: {e}")
            return critique_template

    def get_evidence_check_prompt(self, params: Dict[str, Any]) -> str:
        """Generate a self-check prompt for evidence and citation verification"""
        
        # Get the template from the YAML file
        ##. evidence_template = self.prompts.get("evidence_check_prompt", "")
        evidence_template = self.prompts.get("debater", {}).get("evidence_check_prompt", "")

        
        # Add word_limit parameter if missing but needed
        if "word_limit" not in params and "{word_limit}" in evidence_template:
            params = params.copy()  # Create a copy to avoid modifying the original
            params["word_limit"] = self.DEFAULT_WORD_LIMIT
        
        # The params should already contain: argument, stance, topic
        try:
            return evidence_template.format(**params)
        except KeyError as e:
            print(f"Missing parameter in evidence check template: {e}")
            return evidence_template





            
class DebateStorage:
    """Manages the storage and retrieval of debate data"""
    
    def __init__(self, debate_id: Optional[str] = None, topic: Optional[str] = None):
        """Initialize storage for a debate"""
        self.debate_id = debate_id or f"debate_{int(time.time())}"
        self.topic = topic
        self.debate_data = {
            "debate_id": self.debate_id,
            "topic": self.topic,
            "timestamp": datetime.datetime.now().isoformat(),
            "status": "in_progress",
            "config": {},
            "debaters": {},
            "rounds": [],
            "full_transcript": ""
        }
    
    def set_config(self, config: Dict[str, Any]) -> None:
        """Set debate configuration"""
        self.debate_data["config"] = config
    
    def add_debater(self, key: str, model: str, stance: str) -> None:
        """Add a debater to the debate"""
        self.debate_data["debaters"][key] = {
            "model": model,
            "stance": stance
        }

    
    ## -> acts as a flexible foundation that can handle any type of preparation step (evidence checks, critiques, or future types like summarization, 
    ## fact-checking, etc.)
    def add_preparation(self, round_num: int, debater_key: str, 
                       prep_type: str, content: str) -> None:
        """
        Add a preparation step for a debate round
        
        Args:
            round_num: Round number
            debater_key: Identifier for the debater
            prep_type: Type of preparation ('evidence_check', 'critique', etc.)
            content: Content of the preparation
        """
        # Find or create the round data
        round_exists = False
        for r in self.debate_data["rounds"]:
            if r["round_number"] == round_num:
                round_exists = True
                round_data = r
                break
        
        if not round_exists:
            round_data = {
                "round_number": round_num,
                "round_type": "unknown",  # .. be set when exchange is added
                "exchanges": {},
                "preparations": {}
            }
            self.debate_data["rounds"].append(round_data)
            
            # Sort rounds by number
            self.debate_data["rounds"] = sorted(
                self.debate_data["rounds"], key=lambda r: r["round_number"]
            )
        
        # Ensure preparations dict exists
        if "preparations" not in round_data:
            round_data["preparations"] = {}
        
        # Ensure debater key exists in preparations
        if debater_key not in round_data["preparations"]:
            round_data["preparations"][debater_key] = {}
        
        # Add the preparation content
        round_data["preparations"][debater_key][prep_type] = {
            "content": content,
            "timestamp": datetime.datetime.now().isoformat()
        }


    ## wrappers -> centralizes the storage logic in one place
    def add_critique(self, round_num: int, debater_key: str, critique: str) -> None:
        """Add a critique to storage (wrapper around add_preparation)"""
        self.add_preparation(round_num, debater_key, "critique", critique)
    
    def add_evidence_check(self, round_num: int, debater_key: str, check: str) -> None:
        """Add an evidence check to storage (wrapper around add_preparation)"""
        self.add_preparation(round_num, debater_key, "evidence_check", check)
    

    def get_preparation(self, round_num: int, debater_key: str, 
                        prep_type: str) -> Optional[str]:
        """Get a preparation content by type"""
        for r in self.debate_data["rounds"]:
            if r["round_number"] == round_num:
                if "preparations" in r and debater_key in r["preparations"]:
                    if prep_type in r["preparations"][debater_key]:
                        return r["preparations"][debater_key][prep_type]["content"]
        return None
    

    def set_round_type(self, round_num: int, round_type: str) -> None:
        """Set or update the round type for a specific round"""
        for r in self.debate_data["rounds"]:
            if r["round_number"] == round_num:
                r["round_type"] = round_type
                return
        
        # Round not found, create a new one with the right type
        self.debate_data["rounds"].append({
            "round_number": round_num,
            "round_type": round_type,
            "exchanges": {},
            "preparations": {}
        })
        
        # Sort rounds by number
        self.debate_data["rounds"] = sorted(
            self.debate_data["rounds"], key=lambda r: r["round_number"]
        )

    ## -- removing org version. 
    # ddd add_critique(self, round_num: int, debater_key: str, critique: str) -> None:
    #     """Add a critique to storage"""
    #     # Find the round data
    #     for r in self.debate_data["rounds"]:
    #         if r["round_number"] == round_num:
    #             # Add critiques dict if it doesn't exist
    #             if "critiques" not in r:
    #                 r["critiques"] = {}
                
    #             # Add the critique
    #             r["critiques"][debater_key] = critique
    #             return
        
    #     # If we get here, round wasn't found
    #     print(f"Warning: Couldn't find round {round_num} to add critique")

        
    def add_exchange(self, round_num: int, round_type: str, debater_key: str, 
                     response: str, responding_to: Optional[str] = None) -> None:
        """Add a debate exchange to storage"""
        # Find or create the round data
        round_exists = False
        for r in self.debate_data["rounds"]:
            if r["round_number"] == round_num:
                round_exists = True
                round_data = r
                break
        
        if not round_exists:
            round_data = {
                "round_number": round_num,
                "round_type": round_type,
                "exchanges": {},
                "preparations": {}  
            }
            self.debate_data["rounds"].append(round_data)
            
            # Sort rounds by number to maintain chronological order
            self.debate_data["rounds"] = sorted( self.debate_data["rounds"], key=lambda r: r["round_number"] )
        
        # Add the exchange
        round_data["exchanges"][debater_key] = {
            "response": response,
            "timestamp": datetime.datetime.now().isoformat(),
            "word_count": len(response.split()),
            "responding_to": responding_to
        }
        
        # Update the full transcript
        self._update_transcript()



    
    ## if "critiques" in round_data and round_data["critiques"]: -> being replaced.
    def _update_transcript(self) -> None:
        """Update the full chronological transcript of the debate"""
        transcript = f"DEBATE TOPIC: {self.topic}\n\n"
        
        # Sort rounds by number to maintain chronological order
        sorted_rounds = sorted(self.debate_data["rounds"], key=lambda r: r["round_number"])
        
        for round_data in sorted_rounds:
            round_num = round_data["round_number"]
            round_type = round_data.get("round_type", "unknown")
            
            transcript += f"ROUND {round_num + 1} ({round_type.upper()}):\n\n"
            
            # Include preparations if they exist for this round and debugging is enabled
            if self.debate_data["config"].get("show_preparations", False):
                if "preparations" in round_data and round_data["preparations"]:
                    transcript += "PREPARATION STEPS:\n\n"
                    
                    for debater_key, preps in round_data["preparations"].items():
                        stance = self.debate_data["debaters"][debater_key]["stance"] if debater_key in self.debate_data["debaters"] else "Unknown"
                        
                        for prep_type, prep_data in preps.items():
                            transcript += f"{stance} ({prep_type.upper()}):\n{prep_data['content']}\n\n"
            
            # Include critiques if they exist for this round (using the new preparations structure)
            if "preparations" in round_data:
                critiques_found = False
                critique_section = "CRITIQUES OF PREVIOUS ARGUMENTS:\n\n"
                
                for debater_key, preps in round_data["preparations"].items():
                    if "critique" in preps:
                        critiques_found = True
                        # Find the stance of this debater
                        stance = self.debate_data["debaters"][debater_key]["stance"] if debater_key in self.debate_data["debaters"] else "Unknown"
                        critique_section += f"Critique for {stance} debater to consider:\n{preps['critique']['content']}\n\n"
                
                if critiques_found:
                    transcript += critique_section
            
            # Always print FOR first, then AGAINST
            for position in ["FOR", "AGAINST"]:
                # Find the debater with this position
                for debater_key, debater_info in self.debate_data["debaters"].items():
                    if debater_info["stance"] == position and debater_key in round_data.get("exchanges", {}):
                        model = debater_info["model"]
                        response = round_data["exchanges"][debater_key]["response"]
                        
                        transcript += f"{position} ({model}):\n{response}\n\n"
        
        self.debate_data["full_transcript"] = transcript


    
    def save_to_file(self, filename: Optional[str] = None) -> str:
        """Save debate data to a JSON file in the results/agent_records directory"""
        # Create the results/agent_records directory if it doesn't exist
        results_dir = "results"
        agent_records_dir = os.path.join(results_dir, "agent_records")
        
        # Ensure the directories exist
        os.makedirs(agent_records_dir, exist_ok=True)
        
        if filename is None:
            # Create filename from topic and timestamp
            safe_topic = "".join(c if c.isalnum() else "_" for c in self.topic or "")
            base_filename = f"{self.debate_id}_{safe_topic}.json"
            filename = os.path.join(agent_records_dir, base_filename)
        elif not os.path.isabs(filename):
            # If a relative path is provided, make it relative to agent_records_dir
            filename = os.path.join(agent_records_dir, filename)
        
        # Update status if all rounds are complete
        if len(self.debate_data["rounds"]) == self.debate_data["config"].get("num_rounds", 0):
            self.debate_data["status"] = "completed"
        
        try:
            with open(filename, "w", encoding="utf-8") as f:
                json.dump(self.debate_data, f, indent=2)
            print(f"Debate saved to {filename}")
            return filename
        except Exception as e:
            print(f"Error saving debate: {e}")
            return ""
    

        
    @staticmethod
    def load_from_file(filename: str) -> Optional[Dict[str, Any]]:
        """Load debate data from a JSON file"""
        # If filename doesn't include a path, assume it's in results/agent_records
        if not os.path.dirname(filename):
            results_dir = "results"
            agent_records_dir = os.path.join(results_dir, "agent_records")
            filename = os.path.join(agent_records_dir, filename)
        
        try:
            with open(filename, "r", encoding="utf-8") as f:
                return json.load(f)
        except Exception as e:
            print(f"Error loading debate: {e}")
            return None

        


    
class MultiAgentDebate:
    """Main class for managing multi-agent debates"""
    
    def __init__(self, topic: str, num_rounds: int = 3, word_limit: int = 270, prompt_file: str = None, 
                use_evidence_check: bool = True, use_critiques: bool = True ):
        """
        Initialize a new debate
        """
        self.topic = topic
        self.num_rounds = num_rounds
        self.current_round = 0
        self.word_limit = word_limit

        self.use_evidence_check = use_evidence_check
        self.use_critiques = use_critiques

        
        # Use dictionaries for debaters instead of lists
        self.debaters = {}
        
        # Initialize prompt manager
        self.prompt_manager = PromptManager(prompt_file or "prompts/debate_prompts.yml")
        
        # Initialize storage
        self.storage = DebateStorage(topic=topic)
        
        self.storage.set_config({
            "num_rounds": num_rounds,
            "word_limit": word_limit,
            "date": datetime.datetime.now().strftime("%Y-%m-%d"),
            "use_evidence_check": use_evidence_check,
            "use_critiques": use_critiques,
            "show_preparations": False  # Set to True for debugging
        })



        
    def add_debater(self, key: str, model: str, stance: str) -> None:
        """
        Add a debater to the debate with a meaningful key
        
        Args:
            key: Unique identifier for the debater (e.g., 'for', 'against')
            model: The LLM model to use for this debater
            stance: Debate position ('FOR' or 'AGAINST')
        """

        
        self.debaters[key] = {
            "model": model,
            "stance": stance
        }
        # Add to storage
        self.storage.add_debater(key, model, stance)
    
    def get_opponent_key(self, debater_key: str) -> Optional[str]:
        """Get the key of the opponent for a given debater"""
        # This assumes a binary debate with two participants
        keys = list(self.debaters.keys())
        if len(keys) != 2:
            return None
        return keys[1] if keys[0] == debater_key else keys[0]
    
    def get_round_type(self) -> str:
        """Determine the type of the current round"""
        if self.current_round == 0:
            return "opening"
        elif self.current_round == self.num_rounds - 1:
            return "closing"
        else:
            return "rebuttal"
    
    def get_previous_argument(self, debater_key: str) -> Optional[str]:
        """Get the opponent's argument from the previous round"""
        if self.current_round == 0:
            return None
            
        opponent_key = self.get_opponent_key(debater_key)
        if not opponent_key:
            return None
            
        # Find opponent's response in the previous round
        for round_data in self.storage.debate_data["rounds"]:
            if round_data["round_number"] == self.current_round - 1:
                if opponent_key in round_data["exchanges"]:
                    return round_data["exchanges"][opponent_key]["response"]
        
        return None
    
    def get_debate_prompt(self, debater_key: str) -> str:
        """
        Generate a prompt for the current round for a given debater
        
        Args:
            debater_key: Key of the debater to generate prompt for
            
        Returns:
            Formatted debate prompt
        """
        if debater_key not in self.debaters:
            raise ValueError(f"Unknown debater key: {debater_key}")
        
        # Get round type
        round_type = self.get_round_type()
        
        # Build parameters dictionary
        params = {
            "stance": self.debaters[debater_key]["stance"],
            "topic": self.topic,
            "round_num": self.current_round + 1,  # 1-indexed for prompts
            "total_rounds": self.num_rounds,
            "word_limit": self.word_limit  # Now passing single value
        }
        
        # Add opponent's argument for rebuttal rounds
        if round_type == "rebuttal":
            opponent_argument = self.get_previous_argument(debater_key)
            if opponent_argument:
                params["opponent_argument"] = opponent_argument
        
        # Generate prompt using prompt manager
        return self.prompt_manager.get_debater_prompt(round_type, params)
    

    def _check_word_count(self, response: str) -> str:
        """
        Check if response exceeds word limit and log a warning
        
        Args:
            response: Generated response text
            
        Returns:
            Original response (with warning logged if word count exceeds limit)
        """
        word_count = len(response.split())
        if word_count > self.word_limit:
            print(f"WARNING: Response has {word_count} words, exceeding the {self.word_limit} word limit!")
        return response
    
    
        

    
    def generate_evidence_check(self, argument: str, stance: str, debater_key: str, client) -> str:
        """
        Generate an evidence and citation self-check that analyzes both the 
        debater's previous argument and the opponent's recent argument
        
        Args:
            argument: The debater's own current argument text to check
            stance: The stance of the argument ('FOR' or 'AGAINST')
            debater_key: Key of the debater performing the check
            client: Client object with run_model method
                
        Returns:
            Evidence check analysis text
        """
        
        # Use the debater's own model for self-checking
        model = self.debaters[debater_key]["model"]
        
        # Get opponent's key and latest argument
        opponent_key = self.get_opponent_key(debater_key)
        opponent_argument = self.get_previous_argument(debater_key) or "No opponent argument available yet."
        
        # Build parameters including both arguments
        params = {
            "previous_argument": argument,  # The debater's own argument
            "opponent_argument": opponent_argument,  # The opponent's argument
            "stance": stance, "topic": self.topic, "word_limit": self.word_limit
        }
        
        # Use the correct prompt method - this was using the wrong method name
        check_prompt = self.prompt_manager.get_evidence_check_prompt(params)
        print(f"\nGenerating evidence check for {debater_key} using {model}...")
        
        try:
            start_time = time.time()
            print(f"Evidence Method, Before Client.Run(): Evidence check prompt length: {len(check_prompt)} characters")
            evidence_check = client.run_model(model, check_prompt)
            end_time = time.time()
            print(f"Evidence check generated in {end_time - start_time:.2f} seconds")

            if evidence_check is None: print(f"Evidence Check has failed, Client.Run() has returned None. Revise Prompt.")
            return evidence_check or "[No evidence check generated]"  # Return default if None
        except Exception as e:
            print(f"Error generating evidence check: {e}")
            return f"[Error generating evidence check: {e}]"

    
    
            

    
    def generate_critique(self, argument: str, stance: str, client, model: Optional[str] = None) -> str:
        """
        Generate an adversarial critique of an argument
        
        Args:
            argument: The argument text to critique
            stance: The stance of the argument ('FOR' or 'AGAINST')
            client: Client object with run_model method
            model: The model to use for critique (defaults to debater's own model)
            
        Returns:
            Critique text
        """
        # If no model specified, use a default from one of the debaters
        critique_model = model or next(iter(self.debaters.values()))["model"]
        
        # Build parameters for critique prompt
        params = { "argument": argument, "stance": stance, "topic": self.topic, "word_limit": self.word_limit }
        
        # Get critique prompt
        critique_prompt = self.prompt_manager.get_critique_prompt(params)
        
        print(f"\nGenerating critique using {critique_model}...")
        
        try:
            start_time = time.time()
            critique = client.run_model(critique_model, critique_prompt)
            end_time = time.time()
            print(f"Critique generated in {end_time - start_time:.2f} seconds")
            return critique
        except Exception as e:
            print(f"Error generating critique: {e}")
            return f"[Error generating critique: {e}]"
    
            
    
    def generate_critique_informed_rebuttal(self, debater_key, opponent_argument, critique, client):
        """
        Generate a debate rebuttal informed by critique analysis
        
        Args:
            debater_key: Key of the debater to generate response from
            opponent_argument: The opponent's argument being responded to
            critique: The critique of the opponent's argument
            client: Client object with run_model method
            
        Returns:
            Generated rebuttal text
        """
        if debater_key not in self.debaters:
            raise ValueError(f"Unknown debater key: {debater_key}")
        
        model = self.debaters[debater_key]["model"]
        
        # Build parameters for standard debate prompt
        params = {
            "stance": self.debaters[debater_key]["stance"],
            "topic": self.topic,
            "round_num": self.current_round + 1,  # 1-indexed for prompts
            "total_rounds": self.num_rounds,
            "word_limit": self.word_limit,
            "opponent_argument": opponent_argument
        }
        
        # Get standard rebuttal prompt
        base_prompt = self.prompt_manager.get_debater_prompt("rebuttal", params)
        
        # Enhance with critique
        enhanced_prompt = f"{base_prompt}\n\nHere is a critical analysis of your opponent's argument that you should consider when crafting your rebuttal:\n\n{critique}\n\nUse this analysis to strengthen your rebuttal, addressing the identified weaknesses while maintaining a coherent narrative."
        
        print(f"\nGenerating critique-informed rebuttal for {model} ({self.debaters[debater_key]['stance']})...")
        
        try:
            start_time = time.time()
            response = client.run_model(model, enhanced_prompt)
            end_time = time.time()
            
            # Same processing as in generate_debate_response
            if response:
                import re
                response = re.sub(r'\n{3,}', '\n\n', response)
                word_count = len(response.split())
                print(f"Critique-informed rebuttal generated: {word_count} words in {end_time - start_time:.2f} seconds")
                return response
            else:
                print("Error: No response generated")
                return "[No response generated]"
                
        except Exception as e:
            print(f"Error generating critique-informed rebuttal: {e}")
            return f"[Error generating critique-informed rebuttal: {e}]"


    
    def generate_debate_response(self, debater_key: str, client) -> str:
        """
        Generate a debate response from the specified debater
        
        Args:
            debater_key: Key of the debater to generate response from
            client: Client object with run_model method to call LLMs
            
        Returns:
            Generated response text
        """
        if debater_key not in self.debaters:
            raise ValueError(f"Unknown debater key: {debater_key}")
        
        model = self.debaters[debater_key]["model"]
        prompt = self.get_debate_prompt(debater_key)
        
        print(f"\nGenerating response for {model} ({self.debaters[debater_key]['stance']})...")
        
        try:
            start_time = time.time()
            response = client.run_model(model, prompt)
            end_time = time.time()
            
            if response:
                # Just do minimal formatting - clean up excessive newlines
                import re
                response = re.sub(r'\n{3,}', '\n\n', response)
                
                word_count = len(response.split())
                print(f"Response generated: {word_count} words in {end_time - start_time:.2f} seconds")
                
                # Report if word count exceeds suggested limit, but don't truncate
                if word_count > self.word_limit:
                    print(f"Note: Response exceeds suggested word limit ({word_count} > {self.word_limit} words)")
                
                return response
            else:
                print("Error: No response generated")
                return "[No response generated]"
                
        except Exception as e:
            print(f"Error generating response: {e}")
            return f"[Error generating response: {e}]"
    

    ## -----------------------------------------------------------------------------------------------------------------------------------------
            
    
        
    def _prepare_debate_materials(self, debater_key: str, round_type: str, client, use_critiques: bool = True) -> dict:
        """
        Prepare debate materials (critiques and evidence checks) for a debater
        
        Args:
            debater_key: Key of the debater
            round_type: Type of debate round
            client: Client object with run_model method
            use_critiques: Whether to generate critiques (can override class setting)
            
        Returns:
            Dictionary of prepared materials
        """

        ## print(f"Entered _prepare_debate_materials method.")
        materials = {}
        
        # Get previous argument from opponent (for rebuttals)
        opponent_key = self.get_opponent_key(debater_key)
        prev_opponent_argument = self.get_previous_argument(debater_key)
        
        # For rebuttal rounds, generate critiques if enabled
        if use_critiques and (round_type == "rebuttal" or round_type == "closing") and prev_opponent_argument:
            opponent_stance = self.debaters[opponent_key]["stance"]

            # add word_limit to params for critique prompt
            critique_params = {
                "argument": prev_opponent_argument,
                "stance": opponent_stance,
                "topic": self.topic,
                "word_limit": self.word_limit 
            }

            ## not needed - critique_params
            critique = self.generate_critique(
                prev_opponent_argument, 
                opponent_stance, client, self.debaters[debater_key]["model"]
            )
            
            materials["critique"] = critique
            self.storage.add_critique(self.current_round, debater_key, critique)
            print(f"Critique generated for {debater_key}")
        
        # For all rounds after the first, do evidence check on previous round's argument
        if self.use_evidence_check and self.current_round > 0:
            # Find this debater's argument from previous round
            for round_data in self.storage.debate_data["rounds"]:
                if round_data["round_number"] == self.current_round - 1 and debater_key in round_data.get("exchanges", {}):
                    prev_self_argument = round_data["exchanges"][debater_key]["response"]
                    evidence_check = self.generate_evidence_check(
                        prev_self_argument, 
                        self.debaters[debater_key]["stance"],
                        debater_key,
                        client
                    )
                    materials["evidence_check"] = evidence_check
                    self.storage.add_evidence_check(self.current_round, debater_key, evidence_check)
                    print(f"Evidence check generated for {debater_key}")
                    break
        
        return materials

    
    
    
    def _build_enhanced_prompt(self, debater_key: str, materials: dict) -> str:
        """
        Build an enhanced prompt incorporating preparation materials
        
        Args:
            debater_key: Key of the debater
            materials: Dictionary of preparation materials
            
        Returns:
            Enhanced prompt string
        """

        ## print(f"Entered _build_enhanced_prompt method.")
        
        # Start with the base debate prompt
        prompt = self.get_debate_prompt(debater_key)
        
        # Add critique if available
        if "critique" in materials:
            prompt += f"\n\nHere is a critical analysis of your opponent's argument that you should consider:\n\n{materials['critique']}"
        
        # Add evidence check if available
        if "evidence_check" in materials:
            prompt += f"\n\nHere is an analysis of your previous argument's evidence and citations. Consider these points to strengthen your current argument:\n\n{materials['evidence_check']}"
        
        return prompt

    
        
    def _generate_debate_response_with_materials(self, debater_key: str, prompt: str, client) -> str:
        """
        Generate a debate response using prepared materials
        
        Args:
            debater_key: Key of the debater
            prompt: Enhanced prompt with preparation materials
            client: Client object with run_model method
            
        Returns:
            Generated response text
        """

        ## print(f"Entered _generate_debate_response_with_materials method.")

        model = self.debaters[debater_key]["model"]
        stance = self.debaters[debater_key]["stance"]
        
        print(f"Generating comprehensive response for {model} ({stance})...")

        # Add some model validation before proceeding
        if not hasattr(client, 'check_model_exists') or not client.check_model_exists(model):
            print(f"Error: Model '{model}' is not available. Please check model configuration.")
            return f"[Error: Model '{model}' is not available]"

        try:
            start_time = time.time()
            response = client.run_model(model, prompt)
            end_time = time.time()
            
            if response:
                import re
                response = re.sub(r'\n{3,}', '\n\n', response)
                word_count = len(response.split())
                print(f"Response generated: {word_count} words in {end_time - start_time:.2f} seconds")
                
                # Report if word count exceeds suggested limit, but don't truncate
                if word_count > self.word_limit:
                    print(f"Note: Response exceeds suggested word limit ({word_count} > {self.word_limit} words)")
                
                return response
            else:
                print("Error: No response generated")
                return "[No response generated]"
                
        except Exception as e:
            print(f"Error generating response: {e}")
            return f"[Error generating response: {e}]"


        
    def run_debate_round(self, client, with_critique: bool = True) -> bool:
        """
        Run a single round of debate with combined preparation steps
        
        Args:
            client: Client object with run_model method
            with_critique: Whether to generate critiques to inform rebuttals
                (overridden by self.use_critiques if False)
            
        Returns:
            True if debate can continue, False if debate is complete
        """
        if self.current_round >= self.num_rounds:
            print(f"Debate already completed ({self.num_rounds} rounds)")
            return False
        
        # Apply parameter override only if with_critique is False
        # This preserves backwards compatibility while allowing critique disabling
        use_critiques = self.use_critiques and with_critique
        
        round_type = self.get_round_type()
        print(f"\n===================================== ROUND {self.current_round + 1}: {round_type.upper()} ===================================== ")
        
        self.storage.set_round_type(self.current_round, round_type)
        
        # Process each debater
        for debater_key in self.debaters:

            print(f"\n{'-' * 60}")
            print(f"Preparing arguments for {debater_key.upper()} ({self.debaters[debater_key]['stance']})")
            print(f"{'-' * 60}")

            # 1. Prepare materials (critiques and evidence checks)
            materials = self._prepare_debate_materials(debater_key, round_type, client, use_critiques)

            # 2. Build enhanced prompt with preparation materials
            enhanced_prompt = self._build_enhanced_prompt(debater_key, materials)

            if materials is None: materials = {}  # materials is at least an empty dict

            # Print preparation summary before generating final response
            evidence_length = len(materials.get("evidence_check", "")) if "evidence_check" in materials else 0
            critique_length = len(materials.get("critique", "")) if "critique" in materials else 0
    
            print(f"\n{'-' * 80}")
            print(f"PREPARATION COMPLETE FOR {debater_key.upper()} ({self.debaters[debater_key]['stance']})")
            print(f"Evidence and citation module - generated {evidence_length} chars")
            print(f"Critique and adversarial attack module - generated {critique_length} chars")
            print(f"{'-' * 80}\n")

        
            # 3. Generate response using enhanced prompt
            response = self._generate_debate_response_with_materials(debater_key, enhanced_prompt, client)

            print(f"\n1. Prepare materials, 2. Build enhanced prompt, 3. Generate response -> are complete.")
                  
            # 4. Store response in debate storage
            prev_argument = self.get_previous_argument(debater_key)
            
            self.storage.add_exchange( round_num=self.current_round, round_type=round_type, 
                                      debater_key=debater_key, response=response, responding_to=prev_argument )

            print(f"\n{'-' * 80}")
            print(f"FINISHED RESPONSE FOR {debater_key.upper()} ({self.debaters[debater_key]['stance']})")
            print(f"{'-' * 80}\n")

        # Save the debate after each round
        self.storage.save_to_file()
        
        # Move to next round
        self.current_round += 1
        
        # Check if debate is complete
        if self.current_round >= self.num_rounds:
            print("\n===== DEBATE COMPLETED =====")
            return False
            
        return True
        
        
        
    def run_full_debate(self, client, with_critique: bool = False) -> Dict[str, Any]:
        """
        Run the complete debate from start to finish
        
        Args:
            client: Client object with run_model method
            with_critique: Whether to generate critiques between rounds
            
        Returns:
            Complete debate data
        """
        print(f"\n===== STARTING DEBATE: {self.topic} =====")
        print(f"Debaters: {[(k, d['model'] + ' (' + d['stance'] + ')') for k, d in self.debaters.items()]}")
        print(f"Rounds: {self.num_rounds}")
        
        start_time = time.time()
        
        while self.current_round < self.num_rounds:
            if not self.run_debate_round(client, with_critique):
                break
        
        end_time = time.time()
        debate_duration = end_time - start_time
        print(f"Debate completed in {debate_duration:.2f} seconds")
        
        # Save debate to file
        self.storage.save_to_file()
        
        return self.storage.debate_data
    
    def get_debate_transcript(self) -> str:
        """Get the full debate transcript"""
        return self.storage.debate_data["full_transcript"]
    
    def get_debate_data(self) -> Dict[str, Any]:
        """Get the complete debate data"""
        return self.storage.debate_data
    
    def save_debate(self, filename: Optional[str] = None) -> str:
        """Save the debate to a file"""
        return self.storage.save_to_file(filename)




## --------------------------------------------------------------------------------------------------------------------------------------------------
## --------------------------------------------------------------------------------------------------------------------------------------------------





    
## Gemma 3 27B vs Phi-3 14B
## Mixtral 8x7B vs DeepSeek-R1 14B


if __name__ == "__main__":

    # # --> not needed now, already created. 
    # # Create example YAML file if needed
    # if not os.path.exists("prompts/debate_prompts.yml"):
    #     create_example_yaml_file()


    debate_prompt_yml_path = r"D:\JoelDesktop folds_24\NEU SPRING25 - DL, HCI\Projects\Multi-Agent LLM Debator\prompts\debator_prompts.yml"
    prompt_file_path = r"D:\JoelDesktop folds_24\NEU SPRING25 - DL, HCI\Projects\Multi-Agent LLM Debator\prompts\debator_prompts.yml"
     
    # Initialize the Ollama client
    client = OllamaDebateManager()

    # Initialize a debate
    debate_topic = "The humanities are as valuable as STEM in modern education and should receive equal funding."
    
    debate = MultiAgentDebate(
        topic=debate_topic,
        num_rounds=6, 
        word_limit=300,
        prompt_file=prompt_file_path
    )

    ## hard - large models.
    # debate.add_debater("for", "mixtral", "FOR")
    # debate.add_debater("against", "deepseek_r1", "AGAINST")
    
    # debate.add_debater("for", "phi4", "FOR")
    # debate.add_debater("against", "llama3.1", "AGAINST")

    debate.add_debater("for", "phi4", "FOR")
    debate.add_debater("against", "mixtral", "AGAINST")
    
    
    # run the full debate
    print("Running the full debate...")

    # runs with or without adversarial critique training. 
    # debate_data = debate.run_full_debate(client)
    debate_data = debate.run_full_debate(client, with_critique=True)

    # it PERFORMS the self.storage.save_to_file(), automatically. 

    # Display the debate transcript
    print("\n==== DEBATE TRANSCRIPT ====\n")
    print(debate.get_debate_transcript())


Loaded prompts from D:\JoelDesktop folds_24\NEU SPRING25 - DL, HCI\Projects\Multi-Agent LLM Debator\prompts\debator_prompts.yml
Running the full debate...

===== STARTING DEBATE: The humanities are as valuable as STEM in modern education and should receive equal funding. =====
Debaters: [('for', 'phi4 (FOR)'), ('against', 'mixtral (AGAINST)')]
Rounds: 6


------------------------------------------------------------
Preparing arguments for FOR (FOR)
------------------------------------------------------------

--------------------------------------------------------------------------------
PREPARATION COMPLETE FOR FOR (FOR)
Evidence and citation module - generated 0 chars
Critique and adversarial attack module - generated 0 chars
--------------------------------------------------------------------------------

Generating comprehensive response for phi4 (FOR)...
Available models: ['deepseek-r1:7b', 'gemma3:27b', 'phi4:latest', 'deepseek-r1:14b', 'deepseek-llm:7b', 'phi3:14b', 'llama3.1:8

### Judgement Analysis;
```
1. JudgePromptManager
   - Load and format judge prompts from YAML
   - Support for all judge types (logical, factual, rhetorical, etc.)
   - Parameter substitution in prompts

2. DebateResultsLoader
   - Load debate JSON files
   - Extract arguments by stance across rounds
   - Prepare combined arguments
   - Extract metadata

3. JudgmentResult (dataclass)
   - Judge type
   - Score
   - Critique
   - Metadata (timestamp, etc.)

4. JudgeEvaluator
   - Run individual judge evaluations
   - Process judge responses 
   - Track scores and critiques

5. DebateJudgingSystem
   - Coordinate the entire evaluation pipeline
   - Configure which judges to run
   - Aggregate results
   - Run meta-judge consensus

6. JudgmentStorage
   - Save judgment results to file
   - Generate reports
   - Support different output formats
```

## Judgement Classes and Storage.

In [29]:
import os
import yaml
import json
from typing import Dict, List, Any, Optional
from dataclasses import dataclass

@dataclass
class JudgmentResult:
    """Data class for storing judgment results"""
    judge_type: str
    score: float
    critique: str
    raw_evaluation: str
    metadata: Dict[str, Any]
    timestamp: str = ""  # Will be filled automatically


class JudgePromptManager:
    """Manages loading and formatting of judge prompts from YAML files"""
    
    def __init__(self, prompt_file: str = "prompts/judge_prompts.yml"):
        """Initialize the judge prompt manager with a YAML file"""
        self.prompts = self._load_prompts(prompt_file)
        self.available_judges = self._get_available_judges()
    
    def _load_prompts(self, prompt_file: str) -> Dict[str, Any]:
        """Load prompts from a YAML file"""
        try:
            if os.path.exists(prompt_file):
                with open(prompt_file, "r", encoding="utf-8") as f:
                    prompts = yaml.safe_load(f)
                print(f"Loaded judge prompts from {prompt_file}")
                return prompts
            else:
                print(f"Warning: Judge prompt file not found at {prompt_file}")
                return {"judge": {}}
        except Exception as e:
            print(f"Error loading judge prompts: {e}")
            return {"judge": {}}

    def _get_available_judges(self) -> List[str]:
        """Get a list of available judge types from the loaded prompts"""
        judges = []
        # Check if 'judge' key exists
        if "judge" in self.prompts:
            # Look for prompt keys ending with '_judge_prompt'
            for key in self.prompts["judge"]:
                if key.endswith("_judge_prompt"):
                    judge_type = key.replace("_judge_prompt", "")
                    judges.append(judge_type)
        return judges


    
    def get_judge_prompt(self, judge_type: str, params: Dict[str, Any]) -> str:
        """
        Get a formatted judge prompt of the specified type
        
        Args:
            judge_type: Type of judge (e.g., 'factual', 'logical')
            params: Dictionary with formatting parameters (stance, topic, combined_arguments, word_limit)
        
        Returns:
            Formatted judge prompt
        """
        prompt_key = f"{judge_type}_judge_prompt"
        
        # Check if the requested judge type exists
        if "judge" not in self.prompts or prompt_key not in self.prompts["judge"]:
            print(f"Warning: Judge prompt '{prompt_key}' not found")
            return f"Error: No prompt found for judge type '{judge_type}'"
        
        # Get the prompt template
        prompt_template = self.prompts["judge"].get(prompt_key, "")
        
        # Apply parameters
        try:
            return prompt_template.format(**params)
        except KeyError as e:
            print(f"Missing parameter in judge prompt template: {e}")
            return prompt_template  # Return unformatted prompt on error
    
    def list_available_judges(self) -> List[str]:
        """Return a list of available judge types"""
        return self.available_judges


class DebateResultsLoader:
    """Loads and prepares debate results for judging"""
    
    def __init__(self, debate_file_path: str):
        """Initialize with a path to a debate JSON file"""
        self.debate_file_path = debate_file_path
        self.debate_data = self._load_debate_data()
        self.combined_arguments = self._prepare_combined_arguments()
        self.metadata = self._extract_metadata()
    
    def _load_debate_data(self) -> Dict[str, Any]:
        """Load debate data from a JSON file"""
        try:
            if not os.path.exists(self.debate_file_path):
                raise FileNotFoundError(f"Debate file not found: {self.debate_file_path}")
                
            with open(self.debate_file_path, "r", encoding="utf-8") as f:
                debate_data = json.load(f)
            
            print(f"Loaded debate from {self.debate_file_path}")
            return debate_data
        except Exception as e:
            print(f"Error loading debate data: {e}")
            return {}
    
    def _prepare_combined_arguments(self) -> Dict[str, str]:
        """
        Extract and combine arguments for each debater across all rounds
        
        Returns:
            Dictionary with keys as stance ("FOR", "AGAINST") and values as combined arguments
        """
        combined = {}
        
        # Check if debate data is valid
        if not self.debate_data or "rounds" not in self.debate_data:
            print("Warning: No valid rounds found in debate data")
            return combined
        
        # Get list of debaters and their stances
        debaters = self.debate_data.get("debaters", {})
        
        # Create a mapping from debater_key to stance
        key_to_stance = {k: v["stance"] for k, v in debaters.items()}
        
        # Initialize combined arguments dictionary with empty strings for each stance
        for stance in set(key_to_stance.values()):
            combined[stance] = ""
        
        # Sort rounds by round number to maintain chronological order
        rounds = sorted(self.debate_data.get("rounds", []), key=lambda r: r.get("round_number", 0))
        
        # Extract and combine arguments for each debater
        for round_data in rounds:
            round_num = round_data.get("round_number", 0) + 1  # 1-indexed for display
            round_type = round_data.get("round_type", "unknown").upper()
            
            # Process each exchange in this round
            for debater_key, exchange in round_data.get("exchanges", {}).items():
                if debater_key in key_to_stance:
                    stance = key_to_stance[debater_key]
                    response = exchange.get("response", "")
                    
                    # Add round header and response to combined arguments
                    combined[stance] += f"\n\n--- ROUND {round_num} ({round_type}) ---\n\n"
                    combined[stance] += response
        
        # Trim leading whitespace
        for stance in combined:
            combined[stance] = combined[stance].lstrip()
        
        return combined
    
    def _extract_metadata(self) -> Dict[str, Any]:
        """Extract relevant metadata from debate data"""
        metadata = {
            "debate_id": self.debate_data.get("debate_id", "unknown"),
            "topic": self.debate_data.get("topic", "unknown"),
            "timestamp": self.debate_data.get("timestamp", ""),
            "num_rounds": len(self.debate_data.get("rounds", [])),
            "debaters": {}
        }
        
        # Extract debater info
        for key, debater in self.debate_data.get("debaters", {}).items():
            metadata["debaters"][key] = {
                "model": debater.get("model", "unknown"),
                "stance": debater.get("stance", "unknown")
            }
        
        # Extract debate configuration
        if "config" in self.debate_data:
            metadata["config"] = self.debate_data["config"]
        
        return metadata
    
    def get_combined_arguments(self, stance: str) -> str:
        """Get combined arguments for a specific stance"""
        return self.combined_arguments.get(stance, "")
    
    def get_metadata(self) -> Dict[str, Any]:
        """Get debate metadata"""
        return self.metadata
    
    def get_stances(self) -> List[str]:
        """Get list of debate stances"""
        return list(self.combined_arguments.keys())




if __name__ == "__main__":

    # # No trailing backslash in raw string
    db_file_path_prefix = r"D:\JoelDesktop folds_24\NEU SPRING25 - DL, HCI\Projects\Multi-Agent LLM Debator\results\perfect_debate_transcripts"  
    db_file_name = r"debate_1744506254_The_humanities_are_as_valuable_as_STEM_in_modern_education_and_should_receive_equal_funding_.json"
    debate_file_path = os.path.join(db_file_path_prefix, db_file_name)  


    judge_prompt_path = r"D:\JoelDesktop folds_24\NEU SPRING25 - DL, HCI\Projects\Multi-Agent LLM Debator\prompts\judge_prompts.yml"
    
    # Test JudgePromptManager
    print("\n===== Testing JudgePromptManager =====")
    judge_prompt_manager = JudgePromptManager(judge_prompt_path)
    available_judges = judge_prompt_manager.list_available_judges()
    print(f"Available judges: {available_judges}")
    
    # Test DebateResultsLoader
    print("\n===== Testing DebateResultsLoader =====")
    debate_loader = DebateResultsLoader(debate_file_path)
    
    # Check metadata
    metadata = debate_loader.get_metadata()
    print("\n--- Debate Metadata ---")
    print(f"Debate ID: {metadata.get('debate_id')}")
    print(f"Topic: {metadata.get('topic')}")
    print(f"Number of rounds: {metadata.get('num_rounds')}")
    print("\nDebaters:")
    for key, debater in metadata.get('debaters', {}).items():
        print(f"  {key}: {debater.get('model')} - Stance: {debater.get('stance')}")
    
    # Check combined arguments
    stances = debate_loader.get_stances()
    print("\n--- Combined Arguments Stats ---")
    for stance in stances:
        combined_args = debate_loader.get_combined_arguments(stance)
        word_count = len(combined_args.split())
        char_count = len(combined_args)
        print(f"{stance} argument: {word_count} words, {char_count} characters")
        
        # Preview the first 200 characters
        preview = combined_args[:200].replace('\n', ' ')
        print(f"Preview: {preview}...")

        # Preview the last 600 characters
        last_preview = combined_args[-600:].replace('\n', ' ')
        print(f"\nLast 600 chars: ...{last_preview}")
        print("-" * 80)
        
        # print(f"\n")
        # print(combined_args)
        


===== Testing JudgePromptManager =====
Loaded judge prompts from D:\JoelDesktop folds_24\NEU SPRING25 - DL, HCI\Projects\Multi-Agent LLM Debator\prompts\judge_prompts.yml
Available judges: ['factual', 'logical', 'rhetorical', 'belief_shift', 'audience', 'strategic', 'ethical']

===== Testing DebateResultsLoader =====
Loaded debate from D:\JoelDesktop folds_24\NEU SPRING25 - DL, HCI\Projects\Multi-Agent LLM Debator\results\perfect_debate_transcripts\debate_1744506254_The_humanities_are_as_valuable_as_STEM_in_modern_education_and_should_receive_equal_funding_.json

--- Debate Metadata ---
Debate ID: debate_1744506254
Topic: The humanities are as valuable as STEM in modern education and should receive equal funding.
Number of rounds: 6

Debaters:
  for: phi4 - Stance: FOR
  against: mixtral - Stance: AGAINST

--- Combined Arguments Stats ---
AGAINST argument: 1662 words, 12158 characters
Preview: --- ROUND 1 (OPENING) ---  1 In the contest of equal funding between humanities and STEM fie

## Core Judge Classes.

In [36]:
import os
import yaml
import json
import time
import datetime
from typing import Dict, List, Any, Optional, Tuple
from dataclasses import dataclass, asdict


class JudgeEvaluator:
    """Runs judge evaluations on debate arguments"""
    
    def __init__(self, client, judge_prompt_manager: JudgePromptManager, word_limit: int = 300):
        """
        Initialize the judge evaluator
        
        Args:
            client: Client object with run_model method for calling LLMs
            judge_prompt_manager: JudgePromptManager instance
            word_limit: Word limit for judge critiques
        """
        self.client = client
        self.judge_prompt_manager = judge_prompt_manager
        self.word_limit = word_limit
        self.model = "phi4"  # Default model for judges
    
    def evaluate_argument(self, judge_type: str, stance: str, topic: str, 
                         combined_arguments: str) -> Tuple[float, str]:
        """
        Evaluate an argument using a specific judge type
        
        Args:
            judge_type: Type of judge to use
            stance: Stance being evaluated ('FOR' or 'AGAINST')
            topic: Debate topic
            combined_arguments: Combined argument text to evaluate
            
        Returns:
            Tuple of (score, critique text)
        """
        # Prepare parameters for the judge prompt
        params = {
            "stance": stance,
            "topic": topic,
            "combined_arguments": combined_arguments,
            "word_limit": self.word_limit
        }
        
        # Get the formatted judge prompt
        judge_prompt = self.judge_prompt_manager.get_judge_prompt(judge_type, params)
        
        # Log what we're doing
        print(f"\nRunning {judge_type.upper()} judge on {stance} argument...")
        
        try:
            # Call the model
            start_time = time.time()
            response = self.client.run_model(self.model, judge_prompt)
            end_time = time.time()
            
            print(f"Judge evaluation completed in {end_time - start_time:.2f} seconds")
            
            if not response:
                print(f"Warning: Empty response from {judge_type} judge")
                return 0.0, "Error: No response generated"
            
            # Extract score and critique
            score, critique = self._parse_judge_response(response, judge_type)
            
            return score, critique, response
            
        except Exception as e:
            print(f"Error during judge evaluation: {e}")
            return 0.0, f"Error during evaluation: {e}"

    
        
    def _parse_judge_response(self, response: str, judge_type: str) -> Tuple[float, str]:
        """
        Parse the response from a judge to extract score and critique
        
        Args:
            response: Raw response from the judge
            judge_type: Type of judge used
            
        Returns:
            Tuple of (score, critique text)
        """
        # Default values
        score = 0.0
        critique = response
        
        try:
            import re
            
            # Expanded patterns to catch more score formats
            score_patterns = [
                # Standard patterns with score label
                r"(?i)SCORE:\s*(\d+(?:\.\d+)?)",
                r"(?i)" + judge_type.upper() + r".*SCORE:\s*(\d+(?:\.\d+)?)",
                
                # Patterns with X/10 format
                r"(?i)(\d+(?:\.\d+)?)\s*/\s*10",
                
                # Simple standalone number format (must be careful with this)
                r"(?i)SCORE[^\d]*?(\d+(?:\.\d+)?)",
                
                # Score with label + number format
                r"(?i)" + judge_type.upper() + r".*?(\d+(?:\.\d+)?)\s*/\s*10"
            ]
            
            # Look for score in the response
            for pattern in score_patterns:
                score_match = re.search(pattern, response)
                if score_match:
                    try:
                        extracted_score = float(score_match.group(1))
                        # Validate the score is in a reasonable range (0-10)
                        if 0 <= extracted_score <= 10:
                            score = extracted_score
                            break
                    except ValueError:
                        continue
            
            # Look for critique section
            critique_patterns = [
                r"(?i)CRITIQUE:\s*([\s\S]+)",
                r"(?i)ASSESSMENT:\s*([\s\S]+)",
                r"(?i)EVALUATION:\s*([\s\S]+)"
            ]
            
            for pattern in critique_patterns:
                critique_match = re.search(pattern, response)
                if critique_match:
                    critique = critique_match.group(1).strip()
                    break
            
            return score, critique
                
        except Exception as e:
            print(f"Error parsing judge response: {e}")
            return 0.0, response
    
    
            



class JudgmentStorage:
    """Handles storage and retrieval of judgment results"""
    
    def __init__(self, base_dir: str = "results/judge_records"):
        """Initialize storage with base directory"""
        self.base_dir = base_dir
        os.makedirs(self.base_dir, exist_ok=True)
    
    def save_judgment(self, judgment_data: Dict[str, Any], filename: Optional[str] = None) -> str:
        """
        Save judgment data to a JSON file
        
        Args:
            judgment_data: Dictionary of judgment data
            filename: Optional specific filename
            
        Returns:
            Path to saved file
        """
        if filename is None:
            # Generate filename from metadata
            debate_id = judgment_data.get("metadata", {}).get("debate_id", "unknown")
            timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
            filename = f"judgment_{debate_id}_{timestamp}.json"
        
        filepath = os.path.join(self.base_dir, filename)
        
        try:
            with open(filepath, "w", encoding="utf-8") as f:
                json.dump(judgment_data, f, indent=2)
            print(f"Judgment saved to {filepath}")
            return filepath
        except Exception as e:
            print(f"Error saving judgment: {e}")
            return ""





## ----------------------------------------------------------------------------------------------------------------------------------------------
## ----------------------------------------------------------------------------------------------------------------------------------------------

def run_judging_pipeline(debate_file_path: str, client, judge_types: Optional[List[str]] = None,
                       judge_model: str = "phi4", word_limit: int = 300, 
                       print_opt: str = 'preview') -> Dict[str, Any]:
    """
    Run the complete judging pipeline on a debate
    
    Args:
        debate_file_path: Path to debate JSON file
        client: Client object with run_model method
        judge_types: List of judge types to run (defaults to all available)
        judge_model: Model to use for judging
        word_limit: Word limit for judge critiques
        print_opt: Controls output verbosity ('preview', 'full', or 'none')
        
    Returns:
        Complete judgment data
    """
    # Initialize components
    judge_prompt_path = r"D:\JoelDesktop folds_24\NEU SPRING25 - DL, HCI\Projects\Multi-Agent LLM Debator\prompts\judge_prompts.yml"
    judge_prompt_manager = JudgePromptManager(judge_prompt_path)
    debate_loader = DebateResultsLoader(debate_file_path)
    judge_evaluator = JudgeEvaluator(client, judge_prompt_manager, word_limit)
    judge_evaluator.model = judge_model
    
    # Set judge types to all available if not specified
    if judge_types is None:
        judge_types = judge_prompt_manager.list_available_judges()
    
    # Get debate metadata and stances
    metadata = debate_loader.get_metadata()
    stances = debate_loader.get_stances()
    topic = metadata.get("topic", "unknown")
    
    print(f"\n===== STARTING DEBATE JUDGING =====")
    print(f"Topic: {topic}")
    print(f"Judges: {judge_types}")
    print(f"Judge model: {judge_model}")

    # Prepare results structure
    results = {
        "metadata": metadata, 
        "timestamp": datetime.datetime.now().isoformat(), 
        "judgments": {}
    }
    
    # Run judgments for each stance
    for stance in stances:
        print(f"\n----- Evaluating {stance} Arguments -----")
        combined_arguments = debate_loader.get_combined_arguments(stance)
        
        # Skip if no arguments for this stance
        if not combined_arguments:
            print(f"No arguments found for {stance} stance, skipping...")
            continue
        
        stance_results = {}
        
        # Run each judge type
        for judge_type in judge_types:
            print(f"\nRunning {judge_type.upper()} judge...")
            score, critique, raw_evaluation = judge_evaluator.evaluate_argument( judge_type, stance, topic, combined_arguments )
            
            # Store results
            stance_results[judge_type] = {
                "score": score, "critique": critique, "raw_evaluation": raw_evaluation,
                "timestamp": datetime.datetime.now().isoformat()
            }
            
            # Print score and critique based on print_opt
            print(f"{judge_type.upper()} Score: {score}/10")
            
            if print_opt == 'full':
                ## print(f"Critique:\n{'-' * 80}\n{critique}\n{'-' * 80}")
                print(f"Raw Evaluation:\n{'-' * 80}\n{raw_evaluation}\n{'-' * 80}")
            elif print_opt == 'preview':
                critique_preview = critique[:100] + "..." if len(critique) > 100 else critique
                print(f"Critique preview: {critique_preview}")
        
        # Store all judgments for this stance
        results["judgments"][stance] = stance_results
    
    # Save results
    storage = JudgmentStorage(r"D:\JoelDesktop folds_24\NEU SPRING25 - DL, HCI\Projects\Multi-Agent LLM Debator\results\judge_records")
    filepath = storage.save_judgment(results)
    
    print(f"\n===== JUDGING COMPLETE =====")
    print(f"Results saved to: {filepath}")
    
    return results


In [37]:
if __name__ == "__main__":
    
    # Create client
    client = OllamaDebateManager()
    
    # Test file path
    db_file_path_prefix = r"D:\JoelDesktop folds_24\NEU SPRING25 - DL, HCI\Projects\Multi-Agent LLM Debator\results\perfect_debate_transcripts"
    db_file_name = r"debate_1744506254_The_humanities_are_as_valuable_as_STEM_in_modern_education_and_should_receive_equal_funding_.json"
    debate_file_path = os.path.join(db_file_path_prefix, db_file_name)
    
    # Configure which judges to run

    # subset. bleh.
    # judge_types = ["logical", "factual", "rhetorical", "belief_shift"] 

    judge_types = None  # Run all available judges
    
    # Run the judging pipeline
    judgment_results = run_judging_pipeline(
        debate_file_path=debate_file_path,
        client=client,
        judge_types=judge_types,
        judge_model="deepseek_r1",  # Using phi4 for judging
        word_limit=300, 
        print_opt='full'
    )
    
    # Print summary
    print("\n===== JUDGMENT SUMMARY =====")
    for stance, judges in judgment_results["judgments"].items():
        print(f"\n{stance} Argument Scores:")
        for judge_type, data in judges.items():
            print(f"  {judge_type.upper()}: {data['score']}/10")

Loaded judge prompts from D:\JoelDesktop folds_24\NEU SPRING25 - DL, HCI\Projects\Multi-Agent LLM Debator\prompts\judge_prompts.yml
Loaded debate from D:\JoelDesktop folds_24\NEU SPRING25 - DL, HCI\Projects\Multi-Agent LLM Debator\results\perfect_debate_transcripts\debate_1744506254_The_humanities_are_as_valuable_as_STEM_in_modern_education_and_should_receive_equal_funding_.json

===== STARTING DEBATE JUDGING =====
Topic: The humanities are as valuable as STEM in modern education and should receive equal funding.
Judges: ['factual', 'logical', 'rhetorical', 'belief_shift', 'audience', 'strategic', 'ethical']
Judge model: deepseek_r1

----- Evaluating AGAINST Arguments -----

Running FACTUAL judge...

Running FACTUAL judge on AGAINST argument...
Available models: ['dolphin-llama3:8b', 'qwen2.5:14b', 'yi:9b', 'gemma3:27b', 'phi4:latest', 'deepseek-r1:14b', 'phi3:14b', 'mixtral:8x7b']
Looking for: deepseek-r1:14b
Sending request to deepseek-r1:14b...
Judge evaluation completed in 20.60 se

## Multi Judge Setup on Older Cell. 

In [38]:
import pandas as pd
from IPython.display import display, HTML
from typing import List, Dict, Any, Optional
import time

def run_multiple_judges_pipeline(debate_file_path: str, client,
                               judge_types: Optional[List[str]] = None,
                               judge_models: List[str] = ["deepseek_r1", "phi4", "mixtral"],
                               word_limit: int = 300,
                               print_opt: str = 'preview') -> Dict[str, Any]:
    """
    Run debate judging with multiple judge models and collect results
    
    Args:
        debate_file_path: Path to debate JSON file
        client: Client object with run_model method
        judge_types: List of judge types to run (defaults to all available)
        judge_models: List of models to use as judges
        word_limit: Word limit for judge critiques
        print_opt: Controls output verbosity ('preview', 'full', or 'none')
        
    Returns:
        Combined results from all judge models
    """
    all_results = {}
    all_scores = {}
    
    # Run each judge model in sequence
    for judge_model in judge_models:
        print(f"\n\n{'=' * 80}")
        print(f"RUNNING JUDGMENTS WITH MODEL: {judge_model}")
        print(f"{'=' * 80}")
        
        try:
            # Run the standard pipeline with this judge model
            results = run_judging_pipeline(
                debate_file_path=debate_file_path,
                client=client,
                judge_types=judge_types,
                judge_model=judge_model,
                word_limit=word_limit,
                print_opt=print_opt
            )
            
            # Store results
            all_results[judge_model] = results
            
            # Extract and store scores
            scores = {}
            for stance, judges in results.get("judgments", {}).items():
                if stance not in scores:
                    scores[stance] = {}
                for judge_type, data in judges.items():
                    scores[stance][f"{judge_model}_{judge_type}"] = data.get("score", 0)
            
            all_scores.update(scores)
            
            # Small delay between models to avoid rate limiting issues
            time.sleep(2)
            
        except Exception as e:
            print(f"Error running {judge_model} as judge: {e}")
    
    # Display summary scores in a nice table
    display_judgment_table(all_results)
    
    # Return combined results
    return {
        "all_results": all_results,
        "all_scores": all_scores
    }


def display_judgment_table(all_results: Dict[str, Any]):
    """
    Create and display an HTML table of judgment scores
    
    Args:
        all_results: Combined results from all judge models
    """
    # Check if we have any results
    if not all_results:
        print("No judgment results to display")
        return
    
    # Get list of stances and judge types from the first result
    first_model = next(iter(all_results.values()))
    stances = list(first_model.get("judgments", {}).keys())
    
    # Create a structured data frame
    table_data = []
    
    for stance in stances:
        # For each judge model
        for judge_model, results in all_results.items():
            if stance not in results.get("judgments", {}):
                continue
                
            # For each judge type
            for judge_type, data in results.get("judgments", {}).get(stance, {}).items():
                score = data.get("score", 0)
                row = {
                    "Stance": stance,
                    "Judge Model": judge_model,
                    "Judge Type": judge_type.capitalize(),
                    "Score": f"{score}/10"
                }
                table_data.append(row)
    
    # Create DataFrame and display
    df = pd.DataFrame(table_data)
    
    # Calculate average scores per stance and judge model
    summary_data = []
    for stance in stances:
        for judge_model in all_results.keys():
            scores = [
                data.get("score", 0) 
                for data in all_results.get(judge_model, {}).get("judgments", {}).get(stance, {}).values()
            ]
            if scores:
                avg_score = sum(scores) / len(scores)
                summary_data.append({
                    "Stance": stance,
                    "Judge Model": judge_model,
                    "Judge Type": "AVERAGE",
                    "Score": f"{avg_score:.1f}/10"
                })
    
    # Add summary rows
    summary_df = pd.DataFrame(summary_data)
    df = pd.concat([df, summary_df]).reset_index(drop=True)
    
    # Create pivoted view for easier comparison
    pivot_df = df.pivot_table(
        index=["Stance", "Judge Type"],
        columns="Judge Model",
        values="Score",
        aggfunc="first"
    ).reset_index()
    
    # Style the table
    styled_table = pivot_df.style.set_table_styles([
        {'selector': 'th', 'props': [('background-color', '#f2f2f2'), 
                                     ('color', 'black'),
                                     ('font-weight', 'bold'),
                                     ('text-align', 'center'),
                                     ('padding', '8px')]},
        {'selector': 'td', 'props': [('padding', '8px'), ('text-align', 'center')]},
        {'selector': 'caption', 'props': [('caption-side', 'top'), 
                                          ('font-weight', 'bold'),
                                          ('font-size', '1.2em'),
                                          ('padding', '8px')]}
    ]).set_caption("Debate Judgment Scores by Model and Judge Type")
    
    # Highlight AVERAGE rows
    def highlight_average(row):
        is_average = row["Judge Type"] == "AVERAGE"
        return ['background-color: #fff3e0' if is_average else '' for _ in row]
    
    styled_table = styled_table.apply(highlight_average, axis=1)
    
    # Display the table
    display(HTML(styled_table.to_html()))
    
    # Also show a model comparison table (average scores per judge model)
    model_comparison = []
    for stance in stances:
        for judge_model in all_results.keys():
            model_data = all_results.get(judge_model, {}).get("judgments", {}).get(stance, {})
            if model_data:
                avg_score = sum(data.get("score", 0) for data in model_data.values()) / len(model_data)
                model_comparison.append({
                    "Stance": stance,
                    "Judge Model": judge_model,
                    "Average Score": f"{avg_score:.1f}/10"
                })
    
    model_df = pd.DataFrame(model_comparison)
    print("\nModel Comparison (Average Scores):")
    display(model_df.pivot(index="Stance", columns="Judge Model", values="Average Score"))



## ---------------------------------------------------------------------------------------------------------------------------------------
## ---------------------------------------------------------------------------------------------------------------------------------------


# Create client
client = OllamaDebateManager()

# Test file path
db_file_path_prefix = r"D:\JoelDesktop folds_24\NEU SPRING25 - DL, HCI\Projects\Multi-Agent LLM Debator\results\perfect_debate_transcripts"
db_file_name = r"debate_1744506254_The_humanities_are_as_valuable_as_STEM_in_modern_education_and_should_receive_equal_funding_.json"
debate_file_path = os.path.join(db_file_path_prefix, db_file_name)

# Run multiple judge models
judgment_results = run_multiple_judges_pipeline(
    debate_file_path=debate_file_path,
    client=client,
    ## - judge_types=["logical", "factual", "rhetorical", "belief_shift"],
    judge_types=None,
    judge_models=["deepseek_r1", "qwen2.5", "yi", "phi4", "gemma3" ],  # multiple models
    word_limit=270,
    print_opt='full' 
)




RUNNING JUDGMENTS WITH MODEL: deepseek_r1
Loaded judge prompts from D:\JoelDesktop folds_24\NEU SPRING25 - DL, HCI\Projects\Multi-Agent LLM Debator\prompts\judge_prompts.yml
Loaded debate from D:\JoelDesktop folds_24\NEU SPRING25 - DL, HCI\Projects\Multi-Agent LLM Debator\results\perfect_debate_transcripts\debate_1744506254_The_humanities_are_as_valuable_as_STEM_in_modern_education_and_should_receive_equal_funding_.json

===== STARTING DEBATE JUDGING =====
Topic: The humanities are as valuable as STEM in modern education and should receive equal funding.
Judges: ['factual', 'logical', 'rhetorical', 'belief_shift', 'audience', 'strategic', 'ethical']
Judge model: deepseek_r1

----- Evaluating AGAINST Arguments -----

Running FACTUAL judge...

Running FACTUAL judge on AGAINST argument...
Available models: ['dolphin-llama3:8b', 'qwen2.5:14b', 'yi:9b', 'gemma3:27b', 'phi4:latest', 'deepseek-r1:14b', 'phi3:14b', 'mixtral:8x7b']
Looking for: deepseek-r1:14b
Sending request to deepseek-r1:1

Judge Model,Stance,Judge Type,deepseek_r1,gemma3,phi4,qwen2.5,yi
0,AGAINST,AVERAGE,6.4/10,6.6/10,6.1/10,7.1/10,3.3/10
1,AGAINST,Audience,7.5/10,7.0/10,7.0/10,7.5/10,0.0/10
2,AGAINST,Belief_shift,7.0/10,6.5/10,6.0/10,7.0/10,0.0/10
3,AGAINST,Ethical,7.0/10,6.0/10,6.0/10,7.0/10,0.0/10
4,AGAINST,Factual,4.0/10,6.0/10,5.0/10,6.0/10,6.0/10
5,AGAINST,Logical,7.0/10,6.0/10,5.0/10,7.0/10,8.0/10
6,AGAINST,Rhetorical,6.0/10,7.5/10,7.0/10,7.0/10,9.0/10
7,AGAINST,Strategic,6.0/10,7.0/10,7.0/10,8.0/10,0.0/10
8,FOR,AVERAGE,7.4/10,7.6/10,7.5/10,7.9/10,3.6/10
9,FOR,Audience,8.5/10,8.0/10,7.5/10,8.0/10,9.5/10



Model Comparison (Average Scores):


Judge Model,deepseek_r1,gemma3,phi4,qwen2.5,yi
Stance,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
AGAINST,6.4/10,6.6/10,6.1/10,7.1/10,3.3/10
FOR,7.4/10,7.6/10,7.5/10,7.9/10,3.6/10


### Recreating table; with same output as above.
### Debate Judgment Scores by Model and Judge Type


```

Loaded judge prompts from D:\JoelDesktop folds_24\NEU SPRING25 - DL, HCI\Projects\Multi-Agent LLM Debator\prompts\judge_prompts.yml
Loaded debate from D:\JoelDesktop folds_24\NEU SPRING25 - DL, HCI\Projects\Multi-Agent LLM Debator\results\perfect_debate_transcripts\debate_1744506254_The_humanities_are_as_valuable_as_STEM_in_modern_education_and_should_receive_equal_funding_.json


FOR averaged 7.4-7.9 across models
AGAINST averaged 6.1-7.1 across models

This consistent gap suggests the models are genuinely detecting qualitative differences between the two sides, not just randomly assigning scores. The fact that they've independently arrived at similar conclusions reinforces the reliability of these evaluations.
```

| Judge Model | Stance  | Judge Type   | deepseek_r1 | gemma3 | phi4  | qwen2.5 | yi     |
|-------------|---------|--------------|-------------|--------|-------|---------|--------|
| 0           | AGAINST | AVERAGE      | 6.4/10      | 6.6/10 | 6.1/10| 7.1/10  | 3.3/10 |
| 1           | AGAINST | Audience     | 7.5/10      | 7.0/10 | 7.0/10| 7.5/10  | 0.0/10 |
| 2           | AGAINST | Belief_shift | 7.0/10      | 6.5/10 | 6.0/10| 7.0/10  | 0.0/10 |
| 3           | AGAINST | Ethical      | 7.0/10      | 6.0/10 | 6.0/10| 7.0/10  | 0.0/10 |
| 4           | AGAINST | Factual      | 4.0/10      | 6.0/10 | 5.0/10| 6.0/10  | 6.0/10 |
| 5           | AGAINST | Logical      | 7.0/10      | 6.0/10 | 5.0/10| 7.0/10  | 8.0/10 |
| 6           | AGAINST | Rhetorical   | 6.0/10      | 7.5/10 | 7.0/10| 7.0/10  | 9.0/10 |
| 7           | AGAINST | Strategic    | 6.0/10      | 7.0/10 | 7.0/10| 8.0/10  | 0.0/10 |
| 8           | FOR     | AVERAGE      | 7.4/10      | 7.6/10 | 7.5/10| 7.9/10  | 3.6/10 |
| 9           | FOR     | Audience     | 8.5/10      | 8.0/10 | 7.5/10| 8.0/10  | 9.5/10 |
| 10          | FOR     | Belief_shift | 8.0/10      | 7.0/10 | 7.0/10| 8.0/10  | 0.0/10 |
| 11          | FOR     | Ethical      | 7.0/10      | 7.5/10 | 8.0/10| 8.0/10  | 9.0/10 |
| 12          | FOR     | Factual      | 6.0/10      | 7.0/10 | 7.0/10| 7.0/10  | 7.0/10 |
| 13          | FOR     | Logical      | 7.0/10      | 7.0/10 | 7.0/10| 8.0/10  | 8.0/10 |
| 14          | FOR     | Rhetorical   | 8.0/10      | 8.5/10 | 7.0/10| 8.0/10  | 9.0/10 |
| 15          | FOR     | Strategic    | 7.0/10      | 8.0/10 | 8.0/10| 8.0/10  | 0.0/10 |
