# Multi-Model Behavioral Validation - Study 4

This notebook refactors the original Study 4 moral and risk-taking simulations to work with multiple LLM models using the unified portal.py interface.

## Models to Test
- GPT-4
- GPT-4o  
- Llama-3.3-70B-Instruct
- DeepSeek-V3

## Simulations
1. **Moral Reasoning**: 5 ethical dilemma scenarios
2. **Risk Taking**: Risk assessment scenarios

## Data Flow
1. Load personality-assigned participants data
2. Run moral reasoning simulations across all models
3. Run risk-taking simulations across all models
4. Save results for comparative analysis

In [ ]:
import pandas as pd
import sys
from pathlib import Path

# Add shared modules to path
sys.path.append('../shared')

from simulation_utils import (
    SimulationConfig, 
    run_moral_simulation,
    run_risk_simulation,
    retry_failed_participants,
    save_simulation_results
)

## Data Loading

In [ ]:
# Load the personality data (from Study 4 original data)
data_path = Path('../../study_4/data/york_data_clean.csv')
if not data_path.exists():
    print(f"Data file not found at {data_path}")
    print("Expected path:", data_path)
    print("Please ensure the Study 4 data is available.")
    raise FileNotFoundError(f"Data file not found: {data_path}")
else:
    df = pd.read_csv(data_path)
    print(f"Loaded data shape: {df.shape}")
    print(f"Columns with 'bfi' in name: {[col for col in df.columns if 'bfi' in col.lower()][:10]}")
    df.head()

In [None]:
# Check for the personality combination column
personality_columns = [col for col in df.columns if 'combined' in col.lower() or 'bfi_combined' in col]
print(f"Available personality columns: {personality_columns}")

# Use the appropriate personality column
if 'bfi_combined' in df.columns:
    personality_key = 'bfi_combined'
elif len(personality_columns) > 0:
    personality_key = personality_columns[0]
else:
    print("Warning: No combined personality column found. You may need to create it first.")
    personality_key = 'bfi_combined'

print(f"Using personality key: {personality_key}")

# Preview a personality description
if personality_key in df.columns:
    print("\nSample personality description:")
    print(df.iloc[0][personality_key][:500] + "...")

In [None]:
# Prepare participant data
participants_data = df.to_dict('records')
print(f"Prepared {len(participants_data)} participants for simulation")

## Simulation Configuration

In [None]:
# Configuration for different models and scenarios
models_to_test = ['gpt-4', 'gpt-4o', 'llama', 'deepseek']
temperatures = [0.0, 1.0]  # Test both deterministic and stochastic responses
batch_size = 20  # Conservative batch size for complex scenarios

# Results storage
moral_results = {}
risk_results = {}

## Moral Reasoning Simulations

In [ ]:
print("Starting Moral Reasoning Simulations")
print("=" * 60)

for model in models_to_test:
    for temperature in temperatures:
        print(f"\nRunning moral simulation: {model} with temperature {temperature}")
        
        config = SimulationConfig(
            model=model,
            temperature=temperature,
            batch_size=batch_size,
            max_workers=8
        )
        
        try:
            results = run_moral_simulation(
                participants_data=participants_data,
                config=config,
                output_dir="study_4_results/moral"
            )
            
            # Store results
            key = f"{model}_temp{temperature}"
            moral_results[key] = results
            
            # Check for failures
            failed_count = sum(1 for r in results if isinstance(r, dict) and 'error' in r)
            success_rate = ((len(results) - failed_count) / len(results)) * 100
            
            print(f"Completed: {len(results)} participants, {failed_count} failed, {success_rate:.1f}% success rate")
            
        except Exception as e:
            print(f"Error in moral simulation {model} temp {temperature}: {str(e)}")
            moral_results[f"{model}_temp{temperature}"] = {"error": str(e)}

print(f"\nMoral simulations completed. Results: {list(moral_results.keys())}")

<cell_type>markdown</cell_type>## Risk-Taking Simulations

In [None]:
print("Starting Risk-Taking Simulations")
print("=" * 60)

for model in models_to_test:
    for temperature in temperatures:
        print(f"\nRunning risk simulation: {model} with temperature {temperature}")
        
        config = SimulationConfig(
            model=model,
            temperature=temperature,
            batch_size=batch_size,
            max_workers=8
        )
        
        try:
            results = run_risk_simulation(
                participants_data=participants_data,
                config=config,
                output_dir="study_4_results/risk"
            )
            
            # Store results
            key = f"{model}_temp{temperature}"
            risk_results[key] = results
            
            # Check for failures
            failed_count = sum(1 for r in results if isinstance(r, dict) and 'error' in r)
            success_rate = ((len(results) - failed_count) / len(results)) * 100
            
            print(f"Completed: {len(results)} participants, {failed_count} failed, {success_rate:.1f}% success rate")
            
        except Exception as e:
            print(f"Error in risk simulation {model} temp {temperature}: {str(e)}")
            risk_results[f"{model}_temp{temperature}"] = {"error": str(e)}

print(f"\nRisk simulations completed. Results: {list(risk_results.keys())}")

## Retry Failed Participants

In [ ]:
# Retry failed participants if needed
print("Retrying failed participants...")

# Retry moral reasoning failures
for key, results in moral_results.items():
    if isinstance(results, list):
        failed_count = sum(1 for r in results if isinstance(r, dict) and 'error' in r)
        if failed_count > 0:
            print(f"Retrying {failed_count} failed participants for moral {key}")
            
            model = key.split('_temp')[0]
            temperature = float(key.split('_temp')[1])
            
            config = SimulationConfig(
                model=model,
                temperature=temperature,
                batch_size=batch_size
            )
            
            from moral_stories import get_prompt
            updated_results = retry_failed_participants(
                results=results,
                participants_data=participants_data,
                prompt_generator=get_prompt,
                config=config,
                personality_key=personality_key
            )
            
            moral_results[key] = updated_results

# Retry risk-taking failures
for key, results in risk_results.items():
    if isinstance(results, list):
        failed_count = sum(1 for r in results if isinstance(r, dict) and 'error' in r)
        if failed_count > 0:
            print(f"Retrying {failed_count} failed participants for risk {key}")
            
            model = key.split('_temp')[0]
            temperature = float(key.split('_temp')[1])
            
            config = SimulationConfig(
                model=model,
                temperature=temperature,
                batch_size=batch_size
            )
            
            from risk_taking import get_prompt
            updated_results = retry_failed_participants(
                results=results,
                participants_data=participants_data,
                prompt_generator=get_prompt,
                config=config,
                personality_key=personality_key
            )
            
            risk_results[key] = updated_results

print("Retry process completed")

# Save simulation results
results_dir = Path("study_4_results")
results_dir.mkdir(exist_ok=True)

# Save basic summary
import json
from datetime import datetime

experiment_summary = {
    "timestamp": datetime.now().isoformat(),
    "study": "Study 4 - Behavioral Validation",
    "models_tested": models_to_test,
    "temperatures": temperatures,
    "batch_size": batch_size,
    "total_participants": len(participants_data),
    "personality_key": personality_key,
    "moral_results_summary": {},
    "risk_results_summary": {}
}

# Add summary statistics
for key, results in moral_results.items():
    if isinstance(results, list):
        successful = sum(1 for r in results if not (isinstance(r, dict) and 'error' in r))
        experiment_summary["moral_results_summary"][key] = {
            "total": len(results),
            "successful": successful,
            "success_rate": (successful / len(results)) * 100
        }

for key, results in risk_results.items():
    if isinstance(results, list):
        successful = sum(1 for r in results if not (isinstance(r, dict) and 'error' in r))
        experiment_summary["risk_results_summary"][key] = {
            "total": len(results),
            "successful": successful,
            "success_rate": (successful / len(results)) * 100
        }

# Save summary
with open(results_dir / "study4_experiment_summary.json", "w") as f:
    json.dump(experiment_summary, f, indent=4)

print(f"Study 4 simulation completed. Results saved to {results_dir}/")