In [1]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path
import json
from datetime import datetime

# Add shared modules to path
sys.path.append('../shared')

from simulation_utils import (
    SimulationConfig, 
    run_batch_simulation,
    run_enhanced_bfi_to_minimarker_simulation
)
from binary_baseline_prompt import (
    generate_binary_personality_description,
    get_binary_prompt,
    create_binary_participant_data
)

print("Setup complete!")
print(f"Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


Setup complete!
Analysis started at: 2025-07-15 09:53:48


In [2]:
# Load the statistically simulated BFI-2 data
data_path = Path('facet_lvl_simulated_data.csv')
if not data_path.exists():
    print(f"Simulated data file not found at {data_path}")
    print("Please run bfi2_facet_level_parameter_extraction_and_simulation.py first to generate the data")
    raise FileNotFoundError(f"Data file not found: {data_path}")

data = pd.read_csv(data_path)
print(f"Loaded simulated data shape: {data.shape}")
print(f"Columns: {list(data.columns[:10])}...")  # Show first 10 columns
print(f"Domain score columns: {[col for col in data.columns if col.startswith('bfi_') and len(col) == 5]}")
print(f"Domain score range: {data[['bfi_e', 'bfi_a', 'bfi_c', 'bfi_n', 'bfi_o']].min().min():.2f} to {data[['bfi_e', 'bfi_a', 'bfi_c', 'bfi_n', 'bfi_o']].max().max():.2f}")
data.head()


Loaded simulated data shape: (200, 80)
Columns: ['bfi1', 'bfi16', 'bfi31', 'bfi46', 'bfi6', 'bfi21', 'bfi36', 'bfi51', 'bfi11', 'bfi26']...
Domain score columns: ['bfi_e', 'bfi_a', 'bfi_c', 'bfi_n', 'bfi_o']
Domain score range: 1.33 to 5.00


Unnamed: 0,bfi1,bfi16,bfi31,bfi46,bfi6,bfi21,bfi36,bfi51,bfi11,bfi26,...,bfi_n_depression,bfi_n_emotional_volatility,bfi_o_intellectual_curiosity,bfi_o_aesthetic_sensitivity,bfi_o_creative_imagination,bfi_e,bfi_a,bfi_c,bfi_n,bfi_o
0,3.0,2.0,4.0,3.0,2.0,2.0,3.0,5.0,3.0,2.0,...,2.0,4.5,4.0,4.5,3.25,2.666667,3.833333,2.333333,3.75,3.916667
1,4.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,4.0,...,2.75,4.5,2.5,3.25,3.0,3.25,2.833333,4.083333,3.5,2.916667
2,3.0,3.0,4.0,1.0,1.0,2.0,3.0,4.0,2.0,3.0,...,2.0,4.5,3.5,4.75,4.25,2.666667,2.75,3.833333,3.833333,4.166667
3,5.0,1.0,2.0,5.0,5.0,5.0,1.0,1.0,2.0,2.0,...,4.25,2.25,4.75,4.25,3.75,4.583333,3.166667,2.916667,3.583333,4.25
4,3.0,4.0,3.0,3.0,1.0,2.0,4.0,3.0,2.0,4.0,...,3.0,1.75,4.5,4.0,4.5,2.75,4.333333,2.5,3.0,4.333333


In [3]:
# Prepare participant data for binary baseline format
print("Preparing participant data for binary baseline format...")

# Create participant data with correct column names for binary format
participants_data = []
for idx, row in data.iterrows():
    participant = {
        'participant_id': idx,
        'bfi2_e': row['bfi_e'],
        'bfi2_a': row['bfi_a'],
        'bfi2_c': row['bfi_c'],
        'bfi2_n': row['bfi_n'],
        'bfi2_o': row['bfi_o']
    }
    participants_data.append(participant)

print(f"Created {len(participants_data)} participant records")

# Show sample participant data
print("\nSample participant data:")
for i in range(min(3, len(participants_data))):
    print(f"Participant {i+1}: {participants_data[i]}")

# Generate binary personality descriptions using threshold approach
print("\n=== GENERATING BINARY BASELINE DESCRIPTIONS ===")
participants_with_binary = create_binary_participant_data(participants_data, threshold=2.5)

# Display sample binary descriptions
print("\nSample binary baseline personality descriptions:")
for i, p in enumerate(participants_with_binary[:3]):
    print(f"\nParticipant {i+1}:")
    print(f"Domain scores: E={p['bfi2_e']:.2f}, A={p['bfi2_a']:.2f}, C={p['bfi2_c']:.2f}, N={p['bfi2_n']:.2f}, O={p['bfi2_o']:.2f}")
    print(f"Binary description: {p['binary_personality']}")


Preparing participant data for binary baseline format...
Created 200 participant records

Sample participant data:
Participant 1: {'participant_id': 0, 'bfi2_e': 2.6666666666666665, 'bfi2_a': 3.8333333333333335, 'bfi2_c': 2.3333333333333335, 'bfi2_n': 3.75, 'bfi2_o': 3.9166666666666665}
Participant 2: {'participant_id': 1, 'bfi2_e': 3.25, 'bfi2_a': 2.8333333333333335, 'bfi2_c': 4.083333333333333, 'bfi2_n': 3.5, 'bfi2_o': 2.9166666666666665}
Participant 3: {'participant_id': 2, 'bfi2_e': 2.6666666666666665, 'bfi2_a': 2.75, 'bfi2_c': 3.8333333333333335, 'bfi2_n': 3.8333333333333335, 'bfi2_o': 4.166666666666667}

=== GENERATING BINARY BASELINE DESCRIPTIONS ===

Sample binary baseline personality descriptions:

Participant 1:
Domain scores: E=2.67, A=3.83, C=2.33, N=3.75, O=3.92
Binary description: You are high in Extraversion. You are high in Agreeableness. You are low in Conscientiousness. You are high in Neuroticism. You are high in Openness.

Participant 2:
Domain scores: E=3.25, A=2.8

In [4]:
# Test prompt generation with first participant
if participants_with_binary:
    sample_personality = participants_with_binary[0]['binary_personality']
    sample_prompt = get_binary_prompt(sample_personality)
    
    print("=" * 80)
    print("COMPLETE BINARY BASELINE PROMPT SENT TO LLM")
    print("=" * 80)
    print(sample_prompt)
    print("=" * 80)
    print(f"Prompt length: {len(sample_prompt)} characters")
    print(f"Prompt word count: {len(sample_prompt.split())} words")
else:
    print("No participants available for testing")


COMPLETE BINARY BASELINE PROMPT SENT TO LLM
### Your Assigned Personality ### 
Based on your personality profile below, please rate yourself on the following traits.
You are high in Extraversion. You are high in Agreeableness. You are low in Conscientiousness. You are high in Neuroticism. You are high in Openness.

### Context and Objective ###
You are participating in a study to help us understand human personality.

Your job is to fill out a personality questionnaire below. Your questionnaire answers should be reflective of your assigned personalities.

### Response Format ###
IMPORTANT: You must provide ratings for ALL 40 traits listed below.
Return ONLY a JSON object where:
- Keys are the exact trait names (e.g., "Bashful", "Bold", etc.)
- Values are numbers from 1-9 based on the rating scale
- Include ALL 40 traits - no more, no less
- Do NOT include personality domains like "Extraversion" or "Agreeableness"
- Do NOT add any text outside the JSON

Example format:
{
    "Bashful": 

In [5]:
# Configuration for different models and temperatures
models_to_test = [
    'openai-gpt-3.5-turbo-0125',
    'gpt-4',
    'gpt-4o',
    'llama',
    'deepseek'
]
temperatures = [1]  # Use temperature 1 for stochastic responses
batch_size = 25  # Smaller batch size for stability across different APIs

print(f"Simulation Configuration:")
print(f"  Models to test: {models_to_test}")
print(f"  Temperatures: {temperatures}")
print(f"  Batch size: {batch_size}")
print(f"  Total combinations: {len(models_to_test) * len(temperatures)}")
print(f"  Participants: {len(participants_with_binary)}")

# Verify participant data structure
print(f"\nSample participant data keys: {list(participants_with_binary[0].keys())}")
print(f"Has 'binary_personality' key: {'binary_personality' in participants_with_binary[0]}")


Simulation Configuration:
  Models to test: ['openai-gpt-3.5-turbo-0125', 'gpt-4', 'gpt-4o', 'llama', 'deepseek']
  Temperatures: [1]
  Batch size: 25
  Total combinations: 5
  Participants: 200

Sample participant data keys: ['participant_id', 'bfi2_e', 'bfi2_a', 'bfi2_c', 'bfi2_n', 'bfi2_o', 'binary_personality']
Has 'binary_personality' key: True


In [6]:
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Thread-safe logging
log_lock = threading.Lock()

def safe_print(message, prefix="INFO"):
    """Thread-safe printing with timestamp and prefix"""
    timestamp = datetime.now().strftime("%H:%M:%S")
    with log_lock:
        print(f"[{timestamp}] {prefix}: {message}")

def run_simulation(model, temperature):
    """Run simulation for a single model-temperature combination."""
    simulation_id = f"{model}_temp{temperature}"
    
    # Start message
    safe_print(f"Starting simulation: {model} (temp={temperature})", "START")
    
    config = SimulationConfig(
        model=model,
        temperature=temperature,
        batch_size=batch_size,
        max_workers=10,
        max_retries=5,  # Enhanced retry logic
        base_wait_time=2.0,
        max_wait_time=60.0
    )
    
    start_time = time.time()
    
    try:
        # Use run_batch_simulation with the correct personality_key for binary format
        results = run_batch_simulation(
            participants_data=participants_with_binary,
            prompt_generator=get_binary_prompt,
            config=config,
            personality_key='binary_personality',  # Correct key for binary format
            output_dir="study_3_binary_results",
            output_filename="bfi_to_minimarker_binary"
        )
        
        # Check for failures
        failed_count = sum(1 for r in results if isinstance(r, dict) and 'error' in r)
        duration = time.time() - start_time
        
        if failed_count > 0:
            safe_print(f"Completed {simulation_id} in {duration:.1f}s - WARNING: {failed_count} participants failed", "WARN")
        else:
            safe_print(f"Completed {simulation_id} in {duration:.1f}s - All participants successful", "SUCCESS")
        
        return (simulation_id, results)
        
    except Exception as e:
        duration = time.time() - start_time
        safe_print(f"Failed {simulation_id} after {duration:.1f}s - Error: {str(e)}", "ERROR")
        return (simulation_id, {"error": str(e)})

# Main execution
print("=" * 80)
print("STARTING ENHANCED BINARY BASELINE SIMULATIONS FOR STUDY 3")
print(f"Models: {models_to_test}")
print(f"Temperatures: {temperatures}")
print(f"Total combinations: {len(models_to_test) * len(temperatures)}")
print(f"Participants: {len(participants_with_binary)}")
print("=" * 80)

all_results = {}
start_time = time.time()

# Use ThreadPoolExecutor for parallel execution
with ThreadPoolExecutor(max_workers=len(models_to_test)) as executor:
    # Submit all jobs
    futures = [
        executor.submit(run_simulation, model, temperature)
        for model in models_to_test
        for temperature in temperatures
    ]
    
    # Collect results as they complete
    completed_count = 0
    total_jobs = len(futures)
    
    for future in as_completed(futures):
        key, result = future.result()
        all_results[key] = result
        completed_count += 1
        
        # Progress update
        safe_print(f"Progress: {completed_count}/{total_jobs} simulations completed", "PROGRESS")

total_duration = time.time() - start_time
print("\n" + "=" * 80)
print(f"ALL SIMULATIONS COMPLETED IN {total_duration:.1f} SECONDS")
print(f"Results keys: {list(all_results.keys())}")
print("=" * 80)


STARTING ENHANCED BINARY BASELINE SIMULATIONS FOR STUDY 3
Models: ['openai-gpt-3.5-turbo-0125', 'gpt-4', 'gpt-4o', 'llama', 'deepseek']
Temperatures: [1]
Total combinations: 5
Participants: 200
[09:53:48] START: Starting simulation: openai-gpt-3.5-turbo-0125 (temp=1)
Starting simulation for 200 participants using openai-gpt-3.5-turbo-0125
Temperature: 1, Batch size: 25
Processing participants 0 to 24
[09:53:48] START: Starting simulation: gpt-4 (temp=1)
Starting simulation for 200 participants using gpt-4
Temperature: 1, Batch size: 25
Processing participants 0 to 24
[09:53:48] START: Starting simulation: gpt-4o (temp=1)
Starting simulation for 200 participants using gpt-4o
Temperature: 1, Batch size: 25
Processing participants 0 to 24
[09:53:48] START: Starting simulation: llama (temp=1)
Starting simulation for 200 participants using llama
Temperature: 1, Batch size: 25
Processing participants 0 to 24
[09:53:48] START: Starting simulation: deepseek (temp=1)
Starting simulation for 200

In [7]:
# Retry any failed participants
print("Checking for failed participants and retrying if necessary...")

from simulation_utils import retry_failed_participants, save_simulation_results

retry_count = 0
for key, results in all_results.items():
    if isinstance(results, list):
        failed_count = sum(1 for r in results if isinstance(r, dict) and 'error' in r)
        if failed_count > 0:
            print(f"Retrying {failed_count} failed participants for {key}")
            retry_count += 1
            
            # Extract model and temperature from key
            model = key.split('_temp')[0]
            temperature = float(key.split('_temp')[1])
            
            config = SimulationConfig(
                model=model,
                temperature=temperature,
                batch_size=batch_size
            )
            
            updated_results = retry_failed_participants(
                results=results,
                participants_data=participants_with_binary,
                prompt_generator=get_binary_prompt,  # Use binary-specific prompt function
                config=config,
                personality_key='binary_personality'
            )
            
            all_results[key] = updated_results
            
            # Save updated results
            save_simulation_results(updated_results, "study_3_binary_results", "bfi_to_minimarker_binary", config)

if retry_count == 0:
    print("No failed participants found - all simulations successful!")
else:
    print(f"Retry process completed for {retry_count} model(s)")


Checking for failed participants and retrying if necessary...
Retrying 5 failed participants for llama_temp1
Retrying participant 23
Llama error: ('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response'))
Error in get_personality_response: ('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response'))
Llama error: ('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response'))
Error in get_personality_response: ('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response'))
Successfully retried participant 23
Retrying participant 68
Successfully retried participant 68
Retrying participant 133
Successfully retried participant 133
Retrying participant 161
Successfully retried participant 161
Retrying participant 167
Successfully retried participant 167
Results saved to study_3_binary_results/bfi_to_minimarker_binary_llama_temp1_0.json
Retry process

In [8]:
# Summary of all results
print("=" * 80)
print("STUDY 3 BINARY BASELINE SIMULATION RESULTS SUMMARY")
print("=" * 80)

for key, results in all_results.items():
    if isinstance(results, list):
        total_participants = len(results)
        successful_participants = sum(1 for r in results if isinstance(r, dict) and 'error' not in r)
        failed_participants = total_participants - successful_participants
        
        print(f"\n{key}:")
        print(f"  Total participants: {total_participants}")
        print(f"  Successful: {successful_participants}")
        print(f"  Failed: {failed_participants}")
        print(f"  Success rate: {(successful_participants/total_participants)*100:.1f}%")
        
        # Sample a successful response for validation
        successful_responses = [r for r in results if isinstance(r, dict) and 'error' not in r]
        if successful_responses:
            sample_response = successful_responses[0]
            print(f"  Sample response keys: {list(sample_response.keys())[:10]}...")  # Show first 10 keys
            print(f"  Sample response length: {len(sample_response)} traits")
    else:
        print(f"\n{key}: FAILED - {results}")

print("\n" + "=" * 80)


STUDY 3 BINARY BASELINE SIMULATION RESULTS SUMMARY

openai-gpt-3.5-turbo-0125_temp1:
  Total participants: 200
  Successful: 200
  Failed: 0
  Success rate: 100.0%
  Sample response keys: ['Bashful', 'Bold', 'Careless', 'Cold', 'Complex', 'Cooperative', 'Creative', 'Deep', 'Disorganized', 'Efficient']...
  Sample response length: 40 traits

gpt-4o_temp1:
  Total participants: 200
  Successful: 200
  Failed: 0
  Success rate: 100.0%
  Sample response keys: ['Bashful', 'Bold', 'Careless', 'Cold', 'Complex', 'Cooperative', 'Creative', 'Deep', 'Disorganized', 'Efficient']...
  Sample response length: 40 traits

deepseek_temp1:
  Total participants: 200
  Successful: 200
  Failed: 0
  Success rate: 100.0%
  Sample response keys: ['Bashful', 'Bold', 'Careless', 'Cold', 'Complex', 'Cooperative', 'Creative', 'Deep', 'Disorganized', 'Efficient']...
  Sample response length: 40 traits

gpt-4_temp1:
  Total participants: 200
  Successful: 200
  Failed: 0
  Success rate: 100.0%
  Sample response k

In [9]:
# Save the preprocessed data for reference
output_path = Path('study_3_binary_results')
output_path.mkdir(exist_ok=True)

# Save preprocessed data with binary descriptions
binary_data = pd.DataFrame(participants_with_binary)
binary_data.to_csv(output_path / 'study3_binary_preprocessed_data.csv', index=False)
print(f"Preprocessed data saved to {output_path / 'study3_binary_preprocessed_data.csv'}")

# Save simulation metadata
metadata = {
    'simulation_type': 'study_3_binary_baseline',
    'models_tested': models_to_test,
    'temperatures': temperatures,
    'batch_size': batch_size,
    'total_participants': len(participants_with_binary),
    'simulation_date': datetime.now().isoformat(),
    'data_source': 'statistically_simulated_bfi2',
    'format': 'binary_baseline',
    'threshold': 2.5,
    'key_differences_from_other_formats': [
        'Uses binary high/low classifications instead of continuous descriptions',
        'Threshold-based conversion (2.5 cutoff) from continuous domain scores',
        'Simplified personality descriptions for baseline comparison',
        'Uses get_binary_prompt instead of get_expanded_prompt or get_likert_prompt',
    ],
    'results_summary': {
        key: {
            'total': len(results) if isinstance(results, list) else 0,
            'successful': sum(1 for r in results if isinstance(r, dict) and 'error' not in r) if isinstance(results, list) else 0
        } for key, results in all_results.items()
    }
}

with open(output_path / 'simulation_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Simulation metadata saved to {output_path / 'simulation_metadata.json'}")

print("\n" + "=" * 60)
print("STUDY 3 BINARY BASELINE SIMULATION COMPLETE!")
print("\nNext steps:")
print("1. Run convergent validity analysis on the results")
print("2. Compare with Study 3 Expanded and Likert results for format differences")
print("3. Compare with Study 2 binary baseline results for study differences")
print("4. Results are saved in study_3_binary_results/ directory")
print("5. Binary baseline serves as simplified comparison to complex formats")
print("=" * 60)


Preprocessed data saved to study_3_binary_results/study3_binary_preprocessed_data.csv
Simulation metadata saved to study_3_binary_results/simulation_metadata.json

STUDY 3 BINARY BASELINE SIMULATION COMPLETE!

Next steps:
1. Run convergent validity analysis on the results
2. Compare with Study 3 Expanded and Likert results for format differences
3. Compare with Study 2 binary baseline results for study differences
4. Results are saved in study_3_binary_results/ directory
5. Binary baseline serves as simplified comparison to complex formats
