In [1]:
import sys
import pandas as pd
import numpy as np
from pathlib import Path
import json

# Add shared utilities to path
sys.path.append('../shared')

# Updated imports to use the unified prompt system
from simulation_utils import (
    SimulationConfig, 
    run_batch_simulation,
    run_enhanced_bfi_to_minimarker_simulation
)
from mini_marker_prompt import (
    generate_binary_personality_description, 
    get_binary_prompt,
    create_binary_participant_data,
    validate_minimarker_response
)

print("Binary baseline simulation utilities loaded successfully")

Binary baseline simulation utilities loaded successfully


In [2]:
# The validate_minimarker_response function is now imported from mini_marker_prompt
# No need to redefine it here

print("Using unified prompt system from mini_marker_prompt.py")

Using unified prompt system from mini_marker_prompt.py


In [3]:
# Load and examine the data structure
print("=== DEBUGGING DATA STRUCTURE ===")

# Load the original Soto dataset
data_path = Path('../../raw_data/Soto_data.xlsx')
if not data_path.exists():
    print(f"Error: Data file not found at {data_path}")
    exit()

data = pd.read_excel(data_path)
print(f"Loaded {len(data)} participants from Soto dataset")

# Check ALL available columns
print(f"\nTotal columns: {len(data.columns)}")
print(f"All columns: {list(data.columns)}")

# Check specifically for BFI columns
bfi_cols = [col for col in data.columns if 'bfi' in col.lower()]
print(f"\nBFI-related columns found: {bfi_cols}")

# Check for domain score columns specifically
domain_cols = [col for col in data.columns if any(domain in col for domain in ['_e', '_a', '_c', '_n', '_o'])]
print(f"\nDomain score columns found: {domain_cols}")

# Show sample data for BFI domain columns
if len(bfi_cols) > 0:
    print(f"\nSample BFI data:")
    print(data[bfi_cols].head())
else:
    print("\nNo BFI columns found - showing first 10 columns:")
    print(data.iloc[:5, :10])


=== DEBUGGING DATA STRUCTURE ===
Loaded 470 participants from Soto dataset

Total columns: 704
All columns: ['case_id', 'age', 'sex', 'ethnicity', 'rel_acquaintance', 'rel_friend', 'rel_roommate', 'rel_boygirlfriend', 'rel_relative', 'rel_other', 'rel_description', 'bfi1', 'bfi2', 'bfi3', 'bfi4', 'bfi5', 'bfi6', 'bfi7', 'bfi8', 'bfi9', 'bfi10', 'bfi11', 'bfi12', 'bfi13', 'bfi14', 'bfi15', 'bfi16', 'bfi17', 'bfi18', 'bfi19', 'bfi20', 'bfi21', 'bfi22', 'bfi23', 'bfi24', 'bfi25', 'bfi26', 'bfi27', 'bfi28', 'bfi29', 'bfi30', 'bfi31', 'bfi32', 'bfi33', 'bfi34', 'bfi35', 'bfi36', 'bfi37', 'bfi38', 'bfi39', 'bfi40', 'bfi41', 'bfi42', 'bfi43', 'bfi44', 'bfi45', 'bfi46', 'bfi47', 'bfi48', 'bfi49', 'bfi50', 'bfi51', 'bfi52', 'bfi53', 'bfi54', 'bfi55', 'bfi56', 'bfi57', 'bfi58', 'bfi59', 'bfi60', 'pbfi1', 'pbfi2', 'pbfi3', 'pbfi4', 'pbfi5', 'pbfi6', 'pbfi7', 'pbfi8', 'pbfi9', 'pbfi10', 'pbfi11', 'pbfi12', 'pbfi13', 'pbfi14', 'pbfi15', 'pbfi16', 'pbfi17', 'pbfi18', 'pbfi19', 'pbfi20', 'pbfi21', 'p

In [4]:
# Generate column names for TDA (Mini-Marker) and BFI-2 items
tda_columns = [f"tda{i}" for i in range(1, 41)]
sbfi_columns = [f"bfi{i}" for i in range(1, 61)]
selected_columns = tda_columns + sbfi_columns

print(f"Original data shape: {data.shape}")

# Remove rows with missing values in the selected columns - SAME AS LIKERT/EXPANDED
data = data.dropna(subset=selected_columns)
print(f"Data shape after removing missing values: {data.shape}")

# Create participant data with correct column names
print("=== CREATING PARTICIPANT DATA ===")

# Use the actual column names found in the data
participants_data = []

# First, let's identify the correct BFI domain columns
if 'bfi2_e' in data.columns:
    # Use BFI-2 format
    domain_columns = {
        'bfi2_e': 'bfi2_e',
        'bfi2_a': 'bfi2_a', 
        'bfi2_c': 'bfi2_c',
        'bfi2_n': 'bfi2_n',
        'bfi2_o': 'bfi2_o'
    }
    print("Using BFI-2 format columns")
elif 'bfi_e' in data.columns:
    # Use alternative BFI format
    domain_columns = {
        'bfi2_e': 'bfi_e',
        'bfi2_a': 'bfi_a', 
        'bfi2_c': 'bfi_c',
        'bfi2_n': 'bfi_n',
        'bfi2_o': 'bfi_o'
    }
    print("Using alternative BFI format columns")
else:
    # Need to find the correct columns
    print("ERROR: Cannot find standard BFI domain columns")
    print("Available columns that might be domain scores:")
    potential_cols = [col for col in data.columns if any(x in col.lower() for x in ['extra', 'agree', 'consc', 'neuro', 'open'])]
    print(potential_cols)
    
    # For now, let's stop and ask user to clarify
    raise Exception("Cannot identify BFI domain columns. Please check the data structure above.")

# Create participant data
for idx, row in data.iterrows():
    participant = {'participant_id': idx}
    
    # Map the data columns to the expected binary baseline format
    for expected_col, actual_col in domain_columns.items():
        if actual_col in data.columns:
            participant[expected_col] = row[actual_col]
        else:
            print(f"Warning: Column {actual_col} not found in data")
    
    participants_data.append(participant)

print(f"Created {len(participants_data)} participant records")

# Show sample participant data
print("\nSample participant data:")
for i in range(min(3, len(participants_data))):
    print(f"Participant {i+1}: {participants_data[i]}")

# Generate binary personality descriptions
print("\n=== GENERATING BINARY DESCRIPTIONS ===")
participants_with_binary = create_binary_participant_data(participants_data)

# Display sample binary descriptions
print("\nSample binary personality descriptions:")
for i, p in enumerate(participants_with_binary[:2]):
    print(f"\nParticipant {i+1}:")
    print(f"Domain scores: E={p['bfi2_e']:.2f}, A={p['bfi2_a']:.2f}, C={p['bfi2_c']:.2f}, N={p['bfi2_n']:.2f}, O={p['bfi2_o']:.2f}")
    print(f"Binary description: {p['binary_personality'][:200]}...")

Original data shape: (470, 704)
Data shape after removing missing values: (438, 704)
=== CREATING PARTICIPANT DATA ===
Using BFI-2 format columns
Created 438 participant records

Sample participant data:
Participant 1: {'participant_id': 0, 'bfi2_e': 4.083333333333333, 'bfi2_a': 4.583333333333333, 'bfi2_c': 3.1666666666666665, 'bfi2_n': 2.4166666666666665, 'bfi2_o': 3.1666666666666665}
Participant 2: {'participant_id': 1, 'bfi2_e': 2.9166666666666665, 'bfi2_a': 3.1666666666666665, 'bfi2_c': 3.0, 'bfi2_n': 3.0, 'bfi2_o': 3.3333333333333335}
Participant 3: {'participant_id': 2, 'bfi2_e': 2.0833333333333335, 'bfi2_a': 4.083333333333333, 'bfi2_c': 3.8333333333333335, 'bfi2_n': 3.1666666666666665, 'bfi2_o': 4.416666666666667}

=== GENERATING BINARY DESCRIPTIONS ===

Sample binary personality descriptions:

Participant 1:
Domain scores: E=4.08, A=4.58, C=3.17, N=2.42, O=3.17
Binary description: You are high in Extraversion. You are outgoing, sociable, assertive, and energetic. You are high i

In [5]:
# Configure simulation parameters
print("=== SIMULATION CONFIGURATION ===")

# Models to test - using correct model names from portal.py
models_to_test = [
    'openai-gpt-3.5-turbo-0125',
    "gpt-4",
    "gpt-4o",
    "llama",
    "deepseek"
]

# Simulation parameters
temperature = 1.0
batch_size = 25  # Smaller batch size for stability across different APIs
max_workers = 8

print(f"Models to test: {models_to_test}")
print(f"Temperature: {temperature}")
print(f"Batch size: {batch_size}")
print(f"Max workers: {max_workers}")

print(f"Using {len(participants_with_binary)} participants for simulation")

# Test the prompt generator with first participant
if participants_with_binary:
    sample_personality = participants_with_binary[0]['binary_personality']
    sample_prompt = get_binary_prompt(sample_personality)
    print(f"\nSample prompt length: {len(sample_prompt)} characters")
    print("Sample prompt preview:")
    print(sample_prompt)
else:
    print("No participants available for testing")

=== SIMULATION CONFIGURATION ===
Models to test: ['openai-gpt-3.5-turbo-0125', 'llama']
Temperature: 1.0
Batch size: 25
Max workers: 8
Using 438 participants for simulation

Sample prompt length: 2463 characters
Sample prompt preview:
### Your Assigned Personality ### 
Based on your personality profile below, please rate yourself on the following traits.
You are high in Extraversion. You are outgoing, sociable, assertive, and energetic. You are high in Agreeableness. You are compassionate, cooperative, trusting, and kind to others. You are high in Conscientiousness. You are organized, responsible, hardworking, and reliable. You are low in Neuroticism. You are emotionally stable, calm, and resilient under stress. You are high in Openness. You are curious, creative, open to new experiences, and intellectually engaged.

### Context and Objective ###
You are participating in a study to help us understand human personality.

Your job is to fill out a personality questionnaire below. Your qu

In [6]:
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import datetime
import time

# Thread-safe logging
log_lock = threading.Lock()

def safe_print(message, prefix="INFO"):
    """Thread-safe printing with timestamp and prefix"""
    timestamp = datetime.now().strftime("%H:%M:%S")
    with log_lock:
        print(f"[{timestamp}] {prefix}: {message}")

def run_simulation(model, temperature):
    simulation_id = f"{model}_temp{temperature}"
    
    # Start message
    safe_print(f"Starting simulation: {model} (temp={temperature})", "START")
    
    config = SimulationConfig(
        model=model,
        temperature=temperature,
        batch_size=batch_size,
        max_workers=10,
        max_retries=5,  # Enhanced retry logic
        base_wait_time=2.0,
        max_wait_time=60.0
    )
    
    start_time = time.time()
    
    try:
        # Use run_batch_simulation with the correct personality_key for binary format
        results = run_batch_simulation(
            participants_data=participants_with_binary,
            prompt_generator=get_binary_prompt,
            config=config,
            personality_key='binary_personality',  # Correct key for binary format
            # output_dir="study_2_binary_results",
            output_dir="study_2_elaborated_binary_results",
            output_filename="bfi_to_minimarker_binary"
        )
        
        # Check for failures
        failed_count = sum(1 for r in results if isinstance(r, dict) and 'error' in r)
        duration = time.time() - start_time
        
        if failed_count > 0:
            safe_print(f"Completed {simulation_id} in {duration:.1f}s - WARNING: {failed_count} participants failed", "WARN")
        else:
            safe_print(f"Completed {simulation_id} in {duration:.1f}s - All participants successful", "SUCCESS")
        
        return (simulation_id, results)
        
    except Exception as e:
        duration = time.time() - start_time
        safe_print(f"Failed {simulation_id} after {duration:.1f}s - Error: {str(e)}", "ERROR")
        return (simulation_id, {"error": str(e)})

# Main execution
print("="*80)
print("STARTING ENHANCED BINARY BASELINE SIMULATIONS")
print(f"Models: {models_to_test}")
print(f"Temperature: {temperature}")
print(f"Total models: {len(models_to_test)}")
print("="*80)

all_results = {}
start_time = time.time()

# Use ThreadPoolExecutor for parallel execution
with ThreadPoolExecutor(max_workers=len(models_to_test)) as executor:
    # Submit all jobs
    futures = [
        executor.submit(run_simulation, model, temperature)
        for model in models_to_test
    ]
    
    # Collect results as they complete
    completed_count = 0
    total_jobs = len(futures)
    
    for future in as_completed(futures):
        key, result = future.result()
        all_results[key] = result
        completed_count += 1
        
        # Progress update
        safe_print(f"Progress: {completed_count}/{total_jobs} simulations completed", "PROGRESS")

total_duration = time.time() - start_time

# Final summary
print("\n" + "="*80)
print("BINARY BASELINE SIMULATION SUMMARY")
print("="*80)
print(f"Total time: {total_duration:.1f} seconds")
print(f"Completed simulations: {len(all_results)}")

# Categorize results
successful = []
failed = []

for key, result in all_results.items():
    if isinstance(result, dict) and 'error' in result:
        failed.append(key)
    else:
        # Check for partial failures
        if isinstance(result, list):
            failed_participants = sum(1 for r in result if isinstance(r, dict) and 'error' in r)
            if failed_participants > 0:
                print(f"  {key}: SUCCESS (with {failed_participants} failed participants)")
            else:
                print(f"  {key}: SUCCESS")
            successful.append(key)
        else:
            successful.append(key)

if failed:
    print(f"\nFailed simulations ({len(failed)}):")
    for key in failed:
        print(f"  {key}: {all_results[key].get('error', 'Unknown error')}")

print("="*80)

STARTING ENHANCED BINARY BASELINE SIMULATIONS
Models: ['openai-gpt-3.5-turbo-0125', 'llama']
Temperature: 1.0
Total models: 2
[18:06:09] START: Starting simulation: openai-gpt-3.5-turbo-0125 (temp=1.0)
Starting simulation for 438 participants using openai-gpt-3.5-turbo-0125
Temperature: 1.0, Batch size: 25
Processing participants 0 to 24
[18:06:09] START: Starting simulation: llama (temp=1.0)
Starting simulation for 438 participants using llama
Temperature: 1.0, Batch size: 25
Processing participants 0 to 24
Completed batch 0 to 24
Processing participants 25 to 49
Completed batch 25 to 49
Processing participants 50 to 74
Completed batch 50 to 74
Processing participants 75 to 99
Completed batch 75 to 99
Processing participants 100 to 124
Completed batch 100 to 124
Processing participants 125 to 149
Completed batch 125 to 149
Processing participants 150 to 174
Completed batch 150 to 174
Processing participants 175 to 199
Completed batch 175 to 199
Processing participants 200 to 224
Compl