In [1]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path
import json
from datetime import datetime

# Add shared modules to path
sys.path.append('../shared')

from simulation_utils import (
    SimulationConfig, 
    run_bfi_to_minimarker_simulation,
    retry_failed_participants,
    run_enhanced_bfi_to_minimarker_simulation
)
from schema_bfi2 import likert_scale
from mini_marker_prompt import get_likert_prompt

print("Setup complete!")
print(f"Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


Setup complete!
Analysis started at: 2025-07-14 01:07:04


In [2]:
# Load the statistically simulated BFI-2 data
data_path = Path('facet_lvl_simulated_data.csv')
if not data_path.exists():
    print(f"Simulated data file not found at {data_path}")
    print("Please run bfi2_facet_level_parameter_extraction_and_simulation.py first to generate the data")
    raise FileNotFoundError(f"Data file not found: {data_path}")

data = pd.read_csv(data_path)
print(f"Loaded simulated data shape: {data.shape}")
print(f"Columns: {list(data.columns[:10])}...")  # Show first 10 columns
print(f"BFI columns: {[col for col in data.columns if col.startswith('bfi') and col.replace('bfi', '').replace('_', '').isdigit()][:10]}...")  # Show BFI item columns
print(f"Value range: {data[[col for col in data.columns if col.startswith('bfi') and col.replace('bfi', '').replace('_', '').isdigit()]].min().min()} to {data[[col for col in data.columns if col.startswith('bfi') and col.replace('bfi', '').replace('_', '').isdigit()]].max().max()}")
data.head()


Loaded simulated data shape: (200, 80)
Columns: ['bfi1', 'bfi16', 'bfi31', 'bfi46', 'bfi6', 'bfi21', 'bfi36', 'bfi51', 'bfi11', 'bfi26']...
BFI columns: ['bfi1', 'bfi16', 'bfi31', 'bfi46', 'bfi6', 'bfi21', 'bfi36', 'bfi51', 'bfi11', 'bfi26']...
Value range: 1.0 to 5.0


Unnamed: 0,bfi1,bfi16,bfi31,bfi46,bfi6,bfi21,bfi36,bfi51,bfi11,bfi26,...,bfi_n_depression,bfi_n_emotional_volatility,bfi_o_intellectual_curiosity,bfi_o_aesthetic_sensitivity,bfi_o_creative_imagination,bfi_e,bfi_a,bfi_c,bfi_n,bfi_o
0,3.0,2.0,4.0,3.0,2.0,2.0,3.0,5.0,3.0,2.0,...,2.0,4.5,4.0,4.5,3.25,2.666667,3.833333,2.333333,3.75,3.916667
1,4.0,2.0,3.0,3.0,3.0,3.0,3.0,2.0,3.0,4.0,...,2.75,4.5,2.5,3.25,3.0,3.25,2.833333,4.083333,3.5,2.916667
2,3.0,3.0,4.0,1.0,1.0,2.0,3.0,4.0,2.0,3.0,...,2.0,4.5,3.5,4.75,4.25,2.666667,2.75,3.833333,3.833333,4.166667
3,5.0,1.0,2.0,5.0,5.0,5.0,1.0,1.0,2.0,2.0,...,4.25,2.25,4.75,4.25,3.75,4.583333,3.166667,2.916667,3.583333,4.25
4,3.0,4.0,3.0,3.0,1.0,2.0,4.0,3.0,2.0,4.0,...,3.0,1.75,4.5,4.0,4.5,2.75,4.333333,2.5,3.0,4.333333


In [3]:
# Since variables are already named 'bfi1', 'bfi2', etc., no column mapping needed
# Create a copy for likert processing (maintaining same structure)
data_for_likert = data.copy()

print(f"BFI columns ready for likert processing:")
bfi_columns = sorted([col for col in data_for_likert.columns if col.startswith('bfi')])
print(f"Found {len(bfi_columns)} BFI items: {bfi_columns}")

print("BFI data ready for analysis - no reverse coding needed")

BFI columns ready for likert processing:
Found 80 BFI items: ['bfi1', 'bfi10', 'bfi11', 'bfi12', 'bfi13', 'bfi14', 'bfi15', 'bfi16', 'bfi17', 'bfi18', 'bfi19', 'bfi2', 'bfi20', 'bfi21', 'bfi22', 'bfi23', 'bfi24', 'bfi25', 'bfi26', 'bfi27', 'bfi28', 'bfi29', 'bfi3', 'bfi30', 'bfi31', 'bfi32', 'bfi33', 'bfi34', 'bfi35', 'bfi36', 'bfi37', 'bfi38', 'bfi39', 'bfi4', 'bfi40', 'bfi41', 'bfi42', 'bfi43', 'bfi44', 'bfi45', 'bfi46', 'bfi47', 'bfi48', 'bfi49', 'bfi5', 'bfi50', 'bfi51', 'bfi52', 'bfi53', 'bfi54', 'bfi55', 'bfi56', 'bfi57', 'bfi58', 'bfi59', 'bfi6', 'bfi60', 'bfi7', 'bfi8', 'bfi9', 'bfi_a', 'bfi_a_compassion', 'bfi_a_respectfulness', 'bfi_a_trust', 'bfi_c', 'bfi_c_organization', 'bfi_c_productiveness', 'bfi_c_responsibility', 'bfi_e', 'bfi_e_assertiveness', 'bfi_e_energy_level', 'bfi_e_sociability', 'bfi_n', 'bfi_n_anxiety', 'bfi_n_depression', 'bfi_n_emotional_volatility', 'bfi_o', 'bfi_o_aesthetic_sensitivity', 'bfi_o_creative_imagination', 'bfi_o_intellectual_curiosity']
BFI dat

In [4]:
# Generate BFI column list for processing
bfi_columns = [f"bfi{i}" for i in range(1, 61)]
print(f"Expected BFI columns: {len(bfi_columns)}")
print(f"Available BFI columns: {len([col for col in data_for_likert.columns if col in bfi_columns])}")

# Verify all expected columns are present
missing_cols = [col for col in bfi_columns if col not in data_for_likert.columns]
if missing_cols:
    print(f"Warning: Missing columns: {missing_cols}")
else:
    print("All BFI columns are present!")


Expected BFI columns: 60
Available BFI columns: 60
All BFI columns are present!


In [5]:
# Map numeric values to Likert format descriptions
def convert_values_to_string(series, mapping):
    """Convert numeric BFI values to likert format strings."""
    series_converted = series.copy()
    if series.name in mapping:
        series_converted = series_converted.apply(lambda x: f"{mapping[series.name]} {x};")
    return series_converted

print("Converting BFI values to Likert format descriptions...")

# Apply the mapping function to each BFI column
mapped_data = data_for_likert[bfi_columns].apply(lambda df: convert_values_to_string(df, likert_scale))

# Combine all BFI descriptions into a single personality description
mapped_data['combined_bfi2'] = mapped_data[bfi_columns].apply(lambda row: ' '.join(row), axis=1)

# Add combined description to both original data AND corrected data
data['combined_bfi2'] = mapped_data['combined_bfi2']
data_for_likert['combined_bfi2'] = mapped_data['combined_bfi2']

print("Likert format personality descriptions created successfully")
print(f"Final data shape: {data.shape}")
print(f"Corrected data shape: {data_for_likert.shape}")


Converting BFI values to Likert format descriptions...
Likert format personality descriptions created successfully
Final data shape: (200, 81)
Corrected data shape: (200, 81)


In [6]:
# Preview a personality description
print("Sample Likert format personality description:")
print("=" * 80)
sample_description = data.iloc[0]['combined_bfi2']
print(sample_description[:500] + "...")
print("=" * 80)
print(f"Full description length: {len(sample_description)} characters")


Sample Likert format personality description:
Is outgoing, sociable: 3.0; Is compassionate, has a soft heart: 4.0; Tends to be disorganized: 5.0; Is relaxed, handles stress well: 1.0; Has few artistic interests: 1.0; Has an assertive personality: 2.0; Is respectful, treats others with respect: 4.0; Tends to be lazy: 3.0; Stays optimistic after experiencing a setback: 5.0; Is curious about many different things: 4.0; Rarely feels excited or eager: 3.0; Tends to find fault with others: 2.0; Is dependable, steady: 3.0; Is moody, has up and dow...
Full description length: 2229 characters


In [7]:
# Test prompt generation with first participant
first_participant = data.iloc[0]
sample_prompt = get_likert_prompt(first_participant['combined_bfi2'])

print("=" * 80)
print("COMPLETE LIKERT FORMAT PROMPT SENT TO LLM")
print("=" * 80)
print(sample_prompt)
print("=" * 80)
print(f"Prompt length: {len(sample_prompt)} characters")
print(f"Prompt word count: {len(sample_prompt.split())} words")


COMPLETE LIKERT FORMAT PROMPT SENT TO LLM
### Your Assigned Personality ### 
The number indicates the extent to which you agree or disagree with that statement. 1 means 'Disagree Strongly', 3 means 'Neutral', and 5 means 'Agree Strongly'.

Is outgoing, sociable: 3.0; Is compassionate, has a soft heart: 4.0; Tends to be disorganized: 5.0; Is relaxed, handles stress well: 1.0; Has few artistic interests: 1.0; Has an assertive personality: 2.0; Is respectful, treats others with respect: 4.0; Tends to be lazy: 3.0; Stays optimistic after experiencing a setback: 5.0; Is curious about many different things: 4.0; Rarely feels excited or eager: 3.0; Tends to find fault with others: 2.0; Is dependable, steady: 3.0; Is moody, has up and down mood swings: 5.0; Is inventive, finds clever ways to do things: 3.0; Tends to be quiet: 2.0; Feels little sympathy for others: 2.0; Is systematic, likes to keep things in order: 2.0; Can be tense: 5.0; Is fascinated by art, music, or literature: 4.0; Is domi

In [8]:
# Configuration for different models and temperatures
models_to_test = [
    # 'openai-gpt-3.5-turbo-0125',
    'gpt-4',
    'gpt-4o',
    'llama',
    'deepseek'
]
temperatures = [1]  # Use temperature 1 for stochastic responses
batch_size = 25  # Smaller batch size for stability across different APIs

# Create participant data list from corrected DataFrame
participants_data = data_for_likert.to_dict('records')

print(f"Simulation Configuration:")
print(f"  Models to test: {models_to_test}")
print(f"  Temperatures: {temperatures}")
print(f"  Batch size: {batch_size}")
print(f"  Total combinations: {len(models_to_test) * len(temperatures)}")
print(f"  Participants: {len(participants_data)}")

# Verify participant data structure
print(f"\nSample participant data keys: {list(participants_data[0].keys())}")
print(f"Has 'combined_bfi2' key: {'combined_bfi2' in participants_data[0]}")


Simulation Configuration:
  Models to test: ['gpt-4', 'gpt-4o', 'llama', 'deepseek']
  Temperatures: [1]
  Batch size: 25
  Total combinations: 4
  Participants: 200

Sample participant data keys: ['bfi1', 'bfi16', 'bfi31', 'bfi46', 'bfi6', 'bfi21', 'bfi36', 'bfi51', 'bfi11', 'bfi26', 'bfi41', 'bfi56', 'bfi2', 'bfi17', 'bfi32', 'bfi47', 'bfi7', 'bfi22', 'bfi37', 'bfi52', 'bfi12', 'bfi27', 'bfi42', 'bfi57', 'bfi3', 'bfi18', 'bfi33', 'bfi48', 'bfi8', 'bfi23', 'bfi38', 'bfi53', 'bfi13', 'bfi28', 'bfi43', 'bfi58', 'bfi4', 'bfi19', 'bfi34', 'bfi49', 'bfi9', 'bfi24', 'bfi39', 'bfi54', 'bfi14', 'bfi29', 'bfi44', 'bfi59', 'bfi10', 'bfi25', 'bfi40', 'bfi55', 'bfi5', 'bfi20', 'bfi35', 'bfi50', 'bfi15', 'bfi30', 'bfi45', 'bfi60', 'bfi_e_sociability', 'bfi_e_assertiveness', 'bfi_e_energy_level', 'bfi_a_compassion', 'bfi_a_respectfulness', 'bfi_a_trust', 'bfi_c_organization', 'bfi_c_productiveness', 'bfi_c_responsibility', 'bfi_n_anxiety', 'bfi_n_depression', 'bfi_n_emotional_volatility', 'bfi_o_in

In [9]:
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Thread-safe logging
log_lock = threading.Lock()

def safe_print(message, prefix="INFO"):
    """Thread-safe printing with timestamp and prefix"""
    timestamp = datetime.now().strftime("%H:%M:%S")
    with log_lock:
        print(f"[{timestamp}] {prefix}: {message}")

def run_simulation(model, temperature):
    """Run simulation for a single model-temperature combination."""
    simulation_id = f"{model}_temp{temperature}"
    
    # Start message
    safe_print(f"Starting simulation: {model} (temp={temperature})", "START")
    
    config = SimulationConfig(
        model=model,
        temperature=temperature,
        batch_size=batch_size,
        max_workers=10,
        max_retries=5,  # Enhanced retry logic
        base_wait_time=2.0,
        max_wait_time=60.0
    )
    
    start_time = time.time()
    
    try:
        # Use enhanced simulation with format validation and auto-retry
        results = run_enhanced_bfi_to_minimarker_simulation(
            participants_data=participants_data,
            config=config,
            output_dir="study_3_likert_results",
            use_enhanced=True,  # Enable enhanced validation
            prompt_generator=get_likert_prompt  # Use Likert-specific prompts
        )
        
        # Check for failures
        failed_count = sum(1 for r in results if isinstance(r, dict) and 'error' in r)
        duration = time.time() - start_time
        
        if failed_count > 0:
            safe_print(f"Completed {simulation_id} in {duration:.1f}s - WARNING: {failed_count} participants failed", "WARN")
        else:
            safe_print(f"Completed {simulation_id} in {duration:.1f}s - All participants successful", "SUCCESS")
        
        return (simulation_id, results)
        
    except Exception as e:
        duration = time.time() - start_time
        safe_print(f"Failed {simulation_id} after {duration:.1f}s - Error: {str(e)}", "ERROR")
        return (simulation_id, {"error": str(e)})

# Main execution
print("=" * 80)
print("STARTING ENHANCED PARALLEL SIMULATIONS FOR STUDY 2b")
print(f"Models: {models_to_test}")
print(f"Temperatures: {temperatures}")
print(f"Total combinations: {len(models_to_test) * len(temperatures)}")
print(f"Participants: {len(participants_data)}")
print("=" * 80)

all_results = {}
start_time = time.time()

# Use ThreadPoolExecutor for parallel execution
with ThreadPoolExecutor(max_workers=len(models_to_test)) as executor:
    # Submit all jobs
    futures = [
        executor.submit(run_simulation, model, temperature)
        for model in models_to_test
        for temperature in temperatures
    ]
    
    # Collect results as they complete
    completed_count = 0
    total_jobs = len(futures)
    
    for future in as_completed(futures):
        key, result = future.result()
        all_results[key] = result
        completed_count += 1
        
        # Progress update
        safe_print(f"Progress: {completed_count}/{total_jobs} simulations completed", "PROGRESS")

total_duration = time.time() - start_time
print("\n" + "=" * 80)
print(f"ALL SIMULATIONS COMPLETED IN {total_duration:.1f} SECONDS")
print(f"Results keys: {list(all_results.keys())}")
print("=" * 80)


STARTING ENHANCED PARALLEL SIMULATIONS FOR STUDY 2b
Models: ['gpt-4', 'gpt-4o', 'llama', 'deepseek']
Temperatures: [1]
Total combinations: 4
Participants: 200
[01:07:05] START: Starting simulation: gpt-4 (temp=1)
Starting simulation for 200 participants using gpt-4
Temperature: 1, Batch size: 25
Processing participants 0 to 24
[01:07:05] START: Starting simulation: gpt-4o (temp=1)
Starting simulation for 200 participants using gpt-4o
Temperature: 1, Batch size: 25
Processing participants 0 to 24
[01:07:05] START: Starting simulation: llama (temp=1)
Starting simulation for 200 participants using llama
Temperature: 1, Batch size: 25
Processing participants 0 to 24
[01:07:05] START: Starting simulation: deepseek (temp=1)
Starting simulation for 200 participants using deepseek
Temperature: 1, Batch size: 25
Processing participants 0 to 24
Completed batch 0 to 24
Processing participants 25 to 49
Completed batch 0 to 24
Processing participants 25 to 49
Completed batch 0 to 24
Processing part

In [10]:
# Retry any failed participants
print("Checking for failed participants and retrying if necessary...")

retry_count = 0
for key, results in all_results.items():
    if isinstance(results, list):
        failed_count = sum(1 for r in results if isinstance(r, dict) and 'error' in r)
        if failed_count > 0:
            print(f"Retrying {failed_count} failed participants for {key}")
            retry_count += 1
            
            # Extract model and temperature from key
            model = key.split('_temp')[0]
            temperature = float(key.split('_temp')[1])
            
            config = SimulationConfig(
                model=model,
                temperature=temperature,
                batch_size=batch_size
            )
            
            updated_results = retry_failed_participants(
                results=results,
                participants_data=participants_data,
                prompt_generator=get_likert_prompt,  # Use likert-specific prompt function
                config=config,
                personality_key='combined_bfi2'
            )
            
            all_results[key] = updated_results
            
            # Save updated results
            from simulation_utils import save_simulation_results
            save_simulation_results(updated_results, "study_3_likert_results", "bfi_to_minimarker", config)

if retry_count == 0:
    print("No failed participants found - all simulations successful!")
else:
    print(f"Retry process completed for {retry_count} model(s)")


Checking for failed participants and retrying if necessary...
Retrying 15 failed participants for llama_temp1
Retrying participant 58
Llama error: ('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response'))
Error in get_personality_response: ('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response'))
Successfully retried participant 58
Retrying participant 60
Successfully retried participant 60
Retrying participant 80
Llama error: ('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response'))
Error in get_personality_response: ('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response'))
Llama error: ('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response'))
Error in get_personality_response: ('Cannot connect to proxy.', RemoteDisconnected('Remote end closed connection without response'))
Llama error: (

In [11]:
# Summary of all results
print("=" * 80)
print("STUDY 2b LIKERT SIMULATION RESULTS SUMMARY")
print("=" * 80)

for key, results in all_results.items():
    if isinstance(results, list):
        total_participants = len(results)
        successful_participants = sum(1 for r in results if isinstance(r, dict) and 'error' not in r)
        failed_participants = total_participants - successful_participants
        
        print(f"\n{key}:")
        print(f"  Total participants: {total_participants}")
        print(f"  Successful: {successful_participants}")
        print(f"  Failed: {failed_participants}")
        print(f"  Success rate: {(successful_participants/total_participants)*100:.1f}%")
        
        # Sample a successful response for validation
        successful_responses = [r for r in results if isinstance(r, dict) and 'error' not in r]
        if successful_responses:
            sample_response = successful_responses[0]
            print(f"  Sample response keys: {list(sample_response.keys())[:10]}...")  # Show first 10 keys
            print(f"  Sample response length: {len(sample_response)} traits")
    else:
        print(f"\n{key}: FAILED - {results}")

print("\n" + "=" * 80)


STUDY 2b LIKERT SIMULATION RESULTS SUMMARY

gpt-4o_temp1:
  Total participants: 200
  Successful: 200
  Failed: 0
  Success rate: 100.0%
  Sample response keys: ['Bashful', 'Bold', 'Careless', 'Cold', 'Complex', 'Cooperative', 'Creative', 'Deep', 'Disorganized', 'Efficient']...
  Sample response length: 40 traits

deepseek_temp1:
  Total participants: 200
  Successful: 200
  Failed: 0
  Success rate: 100.0%
  Sample response keys: ['Bashful', 'Bold', 'Careless', 'Cold', 'Complex', 'Cooperative', 'Creative', 'Deep', 'Disorganized', 'Efficient']...
  Sample response length: 40 traits

gpt-4_temp1:
  Total participants: 200
  Successful: 200
  Failed: 0
  Success rate: 100.0%
  Sample response keys: ['Bashful', 'Bold', 'Careless', 'Cold', 'Complex', 'Cooperative', 'Creative', 'Deep', 'Disorganized', 'Efficient']...
  Sample response length: 40 traits

llama_temp1:
  Total participants: 200
  Successful: 200
  Failed: 0
  Success rate: 100.0%
  Sample response keys: ['Bashful', 'Bold', 'Ca

In [12]:
# Save the preprocessed data for reference
output_path = Path('study_3_likert_results')
output_path.mkdir(exist_ok=True)

print("\n" + "=" * 60)
print("STUDY 2b LIKERT FORMAT SIMULATION COMPLETE!")
print("\nNext steps:")
print("1. Run convergent validity analysis on the results")
print("2. Compare with Study 2a results for format differences")
print("3. Results are saved in study_3_likert_results/ directory")
print("4. Enhanced validation and retry logic handled format issues")
print("=" * 60)



STUDY 2b LIKERT FORMAT SIMULATION COMPLETE!

Next steps:
1. Run convergent validity analysis on the results
2. Compare with Study 2a results for format differences
3. Results are saved in study_3_likert_results/ directory
4. Enhanced validation and retry logic handled format issues
