In [1]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path
import json
from datetime import datetime

# Add shared modules to path
sys.path.append('../shared')

from simulation_utils import (
    SimulationConfig, 
    run_bfi_to_minimarker_simulation,
    retry_failed_participants,
    run_enhanced_bfi_to_minimarker_simulation
)
from schema_bfi2 import likert_scale
from mini_marker_prompt import get_likert_prompt

print("Setup complete!")
print(f"Analysis started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")


Setup complete!
Analysis started at: 2025-07-12 22:47:01


In [2]:
# Load the statistically simulated BFI-2 data
data_path = Path('study3_simulated_data.csv')
if not data_path.exists():
    print(f"Simulated data file not found at {data_path}")
    print("Please run study_3_statistical_simulation.py first to generate the data")
    raise FileNotFoundError(f"Data file not found: {data_path}")

data = pd.read_csv(data_path)
print(f"Loaded simulated data shape: {data.shape}")
print(f"Columns: {list(data.columns)}")
print(f"Value range: {data.min().min()} to {data.max().max()}")
data.head()


Loaded simulated data shape: (200, 60)
Columns: ['reversed_bfi1', 'reversed_bfi16', 'reversed_bfi31', 'reversed_bfi46', 'reversed_bfi6', 'reversed_bfi21', 'reversed_bfi36', 'reversed_bfi51', 'reversed_bfi11', 'reversed_bfi26', 'reversed_bfi41', 'reversed_bfi56', 'reversed_bfi2', 'reversed_bfi17', 'reversed_bfi32', 'reversed_bfi47', 'reversed_bfi7', 'reversed_bfi22', 'reversed_bfi37', 'reversed_bfi52', 'reversed_bfi12', 'reversed_bfi27', 'reversed_bfi42', 'reversed_bfi57', 'reversed_bfi3', 'reversed_bfi18', 'reversed_bfi33', 'reversed_bfi48', 'reversed_bfi8', 'reversed_bfi23', 'reversed_bfi38', 'reversed_bfi53', 'reversed_bfi13', 'reversed_bfi28', 'reversed_bfi43', 'reversed_bfi58', 'reversed_bfi4', 'reversed_bfi19', 'reversed_bfi34', 'reversed_bfi49', 'reversed_bfi9', 'reversed_bfi24', 'reversed_bfi39', 'reversed_bfi54', 'reversed_bfi14', 'reversed_bfi29', 'reversed_bfi44', 'reversed_bfi59', 'reversed_bfi10', 'reversed_bfi25', 'reversed_bfi40', 'reversed_bfi55', 'reversed_bfi5', 'rever

Unnamed: 0,reversed_bfi1,reversed_bfi16,reversed_bfi31,reversed_bfi46,reversed_bfi6,reversed_bfi21,reversed_bfi36,reversed_bfi51,reversed_bfi11,reversed_bfi26,...,reversed_bfi40,reversed_bfi55,reversed_bfi5,reversed_bfi20,reversed_bfi35,reversed_bfi50,reversed_bfi15,reversed_bfi30,reversed_bfi45,reversed_bfi60
0,3,4,2,3,2,2,3,1,3,4,...,4,4,5,4,4,5,3,3,4,3
1,4,4,3,3,3,3,3,4,3,2,...,2,3,4,4,3,2,2,3,3,4
2,3,3,2,1,1,2,3,2,4,3,...,3,3,5,5,5,4,5,4,4,4
3,5,5,4,5,5,5,5,5,4,4,...,5,5,5,5,4,3,4,4,4,3
4,3,2,3,3,1,2,2,3,4,2,...,5,4,4,4,4,4,5,5,4,4


In [3]:
# Map the reversed BFI columns to standard BFI columns for likert format
# Create mapping from reversed_bfi* to bfi* for likert_scale compatibility
bfi_mapping = {}
for col in data.columns:
    if col.startswith('reversed_bfi'):
        bfi_num = col.replace('reversed_bfi', 'bfi')
        bfi_mapping[col] = bfi_num

print(f"Created mapping for {len(bfi_mapping)} BFI items")
print("Sample mappings:")
for i, (k, v) in enumerate(list(bfi_mapping.items())[:5]):
    print(f"  {k} -> {v}")

# Create a copy with standard BFI column names for likert processing
data_for_likert = data.copy()
data_for_likert = data_for_likert.rename(columns=bfi_mapping)

print(f"\nRenamed columns for likert processing:")
print(f"New columns: {sorted([col for col in data_for_likert.columns if col.startswith('bfi')])}")

# CRITICAL FIX: The simulated data already has reverse coding applied in the statistical simulation
# However, some items need to be "un-reversed" to match the likert_scale expectations
# Items that were reverse coded in the original data need to be reversed back for proper likert descriptions

# These are the items that were reverse coded in the original Soto data (marked with 'R')
# They need to be reversed back to match the likert_scale descriptions
reverse_coded_items = [
    'bfi3', 'bfi4', 'bfi5', 'bfi8', 'bfi9', 'bfi11', 'bfi12', 'bfi16', 'bfi17',
    'bfi22', 'bfi23', 'bfi24', 'bfi25', 'bfi26', 'bfi28', 'bfi29', 'bfi30',
    'bfi31', 'bfi36', 'bfi37', 'bfi42', 'bfi44', 'bfi45', 'bfi47', 'bfi48',
    'bfi49', 'bfi50', 'bfi51', 'bfi55', 'bfi58'
]

print(f"\nApplying reverse coding correction for {len(reverse_coded_items)} items...")
for item in reverse_coded_items:
    if item in data_for_likert.columns:
        data_for_likert[item] = 6 - data_for_likert[item]

print("Reverse coding correction applied successfully")


Created mapping for 60 BFI items
Sample mappings:
  reversed_bfi1 -> bfi1
  reversed_bfi16 -> bfi16
  reversed_bfi31 -> bfi31
  reversed_bfi46 -> bfi46
  reversed_bfi6 -> bfi6

Renamed columns for likert processing:
New columns: ['bfi1', 'bfi10', 'bfi11', 'bfi12', 'bfi13', 'bfi14', 'bfi15', 'bfi16', 'bfi17', 'bfi18', 'bfi19', 'bfi2', 'bfi20', 'bfi21', 'bfi22', 'bfi23', 'bfi24', 'bfi25', 'bfi26', 'bfi27', 'bfi28', 'bfi29', 'bfi3', 'bfi30', 'bfi31', 'bfi32', 'bfi33', 'bfi34', 'bfi35', 'bfi36', 'bfi37', 'bfi38', 'bfi39', 'bfi4', 'bfi40', 'bfi41', 'bfi42', 'bfi43', 'bfi44', 'bfi45', 'bfi46', 'bfi47', 'bfi48', 'bfi49', 'bfi5', 'bfi50', 'bfi51', 'bfi52', 'bfi53', 'bfi54', 'bfi55', 'bfi56', 'bfi57', 'bfi58', 'bfi59', 'bfi6', 'bfi60', 'bfi7', 'bfi8', 'bfi9']

Applying reverse coding correction for 30 items...
Reverse coding correction applied successfully


In [4]:
# Generate BFI column list for processing
bfi_columns = [f"bfi{i}" for i in range(1, 61)]
print(f"Expected BFI columns: {len(bfi_columns)}")
print(f"Available BFI columns: {len([col for col in data_for_likert.columns if col in bfi_columns])}")

# Verify all expected columns are present
missing_cols = [col for col in bfi_columns if col not in data_for_likert.columns]
if missing_cols:
    print(f"Warning: Missing columns: {missing_cols}")
else:
    print("All BFI columns are present!")


Expected BFI columns: 60
Available BFI columns: 60
All BFI columns are present!


In [5]:
# Map numeric values to Likert format descriptions
def convert_values_to_string(series, mapping):
    """Convert numeric BFI values to likert format strings."""
    series_converted = series.copy()
    if series.name in mapping:
        series_converted = series_converted.apply(lambda x: f"{mapping[series.name]} {x};")
    return series_converted

print("Converting BFI values to Likert format descriptions...")

# Apply the mapping function to each BFI column
mapped_data = data_for_likert[bfi_columns].apply(lambda df: convert_values_to_string(df, likert_scale))

# Combine all BFI descriptions into a single personality description
mapped_data['combined_bfi2'] = mapped_data[bfi_columns].apply(lambda row: ' '.join(row), axis=1)

# Add combined description to both original data AND corrected data
data['combined_bfi2'] = mapped_data['combined_bfi2']
data_for_likert['combined_bfi2'] = mapped_data['combined_bfi2']

print("Likert format personality descriptions created successfully")
print(f"Final data shape: {data.shape}")
print(f"Corrected data shape: {data_for_likert.shape}")


Converting BFI values to Likert format descriptions...
Likert format personality descriptions created successfully
Final data shape: (200, 61)
Corrected data shape: (200, 61)


In [6]:
# Preview a personality description
print("Sample Likert format personality description:")
print("=" * 80)
sample_description = data.iloc[0]['combined_bfi2']
print(sample_description[:500] + "...")
print("=" * 80)
print(f"Full description length: {len(sample_description)} characters")


Sample Likert format personality description:
Is outgoing, sociable: 3; Is compassionate, has a soft heart: 4; Tends to be disorganized: 5; Is relaxed, handles stress well: 1; Has few artistic interests: 1; Has an assertive personality: 2; Is respectful, treats others with respect: 4; Tends to be lazy: 3; Stays optimistic after experiencing a setback: 5; Is curious about many different things: 4; Rarely feels excited or eager: 3; Tends to find fault with others: 2; Is dependable, steady: 3; Is moody, has up and down mood swings: 5; Is inven...
Full description length: 2109 characters


In [7]:
# Test prompt generation with first participant
first_participant = data.iloc[0]
sample_prompt = get_likert_prompt(first_participant['combined_bfi2'])

print("=" * 80)
print("COMPLETE LIKERT FORMAT PROMPT SENT TO LLM")
print("=" * 80)
print(sample_prompt)
print("=" * 80)
print(f"Prompt length: {len(sample_prompt)} characters")
print(f"Prompt word count: {len(sample_prompt.split())} words")


COMPLETE LIKERT FORMAT PROMPT SENT TO LLM
### Your Assigned Personality ### 
The number indicates the extent to which you agree or disagree with that statement. 1 means 'Disagree Strongly', 3 means 'Neutral', and 5 means 'Agree Strongly'.

Is outgoing, sociable: 3; Is compassionate, has a soft heart: 4; Tends to be disorganized: 5; Is relaxed, handles stress well: 1; Has few artistic interests: 1; Has an assertive personality: 2; Is respectful, treats others with respect: 4; Tends to be lazy: 3; Stays optimistic after experiencing a setback: 5; Is curious about many different things: 4; Rarely feels excited or eager: 3; Tends to find fault with others: 2; Is dependable, steady: 3; Is moody, has up and down mood swings: 5; Is inventive, finds clever ways to do things: 3; Tends to be quiet: 2; Feels little sympathy for others: 2; Is systematic, likes to keep things in order: 2; Can be tense: 5; Is fascinated by art, music, or literature: 4; Is dominant, acts as a leader: 2; Starts argume

In [8]:
# Configuration for different models and temperatures
models_to_test = ['openai-gpt-3.5-turbo-0125',]
                  # 'gpt-4', 
                  # 'gpt-4o', 
                  # 'llama', 
                  # 'deepseek']
temperatures = [1]  # Use temperature 1 for stochastic responses
batch_size = 25  # Smaller batch size for stability across different APIs

# Create participant data list from corrected DataFrame
participants_data = data_for_likert.to_dict('records')

print(f"Simulation Configuration:")
print(f"  Models to test: {models_to_test}")
print(f"  Temperatures: {temperatures}")
print(f"  Batch size: {batch_size}")
print(f"  Total combinations: {len(models_to_test) * len(temperatures)}")
print(f"  Participants: {len(participants_data)}")

# Verify participant data structure
print(f"\nSample participant data keys: {list(participants_data[0].keys())}")
print(f"Has 'combined_bfi2' key: {'combined_bfi2' in participants_data[0]}")


Simulation Configuration:
  Models to test: ['openai-gpt-3.5-turbo-0125']
  Temperatures: [1]
  Batch size: 25
  Total combinations: 1
  Participants: 200

Sample participant data keys: ['bfi1', 'bfi16', 'bfi31', 'bfi46', 'bfi6', 'bfi21', 'bfi36', 'bfi51', 'bfi11', 'bfi26', 'bfi41', 'bfi56', 'bfi2', 'bfi17', 'bfi32', 'bfi47', 'bfi7', 'bfi22', 'bfi37', 'bfi52', 'bfi12', 'bfi27', 'bfi42', 'bfi57', 'bfi3', 'bfi18', 'bfi33', 'bfi48', 'bfi8', 'bfi23', 'bfi38', 'bfi53', 'bfi13', 'bfi28', 'bfi43', 'bfi58', 'bfi4', 'bfi19', 'bfi34', 'bfi49', 'bfi9', 'bfi24', 'bfi39', 'bfi54', 'bfi14', 'bfi29', 'bfi44', 'bfi59', 'bfi10', 'bfi25', 'bfi40', 'bfi55', 'bfi5', 'bfi20', 'bfi35', 'bfi50', 'bfi15', 'bfi30', 'bfi45', 'bfi60', 'combined_bfi2']
Has 'combined_bfi2' key: True


In [9]:
import threading
from concurrent.futures import ThreadPoolExecutor, as_completed
import time

# Thread-safe logging
log_lock = threading.Lock()

def safe_print(message, prefix="INFO"):
    """Thread-safe printing with timestamp and prefix"""
    timestamp = datetime.now().strftime("%H:%M:%S")
    with log_lock:
        print(f"[{timestamp}] {prefix}: {message}")

def run_simulation(model, temperature):
    """Run simulation for a single model-temperature combination."""
    simulation_id = f"{model}_temp{temperature}"
    
    # Start message
    safe_print(f"Starting simulation: {model} (temp={temperature})", "START")
    
    config = SimulationConfig(
        model=model,
        temperature=temperature,
        batch_size=batch_size,
        max_workers=10,
        max_retries=5,  # Enhanced retry logic
        base_wait_time=2.0,
        max_wait_time=60.0
    )
    
    start_time = time.time()
    
    try:
        # Use enhanced simulation with format validation and auto-retry
        results = run_enhanced_bfi_to_minimarker_simulation(
            participants_data=participants_data,
            config=config,
            output_dir="study_3_likert_results",
            use_enhanced=True,  # Enable enhanced validation
            prompt_generator=get_likert_prompt  # Use Likert-specific prompts
        )
        
        # Check for failures
        failed_count = sum(1 for r in results if isinstance(r, dict) and 'error' in r)
        duration = time.time() - start_time
        
        if failed_count > 0:
            safe_print(f"Completed {simulation_id} in {duration:.1f}s - WARNING: {failed_count} participants failed", "WARN")
        else:
            safe_print(f"Completed {simulation_id} in {duration:.1f}s - All participants successful", "SUCCESS")
        
        return (simulation_id, results)
        
    except Exception as e:
        duration = time.time() - start_time
        safe_print(f"Failed {simulation_id} after {duration:.1f}s - Error: {str(e)}", "ERROR")
        return (simulation_id, {"error": str(e)})

# Main execution
print("=" * 80)
print("STARTING ENHANCED PARALLEL SIMULATIONS FOR STUDY 3")
print(f"Models: {models_to_test}")
print(f"Temperatures: {temperatures}")
print(f"Total combinations: {len(models_to_test) * len(temperatures)}")
print(f"Participants: {len(participants_data)}")
print("=" * 80)

all_results = {}
start_time = time.time()

# Use ThreadPoolExecutor for parallel execution
with ThreadPoolExecutor(max_workers=len(models_to_test)) as executor:
    # Submit all jobs
    futures = [
        executor.submit(run_simulation, model, temperature)
        for model in models_to_test
        for temperature in temperatures
    ]
    
    # Collect results as they complete
    completed_count = 0
    total_jobs = len(futures)
    
    for future in as_completed(futures):
        key, result = future.result()
        all_results[key] = result
        completed_count += 1
        
        # Progress update
        safe_print(f"Progress: {completed_count}/{total_jobs} simulations completed", "PROGRESS")

total_duration = time.time() - start_time
print("\n" + "=" * 80)
print(f"ALL SIMULATIONS COMPLETED IN {total_duration:.1f} SECONDS")
print(f"Results keys: {list(all_results.keys())}")
print("=" * 80)


STARTING ENHANCED PARALLEL SIMULATIONS FOR STUDY 3
Models: ['openai-gpt-3.5-turbo-0125']
Temperatures: [1]
Total combinations: 1
Participants: 200
[22:47:02] START: Starting simulation: openai-gpt-3.5-turbo-0125 (temp=1)
Starting simulation for 200 participants using openai-gpt-3.5-turbo-0125
Temperature: 1, Batch size: 25
Processing participants 0 to 24
Completed batch 0 to 24
Processing participants 25 to 49
Completed batch 25 to 49
Processing participants 50 to 74
Completed batch 50 to 74
Processing participants 75 to 99
Completed batch 75 to 99
Processing participants 100 to 124
Completed batch 100 to 124
Processing participants 125 to 149
Completed batch 125 to 149
Processing participants 150 to 174
Completed batch 150 to 174
Processing participants 175 to 199
Completed batch 175 to 199
Results saved to study_3_likert_results/bfi_to_minimarker_openai_gpt_3.5_turbo_0125_temp1.json
[22:50:04] SUCCESS: Completed openai-gpt-3.5-turbo-0125_temp1 in 182.4s - All participants successful


In [10]:
# Retry any failed participants
print("Checking for failed participants and retrying if necessary...")

retry_count = 0
for key, results in all_results.items():
    if isinstance(results, list):
        failed_count = sum(1 for r in results if isinstance(r, dict) and 'error' in r)
        if failed_count > 0:
            print(f"Retrying {failed_count} failed participants for {key}")
            retry_count += 1
            
            # Extract model and temperature from key
            model = key.split('_temp')[0]
            temperature = float(key.split('_temp')[1])
            
            config = SimulationConfig(
                model=model,
                temperature=temperature,
                batch_size=batch_size
            )
            
            updated_results = retry_failed_participants(
                results=results,
                participants_data=participants_data,
                prompt_generator=get_likert_prompt,  # Use likert-specific prompt function
                config=config,
                personality_key='combined_bfi2'
            )
            
            all_results[key] = updated_results
            
            # Save updated results
            from simulation_utils import save_simulation_results
            save_simulation_results(updated_results, "study_3_likert_results", "bfi_to_minimarker", config)

if retry_count == 0:
    print("No failed participants found - all simulations successful!")
else:
    print(f"Retry process completed for {retry_count} model(s)")


Checking for failed participants and retrying if necessary...
No failed participants found - all simulations successful!


In [11]:
# Summary of all results
print("=" * 80)
print("STUDY 3 LIKERT SIMULATION RESULTS SUMMARY")
print("=" * 80)

for key, results in all_results.items():
    if isinstance(results, list):
        total_participants = len(results)
        successful_participants = sum(1 for r in results if isinstance(r, dict) and 'error' not in r)
        failed_participants = total_participants - successful_participants
        
        print(f"\n{key}:")
        print(f"  Total participants: {total_participants}")
        print(f"  Successful: {successful_participants}")
        print(f"  Failed: {failed_participants}")
        print(f"  Success rate: {(successful_participants/total_participants)*100:.1f}%")
        
        # Sample a successful response for validation
        successful_responses = [r for r in results if isinstance(r, dict) and 'error' not in r]
        if successful_responses:
            sample_response = successful_responses[0]
            print(f"  Sample response keys: {list(sample_response.keys())[:10]}...")  # Show first 10 keys
            print(f"  Sample response length: {len(sample_response)} traits")
    else:
        print(f"\n{key}: FAILED - {results}")

print("\n" + "=" * 80)


STUDY 3 LIKERT SIMULATION RESULTS SUMMARY

openai-gpt-3.5-turbo-0125_temp1:
  Total participants: 200
  Successful: 200
  Failed: 0
  Success rate: 100.0%
  Sample response keys: ['Bashful', 'Bold', 'Careless', 'Cold', 'Complex', 'Cooperative', 'Creative', 'Deep', 'Disorganized', 'Efficient']...
  Sample response length: 40 traits



In [12]:
# Save the preprocessed data for reference
output_path = Path('study_3_likert_results')
output_path.mkdir(exist_ok=True)

# Save preprocessed data (with reverse coding corrections)
data_for_likert.to_csv(output_path / 'study3_likert_preprocessed_data.csv', index=False)
print(f"Preprocessed data saved to {output_path / 'study3_likert_preprocessed_data.csv'}")

# Save simulation metadata
metadata = {
    'simulation_type': 'study_3_likert',
    'models_tested': models_to_test,
    'temperatures': temperatures,
    'batch_size': batch_size,
    'total_participants': len(participants_data),
    'simulation_date': datetime.now().isoformat(),
    'data_source': 'statistically_simulated_bfi2',
    'format': 'likert',
    'results_summary': {
        key: {
            'total': len(results) if isinstance(results, list) else 0,
            'successful': sum(1 for r in results if isinstance(r, dict) and 'error' not in r) if isinstance(results, list) else 0
        } for key, results in all_results.items()
    }
}

with open(output_path / 'simulation_metadata.json', 'w') as f:
    json.dump(metadata, f, indent=2)

print(f"Simulation metadata saved to {output_path / 'simulation_metadata.json'}")

print("\n" + "=" * 60)
print("STUDY 3 LIKERT FORMAT SIMULATION COMPLETE!")
print("\nNext steps:")
print("1. Run convergent validity analysis on the results")
print("2. Compare with Study 2 results for format differences")
print("3. Results are saved in study_3_likert_results/ directory")
print("4. Enhanced validation and retry logic handled format issues")
print("=" * 60)


Preprocessed data saved to study_3_likert_results/study3_likert_preprocessed_data.csv
Simulation metadata saved to study_3_likert_results/simulation_metadata.json

STUDY 3 LIKERT FORMAT SIMULATION COMPLETE!

Next steps:
1. Run convergent validity analysis on the results
2. Compare with Study 2 results for format differences
3. Results are saved in study_3_likert_results/ directory
4. Enhanced validation and retry logic handled format issues
