# Multi-Model Facet-Level Personality Simulation - Study 3

This notebook refactors the original Study 3 facet-level parameter extraction and simulation to work with multiple LLM models using the unified portal.py interface.

## Models to Test
- GPT-4
- GPT-4o  
- Llama-3.3-70B-Instruct
- DeepSeek-V3

## Study 3 Components
1. **Facet-Level Parameter Extraction**: Extract personality parameters at the facet level
2. **BFI-2 to Mini-Marker Simulation**: Similar to Study 2 but with facet-level focus
3. **Statistical Modeling**: Advanced factor analysis and parameter validation

## Data Flow
1. Load and preprocess BFI-2 data
2. Extract facet-level personality parameters
3. Run simulations across multiple models
4. Validate personality structure with factor analysis

In [None]:
import pandas as pd
import numpy as np
import sys
from pathlib import Path

# Add shared modules to path
sys.path.append('../shared')

from simulation_utils import (
    SimulationConfig, 
    run_bfi_to_minimarker_simulation,
    run_batch_simulation,
    retry_failed_participants,
    save_simulation_results
)
from schema_bfi2 import expanded_scale
from mini_marker_prompt import get_prompt

## Data Loading and Preprocessing

In [None]:
# Load the BFI-2 dataset (same as Study 2)
data_path = Path('../../raw_data/Soto_data.xlsx')
if not data_path.exists():
    print(f"Data file not found at {data_path}")
    print("Please ensure the raw_data/Soto_data.xlsx file exists in the project root")
    raise FileNotFoundError(f"Data file not found: {data_path}")

data = pd.read_excel(data_path, sheet_name='data')
print(f"Loaded data shape: {data.shape}")

# Generate column names
tda_columns = [f"tda{i}" for i in range(1, 41)]
sbfi_columns = [f"bfi{i}" for i in range(1, 61)]
selected_columns = tda_columns + sbfi_columns

# Clean data
print(f"Original data shape: {data.shape}")
data = data.dropna(subset=selected_columns)
print(f"Data shape after removing missing values: {data.shape}")

In [None]:
# Apply reverse coding (same as Study 2)
reverse_coding_map = {
    'bfi1': 'bfi1', 'bfi2': 'bfi2', 'bfi3': 'bfi3R', 'bfi4': 'bfi4R', 'bfi5': 'bfi5R',
    'bfi6': 'bfi6', 'bfi7': 'bfi7', 'bfi8': 'bfi8R', 'bfi9': 'bfi9R', 'bfi10': 'bfi10',
    'bfi11': 'bfi11R', 'bfi12': 'bfi12R', 'bfi13': 'bfi13', 'bfi14': 'bfi14', 'bfi15': 'bfi15',
    'bfi16': 'bfi16R', 'bfi17': 'bfi17R', 'bfi18': 'bfi18', 'bfi19': 'bfi19', 'bfi20': 'bfi20',
    'bfi21': 'bfi21', 'bfi22': 'bfi22R', 'bfi23': 'bfi23R', 'bfi24': 'bfi24R', 'bfi25': 'bfi25R',
    'bfi26': 'bfi26R', 'bfi27': 'bfi27', 'bfi28': 'bfi28R', 'bfi29': 'bfi29R', 'bfi30': 'bfi30R',
    'bfi31': 'bfi31R', 'bfi32': 'bfi32', 'bfi33': 'bfi33', 'bfi34': 'bfi34', 'bfi35': 'bfi35',
    'bfi36': 'bfi36R', 'bfi37': 'bfi37R', 'bfi38': 'bfi38', 'bfi39': 'bfi39', 'bfi40': 'bfi40',
    'bfi41': 'bfi41', 'bfi42': 'bfi42R', 'bfi43': 'bfi43', 'bfi44': 'bfi44R', 'bfi45': 'bfi45R',
    'bfi46': 'bfi46', 'bfi47': 'bfi47R', 'bfi48': 'bfi48R', 'bfi49': 'bfi49R', 'bfi50': 'bfi50R',
    'bfi51': 'bfi51R', 'bfi52': 'bfi52', 'bfi53': 'bfi53', 'bfi54': 'bfi54', 'bfi55': 'bfi55R',
    'bfi56': 'bfi56', 'bfi57': 'bfi57', 'bfi58': 'bfi58R', 'bfi59': 'bfi59', 'bfi60': 'bfi60'
}

for key, value in reverse_coding_map.items():
    if value.endswith('R'):
        data[key] = 6 - data[key]

print("Reverse coding applied successfully")

## Facet-Level Parameter Extraction

Study 3 focuses on extracting and analyzing personality at the facet level rather than just the domain level.

In [None]:
# BFI-2 Facet Structure (12 facets across 5 domains)
# Based on the BFI-2 structure from Soto & John (2017)

facet_structure = {
    # Extraversion facets
    'Sociability': ['bfi1', 'bfi6', 'bfi11', 'bfi16', 'bfi21', 'bfi26'],
    'Assertiveness': ['bfi31', 'bfi36', 'bfi41', 'bfi46', 'bfi51', 'bfi56'],
    
    # Agreeableness facets  
    'Compassion': ['bfi2', 'bfi7', 'bfi12', 'bfi17', 'bfi22', 'bfi27'],
    'Respectfulness': ['bfi32', 'bfi37', 'bfi42', 'bfi47', 'bfi52', 'bfi57'],
    
    # Conscientiousness facets
    'Organization': ['bfi3', 'bfi8', 'bfi13', 'bfi18', 'bfi23', 'bfi28'],
    'Productiveness': ['bfi33', 'bfi38', 'bfi43', 'bfi48', 'bfi53', 'bfi58'],
    
    # Negative Emotionality facets
    'Anxiety': ['bfi4', 'bfi9', 'bfi14', 'bfi19', 'bfi24', 'bfi29'],
    'Depression': ['bfi34', 'bfi39', 'bfi44', 'bfi49', 'bfi54', 'bfi59'],
    
    # Open-Mindedness facets
    'Intellectual_Curiosity': ['bfi5', 'bfi10', 'bfi15', 'bfi20', 'bfi25', 'bfi30'],
    'Aesthetic_Sensitivity': ['bfi35', 'bfi40', 'bfi45', 'bfi50', 'bfi55', 'bfi60']
}

print(f"Facet structure defined with {len(facet_structure)} facets")
for facet, items in facet_structure.items():
    print(f"{facet}: {len(items)} items")

In [None]:
# Calculate facet-level scores
facet_scores = pd.DataFrame(index=data.index)

for facet_name, items in facet_structure.items():
    # Calculate mean score for each facet
    facet_scores[facet_name] = data[items].mean(axis=1)
    
# Calculate domain-level scores
domain_scores = pd.DataFrame(index=data.index)
domain_scores['Extraversion'] = facet_scores[['Sociability', 'Assertiveness']].mean(axis=1)
domain_scores['Agreeableness'] = facet_scores[['Compassion', 'Respectfulness']].mean(axis=1)
domain_scores['Conscientiousness'] = facet_scores[['Organization', 'Productiveness']].mean(axis=1)
domain_scores['Negative_Emotionality'] = facet_scores[['Anxiety', 'Depression']].mean(axis=1)
domain_scores['Open_Mindedness'] = facet_scores[['Intellectual_Curiosity', 'Aesthetic_Sensitivity']].mean(axis=1)

print("Facet and domain scores calculated")
print(f"Facet scores shape: {facet_scores.shape}")
print(f"Domain scores shape: {domain_scores.shape}")

# Combine with original data
data_with_facets = pd.concat([data, facet_scores, domain_scores], axis=1)
print(f"Combined data shape: {data_with_facets.shape}")

## Create Expanded Format Personality Descriptions

In [None]:
# Map numeric values to expanded format descriptions (same as Study 2)
def map_values(row):
    mapped_row = row.copy()
    for key in expanded_scale:
        if pd.notna(row[key]):
            index = int(row[key]) - 1
            mapped_row[key] = expanded_scale[key][index]
    return mapped_row

# Apply mapping to BFI columns
mapped_data = data_with_facets[sbfi_columns].apply(map_values, axis=1)

# Create combined BFI-2 description
mapped_data['combined_bfi2'] = mapped_data[[f'bfi{i}' for i in range(1, 61)]].apply(
    lambda row: ' '.join(row), axis=1
)

# Add to main dataset
data_with_facets['combined_bfi2'] = mapped_data['combined_bfi2']

print("Expanded format personality descriptions created")
print(f"Sample description length: {len(data_with_facets.iloc[0]['combined_bfi2'])} characters")

## Multi-Model Simulation Configuration

In [None]:
# Configuration for different models and temperatures
models_to_test = ['gpt-4', 'gpt-4o', 'llama', 'deepseek']
temperatures = [0.0, 1.0]  # Test both deterministic and stochastic responses
batch_size = 20  # Conservative batch size for stability

# Prepare participant data
participants_data = data_with_facets.to_dict('records')
print(f"Prepared {len(participants_data)} participants for simulation")

# Results storage
expanded_format_results = {}
likert_format_results = {}

## Expanded Format Simulations (BFI-2 to Mini-Marker)

In [None]:
print("Starting Expanded Format BFI-2 to Mini-Marker Simulations")
print("=" * 60)

for model in models_to_test:
    for temperature in temperatures:
        print(f"\nRunning expanded format simulation: {model} with temperature {temperature}")
        
        config = SimulationConfig(
            model=model,
            temperature=temperature,
            batch_size=batch_size,
            max_workers=8
        )
        
        try:
            results = run_bfi_to_minimarker_simulation(
                participants_data=participants_data,
                config=config,
                output_dir="study_3_results/expanded_format"
            )
            
            # Store results
            key = f"{model}_temp{temperature}"
            expanded_format_results[key] = results
            
            # Check for failures
            failed_count = sum(1 for r in results if isinstance(r, dict) and 'error' in r)
            success_rate = ((len(results) - failed_count) / len(results)) * 100
            
            print(f"Completed: {len(results)} participants, {failed_count} failed, {success_rate:.1f}% success rate")
            
        except Exception as e:
            print(f"Error in expanded format simulation {model} temp {temperature}: {str(e)}")
            expanded_format_results[f"{model}_temp{temperature}"] = {"error": str(e)}

print(f"\nExpanded format simulations completed. Results: {list(expanded_format_results.keys())}")

## Likert Format Simulations (Alternative Format)

Study 3 also tests a more traditional Likert-scale format for comparison.

In [None]:
# Create Likert-style personality descriptions using raw scores
def create_likert_description(row):
    """Create a personality description using Likert-style language."""
    descriptions = []
    
    # Use domain scores for more concise descriptions
    domains = {
        'Extraversion': 'outgoing and sociable',
        'Agreeableness': 'compassionate and cooperative', 
        'Conscientiousness': 'organized and responsible',
        'Negative_Emotionality': 'emotionally reactive',
        'Open_Mindedness': 'curious and creative'
    }
    
    for domain, description in domains.items():
        score = row[domain]
        if score >= 4.5:
            level = "very"
        elif score >= 3.5:
            level = "quite"
        elif score >= 2.5:
            level = "moderately"
        elif score >= 1.5:
            level = "somewhat"
        else:
            level = "not very"
            
        descriptions.append(f"I am {level} {description}")
    
    return '. '.join(descriptions) + '.'

# Add Likert-style descriptions
data_with_facets['likert_description'] = data_with_facets.apply(create_likert_description, axis=1)

print("Likert-style personality descriptions created")
print("Sample description:")
print(data_with_facets.iloc[0]['likert_description'])

In [None]:
# Update participants data with Likert descriptions
participants_data_likert = data_with_facets.to_dict('records')

print("Starting Likert Format BFI-2 to Mini-Marker Simulations")
print("=" * 60)

# Create a custom prompt generator for Likert format
def likert_prompt_generator(personality_description):
    """Generate prompts using Likert-style personality descriptions."""
    from mini_marker_prompt import get_prompt
    return get_prompt(personality_description)

for model in models_to_test:
    for temperature in temperatures:
        print(f"\nRunning Likert format simulation: {model} with temperature {temperature}")
        
        config = SimulationConfig(
            model=model,
            temperature=temperature,
            batch_size=batch_size,
            max_workers=8
        )
        
        try:
            results = run_batch_simulation(
                participants_data=participants_data_likert,
                prompt_generator=likert_prompt_generator,
                config=config,
                personality_key='likert_description',
                output_dir="study_3_results/likert_format",
                output_filename="bfi_to_minimarker_likert"
            )
            
            # Store results
            key = f"{model}_temp{temperature}"
            likert_format_results[key] = results
            
            # Check for failures
            failed_count = sum(1 for r in results if isinstance(r, dict) and 'error' in r)
            success_rate = ((len(results) - failed_count) / len(results)) * 100
            
            print(f"Completed: {len(results)} participants, {failed_count} failed, {success_rate:.1f}% success rate")
            
        except Exception as e:
            print(f"Error in Likert format simulation {model} temp {temperature}: {str(e)}")
            likert_format_results[f"{model}_temp{temperature}"] = {"error": str(e)}

print(f"\nLikert format simulations completed. Results: {list(likert_format_results.keys())}")

## Retry Failed Participants

In [None]:
# Retry failed expanded format participants
print("Retrying failed expanded format participants...")

for key, results in expanded_format_results.items():
    if isinstance(results, list):
        failed_count = sum(1 for r in results if isinstance(r, dict) and 'error' in r)
        if failed_count > 0:
            print(f"Retrying {failed_count} failed participants for expanded {key}")
            
            model = key.split('_temp')[0]
            temperature = float(key.split('_temp')[1])
            
            config = SimulationConfig(
                model=model,
                temperature=temperature,
                batch_size=batch_size
            )
            
            from mini_marker_prompt import get_prompt
            updated_results = retry_failed_participants(
                results=results,
                participants_data=participants_data,
                prompt_generator=get_prompt,
                config=config,
                personality_key='combined_bfi2'
            )
            
            expanded_format_results[key] = updated_results
            save_simulation_results(updated_results, "study_3_results/expanded_format", "bfi_to_minimarker_retried", config)

In [None]:
# Retry failed Likert format participants
print("Retrying failed Likert format participants...")

for key, results in likert_format_results.items():
    if isinstance(results, list):
        failed_count = sum(1 for r in results if isinstance(r, dict) and 'error' in r)
        if failed_count > 0:
            print(f"Retrying {failed_count} failed participants for Likert {key}")
            
            model = key.split('_temp')[0]
            temperature = float(key.split('_temp')[1])
            
            config = SimulationConfig(
                model=model,
                temperature=temperature,
                batch_size=batch_size
            )
            
            updated_results = retry_failed_participants(
                results=results,
                participants_data=participants_data_likert,
                prompt_generator=likert_prompt_generator,
                config=config,
                personality_key='likert_description'
            )
            
            likert_format_results[key] = updated_results
            save_simulation_results(updated_results, "study_3_results/likert_format", "bfi_to_minimarker_likert_retried", config)

print("Retry process completed")

## Results Summary and Comparison

In [None]:
# Analyze expanded format results
print("Expanded Format Simulation Results:")
print("=" * 50)

for key, results in expanded_format_results.items():
    if isinstance(results, list):
        total = len(results)
        successful = sum(1 for r in results if not (isinstance(r, dict) and 'error' in r))
        failed = total - successful
        success_rate = (successful / total) * 100
        
        print(f"{key}: Total={total}, Successful={successful}, Failed={failed}, Success Rate={success_rate:.1f}%")
    else:
        print(f"{key}: FAILED - {results.get('error', 'Unknown error')}")

print("\nLikert Format Simulation Results:")
print("=" * 50)

for key, results in likert_format_results.items():
    if isinstance(results, list):
        total = len(results)
        successful = sum(1 for r in results if not (isinstance(r, dict) and 'error' in r))
        failed = total - successful
        success_rate = (successful / total) * 100
        
        print(f"{key}: Total={total}, Successful={successful}, Failed={failed}, Success Rate={success_rate:.1f}%")
    else:
        print(f"{key}: FAILED - {results.get('error', 'Unknown error')}")

In [None]:
# Analyze correlations between facet scores and simulation outcomes
def analyze_facet_correlations(results_dict, facet_scores, model_name):
    """Analyze correlations between facet scores and Mini-Marker responses."""
    correlations = {}
    
    for key, results in results_dict.items():
        if isinstance(results, list) and model_name in key:
            valid_results = [r for r in results if not (isinstance(r, dict) and 'error' in r)]
            if len(valid_results) > 10:  # Minimum sample size
                # Extract Mini-Marker trait scores
                trait_scores = {}
                for i, response in enumerate(valid_results):
                    if isinstance(response, dict):
                        for trait, score in response.items():
                            if trait not in trait_scores:
                                trait_scores[trait] = []
                            try:
                                trait_scores[trait].append(float(score))
                            except (ValueError, TypeError):
                                trait_scores[trait].append(np.nan)
                
                # Calculate correlations with facet scores
                for trait, scores in trait_scores.items():
                    if len(scores) == len(facet_scores):
                        trait_df = pd.DataFrame({'trait_score': scores})
                        combined_df = pd.concat([facet_scores.reset_index(drop=True), trait_df], axis=1)
                        
                        trait_correlations = {}
                        for facet in facet_scores.columns:
                            corr = combined_df[facet].corr(combined_df['trait_score'])
                            if not np.isnan(corr):
                                trait_correlations[facet] = corr
                        
                        if trait_correlations:
                            correlations[f"{key}_{trait}"] = trait_correlations
    
    return correlations

# Analyze correlations for one model as example
sample_correlations = analyze_facet_correlations(expanded_format_results, facet_scores, 'gpt-4')

if sample_correlations:
    print("Sample Facet-Trait Correlations (GPT-4):")
    for key, corrs in list(sample_correlations.items())[:3]:  # Show first 3
        print(f"\n{key}:")
        sorted_corrs = sorted(corrs.items(), key=lambda x: abs(x[1]), reverse=True)[:5]
        for facet, corr in sorted_corrs:
            print(f"  {facet}: {corr:.3f}")
else:
    print("No valid correlations found - check data alignment")

# Save simulation results
results_dir = Path("study_3_results")
results_dir.mkdir(exist_ok=True)

# Save processed data with facet scores
data_with_facets.to_csv(results_dir / 'study3_data_with_facets.csv', index=False)
facet_scores.to_csv(results_dir / 'facet_scores.csv', index=False)
domain_scores.to_csv(results_dir / 'domain_scores.csv', index=False)

print(f"Study 3 simulation completed. Results saved to {results_dir}/")