# Multi-Model Personality Simulation - Study 2

This notebook refactors the original Study 2 BFI-2 to Mini-Marker simulation to work with multiple LLM models using the unified portal.py interface.

## Models to Test
- GPT-4
- GPT-4o  
- Llama-3.3-70B-Instruct
- DeepSeek-V3

## Data Flow
1. Load and preprocess Soto BFI-2 data
2. Apply reverse coding to personality items
3. Map numeric responses to expanded format descriptions
4. Generate personality simulation prompts
5. Run simulations across multiple models
6. Save results for analysis

In [1]:
import pandas as pd
import sys
from pathlib import Path

# Add shared modules to path
sys.path.append('../shared')

from simulation_utils import (
    SimulationConfig, 
    run_bfi_to_minimarker_simulation,
    retry_failed_participants
)
from schema_bfi2 import expanded_scale
from mini_marker_prompt import get_prompt

## Data Loading and Preprocessing

In [2]:
# Load the Soto BFI-2 dataset
data_path = Path('../../raw_data/Soto_data.xlsx')
if not data_path.exists():
    print(f"Data file not found at {data_path}")
    print("Please ensure the raw_data/Soto_data.xlsx file exists in the project root")
    raise FileNotFoundError(f"Data file not found: {data_path}")

data = pd.read_excel(data_path, sheet_name='data')
print(f"Loaded data shape: {data.shape}")
data.head()

Loaded data shape: (470, 704)


Unnamed: 0,case_id,age,sex,ethnicity,rel_acquaintance,rel_friend,rel_roommate,rel_boygirlfriend,rel_relative,rel_other,...,tneo_n3_dep,tneo_n4_sel,tneo_n5_imp,tneo_n6_vul,tneo_o1_fan,tneo_o2_aes,tneo_o3_fee,tneo_o4_act,tneo_o5_ide,tneo_o6_val
0,1,27.0,M,2.0,,,,,,,...,51.25,40.181818,64.0,55.102041,46.639344,46.969697,66.7,57.065217,41.984127,58.039216
1,2,26.0,M,3.0,,,,,,,...,69.632353,60.636364,66.272727,65.306122,54.836066,56.439394,51.7,51.630435,51.904762,45.784314
2,3,24.0,F,4.0,,,,,,,...,60.441176,74.272727,54.909091,65.306122,75.327869,56.439394,56.7,40.76087,51.904762,58.039216
3,4,33.0,M,3.0,,1.0,,,,,...,67.794118,58.363636,64.0,52.55102,54.836066,50.757576,36.7,65.217391,63.809524,58.039216
4,5,23.0,F,5.0,,,,,,,...,62.279412,67.454545,41.272727,60.204082,50.737705,48.863636,49.2,46.195652,38.015873,38.431373


In [3]:
# Generate column names for TDA (Mini-Marker) and BFI-2 items
tda_columns = [f"tda{i}" for i in range(1, 41)]
sbfi_columns = [f"bfi{i}" for i in range(1, 61)]
selected_columns = tda_columns + sbfi_columns

print(f"Original data shape: {data.shape}")

# Remove rows with missing values in the selected columns
data = data.dropna(subset=selected_columns)
print(f"Data shape after removing missing values: {data.shape}")

Original data shape: (470, 704)
Data shape after removing missing values: (438, 704)


In [4]:
# Reverse coding map for BFI-2 items
reverse_coding_map = {
    'bfi1': 'bfi1', 'bfi2': 'bfi2', 'bfi3': 'bfi3R', 'bfi4': 'bfi4R', 'bfi5': 'bfi5R',
    'bfi6': 'bfi6', 'bfi7': 'bfi7', 'bfi8': 'bfi8R', 'bfi9': 'bfi9R', 'bfi10': 'bfi10',
    'bfi11': 'bfi11R', 'bfi12': 'bfi12R', 'bfi13': 'bfi13', 'bfi14': 'bfi14', 'bfi15': 'bfi15',
    'bfi16': 'bfi16R', 'bfi17': 'bfi17R', 'bfi18': 'bfi18', 'bfi19': 'bfi19', 'bfi20': 'bfi20',
    'bfi21': 'bfi21', 'bfi22': 'bfi22R', 'bfi23': 'bfi23R', 'bfi24': 'bfi24R', 'bfi25': 'bfi25R',
    'bfi26': 'bfi26R', 'bfi27': 'bfi27', 'bfi28': 'bfi28R', 'bfi29': 'bfi29R', 'bfi30': 'bfi30R',
    'bfi31': 'bfi31R', 'bfi32': 'bfi32', 'bfi33': 'bfi33', 'bfi34': 'bfi34', 'bfi35': 'bfi35',
    'bfi36': 'bfi36R', 'bfi37': 'bfi37R', 'bfi38': 'bfi38', 'bfi39': 'bfi39', 'bfi40': 'bfi40',
    'bfi41': 'bfi41', 'bfi42': 'bfi42R', 'bfi43': 'bfi43', 'bfi44': 'bfi44R', 'bfi45': 'bfi45R',
    'bfi46': 'bfi46', 'bfi47': 'bfi47R', 'bfi48': 'bfi48R', 'bfi49': 'bfi49R', 'bfi50': 'bfi50R',
    'bfi51': 'bfi51R', 'bfi52': 'bfi52', 'bfi53': 'bfi53', 'bfi54': 'bfi54', 'bfi55': 'bfi55R',
    'bfi56': 'bfi56', 'bfi57': 'bfi57', 'bfi58': 'bfi58R', 'bfi59': 'bfi59', 'bfi60': 'bfi60'
}

# Apply reverse coding
for key, value in reverse_coding_map.items():
    if value.endswith('R'):  # Reverse coded
        data[key] = 6 - data[key]
    # else: keep original value

print("Reverse coding applied successfully")

Reverse coding applied successfully


In [5]:
# Map numeric values to expanded format descriptions
def map_values(row):
    mapped_row = row.copy()
    for key in expanded_scale:
        if pd.notna(row[key]):  # Check if the value is not NaN
            index = int(row[key]) - 1  # Convert to 0-index
            mapped_row[key] = expanded_scale[key][index]  # Replace with corresponding string
    return mapped_row

# Apply mapping to BFI columns
mapped_data = data[sbfi_columns].apply(map_values, axis=1)

# Create combined BFI-2 description
mapped_data['combined_bfi2'] = mapped_data[[f'bfi{i}' for i in range(1, 61)]].apply(
    lambda row: ' '.join(row), axis=1
)

# Add combined description to original data
data['combined_bfi2'] = mapped_data['combined_bfi2']

print("Personality descriptions created successfully")
print(f"Final data shape: {data.shape}")

Personality descriptions created successfully
Final data shape: (438, 705)


  mapped_row[key] = expanded_scale[key][index]  # Replace with corresponding string
  mapped_row[key] = expanded_scale[key][index]  # Replace with corresponding string
  mapped_row[key] = expanded_scale[key][index]  # Replace with corresponding string
  mapped_row[key] = expanded_scale[key][index]  # Replace with corresponding string
  mapped_row[key] = expanded_scale[key][index]  # Replace with corresponding string


In [6]:
# Preview a personality description
print("Sample personality description:")
print(data.iloc[0]['combined_bfi2'][:500] + "...")

Sample personality description:
I am very outgoing, sociable. I am very compassionate almost always soft-hearted. I am fairly organized. I am somewhat relaxed handle stress somewhat well. I have some artistic interests. I am quite assertive. I am very respectful almost always treat others with respect. I am often lazy. I stay very optimistic after experiencing a setback. I am curious about few things. I often feel excited or eager. I rarely find fault with others. I am very dependable steady. I am fairly moody often have up an...


## Multi-Model Simulation Configuration

In [8]:
# Configuration for different models and temperatures
models_to_test = ['gpt-4', 'gpt-4o', 'llama', 'deepseek']
temperatures = [0.0, 1.0]  # Test both deterministic and stochastic responses
batch_size = 25  # Smaller batch size for stability across different APIs

# Create participant data list from DataFrame
participants_data = data.to_dict('records')
print(f"Prepared {len(participants_data)} participants for simulation")


## temp for testing 
models_to_test = ['deepseek']
## take the first 30 participants for testing
participants_data = participants_data[:30]

Prepared 438 participants for simulation


## Run Simulations for All Models

In [9]:
# Run simulations for all model-temperature combinations
all_results = {}

for model in models_to_test:
    for temperature in temperatures:
        print(f"\n{'='*60}")
        print(f"Starting simulation: {model} with temperature {temperature}")
        print(f"{'='*60}")
        
        config = SimulationConfig(
            model=model,
            temperature=temperature,
            batch_size=batch_size,
            max_workers=10
        )
        
        try:
            results = run_bfi_to_minimarker_simulation(
                participants_data=participants_data,
                config=config,
                output_dir="study_2_results"
            )
            
            # Store results
            key = f"{model}_temp{temperature}"
            all_results[key] = results
            
            # Check for any failed participants
            failed_count = sum(1 for r in results if isinstance(r, dict) and 'error' in r)
            if failed_count > 0:
                print(f"Warning: {failed_count} participants failed. Consider retrying.")
                
            print(f"Completed simulation: {model} with temperature {temperature}")
            
        except Exception as e:
            print(f"Error in simulation {model} temp {temperature}: {str(e)}")
            all_results[f"{model}_temp{temperature}"] = {"error": str(e)}

print(f"\nCompleted all simulations. Results keys: {list(all_results.keys())}")


Starting simulation: deepseek with temperature 0.0
Starting simulation for 30 participants using deepseek
Temperature: 0.0, Batch size: 25
Processing participants 0 to 24
Completed batch 0 to 24
Processing participants 25 to 29
Completed batch 25 to 29
Results saved to study_2_results/bfi_to_minimarker_deepseek_temp0_0.json
Completed simulation: deepseek with temperature 0.0

Starting simulation: deepseek with temperature 1.0
Starting simulation for 30 participants using deepseek
Temperature: 1.0, Batch size: 25
Processing participants 0 to 24
Completed batch 0 to 24
Processing participants 25 to 29
Completed batch 25 to 29
Results saved to study_2_results/bfi_to_minimarker_deepseek_temp1_0.json
Completed simulation: deepseek with temperature 1.0

Completed all simulations. Results keys: ['deepseek_temp0.0', 'deepseek_temp1.0']


## Retry Failed Participants (if any)

In [11]:
# Retry any failed participants
for key, results in all_results.items():
    if isinstance(results, list):
        failed_count = sum(1 for r in results if isinstance(r, dict) and 'error' in r)
        if failed_count > 0:
            print(f"Retrying {failed_count} failed participants for {key}")
            
            # Extract model and temperature from key
            model = key.split('_temp')[0]
            temperature = float(key.split('_temp')[1])
            
            config = SimulationConfig(
                model=model,
                temperature=temperature,
                batch_size=batch_size
            )
            
            updated_results = retry_failed_participants(
                results=results,
                participants_data=participants_data,
                prompt_generator=get_prompt,  # Use imported get_prompt function
                config=config,
                personality_key='combined_bfi2'
            )
            
            all_results[key] = updated_results
            
            # Save updated results
            from simulation_utils import save_simulation_results
            save_simulation_results(updated_results, "study_2_results", "bfi_to_minimarker_retried", config)

print("Retry process completed")

Retry process completed


## Results Summary

In [12]:
# Analyze results summary
print("Simulation Results Summary:")
print("=" * 50)

for key, results in all_results.items():
    if isinstance(results, list):
        total_participants = len(results)
        successful = sum(1 for r in results if not (isinstance(r, dict) and 'error' in r))
        failed = total_participants - successful
        success_rate = (successful / total_participants) * 100
        
        print(f"{key}:")
        print(f"  Total: {total_participants}, Successful: {successful}, Failed: {failed}")
        print(f"  Success Rate: {success_rate:.1f}%")
        print()
    else:
        print(f"{key}: FAILED - {results.get('error', 'Unknown error')}")
        print()

Simulation Results Summary:
deepseek_temp0.0:
  Total: 30, Successful: 30, Failed: 0
  Success Rate: 100.0%

deepseek_temp1.0:
  Total: 30, Successful: 30, Failed: 0
  Success Rate: 100.0%



## Save Preprocessed Data

In [13]:
# Save the preprocessed data for reference
output_path = Path('study_2_results')
output_path.mkdir(exist_ok=True)

data.to_csv(output_path / 'study2_preprocessed_data.csv', index=False)
print(f"Preprocessed data saved to {output_path / 'study2_preprocessed_data.csv'}")

Preprocessed data saved to study_2_results/study2_preprocessed_data.csv


# Multi-Model Results Analysis

## Overview
This section provides comprehensive analysis tools for comparing results across multiple LLM models. The analysis includes:

1. **Data Processing**: Convert JSON results to structured DataFrames
2. **Descriptive Statistics**: Compare response patterns across models and temperatures
3. **Correlation Analysis**: Assess agreement between models and with empirical data
4. **Visualization**: Create comprehensive plots for model comparison
5. **Reliability Assessment**: Evaluate consistency within and across models
6. **Psychometric Validation**: Test personality structure preservation

## Usage
- Run all previous cells to generate simulation results
- Results should be available in `all_results` dictionary
- Empirical Mini-Marker data available in original dataset (TDA columns)

In [2]:
## 7. Export Results for Further Analysis

def export_results_for_analysis(results_df, summary_df, trait_stats_df, 
                               corr_matrix, validation_results):
    """Export all analysis results in formats suitable for further research."""
    
    output_dir = Path('study_2_results')
    output_dir.mkdir(exist_ok=True)
    
    print("Exporting analysis results...")
    
    # 1. Export main results DataFrames
    if not results_df.empty:
        results_df.to_csv(output_dir / 'results_long_format.csv', index=False)
        print(f"✓ Long format results: {len(results_df)} rows")
    
    if not summary_df.empty:
        summary_df.to_csv(output_dir / 'results_wide_format.csv', index=False)
        print(f"✓ Wide format results: {len(summary_df)} rows")
    
    if not trait_stats_df.empty:
        trait_stats_df.to_csv(output_dir / 'trait_statistics.csv', index=False)
        print(f"✓ Trait statistics: {len(trait_stats_df)} rows")
    
    # 2. Export correlation matrices
    if corr_matrix is not None and not corr_matrix.empty:
        corr_matrix.to_csv(output_dir / 'inter_model_correlations.csv')
        print(f"✓ Correlation matrix: {corr_matrix.shape}")
    
    # 3. Export validation results
    if validation_results:
        # Flatten validation results for CSV export
        validation_flat = []
        for model_key, validation in validation_results.items():
            base_record = {
                'model_temp': model_key,
                'mean_correlation': validation['mean_correlation'],
                'median_correlation': validation['median_correlation'],
                'std_correlation': validation['std_correlation'],
                'n_traits': validation['n_traits']
            }
            
            # Add trait-level details
            for trait, details in validation['trait_details'].items():
                record = base_record.copy()
                record.update({
                    'trait': trait,
                    'trait_correlation': details['correlation'],
                    'trait_p_value': details['p_value'],
                    'trait_n_participants': details['n_participants'],
                    'empirical_mean': details['empirical_mean'],
                    'llm_mean': details['llm_mean'],
                    'empirical_std': details['empirical_std'],
                    'llm_std': details['llm_std']
                })
                validation_flat.append(record)
        
        validation_df = pd.DataFrame(validation_flat)
        validation_df.to_csv(output_dir / 'empirical_validation_details.csv', index=False)
        print(f"✓ Validation details: {len(validation_df)} rows")
    
    # 4. Create analysis summary for R/other tools
    import json
    summary_stats = {
        'analysis_timestamp': pd.Timestamp.now().isoformat(),
        'n_participants': len(summary_df) if not summary_df.empty else 0,
        'n_models': len(results_df['model'].unique()) if not results_df.empty else 0,
        'n_temperatures': len(results_df['temperature'].unique()) if not results_df.empty else 0,
        'n_traits': len([col for col in results_df.columns if col not in ['participant_id', 'model', 'temperature']]) if not results_df.empty else 0
    }
    
    with open(output_dir / 'analysis_metadata.json', 'w') as f:
        json.dump(summary_stats, f, indent=2)
    print(f"✓ Analysis metadata saved")
    
    # 5. Create R-ready format
    if not results_df.empty:
        # Reshape for R analysis
        r_format = results_df.melt(id_vars=['participant_id', 'model', 'temperature'],
                                  var_name='trait', value_name='response')
        r_format.to_csv(output_dir / 'results_r_format.csv', index=False)
        print(f"✓ R-ready format: {len(r_format)} rows")
    
    print(f"\n📁 All results exported to: {output_dir}/")
    print("Files available for further analysis:")
    for file in sorted(output_dir.glob('*.csv')):
        print(f"  - {file.name}")
    for file in sorted(output_dir.glob('*.json')):
        print(f"  - {file.name}")

# Export all results
export_results_for_analysis(results_df, summary_df, trait_stats_df, 
                           corr_matrix, validation_results)

print("\n🎉 MULTI-MODEL ANALYSIS COMPLETE! 🎉")
print("\nNext steps:")
print("1. Review the comprehensive report: study_2_results/comprehensive_analysis_report.md")
print("2. Examine visualizations: study_2_results/multi_model_analysis.png")
print("3. Use exported CSV files for statistical analysis in R or Python")
print("4. Scale up to full dataset with all models when satisfied with results")
print("5. Compare with Studies 3 and 4 using the same analysis framework")

NameError: name 'results_df' is not defined

In [None]:
## 6. Comprehensive Report Generation

def generate_comprehensive_report(all_results, results_df, trait_stats_df, 
                                 corr_matrix, temp_consistency, trait_agreement,
                                 validation_results, participants_data):
    \"\"\"Generate a comprehensive analysis report.\"\"\"
    
    report = []\n    report.append(\"# Multi-Model Personality Simulation Analysis Report\")\n    report.append(f\"Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\")\n    report.append(\"\\n\" + \"=\"*60)\n    \n    # Executive Summary\n    report.append(\"\\n## Executive Summary\")\n    report.append(f\"- **Participants Analyzed**: {len(participants_data)}\")\n    report.append(f\"- **Models Tested**: {list(results_df['model'].unique()) if not results_df.empty else 'None'}\")\n    report.append(f\"- **Temperature Settings**: {list(results_df['temperature'].unique()) if not results_df.empty else 'None'}\")\n    \n    if not results_df.empty:\n        total_responses = len(results_df)\n        successful_responses = len(results_df[results_df.notna().any(axis=1)])\n        success_rate = (successful_responses / total_responses) * 100 if total_responses > 0 else 0\n        report.append(f\"- **Overall Success Rate**: {success_rate:.1f}%\")\n    \n    # Model Performance Summary\n    if not trait_stats_df.empty:\n        report.append(\"\\n## Model Performance Summary\")\n        \n        performance_summary = trait_stats_df.groupby(['model', 'temperature']).agg({\n            'count': 'mean',\n            'mean': ['mean', 'std'],\n            'std': 'mean'\n        }).round(3)\n        \n        report.append(\"\\n### Response Completeness and Quality\")\n        for (model, temp), group in trait_stats_df.groupby(['model', 'temperature']):\n            avg_completeness = group['count'].mean() / len(participants_data)\n            avg_response = group['mean'].mean()\n            avg_variability = group['std'].mean()\n            \n            report.append(f\"- **{model} (temp={temp})**: {avg_completeness:.2f} completeness, \"\n                         f\"{avg_response:.2f} avg response, {avg_variability:.2f} variability\")\n    \n    # Temperature Consistency\n    if temp_consistency is not None and not temp_consistency.empty:\n        report.append(\"\\n### Temperature Consistency\")\n        for model, stats in temp_consistency.iterrows():\n            report.append(f\"- **{model}**: r = {stats['mean_consistency']:.3f} \"\n                         f\"(±{stats['std_consistency']:.3f}, n={stats['n_traits']})\")\n    \n    # Inter-Model Agreement\n    if corr_matrix is not None and not corr_matrix.empty:\n        report.append(\"\\n### Inter-Model Agreement\")\n        \n        # Calculate average correlations (excluding diagonal)\n        avg_correlations = []\n        for i, col1 in enumerate(corr_matrix.columns):\n            for j, col2 in enumerate(corr_matrix.columns):\n                if i != j and not pd.isna(corr_matrix.loc[col1, col2]):\n                    avg_correlations.append(corr_matrix.loc[col1, col2])\n        \n        if avg_correlations:\n            overall_agreement = np.mean(avg_correlations)\n            report.append(f\"- **Overall Inter-Model Agreement**: r = {overall_agreement:.3f}\")\n            \n            # Find best and worst agreements\n            max_corr = np.max(avg_correlations)\n            min_corr = np.min(avg_correlations)\n            report.append(f\"- **Range**: {min_corr:.3f} to {max_corr:.3f}\")\n    \n    # Empirical Validation\n    if validation_results:\n        report.append(\"\\n## Empirical Validation Results\")\n        \n        best_model = None\n        best_correlation = -1\n        \n        for model_key, validation in validation_results.items():\n            mean_corr = validation['mean_correlation']\n            n_traits = validation['n_traits']\n            \n            report.append(f\"- **{model_key}**: r = {mean_corr:.3f} \"\n                         f\"(across {n_traits} traits)\")\n            \n            if mean_corr > best_correlation:\n                best_correlation = mean_corr\n                best_model = model_key\n        \n        if best_model:\n            report.append(f\"\\n**Best Empirical Match**: {best_model} (r = {best_correlation:.3f})\")\n    \n    # Trait-Level Insights\n    if trait_agreement is not None and not trait_agreement.empty:\n        report.append(\"\\n## Trait-Level Analysis\")\n        \n        # Most reliable traits\n        top_traits = trait_agreement.sort_values('mean_agreement', ascending=False).head(5)\n        report.append(\"\\n### Most Reliable Traits (High Inter-Model Agreement):\")\n        for trait, stats in top_traits.iterrows():\n            report.append(f\"- **{trait}**: r = {stats['mean_agreement']:.3f} \"\n                         f\"(±{stats['std_agreement']:.3f})\")\n        \n        # Most variable traits\n        bottom_traits = trait_agreement.sort_values('mean_agreement', ascending=True).head(5)\n        report.append(\"\\n### Most Variable Traits (Low Inter-Model Agreement):\")\n        for trait, stats in bottom_traits.iterrows():\n            report.append(f\"- **{trait}**: r = {stats['mean_agreement']:.3f} \"\n                         f\"(±{stats['std_agreement']:.3f})\")\n    \n    # Recommendations\n    report.append(\"\\n## Recommendations\")\n    \n    if not results_df.empty:\n        models = results_df['model'].unique()\n        if len(models) > 1:\n            report.append(\"\\n### Model Selection:\")\n            \n            # Based on empirical validation\n            if validation_results:\n                best_empirical = max(validation_results.keys(), \n                                   key=lambda x: validation_results[x]['mean_correlation'])\n                report.append(f\"- For **empirical accuracy**: Use {best_empirical}\")\n            \n            # Based on consistency\n            if temp_consistency is not None and not temp_consistency.empty:\n                most_consistent = temp_consistency['mean_consistency'].idxmax()\n                report.append(f\"- For **consistency**: Use {most_consistent}\")\n        \n        # Temperature recommendations\n        if len(results_df['temperature'].unique()) > 1:\n            report.append(\"\\n### Temperature Settings:\")\n            report.append(\"- Use **temperature = 0.0** for deterministic, consistent responses\")\n            report.append(\"- Use **temperature = 1.0** for diverse, creative responses\")\n            report.append(\"- Consider ensemble approaches combining both temperatures\")\n    \n    # Technical Notes\n    report.append(\"\\n## Technical Notes\")\n    report.append(\"- All correlations computed using Pearson's correlation coefficient\")\n    report.append(\"- Statistical significance testing performed where applicable\")\n    report.append(\"- Missing data handled through pairwise deletion\")\n    report.append(\"- Results based on Mini-Marker 40-item personality assessment\")\n    \n    # Limitations\n    report.append(\"\\n## Limitations\")\n    report.append(f\"- Analysis based on {len(participants_data)} participants (subset for testing)\")\n    report.append(\"- Results may vary with different personality descriptions\")\n    report.append(\"- Model performance may depend on prompt engineering\")\n    report.append(\"- Cross-cultural validity not assessed\")\n    \n    return \"\\n\".join(report)\n\n# Generate comprehensive report\nfinal_report = generate_comprehensive_report(\n    all_results, results_df, trait_stats_df, \n    corr_matrix, temp_consistency, trait_agreement,\n    validation_results, participants_data\n)\n\nprint(final_report)\n\n# Save report to file\nwith open('study_2_results/comprehensive_analysis_report.md', 'w') as f:\n    f.write(final_report)\n\nprint(\"\\n\" + \"=\"*60)\nprint(\"ANALYSIS COMPLETE\")\nprint(\"Report saved to: study_2_results/comprehensive_analysis_report.md\")\nprint(\"Visualizations saved to: study_2_results/multi_model_analysis.png\")\nprint(\"=\"*60)

In [None]:
## 5. Empirical Validation Against Human Data

def validate_against_empirical_data(results_df, data, participants_data):
    \"\"\"Compare LLM responses with empirical human Mini-Marker data.\"\"\"
    
    if results_df.empty:
        print("No LLM results for empirical validation")
        return None
    
    # TDA columns represent empirical Mini-Marker responses (1-9 scale)
    tda_columns = [f"tda{i}" for i in range(1, 41)]
    
    # Check if we have empirical data
    available_tda = [col for col in tda_columns if col in data.columns]
    if not available_tda:
        print("No empirical TDA (Mini-Marker) data available for validation")
        return None
    
    print(f"=== EMPIRICAL VALIDATION ===")
    print(f"Available empirical data: {len(available_tda)} Mini-Marker traits")
    
    # Mini-Marker trait names corresponding to TDA columns
    trait_names = [
        'Bashful', 'Bold', 'Careless', 'Cold', 'Complex', 'Cooperative', 'Creative', 'Deep',
        'Disorganized', 'Efficient', 'Energetic', 'Envious', 'Extraverted', 'Fretful', 'Harsh',
        'Imaginative', 'Inefficient', 'Intellectual', 'Jealous', 'Kind', 'Moody', 'Organized',
        'Philosophical', 'Practical', 'Quiet', 'Relaxed', 'Rude', 'Shy', 'Sloppy', 'Sympathetic',
        'Systematic', 'Talkative', 'Temperamental', 'Touchy', 'Uncreative', 'Unenvious',
        'Unintellectual', 'Unsympathetic', 'Warm', 'Withdrawn'
    ]
    
    # Get empirical data for participants used in simulation
    empirical_subset = data.iloc[:len(participants_data)][available_tda]
    
    validation_results = {}
    
    # For each model-temperature combination
    for model in results_df['model'].unique():
        for temp in results_df['temperature'].unique():
            subset = results_df[(results_df['model'] == model) & (results_df['temperature'] == temp)]
            
            if subset.empty:
                continue
                
            key = f'{model}_temp{temp}'
            trait_correlations = []
            trait_details = {}
            
            # Compare each trait
            for i, trait in enumerate(trait_names[:len(available_tda)]):
                if trait in subset.columns:
                    # Get LLM responses
                    llm_responses = subset[['participant_id', trait]].dropna()
                    
                    # Get corresponding empirical data
                    empirical_values = []
                    llm_values = []
                    
                    for _, row in llm_responses.iterrows():
                        participant_id = int(row['participant_id'])
                        if participant_id < len(empirical_subset):
                            emp_value = empirical_subset.iloc[participant_id, i]
                            if pd.notna(emp_value) and pd.notna(row[trait]):
                                empirical_values.append(emp_value)
                                llm_values.append(row[trait])
                    
                    # Calculate correlation if we have enough data
                    if len(empirical_values) > 2:
                        corr, p_value = pearsonr(empirical_values, llm_values)
                        
                        if not np.isnan(corr):
                            trait_correlations.append(corr)
                            trait_details[trait] = {
                                'correlation': corr,
                                'p_value': p_value,
                                'n_participants': len(empirical_values),
                                'empirical_mean': np.mean(empirical_values),
                                'llm_mean': np.mean(llm_values),
                                'empirical_std': np.std(empirical_values),
                                'llm_std': np.std(llm_values)
                            }
            
            # Store validation results
            if trait_correlations:
                validation_results[key] = {
                    'mean_correlation': np.mean(trait_correlations),
                    'median_correlation': np.median(trait_correlations),
                    'std_correlation': np.std(trait_correlations),
                    'n_traits': len(trait_correlations),
                    'trait_details': trait_details
                }
    
    # Display results
    validation_df = pd.DataFrame({k: {
        'mean_corr': v['mean_correlation'],
        'median_corr': v['median_correlation'],
        'std_corr': v['std_correlation'],
        'n_traits': v['n_traits']
    } for k, v in validation_results.items()}).T
    
    if not validation_df.empty:
        print("\\nValidation against empirical data:")
        print(validation_df.round(3))
        
        # Find best performing model
        best_model = validation_df['mean_corr'].idxmax()
        print(f"\\nBest performing model: {best_model} (r = {validation_df.loc[best_model, 'mean_corr']:.3f})\")\n        
        # Show trait-level details for best model
        if best_model in validation_results:
            print(f\"\\nTrait-level correlations for {best_model}:\")\n            trait_details = validation_results[best_model]['trait_details']\n            trait_corr_df = pd.DataFrame({\n                trait: {\n                    'correlation': details['correlation'],\n                    'p_value': details['p_value'],\n                    'n_participants': details['n_participants']\n                } for trait, details in trait_details.items()\n            }).T\n            \n            # Sort by correlation strength\n            trait_corr_sorted = trait_corr_df.sort_values('correlation', ascending=False)\n            print(trait_corr_sorted.head(10).round(3))\n    \n    return validation_results, validation_df\n\n# Run empirical validation\nvalidation_results, validation_summary = validate_against_empirical_data(results_df, data, participants_data)

In [None]:
## 4. Correlation Analysis and Model Agreement

def analyze_model_correlations(results_df, summary_df):
    \"\"\"Comprehensive correlation analysis between models and with empirical data.\"\"\"
    
    if results_df.empty:
        print("No results for correlation analysis")
        return None
    
    trait_names = [col for col in results_df.columns if col not in ['participant_id', 'model', 'temperature']]
    
    print("=== MODEL CORRELATION ANALYSIS ===")
    
    # 1. Inter-model correlations
    model_combinations = []
    correlations_matrix = {}
    
    models = results_df['model'].unique()
    temps = results_df['temperature'].unique()
    
    for model1 in models:
        for temp1 in temps:
            key1 = f'{model1}_temp{temp1}'
            subset1 = results_df[(results_df['model'] == model1) & (results_df['temperature'] == temp1)]
            
            if subset1.empty:
                continue
                
            correlations_matrix[key1] = {}
            
            for model2 in models:
                for temp2 in temps:
                    key2 = f'{model2}_temp{temp2}'
                    subset2 = results_df[(results_df['model'] == model2) & (results_df['temperature'] == temp2)]
                    
                    if subset2.empty:
                        correlations_matrix[key1][key2] = np.nan
                        continue
                    
                    # Calculate correlations for each trait
                    trait_correlations = []
                    for trait in trait_names:
                        if trait in subset1.columns and trait in subset2.columns:
                            # Merge on participant_id to align responses
                            merged = pd.merge(subset1[['participant_id', trait]], 
                                            subset2[['participant_id', trait]], 
                                            on='participant_id', suffixes=('_1', '_2'))
                            
                            if len(merged) > 1:
                                # Remove rows with NaN values
                                clean_data = merged.dropna()
                                if len(clean_data) > 1:
                                    corr, _ = pearsonr(clean_data[f'{trait}_1'], clean_data[f'{trait}_2'])
                                    if not np.isnan(corr):
                                        trait_correlations.append(corr)
                    
                    # Average correlation across traits
                    if trait_correlations:
                        avg_correlation = np.mean(trait_correlations)
                        correlations_matrix[key1][key2] = avg_correlation
                    else:
                        correlations_matrix[key1][key2] = np.nan
    
    # Convert to DataFrame for visualization
    corr_df = pd.DataFrame(correlations_matrix)
    
    print(f"Inter-model correlation matrix ({len(corr_df)}x{len(corr_df.columns)}):")
    print(corr_df.round(3))
    
    # 2. Temperature consistency within models
    print("\\n=== TEMPERATURE CONSISTENCY ===")
    temp_consistency = {}
    
    for model in models:
        temp0_data = results_df[(results_df['model'] == model) & (results_df['temperature'] == '0.0')]
        temp1_data = results_df[(results_df['model'] == model) & (results_df['temperature'] == '1.0')]
        
        if not temp0_data.empty and not temp1_data.empty:
            trait_consistencies = []
            
            for trait in trait_names:
                merged = pd.merge(temp0_data[['participant_id', trait]], 
                                temp1_data[['participant_id', trait]], 
                                on='participant_id', suffixes=('_t0', '_t1'))
                
                clean_data = merged.dropna()
                if len(clean_data) > 1:
                    corr, _ = pearsonr(clean_data[f'{trait}_t0'], clean_data[f'{trait}_t1'])
                    if not np.isnan(corr):
                        trait_consistencies.append(corr)
            
            if trait_consistencies:
                temp_consistency[model] = {
                    'mean_consistency': np.mean(trait_consistencies),
                    'std_consistency': np.std(trait_consistencies),
                    'n_traits': len(trait_consistencies)
                }
    
    consistency_df = pd.DataFrame(temp_consistency).T
    if not consistency_df.empty:
        print("Temperature consistency by model:")
        print(consistency_df.round(3))
    
    # 3. Trait-level analysis
    print("\\n=== TRAIT-LEVEL AGREEMENT ===")
    trait_agreement = {}
    
    for trait in trait_names[:10]:  # Analyze first 10 traits
        trait_corrs = []
        
        # Get all model-temperature combinations for this trait
        trait_data = {}
        for model in models:
            for temp in temps:
                subset = results_df[(results_df['model'] == model) & (results_df['temperature'] == temp)]
                if not subset.empty and trait in subset.columns:
                    trait_data[f'{model}_t{temp}'] = subset[['participant_id', trait]].dropna()
        
        # Calculate pairwise correlations
        combinations = list(trait_data.keys())
        for i in range(len(combinations)):
            for j in range(i+1, len(combinations)):
                data1 = trait_data[combinations[i]]
                data2 = trait_data[combinations[j]]
                
                merged = pd.merge(data1, data2, on='participant_id', suffixes=('_1', '_2'))
                if len(merged) > 1:
                    corr, _ = pearsonr(merged[f'{trait}_1'], merged[f'{trait}_2'])
                    if not np.isnan(corr):
                        trait_corrs.append(corr)
        
        if trait_corrs:
            trait_agreement[trait] = {
                'mean_agreement': np.mean(trait_corrs),
                'std_agreement': np.std(trait_corrs),
                'n_comparisons': len(trait_corrs)
            }
    
    trait_agreement_df = pd.DataFrame(trait_agreement).T
    if not trait_agreement_df.empty:
        print("Top traits by inter-model agreement:")
        sorted_traits = trait_agreement_df.sort_values('mean_agreement', ascending=False)
        print(sorted_traits.head().round(3))
    
    return corr_df, consistency_df, trait_agreement_df

# Run correlation analysis
corr_matrix, temp_consistency, trait_agreement = analyze_model_correlations(results_df, summary_df)

In [None]:
## 3. Visualization Suite

def create_comprehensive_plots(results_df, trait_stats_df, summary_df):
    \"\"\"Create a comprehensive set of visualization plots.\"\"\"
    
    if results_df.empty:
        print("No data to plot")
        return
    
    # Set up the plotting layout
    fig = plt.figure(figsize=(20, 16))
    
    # Get trait names for plotting
    trait_names = [col for col in results_df.columns if col not in ['participant_id', 'model', 'temperature']]
    
    # 1. Response Distribution Comparison
    plt.subplot(3, 3, 1)
    if not results_df.empty:
        for model in results_df['model'].unique():
            for temp in results_df['temperature'].unique():
                subset = results_df[(results_df['model'] == model) & (results_df['temperature'] == temp)]
                if not subset.empty:
                    # Calculate mean response across all traits for each participant
                    trait_means = subset[trait_names].mean(axis=1)
                    plt.hist(trait_means, alpha=0.6, bins=20, 
                            label=f'{model}_temp{temp}', density=True)
    
    plt.xlabel('Mean Trait Response')
    plt.ylabel('Density')
    plt.title('Distribution of Mean Responses by Model')
    plt.legend()
    plt.grid(True, alpha=0.3)
    
    # 2. Response Range Comparison
    plt.subplot(3, 3, 2)
    if not trait_stats_df.empty:
        range_data = []
        labels = []
        for model in trait_stats_df['model'].unique():
            for temp in trait_stats_df['temperature'].unique():
                subset = trait_stats_df[(trait_stats_df['model'] == model) & 
                                      (trait_stats_df['temperature'] == temp)]
                if not subset.empty:
                    ranges = subset['max'] - subset['min']
                    range_data.append(ranges)
                    labels.append(f'{model}\\ntemp{temp}')
        
        if range_data:
            plt.boxplot(range_data, labels=labels)
            plt.ylabel('Response Range (Max - Min)')
            plt.title('Response Range by Model')
            plt.xticks(rotation=45)
    
    # 3. Model Agreement Heatmap
    plt.subplot(3, 3, 3)
    if len(results_df['model'].unique()) > 1:
        # Calculate correlations between models
        correlations = []
        model_pairs = []
        
        models = results_df['model'].unique()
        temps = results_df['temperature'].unique()
        
        for i, (model1, temp1) in enumerate([(m, t) for m in models for t in temps]):
            subset1 = results_df[(results_df['model'] == model1) & (results_df['temperature'] == temp1)]
            if subset1.empty:
                continue
                
            for j, (model2, temp2) in enumerate([(m, t) for m in models for t in temps]):
                if i >= j:
                    continue
                    
                subset2 = results_df[(results_df['model'] == model2) & (results_df['temperature'] == temp2)]
                if subset2.empty:
                    continue
                
                # Calculate correlation across all traits
                corr_values = []
                for trait in trait_names:
                    if trait in subset1.columns and trait in subset2.columns:
                        values1 = subset1[trait].dropna()
                        values2 = subset2[trait].dropna()
                        if len(values1) > 0 and len(values2) > 0:
                            # Align by participant_id
                            merged = pd.merge(subset1[['participant_id', trait]], 
                                            subset2[['participant_id', trait]], 
                                            on='participant_id', suffixes=('_1', '_2'))
                            if len(merged) > 1:
                                corr, _ = pearsonr(merged[f'{trait}_1'], merged[f'{trait}_2'])
                                if not np.isnan(corr):
                                    corr_values.append(corr)
                
                if corr_values:
                    avg_corr = np.mean(corr_values)
                    correlations.append(avg_corr)
                    model_pairs.append(f'{model1}_t{temp1} vs\\n{model2}_t{temp2}')
        
        if correlations:
            y_pos = np.arange(len(model_pairs))
            plt.barh(y_pos, correlations)
            plt.yticks(y_pos, model_pairs)
            plt.xlabel('Average Correlation')
            plt.title('Inter-Model Agreement')
    else:
        plt.text(0.5, 0.5, 'Need multiple models\\nfor comparison', 
                ha='center', va='center', transform=plt.gca().transAxes)
        plt.title('Inter-Model Agreement')
    
    # 4. Trait Profile Comparison
    plt.subplot(3, 3, 4)
    if not trait_stats_df.empty and len(trait_names) > 5:
        # Select top 10 most variable traits
        trait_vars = trait_stats_df.groupby('trait')['std'].mean().sort_values(ascending=False).head(10)
        selected_traits = trait_vars.index.tolist()
        
        for model in trait_stats_df['model'].unique():
            for temp in trait_stats_df['temperature'].unique():
                subset = trait_stats_df[(trait_stats_df['model'] == model) & 
                                      (trait_stats_df['temperature'] == temp)]
                trait_means = []
                for trait in selected_traits:
                    trait_data = subset[subset['trait'] == trait]
                    if not trait_data.empty:
                        trait_means.append(trait_data['mean'].iloc[0])
                    else:
                        trait_means.append(np.nan)
                
                plt.plot(range(len(selected_traits)), trait_means, 
                        marker='o', label=f'{model}_temp{temp}')
        
        plt.xticks(range(len(selected_traits)), selected_traits, rotation=45, ha='right')
        plt.ylabel('Mean Rating')
        plt.title('Trait Profiles (Top 10 Variable Traits)')
        plt.legend()
        plt.grid(True, alpha=0.3)
    
    # 5. Response Consistency (Temperature Effect)
    plt.subplot(3, 3, 5)
    if len(results_df['temperature'].unique()) > 1:
        consistency_data = []
        models = []
        
        for model in results_df['model'].unique():
            temp0_data = results_df[(results_df['model'] == model) & (results_df['temperature'] == '0.0')]
            temp1_data = results_df[(results_df['model'] == model) & (results_df['temperature'] == '1.0')]
            
            if not temp0_data.empty and not temp1_data.empty:
                # Calculate consistency across traits
                trait_consistencies = []
                for trait in trait_names:
                    if trait in temp0_data.columns and trait in temp1_data.columns:
                        merged = pd.merge(temp0_data[['participant_id', trait]], 
                                        temp1_data[['participant_id', trait]], 
                                        on='participant_id', suffixes=('_t0', '_t1'))
                        if len(merged) > 1:
                            corr, _ = pearsonr(merged[f'{trait}_t0'], merged[f'{trait}_t1'])
                            if not np.isnan(corr):
                                trait_consistencies.append(corr)
                
                if trait_consistencies:
                    avg_consistency = np.mean(trait_consistencies)
                    consistency_data.append(avg_consistency)
                    models.append(model)
        
        if consistency_data:
            plt.bar(models, consistency_data)
            plt.ylabel('Temperature Consistency (r)')
            plt.title('Model Consistency Across Temperatures')
            plt.ylim(0, 1)
    else:
        plt.text(0.5, 0.5, 'Need multiple temperatures\\nfor comparison', 
                ha='center', va='center', transform=plt.gca().transAxes)
        plt.title('Temperature Consistency')
    
    # 6. Score Distribution by Model
    plt.subplot(3, 3, 6)
    if not results_df.empty:
        all_scores = []
        model_labels = []
        
        for model in results_df['model'].unique():
            for temp in results_df['temperature'].unique():
                subset = results_df[(results_df['model'] == model) & (results_df['temperature'] == temp)]
                if not subset.empty:
                    # Flatten all trait scores
                    scores = subset[trait_names].values.flatten()
                    scores = scores[~np.isnan(scores)]
                    if len(scores) > 0:
                        all_scores.append(scores)
                        model_labels.append(f'{model}_t{temp}')
        
        if all_scores:
            plt.boxplot(all_scores, labels=model_labels)
            plt.ylabel('Response Values')
            plt.title('Score Distributions by Model')
            plt.xticks(rotation=45)
    
    # 7. Trait Reliability Heatmap
    plt.subplot(3, 3, 7)
    if not trait_stats_df.empty and len(trait_names) > 10:
        # Create reliability matrix (coefficient of variation)
        reliability_matrix = []
        model_temp_combos = []
        
        for model in trait_stats_df['model'].unique():
            for temp in trait_stats_df['temperature'].unique():
                subset = trait_stats_df[(trait_stats_df['model'] == model) & 
                                      (trait_stats_df['temperature'] == temp)]
                if not subset.empty:
                    cv_values = []
                    for trait in trait_names[:15]:  # Limit to first 15 traits for readability
                        trait_data = subset[subset['trait'] == trait]
                        if not trait_data.empty and trait_data['mean'].iloc[0] != 0:
                            cv = trait_data['std'].iloc[0] / trait_data['mean'].iloc[0]
                            cv_values.append(cv)
                        else:
                            cv_values.append(np.nan)
                    
                    reliability_matrix.append(cv_values)
                    model_temp_combos.append(f'{model}_t{temp}')
        
        if reliability_matrix:
            reliability_array = np.array(reliability_matrix)
            im = plt.imshow(reliability_array, cmap='viridis', aspect='auto')
            plt.colorbar(im, label='Coefficient of Variation')
            plt.yticks(range(len(model_temp_combos)), model_temp_combos)
            plt.xticks(range(min(15, len(trait_names))), trait_names[:15], rotation=90)
            plt.title('Trait Reliability by Model')
    
    # 8. Model Performance Summary
    plt.subplot(3, 3, 8)
    if not trait_stats_df.empty:
        # Calculate performance metrics
        performance_metrics = []
        model_names = []
        
        for model in trait_stats_df['model'].unique():
            for temp in trait_stats_df['temperature'].unique():
                subset = trait_stats_df[(trait_stats_df['model'] == model) & 
                                      (trait_stats_df['temperature'] == temp)]
                if not subset.empty:
                    # Use response completeness as performance metric
                    completeness = subset['count'].mean() / len(participants_data)
                    avg_range = (subset['max'] - subset['min']).mean()
                    
                    performance_metrics.append([completeness, avg_range])
                    model_names.append(f'{model}_t{temp}')
        
        if performance_metrics:
            metrics_array = np.array(performance_metrics)
            x = np.arange(len(model_names))
            width = 0.35
            
            plt.bar(x - width/2, metrics_array[:, 0], width, label='Completeness', alpha=0.8)
            plt.bar(x + width/2, metrics_array[:, 1]/9, width, label='Range (scaled)', alpha=0.8)  # Scale range to 0-1
            
            plt.xlabel('Model')
            plt.ylabel('Score')
            plt.title('Model Performance Metrics')
            plt.xticks(x, model_names, rotation=45)
            plt.legend()
    
    # 9. Response Time Trend (if applicable)
    plt.subplot(3, 3, 9)
    plt.text(0.5, 0.5, 'Response patterns\\nacross participants', 
            ha='center', va='center', transform=plt.gca().transAxes,
            fontsize=12, bbox=dict(boxstyle='round', facecolor='lightgray'))
    plt.title('Future: Response Patterns')
    
    plt.tight_layout()
    plt.savefig('study_2_results/multi_model_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()

# Create the comprehensive visualization
create_comprehensive_plots(results_df, trait_stats_df, summary_df)

In [None]:
## 2. Descriptive Statistics and Model Comparison

def create_descriptive_analysis(results_df, trait_stats_df):
    \"\"\"Create comprehensive descriptive statistics for model comparison.\"\"\"
    
    if results_df.empty:
        print("No results to analyze")
        return
    
    # Overall statistics by model and temperature
    print("=== SIMULATION OVERVIEW ===")
    model_summary = results_df.groupby(['model', 'temperature']).agg({
        'participant_id': 'count'
    }).rename(columns={'participant_id': 'n_participants'})
    print(model_summary)
    
    # Trait-level statistics
    if not trait_stats_df.empty:
        print("\\n=== TRAIT STATISTICS BY MODEL ===")
        
        # Calculate overall means by model-temperature
        overall_means = trait_stats_df.groupby(['model', 'temperature'])['mean'].agg(['mean', 'std']).round(3)
        overall_means.columns = ['avg_trait_mean', 'std_trait_mean']
        print("Average trait ratings:")
        print(overall_means)
        
        # Response range analysis
        print("\\n=== RESPONSE RANGE ANALYSIS ===")
        range_stats = trait_stats_df.copy()
        range_stats['range'] = range_stats['max'] - range_stats['min']
        range_summary = range_stats.groupby(['model', 'temperature'])['range'].agg(['mean', 'std']).round(3)
        range_summary.columns = ['avg_range', 'std_range']
        print("Response range statistics:")
        print(range_summary)
        
        # Most variable traits
        print("\\n=== MOST VARIABLE TRAITS ===")
        trait_variability = trait_stats_df.groupby('trait')['std'].mean().sort_values(ascending=False)
        print("Traits with highest variability across models:")
        print(trait_variability.head(10).round(3))
    
    return model_summary, overall_means if not trait_stats_df.empty else None

# Run descriptive analysis
model_summary, overall_means = create_descriptive_analysis(results_df, trait_stats_df)

In [None]:
## 1. Data Processing and Structure

def process_simulation_results(all_results, participants_data):
    """
    Process simulation results into structured DataFrames for analysis.
    
    Returns:
    - results_df: Long format DataFrame with all responses
    - summary_df: Wide format DataFrame with model comparisons
    - trait_stats: Summary statistics by trait and model
    """
    
    # Mini-Marker trait names in order
    trait_names = [
        'Bashful', 'Bold', 'Careless', 'Cold', 'Complex', 'Cooperative', 'Creative', 'Deep',
        'Disorganized', 'Efficient', 'Energetic', 'Envious', 'Extraverted', 'Fretful', 'Harsh',
        'Imaginative', 'Inefficient', 'Intellectual', 'Jealous', 'Kind', 'Moody', 'Organized',
        'Philosophical', 'Practical', 'Quiet', 'Relaxed', 'Rude', 'Shy', 'Sloppy', 'Sympathetic',
        'Systematic', 'Talkative', 'Temperamental', 'Touchy', 'Uncreative', 'Unenvious',
        'Unintellectual', 'Unsympathetic', 'Warm', 'Withdrawn'
    ]
    
    # Initialize storage
    results_list = []
    summary_data = {'participant_id': range(len(participants_data))}
    
    # Process each model-temperature combination
    for model_temp, results in all_results.items():
        if not isinstance(results, list):
            print(f"Skipping {model_temp}: {results}")
            continue
            
        model_name = model_temp.split('_temp')[0]
        temperature = model_temp.split('_temp')[1]
        
        # Extract responses for each participant
        model_responses = []
        for i, result in enumerate(results):
            if isinstance(result, dict) and 'error' not in result:
                # Convert to standard format and ensure all traits are present
                response_dict = {'participant_id': i, 'model': model_name, 'temperature': temperature}
                
                for trait in trait_names:
                    if trait in result:
                        try:
                            value = float(result[trait])
                            response_dict[trait] = value
                        except (ValueError, TypeError):
                            response_dict[trait] = np.nan
                    else:
                        response_dict[trait] = np.nan
                
                results_list.append(response_dict)
                model_responses.append([response_dict.get(trait, np.nan) for trait in trait_names])
            else:
                # Handle failed responses
                model_responses.append([np.nan] * len(trait_names))
        
        # Add to summary DataFrame
        model_responses_array = np.array(model_responses)
        for j, trait in enumerate(trait_names):
            summary_data[f'{model_name}_temp{temperature}_{trait}'] = model_responses_array[:, j]
    
    # Create DataFrames
    results_df = pd.DataFrame(results_list)
    summary_df = pd.DataFrame(summary_data)
    
    # Calculate trait statistics
    trait_stats = []
    if not results_df.empty:
        for trait in trait_names:
            if trait in results_df.columns:
                trait_data = results_df.groupby(['model', 'temperature'])[trait].agg([
                    'count', 'mean', 'std', 'min', 'max'
                ]).reset_index()
                trait_data['trait'] = trait
                trait_stats.append(trait_data)
    
    trait_stats_df = pd.concat(trait_stats, ignore_index=True) if trait_stats else pd.DataFrame()
    
    return results_df, summary_df, trait_stats_df

# Process the results
print("Processing simulation results...")
results_df, summary_df, trait_stats_df = process_simulation_results(all_results, participants_data)

print(f"Results DataFrame shape: {results_df.shape}")
print(f"Summary DataFrame shape: {summary_df.shape}")
print(f"Trait statistics shape: {trait_stats_df.shape}")

if not results_df.empty:
    print(f"\\nAvailable models: {results_df['model'].unique()}")
    print(f"Available temperatures: {results_df['temperature'].unique()}")
else:
    print("No valid results to process")

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, spearmanr
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10