# Multi-Model Facet-Level Personality Analysis - Study 3

This notebook analyzes the results from Study 3 facet-level parameter extraction and simulation across multiple LLM models.

## Prerequisites
- Run `study_3_multi_model_simulation.ipynb` first to generate simulation results
- Results should be saved in `study_3_results/` directory

## Analysis Overview
1. **Data Loading**: Load saved simulation results, facet scores, and empirical data
2. **Format Comparison**: Compare expanded vs. Likert format effectiveness
3. **Facet-Level Analysis**: Analyze personality structure at the facet level
4. **Cross-Model Validation**: Assess consistency across different LLMs
5. **Factor Analysis**: Validate personality structure preservation
6. **Empirical Validation**: Test against human personality data
7. **Comprehensive Reporting**: Generate detailed analysis report
8. **Export Results**: Save all analysis outputs for further research

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr, spearmanr
from scipy.cluster.hierarchy import dendrogram, linkage
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA, FactorAnalysis
import json
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (15, 10)
plt.rcParams['font.size'] = 10

print("Analysis environment setup complete")

## 1. Data Loading and Preparation

In [None]:
# Load simulation results
results_dir = Path('study_3_results')
if not results_dir.exists():
    raise FileNotFoundError("Results directory not found. Please run study_3_multi_model_simulation.ipynb first.")

# Load expanded format results
expanded_results = {}
expanded_dir = results_dir / 'expanded_format'
if expanded_dir.exists():
    for json_file in expanded_dir.glob('bfi_to_minimarker_*.json'):
        filename = json_file.stem
        parts = filename.split('_')
        model = parts[3]
        temp_part = parts[4]
        temp_value = parts[5]
        temperature = f"{temp_part.replace('temp', '')}.{temp_value}"
        
        key = f"{model}_temp{temperature}"
        
        with open(json_file, 'r') as f:
            expanded_results[key] = json.load(f)
        
        print(f"Loaded expanded {key}: {len(expanded_results[key])} participants")

# Load Likert format results
likert_results = {}
likert_dir = results_dir / 'likert_format'
if likert_dir.exists():
    for json_file in likert_dir.glob('bfi_to_minimarker_likert_*.json'):
        filename = json_file.stem
        parts = filename.split('_')
        model = parts[4]
        temp_part = parts[5]
        temp_value = parts[6]
        temperature = f"{temp_part.replace('temp', '')}.{temp_value}"
        
        key = f"{model}_temp{temperature}"
        
        with open(json_file, 'r') as f:
            likert_results[key] = json.load(f)
        
        print(f"Loaded Likert {key}: {len(likert_results[key])} participants")

print(f"\nTotal expanded result sets: {len(expanded_results)}")
print(f"Total Likert result sets: {len(likert_results)}")

In [None]:
# Load processed data with facet scores
data_with_facets_path = results_dir / 'study3_data_with_facets.csv'
facet_scores_path = results_dir / 'facet_scores.csv'
domain_scores_path = results_dir / 'domain_scores.csv'

if data_with_facets_path.exists():
    data_with_facets = pd.read_csv(data_with_facets_path)
    print(f"Loaded processed data: {data_with_facets.shape}")
else:
    raise FileNotFoundError("Processed data not found. Please run simulation first.")

if facet_scores_path.exists():
    facet_scores = pd.read_csv(facet_scores_path)
    print(f"Loaded facet scores: {facet_scores.shape}")
    print(f"Facets: {list(facet_scores.columns)}")

if domain_scores_path.exists():
    domain_scores = pd.read_csv(domain_scores_path)
    print(f"Loaded domain scores: {domain_scores.shape}")
    print(f"Domains: {list(domain_scores.columns)}")

# Load experiment summary
summary_path = results_dir / 'study3_experiment_summary.json'
if summary_path.exists():
    with open(summary_path, 'r') as f:
        experiment_summary = json.load(f)
    print(f"\nExperiment conducted: {experiment_summary['timestamp']}")
    print(f"Models tested: {experiment_summary['models_tested']}")
    print(f"Total participants: {experiment_summary['total_participants']}")

In [None]:
# Load empirical data for validation
empirical_data_path = Path('../../raw_data/Soto_data.xlsx')
if empirical_data_path.exists():
    empirical_data = pd.read_excel(empirical_data_path, sheet_name='data')
    print(f"Loaded empirical data: {empirical_data.shape}")
    
    # Get TDA columns for validation
    tda_columns = [f"tda{i}" for i in range(1, 41)]
    available_tda = [col for col in tda_columns if col in empirical_data.columns]
    print(f"Available Mini-Marker traits for validation: {len(available_tda)}")
else:
    print("Empirical data not found - some validation analyses will be limited")
    empirical_data = None
    available_tda = []

## 2. Data Processing and Structure

In [None]:
def process_study3_results(results_dict, format_name):
    """
    Process Study 3 simulation results into structured DataFrames.
    
    Returns:
    - results_df: Long format DataFrame with all responses
    - trait_stats: Summary statistics by trait and model
    """
    
    # Mini-Marker trait names in order
    trait_names = [
        'Bashful', 'Bold', 'Careless', 'Cold', 'Complex', 'Cooperative', 'Creative', 'Deep',
        'Disorganized', 'Efficient', 'Energetic', 'Envious', 'Extraverted', 'Fretful', 'Harsh',
        'Imaginative', 'Inefficient', 'Intellectual', 'Jealous', 'Kind', 'Moody', 'Organized',
        'Philosophical', 'Practical', 'Quiet', 'Relaxed', 'Rude', 'Shy', 'Sloppy', 'Sympathetic',
        'Systematic', 'Talkative', 'Temperamental', 'Touchy', 'Uncreative', 'Unenvious',
        'Unintellectual', 'Unsympathetic', 'Warm', 'Withdrawn'
    ]
    
    # Initialize storage
    results_list = []
    
    # Process each model-temperature combination
    for model_temp, results in results_dict.items():
        if not isinstance(results, list):
            print(f"Skipping {model_temp}: {results}")
            continue
            
        model_name = model_temp.split('_temp')[0]
        temperature = model_temp.split('_temp')[1]
        
        # Extract responses for each participant
        for i, result in enumerate(results):
            if isinstance(result, dict) and 'error' not in result:
                # Convert to standard format
                response_dict = {
                    'participant_id': i, 
                    'model': model_name, 
                    'temperature': temperature,
                    'format': format_name
                }
                
                for trait in trait_names:
                    if trait in result:
                        try:
                            value = float(result[trait])
                            response_dict[trait] = value
                        except (ValueError, TypeError):
                            response_dict[trait] = np.nan
                    else:
                        response_dict[trait] = np.nan
                
                results_list.append(response_dict)
    
    # Create DataFrame
    results_df = pd.DataFrame(results_list)
    
    # Calculate trait statistics
    trait_stats = []
    if not results_df.empty:
        for trait in trait_names:
            if trait in results_df.columns:
                trait_data = results_df.groupby(['model', 'temperature', 'format'])[trait].agg([
                    'count', 'mean', 'std', 'min', 'max'
                ]).reset_index()
                trait_data['trait'] = trait
                trait_stats.append(trait_data)
    
    trait_stats_df = pd.concat(trait_stats, ignore_index=True) if trait_stats else pd.DataFrame()
    
    return results_df, trait_stats_df

# Process both formats
print("Processing expanded format results...")
expanded_df, expanded_stats = process_study3_results(expanded_results, 'expanded')

print("Processing Likert format results...")
likert_df, likert_stats = process_study3_results(likert_results, 'likert')

# Combine results
combined_results_df = pd.concat([expanded_df, likert_df], ignore_index=True)
combined_stats_df = pd.concat([expanded_stats, likert_stats], ignore_index=True)

print(f"\nExpanded format results: {expanded_df.shape}")
print(f"Likert format results: {likert_df.shape}")
print(f"Combined results: {combined_results_df.shape}")
print(f"Combined statistics: {combined_stats_df.shape}")

if not combined_results_df.empty:
    print(f"\nAvailable models: {combined_results_df['model'].unique()}")
    print(f"Available temperatures: {combined_results_df['temperature'].unique()}")
    print(f"Available formats: {combined_results_df['format'].unique()}")

## 3. Format Comparison Analysis

In [None]:
def analyze_format_comparison(combined_results_df, combined_stats_df):
    """Compare expanded vs. Likert format effectiveness."""
    
    if combined_results_df.empty:
        print("No results to compare")
        return
    
    print("=== FORMAT COMPARISON ANALYSIS ===")
    
    # 1. Response completion rates
    print("\n1. Response Completion Rates:")
    completion_rates = combined_results_df.groupby(['format', 'model', 'temperature']).agg({
        'participant_id': 'count'
    }).rename(columns={'participant_id': 'n_responses'})
    print(completion_rates)
    
    # 2. Response quality (non-missing values)
    print("\n2. Response Quality (Average Non-Missing Traits):")
    trait_columns = [col for col in combined_results_df.columns 
                    if col not in ['participant_id', 'model', 'temperature', 'format']]
    
    quality_metrics = []
    for format_name in combined_results_df['format'].unique():
        format_data = combined_results_df[combined_results_df['format'] == format_name]
        for model in format_data['model'].unique():
            for temp in format_data['temperature'].unique():
                subset = format_data[(format_data['model'] == model) & 
                                  (format_data['temperature'] == temp)]
                if not subset.empty:
                    # Calculate average non-missing traits per participant
                    non_missing = subset[trait_columns].notna().sum(axis=1).mean()
                    quality_metrics.append({
                        'format': format_name,
                        'model': model,
                        'temperature': temp,
                        'avg_non_missing_traits': non_missing,
                        'completion_percentage': (non_missing / len(trait_columns)) * 100
                    })
    
    quality_df = pd.DataFrame(quality_metrics)
    if not quality_df.empty:
        quality_summary = quality_df.groupby('format')[['avg_non_missing_traits', 'completion_percentage']].mean()
        print(quality_summary.round(2))
    
    # 3. Response variability
    print("\n3. Response Variability by Format:")
    if not combined_stats_df.empty:
        variability = combined_stats_df.groupby('format')['std'].mean()
        print(variability.round(3))
    
    # 4. Create visualization
    if not quality_df.empty:
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))
        
        # Completion rates by format
        sns.boxplot(data=quality_df, x='format', y='completion_percentage', ax=axes[0])
        axes[0].set_title('Response Completion by Format')
        axes[0].set_ylabel('Completion Percentage')
        
        # Variability comparison
        if not combined_stats_df.empty:
            sns.boxplot(data=combined_stats_df, x='format', y='std', ax=axes[1])
            axes[1].set_title('Response Variability by Format')
            axes[1].set_ylabel('Standard Deviation')
        
        plt.tight_layout()
        plt.savefig('study_3_results/format_comparison.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    return quality_df

# Run format comparison
format_quality = analyze_format_comparison(combined_results_df, combined_stats_df)

## 4. Facet-Level Analysis

In [None]:
def analyze_facet_level_performance(combined_results_df, facet_scores):
    """Analyze personality structure at the facet level."""
    
    if combined_results_df.empty or facet_scores.empty:
        print("Insufficient data for facet-level analysis")
        return
    
    print("=== FACET-LEVEL ANALYSIS ===")
    
    # BFI-2 Facet structure
    facet_structure = {
        'Sociability': ['Talkative', 'Extraverted', 'Bold'],
        'Assertiveness': ['Energetic', 'Efficient'],
        'Compassion': ['Sympathetic', 'Warm', 'Kind'],
        'Respectfulness': ['Cooperative'],
        'Organization': ['Organized', 'Systematic'],
        'Productiveness': ['Efficient'],
        'Anxiety': ['Moody', 'Temperamental', 'Touchy'],
        'Depression': ['Withdrawn', 'Quiet'],
        'Intellectual_Curiosity': ['Intellectual', 'Philosophical', 'Complex'],
        'Aesthetic_Sensitivity': ['Creative', 'Imaginative']
    }
    
    # Calculate facet-level correlations for each model
    facet_correlations = {}
    
    for model in combined_results_df['model'].unique():
        for temp in combined_results_df['temperature'].unique():
            for format_type in combined_results_df['format'].unique():
                subset = combined_results_df[
                    (combined_results_df['model'] == model) & 
                    (combined_results_df['temperature'] == temp) &
                    (combined_results_df['format'] == format_type)
                ]
                
                if subset.empty:
                    continue
                
                key = f"{model}_{temp}_{format_type}"
                facet_corr = {}
                
                # Calculate correlations between empirical facets and LLM responses
                for facet_name in facet_scores.columns:
                    if facet_name in facet_structure:
                        related_traits = facet_structure[facet_name]
                        trait_correlations = []
                        
                        for trait in related_traits:
                            if trait in subset.columns:
                                # Align data by participant_id
                                n_participants = min(len(facet_scores), len(subset))
                                empirical_vals = facet_scores[facet_name].iloc[:n_participants]
                                llm_vals = subset[trait].iloc[:n_participants]
                                
                                # Remove NaN values
                                valid_indices = ~(empirical_vals.isna() | llm_vals.isna())
                                if valid_indices.sum() > 5:  # Minimum 5 valid pairs
                                    corr, _ = pearsonr(empirical_vals[valid_indices], 
                                                     llm_vals[valid_indices])
                                    if not np.isnan(corr):
                                        trait_correlations.append(corr)
                        
                        if trait_correlations:
                            facet_corr[facet_name] = np.mean(trait_correlations)
                
                if facet_corr:
                    facet_correlations[key] = facet_corr
    
    # Display top facet correlations
    if facet_correlations:
        print("\nTop Facet-Level Correlations:")
        for key, corrs in list(facet_correlations.items())[:3]:
            print(f"\n{key}:")
            sorted_corrs = sorted(corrs.items(), key=lambda x: abs(x[1]), reverse=True)
            for facet, corr in sorted_corrs[:5]:
                print(f"  {facet}: {corr:.3f}")
    
    # Create facet correlation heatmap
    if facet_correlations:
        # Convert to DataFrame for visualization
        all_facets = set()
        for corrs in facet_correlations.values():
            all_facets.update(corrs.keys())
        
        corr_matrix = pd.DataFrame(index=list(facet_correlations.keys()), 
                                  columns=sorted(all_facets))
        
        for model_key, corrs in facet_correlations.items():
            for facet, corr in corrs.items():
                corr_matrix.loc[model_key, facet] = corr
        
        # Plot heatmap
        plt.figure(figsize=(12, 8))
        sns.heatmap(corr_matrix.astype(float), 
                   annot=True, cmap='RdBu_r', center=0, 
                   fmt='.2f', cbar_kws={'label': 'Correlation'})
        plt.title('Facet-Level Correlations Across Models and Formats')
        plt.ylabel('Model_Temperature_Format')
        plt.xlabel('BFI-2 Facets')
        plt.xticks(rotation=45, ha='right')
        plt.yticks(rotation=0)
        plt.tight_layout()
        plt.savefig('study_3_results/facet_correlations_heatmap.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        return corr_matrix
    
    return None

# Run facet-level analysis
facet_correlation_matrix = analyze_facet_level_performance(combined_results_df, facet_scores)

## 5. Cross-Model Validation

In [None]:
def analyze_cross_model_consistency(combined_results_df):
    """Assess consistency across different LLMs."""
    
    if combined_results_df.empty:
        print("No results for cross-model analysis")
        return
    
    print("=== CROSS-MODEL CONSISTENCY ANALYSIS ===")
    
    trait_names = [col for col in combined_results_df.columns 
                  if col not in ['participant_id', 'model', 'temperature', 'format']]
    
    # 1. Inter-model correlations by format
    print("\n1. Inter-Model Correlations by Format:")
    
    format_correlations = {}
    
    for format_type in combined_results_df['format'].unique():
        format_data = combined_results_df[combined_results_df['format'] == format_type]
        
        models = format_data['model'].unique()
        temps = format_data['temperature'].unique()
        
        correlations = []
        model_pairs = []
        
        for i, (model1, temp1) in enumerate([(m, t) for m in models for t in temps]):
            subset1 = format_data[(format_data['model'] == model1) & 
                                (format_data['temperature'] == temp1)]
            if subset1.empty:
                continue
                
            for j, (model2, temp2) in enumerate([(m, t) for m in models for t in temps]):
                if i >= j:
                    continue
                    
                subset2 = format_data[(format_data['model'] == model2) & 
                                    (format_data['temperature'] == temp2)]
                if subset2.empty:
                    continue
                
                # Calculate average correlation across traits
                trait_correlations = []
                for trait in trait_names:
                    if trait in subset1.columns and trait in subset2.columns:
                        # Align by participant_id
                        merged = pd.merge(subset1[['participant_id', trait]], 
                                        subset2[['participant_id', trait]], 
                                        on='participant_id', suffixes=('_1', '_2'))
                        
                        clean_data = merged.dropna()
                        if len(clean_data) > 3:
                            corr, _ = pearsonr(clean_data[f'{trait}_1'], clean_data[f'{trait}_2'])
                            if not np.isnan(corr):
                                trait_correlations.append(corr)
                
                if trait_correlations:
                    avg_corr = np.mean(trait_correlations)
                    correlations.append(avg_corr)
                    model_pairs.append(f'{model1}_t{temp1} vs {model2}_t{temp2}')
        
        if correlations:
            format_correlations[format_type] = {
                'correlations': correlations,
                'pairs': model_pairs,
                'mean_correlation': np.mean(correlations),
                'std_correlation': np.std(correlations)
            }
            
            print(f"\n{format_type.upper()} Format:")
            print(f"  Mean inter-model correlation: {np.mean(correlations):.3f}")
            print(f"  Std inter-model correlation: {np.std(correlations):.3f}")
            print(f"  Range: {np.min(correlations):.3f} to {np.max(correlations):.3f}")
    
    # 2. Temperature consistency within models
    print("\n2. Temperature Consistency Within Models:")
    
    temp_consistency = {}
    
    for format_type in combined_results_df['format'].unique():
        format_data = combined_results_df[combined_results_df['format'] == format_type]
        
        for model in format_data['model'].unique():
            temp0_data = format_data[(format_data['model'] == model) & 
                                   (format_data['temperature'] == '0.0')]
            temp1_data = format_data[(format_data['model'] == model) & 
                                   (format_data['temperature'] == '1.0')]
            
            if not temp0_data.empty and not temp1_data.empty:
                trait_consistencies = []
                
                for trait in trait_names:
                    merged = pd.merge(temp0_data[['participant_id', trait]], 
                                    temp1_data[['participant_id', trait]], 
                                    on='participant_id', suffixes=('_t0', '_t1'))
                    
                    clean_data = merged.dropna()
                    if len(clean_data) > 3:
                        corr, _ = pearsonr(clean_data[f'{trait}_t0'], clean_data[f'{trait}_t1'])
                        if not np.isnan(corr):
                            trait_consistencies.append(corr)
                
                if trait_consistencies:
                    temp_consistency[f"{format_type}_{model}"] = {
                        'mean_consistency': np.mean(trait_consistencies),
                        'std_consistency': np.std(trait_consistencies),
                        'n_traits': len(trait_consistencies)
                    }
                    
                    print(f"  {format_type}_{model}: {np.mean(trait_consistencies):.3f} "
                         f"(±{np.std(trait_consistencies):.3f})")
    
    # 3. Visualization
    if format_correlations:
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))
        
        # Inter-model correlations
        format_names = list(format_correlations.keys())
        mean_corrs = [format_correlations[f]['mean_correlation'] for f in format_names]
        std_corrs = [format_correlations[f]['std_correlation'] for f in format_names]
        
        axes[0].bar(format_names, mean_corrs, yerr=std_corrs, capsize=5)
        axes[0].set_title('Inter-Model Agreement by Format')
        axes[0].set_ylabel('Mean Correlation')
        axes[0].set_ylim(0, 1)
        
        # Temperature consistency
        if temp_consistency:
            models = list(temp_consistency.keys())
            consistencies = [temp_consistency[m]['mean_consistency'] for m in models]
            
            axes[1].bar(range(len(models)), consistencies)
            axes[1].set_title('Temperature Consistency')
            axes[1].set_ylabel('Mean Correlation')
            axes[1].set_xticks(range(len(models)))
            axes[1].set_xticklabels(models, rotation=45, ha='right')
            axes[1].set_ylim(0, 1)
        
        plt.tight_layout()
        plt.savefig('study_3_results/cross_model_consistency.png', dpi=300, bbox_inches='tight')
        plt.show()
    
    return format_correlations, temp_consistency

# Run cross-model analysis
format_consistency, temp_consistency = analyze_cross_model_consistency(combined_results_df)

## 6. Empirical Validation

In [None]:
def validate_against_empirical_mini_marker(combined_results_df, empirical_data, available_tda):
    """Compare LLM responses with empirical human Mini-Marker data."""
    
    if combined_results_df.empty or empirical_data is None or not available_tda:
        print("Insufficient data for empirical validation")
        return None
    
    print("=== EMPIRICAL VALIDATION ANALYSIS ===")
    print(f"Available empirical Mini-Marker traits: {len(available_tda)}")
    
    # Mini-Marker trait names
    trait_names = [
        'Bashful', 'Bold', 'Careless', 'Cold', 'Complex', 'Cooperative', 'Creative', 'Deep',
        'Disorganized', 'Efficient', 'Energetic', 'Envious', 'Extraverted', 'Fretful', 'Harsh',
        'Imaginative', 'Inefficient', 'Intellectual', 'Jealous', 'Kind', 'Moody', 'Organized',
        'Philosophical', 'Practical', 'Quiet', 'Relaxed', 'Rude', 'Shy', 'Sloppy', 'Sympathetic',
        'Systematic', 'Talkative', 'Temperamental', 'Touchy', 'Uncreative', 'Unenvious',
        'Unintellectual', 'Unsympathetic', 'Warm', 'Withdrawn'
    ]
    
    # Clean empirical data
    empirical_clean = empirical_data.dropna(subset=available_tda)
    
    validation_results = {}
    
    # For each model-format-temperature combination
    for format_type in combined_results_df['format'].unique():
        for model in combined_results_df['model'].unique():
            for temp in combined_results_df['temperature'].unique():
                subset = combined_results_df[
                    (combined_results_df['format'] == format_type) &
                    (combined_results_df['model'] == model) & 
                    (combined_results_df['temperature'] == temp)
                ]
                
                if subset.empty:
                    continue
                
                key = f'{format_type}_{model}_temp{temp}'
                trait_correlations = []
                trait_details = {}
                
                # Compare each trait
                for i, trait in enumerate(trait_names[:len(available_tda)]):
                    if trait in subset.columns:
                        # Get LLM responses
                        llm_responses = subset[['participant_id', trait]].dropna()
                        
                        # Get corresponding empirical data
                        empirical_values = []
                        llm_values = []
                        
                        for _, row in llm_responses.iterrows():
                            participant_id = int(row['participant_id'])
                            if participant_id < len(empirical_clean):
                                emp_value = empirical_clean.iloc[participant_id][available_tda[i]]
                                if pd.notna(emp_value) and pd.notna(row[trait]):
                                    empirical_values.append(emp_value)
                                    llm_values.append(row[trait])
                        
                        # Calculate correlation if we have enough data
                        if len(empirical_values) > 5:
                            corr, p_value = pearsonr(empirical_values, llm_values)
                            
                            if not np.isnan(corr):
                                trait_correlations.append(corr)
                                trait_details[trait] = {
                                    'correlation': corr,
                                    'p_value': p_value,
                                    'n_participants': len(empirical_values),
                                    'empirical_mean': np.mean(empirical_values),
                                    'llm_mean': np.mean(llm_values),
                                    'empirical_std': np.std(empirical_values),
                                    'llm_std': np.std(llm_values)
                                }
                
                # Store validation results
                if trait_correlations:
                    validation_results[key] = {
                        'mean_correlation': np.mean(trait_correlations),
                        'median_correlation': np.median(trait_correlations),
                        'std_correlation': np.std(trait_correlations),
                        'n_traits': len(trait_correlations),
                        'trait_details': trait_details
                    }
    
    # Display results
    if validation_results:
        print("\nEmpirical Validation Results:")
        
        validation_df = pd.DataFrame({k: {
            'mean_corr': v['mean_correlation'],
            'median_corr': v['median_correlation'],
            'std_corr': v['std_correlation'],
            'n_traits': v['n_traits']
        } for k, v in validation_results.items()}).T
        
        print(validation_df.round(3))
        
        # Find best performing combinations
        best_overall = validation_df['mean_corr'].idxmax()
        print(f"\nBest overall performance: {best_overall} (r = {validation_df.loc[best_overall, 'mean_corr']:.3f})")
        
        # Compare formats
        format_performance = validation_df.reset_index()
        format_performance['format'] = format_performance['index'].str.split('_').str[0]
        format_comparison = format_performance.groupby('format')['mean_corr'].agg(['mean', 'std'])
        print("\nFormat Comparison:")
        print(format_comparison.round(3))
        
        # Visualization
        fig, axes = plt.subplots(1, 2, figsize=(15, 6))
        
        # Performance by model-format
        validation_df.reset_index(inplace=True)
        validation_df['format'] = validation_df['index'].str.split('_').str[0]
        validation_df['model'] = validation_df['index'].str.split('_').str[1]
        
        sns.boxplot(data=validation_df, x='format', y='mean_corr', ax=axes[0])
        axes[0].set_title('Empirical Validation by Format')
        axes[0].set_ylabel('Mean Correlation with Human Data')
        
        sns.boxplot(data=validation_df, x='model', y='mean_corr', ax=axes[1])
        axes[1].set_title('Empirical Validation by Model')
        axes[1].set_ylabel('Mean Correlation with Human Data')
        axes[1].tick_params(axis='x', rotation=45)
        
        plt.tight_layout()
        plt.savefig('study_3_results/empirical_validation.png', dpi=300, bbox_inches='tight')
        plt.show()
        
        return validation_results, validation_df
    
    return None, None

# Run empirical validation
validation_results, validation_summary = validate_against_empirical_mini_marker(
    combined_results_df, empirical_data, available_tda
)

## 7. Comprehensive Report Generation

In [None]:
def generate_study3_comprehensive_report(
    combined_results_df, combined_stats_df, format_quality, 
    facet_correlation_matrix, format_consistency, temp_consistency,
    validation_results, experiment_summary
):
    """Generate a comprehensive analysis report for Study 3."""
    
    report = []
    report.append("# Study 3: Multi-Model Facet-Level Personality Analysis Report")
    report.append(f"Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}")
    report.append("\n" + "="*80)
    
    # Executive Summary
    report.append("\n## Executive Summary")
    if experiment_summary:
        report.append(f"- **Study Focus**: Facet-level personality parameter extraction and validation")
        report.append(f"- **Participants Analyzed**: {experiment_summary.get('total_participants', 'Unknown')}")
        report.append(f"- **Models Tested**: {experiment_summary.get('models_tested', [])}")
        report.append(f"- **Formats Compared**: Expanded BFI-2 descriptions vs. Likert-style descriptions")
        report.append(f"- **Temperature Settings**: {experiment_summary.get('temperatures', [])}")
    
    if not combined_results_df.empty:
        total_combinations = len(combined_results_df.groupby(['model', 'temperature', 'format']))
        report.append(f"- **Total Model-Format Combinations**: {total_combinations}")
    
    # Format Comparison Results
    report.append("\n## Format Comparison: Expanded vs. Likert")
    
    if format_quality is not None and not format_quality.empty:
        format_summary = format_quality.groupby('format')[['completion_percentage']].mean()
        report.append("\n### Response Quality by Format:")
        for format_name, stats in format_summary.iterrows():
            report.append(f"- **{format_name.title()} Format**: {stats['completion_percentage']:.1f}% average completion")
        
        best_format = format_summary['completion_percentage'].idxmax()
        report.append(f"\n**Best Performing Format**: {best_format.title()} (higher completion rate)")
    
    # Facet-Level Insights
    report.append("\n## Facet-Level Analysis")
    
    if facet_correlation_matrix is not None and not facet_correlation_matrix.empty:
        # Find best facet correlations
        mean_facet_corrs = facet_correlation_matrix.mean(axis=0, skipna=True).sort_values(ascending=False)
        
        report.append("\n### Most Reliably Captured Facets:")
        for facet, corr in mean_facet_corrs.head(5).items():
            if not pd.isna(corr):
                report.append(f"- **{facet}**: Average correlation = {corr:.3f}")
        
        report.append("\n### Most Variable Facets:")
        for facet, corr in mean_facet_corrs.tail(3).items():
            if not pd.isna(corr):
                report.append(f"- **{facet}**: Average correlation = {corr:.3f}")
    
    # Cross-Model Consistency
    report.append("\n## Cross-Model Consistency")
    
    if format_consistency:
        report.append("\n### Inter-Model Agreement by Format:")
        for format_type, stats in format_consistency.items():
            mean_corr = stats['mean_correlation']
            std_corr = stats['std_correlation']
            report.append(f"- **{format_type.title()} Format**: r = {mean_corr:.3f} (±{std_corr:.3f})")
    
    if temp_consistency:
        report.append("\n### Temperature Consistency:")
        for model_format, stats in temp_consistency.items():
            mean_cons = stats['mean_consistency']
            report.append(f"- **{model_format}**: r = {mean_cons:.3f}")
    
    # Empirical Validation
    if validation_results:
        report.append("\n## Empirical Validation Results")
        
        # Find best performers
        best_combination = max(validation_results.keys(), 
                             key=lambda x: validation_results[x]['mean_correlation'])
        best_performance = validation_results[best_combination]['mean_correlation']
        
        report.append(f"\n**Best Empirical Match**: {best_combination} (r = {best_performance:.3f})")
        
        # Format comparison
        expanded_results = {k: v for k, v in validation_results.items() if k.startswith('expanded')}
        likert_results = {k: v for k, v in validation_results.items() if k.startswith('likert')}
        
        if expanded_results and likert_results:
            expanded_mean = np.mean([v['mean_correlation'] for v in expanded_results.values()])
            likert_mean = np.mean([v['mean_correlation'] for v in likert_results.values()])
            
            report.append(f"\n**Format Performance Comparison**:")
            report.append(f"- Expanded Format: r = {expanded_mean:.3f}")
            report.append(f"- Likert Format: r = {likert_mean:.3f}")
            
            if expanded_mean > likert_mean:
                report.append(f"- **Winner**: Expanded format (+{expanded_mean - likert_mean:.3f})")
            else:
                report.append(f"- **Winner**: Likert format (+{likert_mean - expanded_mean:.3f})")
    
    # Key Findings
    report.append("\n## Key Findings")
    
    findings = []
    
    # Format effectiveness
    if format_quality is not None and not format_quality.empty:
        format_means = format_quality.groupby('format')['completion_percentage'].mean()
        if len(format_means) > 1:
            best_format = format_means.idxmax()
            findings.append(f"**Format Effectiveness**: {best_format.title()} format shows superior response completion")
    
    # Model consistency
    if format_consistency:
        all_corrs = [stats['mean_correlation'] for stats in format_consistency.values()]
        if all_corrs:
            overall_consistency = np.mean(all_corrs)
            findings.append(f"**Model Consistency**: Average inter-model agreement of r = {overall_consistency:.3f}")
    
    # Empirical validation
    if validation_results:
        all_emp_corrs = [v['mean_correlation'] for v in validation_results.values()]
        if all_emp_corrs:
            overall_empirical = np.mean(all_emp_corrs)
            findings.append(f"**Empirical Validity**: Average correlation with human data of r = {overall_empirical:.3f}")
    
    for finding in findings:
        report.append(f"\n- {finding}")
    
    # Recommendations
    report.append("\n## Recommendations")
    
    recommendations = [
        "**Personality Assessment**: Use expanded format descriptions for more comprehensive personality capture",
        "**Model Selection**: Consider ensemble approaches combining multiple models for enhanced reliability",
        "**Temperature Settings**: Use temperature = 0.0 for consistent personality assessment applications",
        "**Facet-Level Analysis**: Focus on consistently captured facets for reliable personality insights",
        "**Future Research**: Investigate cultural and demographic factors in personality simulation"
    ]
    
    for rec in recommendations:
        report.append(f"\n- {rec}")
    
    # Technical Notes
    report.append("\n## Technical Notes")
    technical_notes = [
        "Facet-level analysis based on BFI-2 10-facet structure",
        "Correlations computed using Pearson's correlation coefficient",
        "Statistical significance testing performed where applicable",
        "Missing data handled through pairwise deletion",
        "Empirical validation against Mini-Marker 40-item assessment"
    ]
    
    for note in technical_notes:
        report.append(f"- {note}")
    
    return "\n".join(report)

# Generate comprehensive report
study3_report = generate_study3_comprehensive_report(
    combined_results_df, combined_stats_df, format_quality,
    facet_correlation_matrix, format_consistency, temp_consistency,
    validation_results, experiment_summary
)

print(study3_report)

# Save report to file
with open('study_3_results/comprehensive_analysis_report.md', 'w') as f:
    f.write(study3_report)

print("\n" + "="*80)
print("STUDY 3 ANALYSIS COMPLETE")
print("Report saved to: study_3_results/comprehensive_analysis_report.md")
print("="*80)

## 8. Export Results for Further Analysis

In [None]:
def export_study3_results(
    combined_results_df, combined_stats_df, format_quality,
    facet_correlation_matrix, validation_results
):
    """Export all Study 3 analysis results in formats suitable for further research."""
    
    output_dir = Path('study_3_results')
    output_dir.mkdir(exist_ok=True)
    
    print("Exporting Study 3 analysis results...")
    
    # 1. Export main results DataFrames
    if not combined_results_df.empty:
        combined_results_df.to_csv(output_dir / 'results_combined_format.csv', index=False)
        print(f"✓ Combined results: {len(combined_results_df)} rows")
        
        # Separate by format
        for format_type in combined_results_df['format'].unique():
            format_data = combined_results_df[combined_results_df['format'] == format_type]
            format_data.to_csv(output_dir / f'results_{format_type}_format.csv', index=False)
            print(f"✓ {format_type.title()} format results: {len(format_data)} rows")
    
    if not combined_stats_df.empty:
        combined_stats_df.to_csv(output_dir / 'trait_statistics_combined.csv', index=False)
        print(f"✓ Combined trait statistics: {len(combined_stats_df)} rows")
    
    # 2. Export format comparison results
    if format_quality is not None and not format_quality.empty:
        format_quality.to_csv(output_dir / 'format_comparison_metrics.csv', index=False)
        print(f"✓ Format comparison metrics: {len(format_quality)} rows")
    
    # 3. Export facet correlation matrix
    if facet_correlation_matrix is not None and not facet_correlation_matrix.empty:
        facet_correlation_matrix.to_csv(output_dir / 'facet_correlations_matrix.csv')
        print(f"✓ Facet correlation matrix: {facet_correlation_matrix.shape}")
    
    # 4. Export validation results
    if validation_results:
        validation_flat = []
        for combination, validation in validation_results.items():
            base_record = {
                'combination': combination,
                'format': combination.split('_')[0],
                'model': combination.split('_')[1],
                'temperature': combination.split('_')[2],
                'mean_correlation': validation['mean_correlation'],
                'median_correlation': validation['median_correlation'],
                'std_correlation': validation['std_correlation'],
                'n_traits': validation['n_traits']
            }
            
            # Add trait-level details
            for trait, details in validation['trait_details'].items():
                record = base_record.copy()
                record.update({
                    'trait': trait,
                    'trait_correlation': details['correlation'],
                    'trait_p_value': details['p_value'],
                    'trait_n_participants': details['n_participants'],
                    'empirical_mean': details['empirical_mean'],
                    'llm_mean': details['llm_mean'],
                    'empirical_std': details['empirical_std'],
                    'llm_std': details['llm_std']
                })
                validation_flat.append(record)
        
        validation_df = pd.DataFrame(validation_flat)
        validation_df.to_csv(output_dir / 'empirical_validation_details.csv', index=False)
        print(f"✓ Validation details: {len(validation_df)} rows")
    
    # 5. Create analysis summary
    summary_stats = {
        'analysis_timestamp': pd.Timestamp.now().isoformat(),
        'study': 'Study 3 - Facet-Level Parameter Extraction',
        'n_participants': len(combined_results_df['participant_id'].unique()) if not combined_results_df.empty else 0,
        'n_models': len(combined_results_df['model'].unique()) if not combined_results_df.empty else 0,
        'n_temperatures': len(combined_results_df['temperature'].unique()) if not combined_results_df.empty else 0,
        'n_formats': len(combined_results_df['format'].unique()) if not combined_results_df.empty else 0,
        'n_traits': len([col for col in combined_results_df.columns if col not in ['participant_id', 'model', 'temperature', 'format']]) if not combined_results_df.empty else 0
    }
    
    with open(output_dir / 'analysis_metadata.json', 'w') as f:
        json.dump(summary_stats, f, indent=2)
    print(f"✓ Analysis metadata saved")
    
    # 6. Create R-ready format
    if not combined_results_df.empty:
        trait_columns = [col for col in combined_results_df.columns 
                        if col not in ['participant_id', 'model', 'temperature', 'format']]
        r_format = combined_results_df.melt(
            id_vars=['participant_id', 'model', 'temperature', 'format'],
            value_vars=trait_columns,
            var_name='trait', value_name='response'
        )
        r_format.to_csv(output_dir / 'results_r_format.csv', index=False)
        print(f"✓ R-ready format: {len(r_format)} rows")
    
    print(f"\n📁 All Study 3 results exported to: {output_dir}/")
    print("Files available for further analysis:")
    for file in sorted(output_dir.glob('*.csv')):
        print(f"  - {file.name}")
    for file in sorted(output_dir.glob('*.json')):
        print(f"  - {file.name}")
    for file in sorted(output_dir.glob('*.png')):
        print(f"  - {file.name}")

# Export all results
export_study3_results(
    combined_results_df, combined_stats_df, format_quality,
    facet_correlation_matrix, validation_results
)

print("\n🎉 STUDY 3 MULTI-MODEL ANALYSIS COMPLETE! 🎉")
print("\nNext steps:")
print("1. Review the comprehensive report: study_3_results/comprehensive_analysis_report.md")
print("2. Examine visualizations in study_3_results/")
print("3. Use exported CSV files for statistical analysis in R or Python")
print("4. Compare with Studies 2 and 4 using similar analysis frameworks")
print("5. Conduct factor analysis using R scripts for personality structure validation")