In [None]:
import pandas as pd
df_poe = pd.concat([
    pd.read_csv(f'data/results_poe_{d}.csv')
    for d in ['commonsenseqa', 'socialqa', 'socialsupport', 'strategyqa']
])
df_identity = pd.read_csv("data/agent_identities.csv")


Columns (11) have mixed types. Specify dtype option on import or set low_memory=False.



## Extracting one profile for role

In [98]:
import os

roles = ["psychologist", "project-manager", "final_decisor"]

# Create output directory
output_dir = "expert_outputs"
os.makedirs(output_dir, exist_ok=True)

# Print unique expert IDs
print(df_identity["expert_id"].unique())

for i, role in enumerate(roles):
    print(f"\n--- Expert ID: {role} ---")
    
    subset = df_identity[df_identity["role"] == role]
    if subset.empty:
        continue
        
    row = subset.sample(n=1, random_state=42 + i).iloc[0]
    
    # Print to console
    print(row["description"])
    
    # Write to file
    file_path = os.path.join(output_dir, f"{role}.txt")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(f"Role: {row['role']}\n")
        f.write(f"Model: {row.get('model', 'N/A')}\n")
        f.write(f"Framework: {row.get('desc_framework', 'N/A')}\n")
        f.write(f"Dataset: {row.get('dataset', 'N/A')}\n")
        f.write("\nDescription:\n")
        f.write(row["description"])

# Handle other experts
df_experts = df_identity[~df_identity["role"].isin(roles)]

if not df_experts.empty:
    row = df_experts.sample(n=1, random_state=42).iloc[0]
    
    print("\n--- Random Other Expert ---")
    print(row["description"])
    
    file_path = os.path.join(output_dir, "other_expert.txt")
    with open(file_path, "w", encoding="utf-8") as f:
        f.write(f"Role: {row['role']}\n")
        f.write(f"Model: {row.get('model', 'N/A')}\n")
        f.write(f"Framework: {row.get('desc_framework', 'N/A')}\n")
        f.write(f"Dataset: {row.get('dataset', 'N/A')}\n")
        f.write("\nDescription:\n")
        f.write(row["description"])

[-3 -2 -1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20
 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39]

--- Expert ID: psychologist ---
Dr. Alistair Finch is a 58-year-old forensic psychologist specializing in behavioral analysis and individual profiling. His career began with a strong emphasis on the ‘Initiative vs. Guilt’ stage (Erikson’s Stage 3), fostered during his early work with juvenile delinquents. He internalized a deep-seated belief in the potential for rehabilitation, a drive to instill purpose and direction, often pushing himself to aggressively pursue solutions, sometimes bordering on self-righteousness. This early experience left him with a persistent, though carefully managed, fear of failure and a tendency to hold himself and others to impossibly high standards – a residue of that initial, fervent belief. nnAs he progressed through ‘Industry vs. Inferiority’ (Stage 4), Finch honed his skills through meticulous observation and data analysis.

## Generate unique identifier and merge

In [99]:
import hashlib

# Function to hash a row based on key columns   
key_columns = ['exp', 'dataset', 'desc_framework', 'model', 'expert_id']

def make_hash(row):
    row_str = "|".join(str(row[col]) for col in key_columns)    
    return hashlib.md5(row_str.encode()).hexdigest()

df_identity['synthetic_id'] = df_identity.apply(make_hash, axis=1)
cols = ["synthetic_id"] + [c for c in df_identity.columns if c != "synthetic_id"]
df_identity = df_identity[cols]

df_poe['synthetic_id'] = df_poe.apply(make_hash, axis=1)
cols = ["synthetic_id"] + [c for c in df_poe.columns if c != "synthetic_id"]
df_poe = df_poe[cols]

In [100]:
# only experts, average accuracy
df_experts = df_poe[df_poe["expert_id"] != -3].copy()
df_experts["match"] = df_experts["final_answer_norm"].astype(str) == df_experts["golden_answer_norm"].astype(str)
df_experts = df_experts.groupby(["synthetic_id","model", "dataset", "desc_framework"])["match"].mean().reset_index()

In [101]:
# add identities info
df_merged = pd.merge(df_experts, df_identity, on="synthetic_id", how="left", suffixes=("", "_drop"))
df_merged = df_merged.filter(regex="^(?!.*_drop)")
df_merged.shape[0]

5185

In [102]:
df_merged.head()

Unnamed: 0,synthetic_id,model,dataset,desc_framework,match,exp,expert_id,role,name,description,file_path,cleaned_name,first_name,last_name,title_name,predicted_gender,predicted_country,predicted_age
0,0000cf8793c955db7984ce9fe8559328,llama-3.1-8b-instruct,SocialSupport,Dual-Process Theory,0.42,Experiments,8.0,Psychology of Communication,Dr. Rachel Kim,A renowned expert in the field of Psychology o...,experts.json,Rachel Kim,Rachel,Kim,Dr.,female,KR,52.0
1,00020fadf292f792fd2b369a1206d147,llama-3.3-70b-instruct,StrategyQA,Myers-Briggs Type Indicator,0.76,Experiments,3.0,Philosophy,Dr. Elena Vasquez,"The Philosopher is an INTP (Introverted, Intui...",experts.json,Elena Vasquez,Elena,Vasquez,Dr.,female,PE,52.0
2,000b67dafef78e4c64510a2e0370132f,gemma-3-12b-it,CommonsenseQA,Erikson's Psychosocial Stages,0.83,Experiments,0.0,Linguistics,Dr. Vivian Holloway,"Dr. Vivian Holloway, a renowned historical lin...",experts.json,Vivian Holloway,Vivian,Holloway,Dr.,female,GB,66.0
3,002944c7a31dd5b1e28e1bdf3943e838,gemma-3-27b-it,CommonsenseQA,no-description,0.86,,,,,,,,,,,,,
4,0033f77bbe92c81036b3fd52a0d96fd4,gemma-3-4b-it,StrategyQA,no-description,0.67,,,,,,,,,,,,,


In [103]:
df_merged["expert_id"].unique()

array([ 8.,  3.,  0., nan,  4.,  9.,  7., 16., 10.,  2.,  1., 12.,  5.,
       19.,  6., 11., 22., 13., 36., 14., 25., 18., 26., 15., 35., 17.,
       24., 34., 28., 30., 20., 39., 21., 32., 23., 31., 29., 27., 37.,
       33., 38.])

In [104]:
df_merged["name"].isna().sum()
df_merged[df_merged["description"].isna()][["model", "dataset", "desc_framework"]].value_counts()
df_merged[df_merged["name"].isna()][["model", "dataset", "desc_framework"]].value_counts()   

model                   dataset        desc_framework
gemma-3-12b-it          SocialQA       no-description    16
gemma-3-4b-it           CommonsenseQA  no-description    16
gemma-3-27b-it          StrategyQA     no-description    16
gemma-3-12b-it          StrategyQA     no-description    15
                        SocialSupport  no-description    15
llama-3.1-8b-instruct   StrategyQA     no-description    15
gemma-3-27b-it          SocialSupport  no-description    15
gemma-3-4b-it           StrategyQA     no-description    15
gemma-3-27b-it          SocialQA       no-description    14
llama-3.1-8b-instruct   SocialQA       no-description    14
gemma-3-4b-it           SocialQA       no-description    13
llama-3.1-8b-instruct   SocialSupport  no-description    12
gemma-3-27b-it          CommonsenseQA  no-description    12
llama-3.2-3b-instruct   SocialSupport  no-description    12
gemma-3-4b-it           SocialSupport  no-description    11
llama-3.2-3b-instruct   SocialQA       no-desc

In [105]:
df_merged["dataset"].unique()

array(['SocialSupport', 'StrategyQA', 'CommonsenseQA', 'SocialQA'],
      dtype=object)

In [106]:
# KEEP ONLY SUBSET OF MODELS
models_to_keep = [
    'gemma-3-12b-it',
    'gemma-3-27b-it', 
    'gemma-3-4b-it',
    'llama-3.1-8b-instruct',
    'llama-3.2-3b-instruct',
    'llama-3.3-70b-instruct',
    'llama-4-scout',
    'mistral-nemo',
    'nova-micro-v1',
    # 'qwen3-32b',
    # 'qwen3-8b'
]

df_merged = df_merged[df_merged["model"].isin(models_to_keep)]
df_merged.shape[0]

5185

In [107]:
df_merged.head()

Unnamed: 0,synthetic_id,model,dataset,desc_framework,match,exp,expert_id,role,name,description,file_path,cleaned_name,first_name,last_name,title_name,predicted_gender,predicted_country,predicted_age
0,0000cf8793c955db7984ce9fe8559328,llama-3.1-8b-instruct,SocialSupport,Dual-Process Theory,0.42,Experiments,8.0,Psychology of Communication,Dr. Rachel Kim,A renowned expert in the field of Psychology o...,experts.json,Rachel Kim,Rachel,Kim,Dr.,female,KR,52.0
1,00020fadf292f792fd2b369a1206d147,llama-3.3-70b-instruct,StrategyQA,Myers-Briggs Type Indicator,0.76,Experiments,3.0,Philosophy,Dr. Elena Vasquez,"The Philosopher is an INTP (Introverted, Intui...",experts.json,Elena Vasquez,Elena,Vasquez,Dr.,female,PE,52.0
2,000b67dafef78e4c64510a2e0370132f,gemma-3-12b-it,CommonsenseQA,Erikson's Psychosocial Stages,0.83,Experiments,0.0,Linguistics,Dr. Vivian Holloway,"Dr. Vivian Holloway, a renowned historical lin...",experts.json,Vivian Holloway,Vivian,Holloway,Dr.,female,GB,66.0
3,002944c7a31dd5b1e28e1bdf3943e838,gemma-3-27b-it,CommonsenseQA,no-description,0.86,,,,,,,,,,,,,
4,0033f77bbe92c81036b3fd52a0d96fd4,gemma-3-4b-it,StrategyQA,no-description,0.67,,,,,,,,,,,,,


## Start Analysis

In [108]:
"""
RQ3 Profile Analysis Script
Analyzes Psychologist Agent generated profiles for:
- RQ3.3: Demographic patterns (gender, age, nationality)
- RQ3.4: Correlation with performance
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from collections import Counter

# Set style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

def load_data(filepath):
    """Load the profile dataframe."""
    df = pd.read_csv(filepath)
    print(f"Loaded {len(df)} profiles")
    print(f"Columns: {df.columns.tolist()}")
    return df


# =============================================================================
# RQ3.3: DEMOGRAPHIC ANALYSIS
# =============================================================================

def analyze_gender_distribution(df):
    """Analyze gender distribution overall and by framework."""
    
    print("\n" + "="*60)
    print("GENDER DISTRIBUTION ANALYSIS")
    print("="*60)
    
    # Overall distribution
    gender_counts = df['predicted_gender'].value_counts()
    gender_pct = df['predicted_gender'].value_counts(normalize=True) * 100
    
    print("\n--- Overall Gender Distribution ---")
    for gender in gender_counts.index:
        print(f"  {gender}: {gender_counts[gender]} ({gender_pct[gender]:.1f}%)")
    
    # By framework
    print("\n--- Gender Distribution by Framework ---")
    gender_by_fw = pd.crosstab(df['desc_framework'], df['predicted_gender'], normalize='index') * 100
    print(gender_by_fw.round(1))
    
    # Chi-square test for independence
    contingency = pd.crosstab(df['desc_framework'], df['predicted_gender'])
    chi2, p_value, dof, expected = stats.chi2_contingency(contingency)
    print(f"\nChi-square test: χ² = {chi2:.2f}, p = {p_value:.4f}, dof = {dof}")
    if p_value < 0.05:
        print("→ Significant association between framework and gender distribution")
    else:
        print("→ No significant association between framework and gender distribution")
    
    return gender_by_fw, (chi2, p_value)


def analyze_age_distribution(df):
    """Analyze age distribution overall and by framework."""
    
    print("\n" + "="*60)
    print("AGE DISTRIBUTION ANALYSIS")
    print("="*60)
    
    # Overall statistics
    age_stats = df['predicted_age'].describe()
    print("\n--- Overall Age Statistics ---")
    print(f"  Mean: {age_stats['mean']:.1f}")
    print(f"  Std:  {age_stats['std']:.1f}")
    print(f"  Min:  {age_stats['min']:.0f}")
    print(f"  Max:  {age_stats['max']:.0f}")
    print(f"  Median: {age_stats['50%']:.0f}")
    
    # By framework
    print("\n--- Age by Framework ---")
    age_by_fw = df.groupby('desc_framework')['predicted_age'].agg(['mean', 'std', 'median'])
    age_by_fw = age_by_fw.sort_values('mean', ascending=False)
    print(age_by_fw.round(1))
    
    # ANOVA test
    frameworks = df['desc_framework'].unique()
    groups = [df[df['desc_framework'] == fw]['predicted_age'].dropna() for fw in frameworks]
    f_stat, p_value = stats.f_oneway(*groups)
    print(f"\nANOVA test: F = {f_stat:.2f}, p = {p_value:.4f}")
    if p_value < 0.05:
        print("→ Significant difference in age across frameworks")
    else:
        print("→ No significant difference in age across frameworks")
    
    return age_by_fw, (f_stat, p_value)


def analyze_nationality_distribution(df, top_n=15):
    """Analyze nationality distribution."""
    
    print("\n" + "="*60)
    print("NATIONALITY DISTRIBUTION ANALYSIS")
    print("="*60)
    
    # Overall distribution (top N)
    nationality_counts = df['predicted_country'].value_counts()
    nationality_pct = df['predicted_country'].value_counts(normalize=True) * 100
    
    print(f"\n--- Top {top_n} Nationalities ---")
    for i, (nat, count) in enumerate(nationality_counts.head(top_n).items()):
        print(f"  {i+1}. {nat}: {count} ({nationality_pct[nat]:.1f}%)")
    
    # Concentration metric
    top5_pct = nationality_pct.head(5).sum()
    print(f"\nTop 5 nationalities account for {top5_pct:.1f}% of all profiles")
    
    # US dominance by framework
    df['is_us'] = df['predicted_country'] == 'United States'
    us_by_fw = df.groupby('desc_framework')['is_us'].mean() * 100
    print("\n--- US Representation by Framework ---")
    print(us_by_fw.sort_values(ascending=False).round(1))
    
    return nationality_counts, us_by_fw


def analyze_gender_by_model(df):
    """Analyze if different models show different gender biases."""
    
    print("\n" + "="*60)
    print("GENDER DISTRIBUTION BY MODEL")
    print("="*60)
    
    gender_by_model = pd.crosstab(df['model'], df['predicted_gender'], normalize='index') * 100
    print(gender_by_model.round(1))
    
    # Chi-square test
    contingency = pd.crosstab(df['model'], df['predicted_gender'])
    chi2, p_value, dof, expected = stats.chi2_contingency(contingency)
    print(f"\nChi-square test: χ² = {chi2:.2f}, p = {p_value:.4f}")
    
    return gender_by_model


def analyze_gender_by_dataset(df):
    """Analyze if different datasets show different gender distributions."""
    
    print("\n" + "="*60)
    print("GENDER DISTRIBUTION BY DATASET")
    print("="*60)
    
    gender_by_dataset = pd.crosstab(df['dataset'], df['predicted_gender'], normalize='index') * 100
    print(gender_by_dataset.round(1))
    
    return gender_by_dataset


# =============================================================================
# RQ3.4: PERFORMANCE CORRELATION
# =============================================================================

def analyze_gender_performance(df):
    """Analyze correlation between gender and task performance."""
    
    print("\n" + "="*60)
    print("GENDER vs PERFORMANCE ANALYSIS")
    print("="*60)
    
    # Overall accuracy by gender
    acc_by_gender = df.groupby('predicted_gender')['match'].agg(['mean', 'std', 'count'])
    acc_by_gender.columns = ['accuracy', 'std', 'n']
    acc_by_gender['accuracy'] = acc_by_gender['accuracy'] * 100
    acc_by_gender['std'] = acc_by_gender['std'] * 100
    print("\n--- Accuracy by Gender ---")
    print(acc_by_gender.round(2))
    
    # Statistical test
    male_acc = df[df['predicted_gender'] == 'male']['match']
    female_acc = df[df['predicted_gender'] == 'female']['match']
    
    if len(male_acc) > 0 and len(female_acc) > 0:
        t_stat, p_value = stats.ttest_ind(male_acc, female_acc)
        print(f"\nT-test (male vs female): t = {t_stat:.3f}, p = {p_value:.4f}")
        
        # Effect size (Cohen's d)
        cohens_d = (male_acc.mean() - female_acc.mean()) / np.sqrt(
            ((len(male_acc)-1)*male_acc.std()**2 + (len(female_acc)-1)*female_acc.std()**2) / 
            (len(male_acc) + len(female_acc) - 2)
        )
        print(f"Cohen's d: {cohens_d:.3f}")
    
    # By dataset
    print("\n--- Accuracy by Gender × Dataset ---")
    acc_gender_dataset = df.groupby(['dataset', 'predicted_gender'])['match'].mean().unstack() * 100
    print(acc_gender_dataset.round(1))
    
    return acc_by_gender, acc_gender_dataset


def analyze_age_performance(df):
    """Analyze correlation between age and task performance."""
    
    print("\n" + "="*60)
    print("AGE vs PERFORMANCE ANALYSIS")
    print("="*60)
    
    # Correlation
    corr, p_value = stats.pearsonr(df['predicted_age'].dropna(), 
                                    df.loc[df['predicted_age'].notna(), 'match'])
    print(f"\nPearson correlation (age vs accuracy): r = {corr:.3f}, p = {p_value:.4f}")
    
    # Bin age into groups
    df['age_group'] = pd.cut(df['predicted_age'], 
                              bins=[0, 35, 45, 55, 65, 100],
                              labels=['<35', '35-44', '45-54', '55-64', '65+'])
    
    acc_by_age = df.groupby('age_group')['match'].agg(['mean', 'count'])
    acc_by_age.columns = ['accuracy', 'n']
    acc_by_age['accuracy'] = acc_by_age['accuracy'] * 100
    print("\n--- Accuracy by Age Group ---")
    print(acc_by_age.round(1))
    
    # ANOVA
    groups = [df[df['age_group'] == ag]['match'].dropna() for ag in df['age_group'].dropna().unique()]
    groups = [g for g in groups if len(g) > 0]
    if len(groups) > 1:
        f_stat, p_value_anova = stats.f_oneway(*groups)
        print(f"\nANOVA (age groups): F = {f_stat:.2f}, p = {p_value_anova:.4f}")
    
    return corr, acc_by_age


def analyze_nationality_performance(df, top_n=10):
    """Analyze correlation between nationality and task performance."""
    
    print("\n" + "="*60)
    print("NATIONALITY vs PERFORMANCE ANALYSIS")
    print("="*60)
    
    # Get top nationalities
    top_nationalities = df['predicted_country'].value_counts().head(top_n).index.tolist()
    df_top = df[df['predicted_country'].isin(top_nationalities)]
    
    acc_by_nat = df_top.groupby('predicted_country')['match'].agg(['mean', 'count'])
    acc_by_nat.columns = ['accuracy', 'n']
    acc_by_nat['accuracy'] = acc_by_nat['accuracy'] * 100
    acc_by_nat = acc_by_nat.sort_values('accuracy', ascending=False)
    
    print(f"\n--- Accuracy by Nationality (Top {top_n}) ---")
    print(acc_by_nat.round(1))
    
    # ANOVA
    groups = [df_top[df_top['predicted_country'] == nat]['match'] for nat in top_nationalities]
    f_stat, p_value = stats.f_oneway(*groups)
    print(f"\nANOVA: F = {f_stat:.2f}, p = {p_value:.4f}")
    
    return acc_by_nat


# =============================================================================
# VISUALIZATION FUNCTIONS
# =============================================================================

def plot_gender_by_framework(df, save_path=None):
    """Plot gender distribution by framework."""
    
    fig, ax = plt.subplots(figsize=(12, 6))
    
    gender_by_fw = pd.crosstab(df['desc_framework'], df['predicted_gender'], normalize='index') * 100
    gender_by_fw = gender_by_fw.sort_values('female', ascending=True)
    
    gender_by_fw.plot(kind='barh', stacked=True, ax=ax, color=['#4C72B0', '#DD8452'])
    
    ax.set_xlabel('Percentage (%)')
    ax.set_ylabel('Framework')
    ax.set_title('Gender Distribution by Psychological Framework')
    ax.legend(title='Gender', loc='lower right')
    ax.axvline(x=50, color='black', linestyle='--', alpha=0.5, label='Parity')
    
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    return fig


def plot_age_by_framework(df, save_path=None):
    """Plot age distribution by framework."""
    
    fig, ax = plt.subplots(figsize=(12, 6))
    
    # Order by mean age
    order = df.groupby('desc_framework')['predicted_age'].mean().sort_values().index
    
    sns.boxplot(data=df, y='desc_framework', x='predicted_age', order=order, ax=ax)
    ax.set_xlabel('Predicted Age')
    ax.set_ylabel('Framework')
    ax.set_title('Age Distribution by Psychological Framework')
    
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    return fig


def plot_gender_performance_by_dataset(df, save_path=None):
    """Plot accuracy by gender across datasets."""
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    acc_data = df.groupby(['dataset', 'predicted_gender'])['match'].mean().unstack() * 100
    
    x = np.arange(len(acc_data.index))
    width = 0.35
    
    bars1 = ax.bar(x - width/2, acc_data['female'], width, label='Female', color='#DD8452')
    bars2 = ax.bar(x + width/2, acc_data['male'], width, label='Male', color='#4C72B0')
    
    ax.set_xlabel('Dataset')
    ax.set_ylabel('Accuracy (%)')
    ax.set_title('Task Performance by Profile Gender')
    ax.set_xticks(x)
    ax.set_xticklabels(acc_data.index, rotation=45, ha='right')
    ax.legend()
    ax.set_ylim(0, 100)
    
    # Add value labels
    for bar in bars1:
        height = bar.get_height()
        ax.annotate(f'{height:.1f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3), textcoords="offset points",
                    ha='center', va='bottom', fontsize=8)
    for bar in bars2:
        height = bar.get_height()
        ax.annotate(f'{height:.1f}',
                    xy=(bar.get_x() + bar.get_width() / 2, height),
                    xytext=(0, 3), textcoords="offset points",
                    ha='center', va='bottom', fontsize=8)
    
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    return fig


def plot_age_performance_scatter(df, save_path=None):
    """Plot age vs performance scatter with regression line."""
    
    fig, ax = plt.subplots(figsize=(10, 6))
    
    # Aggregate by age (to avoid overplotting)
    age_acc = df.groupby('predicted_age')['match'].mean() * 100
    age_counts = df.groupby('predicted_age')['match'].count()
    
    scatter = ax.scatter(age_acc.index, age_acc.values, 
                         s=age_counts.values/10, alpha=0.6, c='#4C72B0')
    
    # Regression line
    z = np.polyfit(df['predicted_age'].dropna(), 
                   df.loc[df['predicted_age'].notna(), 'match'] * 100, 1)
    p = np.poly1d(z)
    x_line = np.linspace(df['predicted_age'].min(), df['predicted_age'].max(), 100)
    ax.plot(x_line, p(x_line), "r--", alpha=0.8, label=f'Trend line')
    
    ax.set_xlabel('Predicted Age')
    ax.set_ylabel('Accuracy (%)')
    ax.set_title('Task Performance by Profile Age')
    ax.legend()
    
    # Add correlation annotation
    corr, p_val = stats.pearsonr(df['predicted_age'].dropna(), 
                                  df.loc[df['predicted_age'].notna(), 'match'])
    ax.annotate(f'r = {corr:.3f}, p = {p_val:.3f}', 
                xy=(0.05, 0.95), xycoords='axes fraction',
                fontsize=10, ha='left', va='top',
                bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))
    
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    return fig


def plot_demographic_summary(df, save_path=None):
    """Create a summary figure with key demographic findings."""
    
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    
    # 1. Overall gender distribution (pie)
    ax1 = axes[0, 0]
    gender_counts = df['predicted_gender'].value_counts()
    ax1.pie(gender_counts.values, labels=gender_counts.index, autopct='%1.1f%%',
            colors=['#DD8452', '#4C72B0'], startangle=90)
    ax1.set_title('Overall Gender Distribution')
    
    # 2. Age histogram
    ax2 = axes[0, 1]
    ax2.hist(df['predicted_age'].dropna(), bins=20, color='#4C72B0', edgecolor='white', alpha=0.7)
    ax2.axvline(df['predicted_age'].mean(), color='red', linestyle='--', label=f"Mean: {df['predicted_age'].mean():.1f}")
    ax2.set_xlabel('Age')
    ax2.set_ylabel('Count')
    ax2.set_title('Age Distribution of Generated Profiles')
    ax2.legend()
    
    # 3. Top nationalities
    ax3 = axes[1, 0]
    top_nat = df['predicted_country'].value_counts().head(10)
    top_nat.plot(kind='barh', ax=ax3, color='#4C72B0')
    ax3.set_xlabel('Count')
    ax3.set_title('Top 10 Nationalities')
    ax3.invert_yaxis()
    
    # 4. Gender by framework (heatmap style)
    ax4 = axes[1, 1]
    gender_by_fw = pd.crosstab(df['desc_framework'], df['predicted_gender'], normalize='index') * 100
    female_pct = gender_by_fw['female'].sort_values()
    ax4.barh(female_pct.index, female_pct.values, color='#DD8452')
    ax4.axvline(x=50, color='black', linestyle='--', alpha=0.5)
    ax4.set_xlabel('Female %')
    ax4.set_title('Female Representation by Framework')
    ax4.set_xlim(0, 100)
    
    plt.tight_layout()
    if save_path:
        plt.savefig(save_path, dpi=300, bbox_inches='tight')
    plt.show()
    
    return fig


# =============================================================================
# MAIN ANALYSIS RUNNER
# =============================================================================

def run_full_analysis(df):
    """Run the complete RQ3 analysis pipeline."""
    
    print("\n" + "="*70)
    print("RQ3 PROFILE ANALYSIS - FULL REPORT")
    print("="*70)
    
    results = {}
    
    # RQ3.3: Demographic Analysis
    print("\n\n>>> RQ3.3: DEMOGRAPHIC PATTERNS AND BIAS <<<\n")
    
    results['gender_by_fw'], results['gender_chi2'] = analyze_gender_distribution(df)
    results['age_by_fw'], results['age_anova'] = analyze_age_distribution(df)
    results['nationality'], results['us_by_fw'] = analyze_nationality_distribution(df)
    results['gender_by_model'] = analyze_gender_by_model(df)
    results['gender_by_dataset'] = analyze_gender_by_dataset(df)
    
    # RQ3.4: Performance Correlation
    print("\n\n>>> RQ3.4: PROFILE CHARACTERISTICS AND PERFORMANCE <<<\n")
    
    results['gender_performance'], results['gender_perf_dataset'] = analyze_gender_performance(df)
    results['age_corr'], results['age_performance'] = analyze_age_performance(df)
    results['nationality_performance'] = analyze_nationality_performance(df)
    
    # Summary
    print("\n" + "="*70)
    print("SUMMARY OF KEY FINDINGS")
    print("="*70)
    
    gender_pct = df['predicted_gender'].value_counts(normalize=True) * 100
    print(f"\n1. Gender Imbalance: {gender_pct.get('male', 0):.1f}% male vs {gender_pct.get('female', 0):.1f}% female")
    
    print(f"2. Mean Age: {df['predicted_age'].mean():.1f} years (SD = {df['predicted_age'].std():.1f})")
    
    us_pct = (df['predicted_country'] == 'United States').mean() * 100
    print(f"3. US Dominance: {us_pct:.1f}% of profiles are American")
    
    chi2, p = results['gender_chi2']
    print(f"4. Framework-Gender Association: χ² = {chi2:.2f}, p = {p:.4f}")
    
    return results


# =============================================================================
# USAGE EXAMPLE
# =============================================================================

if __name__ == "__main__":
    # Load your data
    df = df_merged.copy()
    df = df[~df["desc_framework"].isin(["no-description"])]
    run_full_analysis(df)
    # Or create sample data for testing
    print("To run the analysis, load your dataframe and call:")
    print("  results = run_full_analysis(df)")
    print("\nFor visualizations:")
    print("  plot_gender_by_framework(df, 'gender_by_framework.png')")
    print("  plot_age_by_framework(df, 'age_by_framework.png')")
    print("  plot_gender_performance_by_dataset(df, 'gender_performance.png')")
    print("  plot_age_performance_scatter(df, 'age_performance.png')")
    print("  plot_demographic_summary(df, 'demographic_summary.png')")


RQ3 PROFILE ANALYSIS - FULL REPORT


>>> RQ3.3: DEMOGRAPHIC PATTERNS AND BIAS <<<


GENDER DISTRIBUTION ANALYSIS

--- Overall Gender Distribution ---
  female: 3099 (64.4%)
  male: 1409 (29.3%)
  uncertain: 215 (4.5%)
  nonbinary: 90 (1.9%)

--- Gender Distribution by Framework ---
predicted_gender                 female  male  nonbinary  uncertain
desc_framework                                                     
Big Five Personality Traits        43.1  54.8        1.6        0.5
Cognitive Behavioral Theory        60.4  33.0        2.6        4.0
Cognitive Load Theory              65.2  25.7        1.9        7.2
Dual-Process Theory                80.7  15.7        0.3        3.3
Enneagram of Personality Traits    49.7  40.9        3.8        5.6
Erikson's Psychosocial Stages      63.6  30.2        2.7        3.4
Flow Theory                        83.5   8.8        3.0        4.7
Freudian Psychoanalysis            64.5  30.2        1.4        3.9
Mental Models                      7





## Agent Nationality Distribution

In [109]:
import sys
import math
from collections import Counter

import plotly.express as px
import pandas as pd
import pycountry


def iso2_to_iso3(iso2: str) -> str:
    """Convert ISO-2 country code to ISO-3."""
    try:
        country = pycountry.countries.get(alpha_2=iso2.upper())
        if country:
            return country.alpha_3
    except:
        pass
    return None


def iso2_to_name(iso2: str) -> str:
    """Convert ISO-2 country code to country name."""
    try:
        country = pycountry.countries.get(alpha_2=iso2.upper())
        if country:
            return country.name
    except:
        pass
    return iso2


def generate_map(df, output_html, use_log_scale=True):
    """Generate choropleth map from a DataFrame with 'predicted_country' column."""
    print("=" * 50)
    print("Generate Choropleth Map from Predicted Countries")
    print("=" * 50)

    country_series = df["predicted_country"].dropna().str.strip()
    country_series = country_series[country_series != "ERROR"]

    country_counts = Counter(country_series)
    total_names = len(country_series)

    print(f"Total names: {total_names}")
    print(f"Unique countries: {len(country_counts)}")

    if not country_counts:
        print("ERROR: No valid country data found.")
        sys.exit(1)

    # Show top 10 countries
    print("\nTop 10 countries:")
    for iso2, count in country_counts.most_common(10):
        name = iso2_to_name(iso2)
        pct = count / total_names * 100
        print(f"  {iso2} ({name}): {count} ({pct:.1f}%)")

    # Create DataFrame with ISO-3 codes for plotly
    data = []
    skipped = []

    for iso2, count in country_counts.items():
        iso3 = iso2_to_iso3(iso2)
        if iso3:
            entry = {
                "iso2": iso2,
                "iso3": iso3,
                "country_name": iso2_to_name(iso2),
                "count": count,
                "percentage": count / total_names * 100,
            }
            if use_log_scale:
                entry["log_count"] = math.log10(count + 1)
            data.append(entry)
        else:
            skipped.append(iso2)

    if skipped:
        print(f"\nSkipped (invalid ISO-2 codes): {', '.join(skipped)}")

    plot_df = pd.DataFrame(data)

    color_col = "log_count" if use_log_scale else "count"
    hover_data = {
        "iso3": False,
        "iso2": True,
        "count": True,
        "percentage": ":.1f",
    }
    if use_log_scale:
        hover_data["log_count"] = False

    fig = px.choropleth(
        plot_df,
        locations="iso3",
        locationmode="ISO-3",
        color=color_col,
        hover_name="country_name",
        hover_data=hover_data,
#       color_continuous_scale="Blues",
        color_continuous_scale="YlOrRd",
        title=f"Distribution of Agent Nationalities",
        labels={
            "count": "Number of Names",
            "percentage": "Percentage (%)",
            "iso2": "ISO Code",
        },
    )

    if use_log_scale:
        max_count = plot_df["count"].max()
        tick_values = [1, 10, 100, 1000, 10000, 100000]
        tick_values = [v for v in tick_values if v <= max_count + 10]
        tickvals = [math.log10(v + 1) for v in tick_values]

        fig.update_layout(
            coloraxis_colorbar=dict(
                title="Count",
                tickvals=tickvals,
                ticktext=[str(v) for v in tick_values],
            )
        )

    fig.update_layout(
        geo=dict(
            showframe=False,
            showcoastlines=True,
            projection_type="natural earth",
            bgcolor="white",
            landcolor="lightgray",
            showland=True,
        ),
        title_x=0.5,
        margin=dict(l=0, r=0, t=50, b=0),
        paper_bgcolor="white",
    )

    fig.update_traces(
        marker_line_color="darkgray",
        marker_line_width=0.5,
    )

    fig.write_html(output_html)
    print(f"\nMap saved: {output_html}")

    try:
        fig.show(renderer="browser")
    except:
        print("(Could not open browser, but HTML file is saved)")

    print("\nDone!")

In [110]:
# Linear scale
generate_map(df, "map_linear.html", use_log_scale=False)

Generate Choropleth Map from Predicted Countries
Total names: 4784
Unique countries: 62

Top 10 countries:
  GB (United Kingdom): 1758 (36.7%)
  KR (Korea, Republic of): 1404 (29.3%)
  IE (Ireland): 348 (7.3%)
  NZ (New Zealand): 241 (5.0%)
  UA (Ukraine): 214 (4.5%)
  ES (Spain): 164 (3.4%)
  CA (Canada): 133 (2.8%)
  PT (Portugal): 59 (1.2%)
  AU (Australia): 54 (1.1%)
  TW (Taiwan, Province of China): 50 (1.0%)

Map saved: map_linear.html
Opening in existing browser session.

Done!


In [111]:
# Log scale (default)
generate_map(df, "map_log.html", use_log_scale=True,)

Generate Choropleth Map from Predicted Countries
Total names: 4784
Unique countries: 62

Top 10 countries:
  GB (United Kingdom): 1758 (36.7%)
  KR (Korea, Republic of): 1404 (29.3%)
  IE (Ireland): 348 (7.3%)
  NZ (New Zealand): 241 (5.0%)
  UA (Ukraine): 214 (4.5%)
  ES (Spain): 164 (3.4%)
  CA (Canada): 133 (2.8%)
  PT (Portugal): 59 (1.2%)
  AU (Australia): 54 (1.1%)
  TW (Taiwan, Province of China): 50 (1.0%)

Map saved: map_log.html
Opening in existing browser session.

Done!


## Biased Models

In [112]:
def find_biased_models(df: pd.DataFrame, variable:str, category_col: str, primary_group: str, secondary_group: str, metric_col: str = "match") -> list[str]:

    def check_bias(group):
        """Helper function to apply to each model group."""
        try:
            # Get the match rate for the primary group (e.g., 'male')
            primary_match = group[group[category_col] == primary_group][metric_col].iloc[0]
            # Get the match rate for the secondary group (e.g., 'female')
            secondary_match = group[group[category_col] == secondary_group][metric_col].iloc[0]

            # Check if primary_group performance is greater than secondary_group performance
            return primary_match > secondary_match
        except IndexError:
            # Handle cases where one or both groups might be missing for a model
            return False

    group = df.groupby([variable, category_col])[metric_col].mean().reset_index()

    # Apply the filtering function across all unique models
    biased_df = group.groupby(variable).filter(check_bias)

    # Return the unique list of model names from the filtered DataFrame
    return biased_df[variable].unique().tolist()

In [113]:
# 1. Use the function to find models biased towards 'male'
biased_to_male = find_biased_models(
    df=df_merged,
    variable="model",
    category_col="predicted_gender",
    primary_group="male",
    secondary_group="female",
    metric_col="match"
)

print("\n--- Results: Models where Male Match > Female Match ---")
for model in biased_to_male:
    print(model) # Expected: A


--- Results: Models where Male Match > Female Match ---
gemma-3-12b-it
gemma-3-27b-it
gemma-3-4b-it
llama-3.1-8b-instruct
llama-3.3-70b-instruct
llama-4-scout
nova-micro-v1


In [114]:
group = df_merged.groupby(["model", "predicted_gender"])["match"].mean().reset_index()
group

Unnamed: 0,model,predicted_gender,match
0,gemma-3-12b-it,female,0.691773
1,gemma-3-12b-it,male,0.720842
2,gemma-3-12b-it,uncertain,0.73
3,gemma-3-27b-it,female,0.688418
4,gemma-3-27b-it,male,0.716952
5,gemma-3-27b-it,nonbinary,0.5625
6,gemma-3-27b-it,uncertain,0.748
7,gemma-3-4b-it,female,0.612042
8,gemma-3-4b-it,male,0.639449
9,gemma-3-4b-it,nonbinary,0.61875


## Performance by age

In [115]:
# group by age bracket
# create age brackets
df_merged['age_bracket'] = pd.cut(df_merged['predicted_age'], bins=[0, 18, 30, 45, 60, 100], labels=['<18', '18-29', '30-44', '45-59', '60+'])
df_merged.groupby('age_bracket')['match'].mean()





age_bracket
<18           NaN
18-29    0.708000
30-44    0.657572
45-59    0.638573
60+      0.679286
Name: match, dtype: float64

In [116]:
df_merged["age_bracket"].value_counts()

age_bracket
45-59    2285
60+      1848
30-44     622
18-29      15
<18         0
Name: count, dtype: int64

## Nationality

In [117]:
df_count = df_merged["predicted_country"].value_counts().reset_index()
df_count
# filter those nationality with count > 10
freq_nat = df_count[df_count["count"] > 10]["predicted_country"].unique()

df_freq_nat = df_merged[df_merged["predicted_country"].isin(freq_nat)]
df_freq_nat.groupby('predicted_country')['match'].mean().sort_values(ascending=False)

predicted_country
PE    0.747692
AU    0.715000
FR    0.707368
IE    0.700172
DE    0.698182
GB    0.682793
UA    0.681963
NZ    0.674025
PT    0.663559
TT    0.662500
DK    0.661290
JM    0.635833
UY    0.632766
TW    0.630000
KR    0.621054
CA    0.611579
CH    0.576667
ES    0.568537
IN    0.556154
Name: match, dtype: float64

## Rachel Kim Analysis

In [139]:
df_kor = df_merged[df_merged["predicted_country"] == "KR"]
df_kor["name"].value_counts()

name
Dr. Rachel Kim             1232
Rachel Kim                  106
Dr. Jonathan Lee             44
Dr. Elara Lee                 8
Dr. Ethan Kim                 4
Dr. Carl Jung                 2
Dr. Eleanor Jung              1
Dr. Benjamin Lee              1
The name is: Rachel Kim       1
Carl Jung                     1
Jung                          1
Elara Lee                     1
Dr. Kim                       1
Dr. Rachel Lee                1
Name: count, dtype: int64

In [141]:
df_kor = df_merged[df_merged["predicted_country"] == "KR"]
# 95 % of the names are "Rachel Kim" or "Dr. Rachel Kim"
(df_kor["name"].value_counts()["Dr. Rachel Kim"] + df_kor["name"].value_counts()["Rachel Kim"]) / df_kor.shape[0]

# df[(df_kor["name"] == "Dr. Rachel Kim") | (df_kor["name"] == "Rachel Kim")]
df_kor[(df_kor["name"] == "Dr. Rachel Kim") | (df_kor["name"] == "Rachel Kim")]["model"].value_counts()

model
llama-3.2-3b-instruct     463
llama-3.1-8b-instruct     318
llama-3.3-70b-instruct    311
llama-4-scout             246
Name: count, dtype: int64

In [None]:
llama_models = ["llama-3.1-8b-instruct", "llama-3.2-3b-instruct", "llama-3.3-70b-instruct", "llama-4-scout"]
df_llama = df_merged[df_merged["model"].isin(llama_models)]

df_llama[(df_llama["name"] == "Dr. Rachel Kim") | (df_llama["name"] == "Rachel Kim")].shape[0] / df_llama.shape[0]
# 0.5967885816235504, around 60% of the llama profiles are "Rachel Kim" or "Dr. Rachel Kim"


0.5967885816235504

In [144]:
df_au = df_merged[df_merged["predicted_country"] == "GB"]
df_au["name"].value_counts()


name
Dr. Alistair Finch             678
Dr. Vivian Holloway            260
Dr. Emma Taylor                 85
Professor Silas Blackwood       70
Dr. Silas Blackwood             67
                              ... 
Oliver 'Ollie' Kent              1
Dr. Benjamin 'Ben' Sinclair      1
Dr. Eleanor Thornfield           1
Dr. Silas Vance                  1
Dr. Eleanor Hartfield            1
Name: count, Length: 115, dtype: int64

In [148]:
df_au[(df_au["name"] == "Dr. Alistair Finch") | (df_au["name"] == "Alistair Finch")]["model"].value_counts()
df_au[(df_au["name"] == "Dr. Vivian Holloway") | (df_au["name"] == "Vivian Holloway")]["model"].value_counts()

model
gemma-3-12b-it    141
gemma-3-4b-it     106
gemma-3-27b-it     19
Name: count, dtype: int64

In [154]:
df_merged["name"].value_counts().head(20)
df_merged[df_merged["name"] == "Dr. Eleanor Vance"]["model"].value_counts()

model
gemma-3-12b-it    131
gemma-3-27b-it     31
Name: count, dtype: int64