# Analyze scores for each trait

In [1]:
import json
import os
import numpy as np
import pandas as pd

In [2]:
# load data from data/extract_scores
score_dir = "/root/git/persona-subspace/traits/data/extract_scores"

# iterate through each json file in the directory
scores = {}
for file in os.listdir(score_dir):
    if file.endswith(".json"):
        with open(os.path.join(score_dir, file), "r") as f:
            scores[file.replace(".json", "")] = json.load(f)

print(f"Found {len(scores.keys())} traits with scores")


Found 240 traits with scores


In [4]:
# Analyze refusals and clean data
refusal_info = {}
scores_clean = {}

for trait, score_obj in scores.items():
    refusals = []
    cleaned_scores = {}
    
    # Check each score for refusals
    for key, value in score_obj.items():
        if value == "REFUSAL":
            refusals.append(key)
            cleaned_scores[key] = 0  # Replace refusals with NaN
        else:
            cleaned_scores[key] = float(value)  # Ensure numeric
    
    scores_clean[trait] = cleaned_scores
    refusal_info[trait] = {
        "refusals": refusals,
        "refusal_count": len(refusals)
    }

# Show refusal statistics
total_refusals = sum(info["refusal_count"] for info in refusal_info.values())
traits_with_refusals = sum(1 for info in refusal_info.values() if info["refusal_count"] > 0)

print(f"Refusal Statistics:")
print(f"Total refusals across all traits: {total_refusals}")
print(f"Traits with refusals: {traits_with_refusals}")

if total_refusals > 0:
    sorted_refusals = sorted(refusal_info.items(), key=lambda x: x[1]["refusal_count"], reverse=True)
    print(f"\nTop 10 traits with most refusals:")
    for trait, info in sorted_refusals[:10]:
        if info["refusal_count"] > 0:
            print(f"  {trait}: {info['refusal_count']} refusals - {info['refusals']}")

Refusal Statistics:
Total refusals across all traits: 1
Traits with refusals: 1

Top 10 traits with most refusals:
  generalist: 1 refusals - ['neg_4']


In [5]:
# Create numpy arrays using cleaned scores (refusals as NaN)
scores_np = {}

for trait, cleaned_scores in scores_clean.items():
    # Create arrays for pos, neg, default scores
    pos_scores = []
    neg_scores = []
    default_scores = []
    
    # Extract scores for each question (0-19)
    for i in range(20):
        pos_scores.append(cleaned_scores[f"pos_{i}"])
        neg_scores.append(cleaned_scores[f"neg_{i}"])
        default_scores.append(cleaned_scores[f"default_{i}"])
    
    # Stack into (3, 20) array: [pos_row, neg_row, default_row]
    scores_np[trait] = np.array([pos_scores, neg_scores, default_scores])

print(f"Created numpy arrays for {len(scores_np)} traits")
print(f"Shape of each array: {next(iter(scores_np.values())).shape}")
print(f"Example (first trait): {list(scores_np.keys())[0]}")
print(scores_np[list(scores_np.keys())[0]])

Created numpy arrays for 240 traits
Shape of each array: (3, 20)
Example (first trait): absolutist
[[ 0. 10. 10.  0.  0. 10.  0.  0. 10.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]
 [ 0. 10.  0. 10.  0. 10.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0. 10.]
 [ 0.  0.  0. 10.  0. 10.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
   0.  0.]]


In [6]:
# Calculate statistics for each trait (ignoring NaN values from refusals)
stats = {}

for trait, score_np in scores_np.items():
    pos_scores = score_np[0]  # pos scores across 20 questions
    neg_scores = score_np[1]  # neg scores across 20 questions
    default_scores = score_np[2]  # default scores across 20 questions
    
    # Mean and std for each prompt type (nanmean/nanstd ignore NaN values)
    pos_mean = np.nanmean(pos_scores)
    pos_std = np.nanstd(pos_scores)
    neg_mean = np.nanmean(neg_scores)
    neg_std = np.nanstd(neg_scores)
    default_mean = np.nanmean(default_scores)
    default_std = np.nanstd(default_scores)
    
    # Mean differences (ignoring pairs where either value is NaN)
    pos_minus_default = np.nanmean(pos_scores - default_scores)
    pos_minus_neg = np.nanmean(pos_scores - neg_scores)
    
    # Calculate differences for each question (ignoring NaN values)
    pos_minus_neg_per_question = pos_scores - neg_scores
    pos_minus_default_per_question = pos_scores - default_scores
    
    # Find the maximum single-question difference (ignoring NaN)
    max_pos_minus_neg = np.nanmax(pos_minus_neg_per_question)
    max_pos_minus_default = np.nanmax(pos_minus_default_per_question)
    
    # Find which question had the maximum difference
    max_pos_neg_question = np.nanargmax(pos_minus_neg_per_question)
    max_pos_default_question = np.nanargmax(pos_minus_default_per_question)
    
    stats[trait] = {
        "pos_mean": pos_mean,
        "pos_std": pos_std,
        "neg_mean": neg_mean,
        "neg_std": neg_std,
        "default_mean": default_mean,
        "default_std": default_std,
        "pos_minus_default_mean": pos_minus_default,
        "pos_minus_neg_mean": pos_minus_neg,
        "max_pos_minus_neg_single": max_pos_minus_neg,
        "max_pos_minus_neg_question": max_pos_neg_question,
        "max_pos_minus_default_single": max_pos_minus_default,
        "max_pos_minus_default_question": max_pos_default_question
    }

# Show example statistics for first trait
example_trait = list(stats.keys())[0]
print(f"Example statistics for '{example_trait}':")
for key, value in stats[example_trait].items():
    print(f"  {key}: {value:.2f}" if isinstance(value, float) else f"  {key}: {value}")

print(f"\nCalculated statistics for {len(stats)} traits")

Example statistics for 'absolutist':
  pos_mean: 2.00
  pos_std: 4.00
  neg_mean: 2.00
  neg_std: 4.00
  default_mean: 1.00
  default_std: 3.00
  pos_minus_default_mean: 1.00
  pos_minus_neg_mean: 0.00
  max_pos_minus_neg_single: 10.00
  max_pos_minus_neg_question: 2
  max_pos_minus_default_single: 10.00
  max_pos_minus_default_question: 1

Calculated statistics for 240 traits


In [None]:

# Create results directory if it doesn't exist
results_dir = '/root/git/persona-subspace/traits/results'
os.makedirs(results_dir, exist_ok=True)

# 1. Export statistics to CSV
stats_df = pd.DataFrame.from_dict(stats, orient='index')
stats_df.index.name = 'trait'
stats_df.to_csv(os.path.join(results_dir, 'trait_statistics.csv'))

# 2. Export refusal information to CSV
refusal_df = pd.DataFrame([
    {
        'trait': trait,
        'refusal_count': info['refusal_count'],
        'refusals': ', '.join(info['refusals']) if info['refusals'] else ''
    }
    for trait, info in refusal_info.items()
])
refusal_df.to_csv(os.path.join(results_dir, 'refusal_info.csv'), index=False)

# 3. Export raw scores to CSV (all scores in one file)
raw_scores_data = []
for trait, score_obj in scores.items():
    row = {'trait': trait}
    row.update(score_obj)
    raw_scores_data.append(row)

raw_scores_df = pd.DataFrame(raw_scores_data)
raw_scores_df.to_csv(os.path.join(results_dir, 'raw_scores.csv'), index=False)

print("Exported files to traits/results/:")
print("- trait_statistics.csv: Mean, std, and difference statistics for each trait")
print("- trait_statistics_pos_neg.csv: Sorted by largest pos-neg difference")
print("- trait_statistics_pos_default.csv: Sorted by largest pos-default difference")
print("- refusal_info.csv: Refusal counts and details per trait")
print("- raw_scores.csv: All original scores data")
print(f"\nStatistics shape: {stats_df.shape}")
print(f"Refusal info shape: {refusal_df.shape}")
print(f"Raw scores shape: {raw_scores_df.shape}")



Exported files to traits/results/:
- trait_statistics.csv: Mean, std, and difference statistics for each trait
- trait_statistics_pos_neg.csv: Sorted by largest pos-neg difference
- trait_statistics_pos_default.csv: Sorted by largest pos-default difference
- refusal_info.csv: Refusal counts and details per trait
- raw_scores.csv: All original scores data

Statistics shape: (240, 8)
Refusal info shape: (240, 3)
Raw scores shape: (240, 61)

Top 5 traits by pos-neg difference:
              pos_minus_neg_mean
trait                           
cruel                     100.00
conciliatory              100.00
metaphorical               99.75
melodramatic               99.75
calm                       99.50

Top 5 traits by pos-default difference:
              pos_minus_default_mean
trait                               
cruel                         100.00
nihilistic                     99.25
melodramatic                   98.25
misanthropic                   98.00
bitter                     

In [None]:
# also sort by largest pos-neg and pos-default difference on any single question



## PCA

In [None]:
# run PCA on mean activations