# Analyze scores for each trait

In [1]:
import json
import os
import torch
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.subplots as sp
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
from pathlib import Path
from tqdm import tqdm


In [2]:
model_short = "gemma-3-27b"

dir = f'{model_short}/traits_240'

## Score statistics

In [3]:
# load data from data/extract_scores
score_dir = f"/workspace/{dir}/extract_scores"

# iterate through each json file in the directory
scores = {}
for file in os.listdir(score_dir):
    if file.endswith(".json"):
        with open(os.path.join(score_dir, file), "r") as f:
            scores[file.replace(".json", "")] = json.load(f)

print(f"Found {len(scores.keys())} traits with scores")


Found 240 traits with scores


In [4]:
# Analyze refusals and clean data
refusal_info = {}
scores_clean = {}

for trait, score_obj in scores.items():
    refusals = []
    cleaned_scores = {}
    
    # Check each score for refusals
    for key, value in score_obj.items():
        if value == "REFUSAL":
            refusals.append(key)
            cleaned_scores[key] = 0  # Replace refusals with NaN
        else:
            cleaned_scores[key] = float(value)  # Ensure numeric
    
    scores_clean[trait] = cleaned_scores
    refusal_info[trait] = {
        "refusals": refusals,
        "refusal_count": len(refusals)
    }

# Show refusal statistics
total_refusals = sum(info["refusal_count"] for info in refusal_info.values())
traits_with_refusals = sum(1 for info in refusal_info.values() if info["refusal_count"] > 0)

print(f"Refusal Statistics:")
print(f"Total refusals across all traits: {total_refusals}")
print(f"Traits with refusals: {traits_with_refusals}")

if total_refusals > 0:
    sorted_refusals = sorted(refusal_info.items(), key=lambda x: x[1]["refusal_count"], reverse=True)
    print(f"\nTop 10 traits with most refusals:")
    for trait, info in sorted_refusals[:10]:
        if info["refusal_count"] > 0:
            print(f"  {trait}: {info['refusal_count']} refusals - {info['refusals']}")

Refusal Statistics:
Total refusals across all traits: 0
Traits with refusals: 0


In [5]:
# Create numpy arrays using cleaned scores (refusals as NaN)
# Structure: 3D tensor with shape (3 types, 5 prompts, 240 questions)
scores_np = {}

for trait, cleaned_scores in scores_clean.items():
    # Create 3D array: [type, prompt, question]
    scores_3d = np.full((3, 5, 240), np.nan)
    
    # Extract scores for each type, prompt, and question
    for prompt_idx in range(5):
        for question_idx in range(240):
            # pos scores
            pos_key = f"pos_p{prompt_idx}_q{question_idx}"
            if pos_key in cleaned_scores:
                scores_3d[0, prompt_idx, question_idx] = cleaned_scores[pos_key]
            
            # neg scores  
            neg_key = f"neg_p{prompt_idx}_q{question_idx}"
            if neg_key in cleaned_scores:
                scores_3d[1, prompt_idx, question_idx] = cleaned_scores[neg_key]
            
            # default scores
            default_key = f"default_p{prompt_idx}_q{question_idx}"
            if default_key in cleaned_scores:
                scores_3d[2, prompt_idx, question_idx] = cleaned_scores[default_key]
    
    scores_np[trait] = scores_3d

print(f"Created numpy arrays for {len(scores_np)} traits")
print(f"Shape of each array: {next(iter(scores_np.values())).shape}")
print(f"Example (first trait): {list(scores_np.keys())[0]}")
example_trait = list(scores_np.keys())[0]
print(f"Pos scores for first 2 prompts, 5 questions:\n{scores_np[example_trait][0, :2, :5]}")
print(f"Neg scores for first 2 prompts, 5 questions:\n{scores_np[example_trait][1, :2, :5]}")

Created numpy arrays for 240 traits
Shape of each array: (3, 5, 240)
Example (first trait): zealous
Pos scores for first 2 prompts, 5 questions:
[[ 98.  95.  95.  95. 100.]
 [ 95.  95.  95.  95. 100.]]
Neg scores for first 2 prompts, 5 questions:
[[20. 10. 10. 10. 20.]
 [20. 10. 10. 40. 20.]]


In [6]:
# Calculate simplified statistics for each trait
stats = {}

for trait, scores_3d in scores_np.items():
    pos_scores = scores_3d[0]  # shape: (5, 20) 
    
    # Count all pos/neg pairs with same prompt_index and question_index
    pos_70_count = 0
    pos_40_70_count = 0
    
    # Check all 100 pairs (5 prompts × 20 questions)
    for prompt_idx in range(5):
        for question_idx in range(240):
            pos_val = pos_scores[prompt_idx, question_idx]
           
            
            # Skip if either value is NaN
            if not (np.isnan(pos_val)):
                # Count high pos, low neg cases
                if pos_val >= 70:
                    pos_70_count += 1
                if pos_val >= 40 and pos_val < 70:
                    pos_40_70_count += 1
    
    stats[trait] = {
        "pos_70_count": pos_70_count,
        "pos_40_70_count": pos_40_70_count,
    }


# Export to CSV
stats_df = pd.DataFrame.from_dict(stats, orient='index')
stats_df.index.name = 'trait'

stats_df.to_csv(f'./results/{model_short}/pos.csv')
print(f"\nExported statistics to pos.csv")
print(f"Shape: {stats_df.shape}")


Exported statistics to pos.csv
Shape: (240, 2)


In [7]:
# get number of traits with pos_70_count >= 10
print(len([trait for trait, stats in stats.items() if stats['pos_70_count'] >= 10]))

# get number of traits with pos_40_70_count >= 10
print(len([trait for trait, stats in stats.items() if stats['pos_40_70_count'] >= 10]))

# get number of traits with pos_70_count >= 10 and pos_40_70_count >= 10
print(len([trait for trait, stats in stats.items() if stats['pos_70_count'] >= 10 and stats['pos_40_70_count'] >= 10]))


240
45
45


In [8]:
# Calculate pos - neg statistics similar to pos - default
pos_neg_stats = {}

for trait, scores_3d in scores_np.items():
    pos_scores = scores_3d[0]      # shape: (5, 240) 
    neg_scores = scores_3d[1]      # shape: (5, 240)
    
    # Mean difference between pos and neg across all samples
    pos_minus_neg_mean = np.nanmean(pos_scores - neg_scores)
    
    # Count all pos/neg pairs with same prompt_index and question_index
    high_pos_low_neg_count = 0
    large_diff_count = 0
    
    # Check all 1200 pairs (5 prompts × 240 questions)
    for prompt_idx in range(5):
        for question_idx in range(240):
            pos_val = pos_scores[prompt_idx, question_idx]
            neg_val = neg_scores[prompt_idx, question_idx]
            
            # Skip if either value is NaN
            if not (np.isnan(pos_val) or np.isnan(neg_val)):
                # Count high pos, low neg cases
                if pos_val > 50 and neg_val < 50:
                    high_pos_low_neg_count += 1
                
                # Count large difference cases  
                if abs(pos_val - neg_val) > 40:
                    large_diff_count += 1
    
    pos_neg_stats[trait] = {
        "pos_minus_neg_mean": pos_minus_neg_mean,
        "high_pos_low_neg_count": high_pos_low_neg_count,
        "large_diff_count": large_diff_count
    }

# Show example statistics for first trait
example_trait = list(pos_neg_stats.keys())[0]
print(f"Example pos-neg statistics for '{example_trait}':")
for key, value in pos_neg_stats[example_trait].items():
    if isinstance(value, float):
        print(f"  {key}: {value:.2f}")
    else:
        print(f"  {key}: {value}")

print(f"\nCalculated pos-neg statistics for {len(pos_neg_stats)} traits")

# Show summary of counts
high_pos_counts = [s["high_pos_low_neg_count"] for s in pos_neg_stats.values()]
large_diff_counts = [s["large_diff_count"] for s in pos_neg_stats.values()]
print(f"\nHigh pos, low neg count distribution: min={min(high_pos_counts)}, max={max(high_pos_counts)}, mean={np.mean(high_pos_counts):.1f}")
print(f"Large diff count distribution: min={min(large_diff_counts)}, max={max(large_diff_counts)}, mean={np.mean(large_diff_counts):.1f}")

# Export to CSV
pos_neg_df = pd.DataFrame.from_dict(pos_neg_stats, orient='index')
pos_neg_df.index.name = 'trait'
pos_neg_df.to_csv(f'./results/{model_short}/pos_neg.csv')
print(f"\nExported pos-neg statistics to pos_neg.csv")
print(f"Shape: {pos_neg_df.shape}")

Example pos-neg statistics for 'zealous':
  pos_minus_neg_mean: 81.24
  high_pos_low_neg_count: 1191
  large_diff_count: 1191

Calculated pos-neg statistics for 240 traits

High pos, low neg count distribution: min=147, max=1200, mean=1086.0
Large diff count distribution: min=152, max=1200, mean=1087.4

Exported pos-neg statistics to pos_neg.csv
Shape: (240, 3)


In [10]:
# Calculate pos - default statistics similar to pos - neg
pos_default_stats = {}

for trait, scores_3d in scores_np.items():
    pos_scores = scores_3d[0]      # shape: (5, 20) 
    default_scores = scores_3d[2]  # shape: (5, 20)
    
    # Mean difference between pos and default across all samples
    pos_minus_default_mean = np.nanmean(pos_scores - default_scores)
    
    # Count all pos/default pairs with same prompt_index and question_index
    high_pos_low_default_count = 0
    large_diff_count = 0
    
    # Check all 100 pairs (5 prompts × 20 questions)
    for prompt_idx in range(5):
        for question_idx in range(20):
            pos_val = pos_scores[prompt_idx, question_idx]
            default_val = default_scores[prompt_idx, question_idx]
            
            # Skip if either value is NaN
            if not (np.isnan(pos_val) or np.isnan(default_val)):
                # Count high pos, low default cases
                if pos_val > 50 and default_val < 50:
                    high_pos_low_default_count += 1
                
                # Count large difference cases  
                if abs(pos_val - default_val) > 40:
                    large_diff_count += 1
    
    pos_default_stats[trait] = {
        "pos_minus_default_mean": pos_minus_default_mean,
        "high_pos_low_default_count": high_pos_low_default_count,
        "large_diff_count": large_diff_count
    }

# Show example statistics for first trait
example_trait = list(pos_default_stats.keys())[0]
print(f"Example pos-default statistics for '{example_trait}':")
for key, value in pos_default_stats[example_trait].items():
    if isinstance(value, float):
        print(f"  {key}: {value:.2f}")
    else:
        print(f"  {key}: {value}")

print(f"\nCalculated pos-default statistics for {len(pos_default_stats)} traits")

# Show summary of counts
high_pos_counts = [s["high_pos_low_default_count"] for s in pos_default_stats.values()]
large_diff_counts = [s["large_diff_count"] for s in pos_default_stats.values()]
print(f"\nHigh pos, low default count distribution: min={min(high_pos_counts)}, max={max(high_pos_counts)}, mean={np.mean(high_pos_counts):.1f}")
print(f"Large diff count distribution: min={min(large_diff_counts)}, max={max(large_diff_counts)}, mean={np.mean(large_diff_counts):.1f}")

# Export to CSV
pos_default_df = pd.DataFrame.from_dict(pos_default_stats, orient='index')
pos_default_df.index.name = 'trait'
pos_default_df.to_csv(f'./results/{model_short}/pos_default.csv')
print(f"\nExported pos-default statistics to pos_default.csv")
print(f"Shape: {pos_default_df.shape}")

Example pos-default statistics for 'zealous':
  pos_minus_default_mean: nan
  high_pos_low_default_count: 0
  large_diff_count: 0

Calculated pos-default statistics for 240 traits

High pos, low default count distribution: min=0, max=0, mean=0.0
Large diff count distribution: min=0, max=0, mean=0.0

Exported pos-default statistics to pos_default.csv
Shape: (240, 3)


  pos_minus_default_mean = np.nanmean(pos_scores - default_scores)
