In [16]:
import json
from collections import Counter

def compare_scores_multiple(paths1, paths2, remove_indices=[6, 19, 22, 40, 47, 26]):
    if len(paths1) != len(paths2):
        raise ValueError("The number of files in both lists should be the same")

    total_matches = 0
    total_scores = 0
    all_incorrect_indices = []

    for path1, path2 in zip(paths1, paths2):
        print(f"Comparing {path1} and {path2}")

        try:
            with open(path1, 'r') as f1, open(path2, 'r') as f2:
                data1 = json.load(f1)
                data2 = json.load(f2)
        except:
            print("Failed to open or parse files")
            continue
        
        scores1 = [d['score'] for d in data1]
        scores2 = [d['score'] for d in data2]
        
        min_length = min(len(scores1), len(scores2))
        scores1 = scores1[:min_length]
        scores2 = scores2[:min_length]
        
        matches = 0
        incorrect_indices = []
        for i, (s1, s2) in enumerate(zip(scores1, scores2)):
            if s1 == s2:
                if i not in remove_indices:
                    matches += 1
            else:
                incorrect_indices.append(i)
        
        accuracy = matches / (min_length-len(remove_indices))
        print(f"Accuracy: {accuracy:.2%}")
        
        if incorrect_indices:
            print(f"Incorrect indices: {incorrect_indices}")
        else:
            print("All indices matched correctly")
        
        all_incorrect_indices.extend(incorrect_indices)
        
        total_matches += matches
        total_scores += (min_length-len(remove_indices))

    joint_accuracy = total_matches / total_scores if total_scores > 0 else 0
    
    index_counts = Counter(all_incorrect_indices)
    consistently_incorrect = [index for index, count in index_counts.items() if (count >= len(paths1)-2 and count <= len(paths1))]
    
    print("\nOverall Results:")
    print(f"Joint Accuracy: {joint_accuracy:.2%}")
    if consistently_incorrect:
        print(f"Indices incorrect in all file pairs: {consistently_incorrect}")
    else:
        print("No indices were consistently incorrect across all file pairs")
    
    return joint_accuracy

In [19]:
manual_path = lambda m: f'1COMPLETED_RUNS/10llama-3.1-8b-manually-scored/results/open_ended_scores/context-focus/results_layer=12_multiplier={m}_behavior=context-focus_type=open_ended_use_base_model=False_model_size=7b.json'
auto_path = lambda m: f'1COMPLETED_RUNS/20llama-3.1-8b-evaluation-correlations/results/context-focus/5_times/context-focus/results_layer=12_multiplier={m}_behavior=context-focus_type=open_ended_use_base_model=False_model_size=7b.json'

manual_paths = [manual_path(m) for m in [-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]]
auto_paths = [auto_path(m) for m in [-2.0, -1.0, 0.0, 1.0, 2.0, 3.0]]

joint_accuracy = compare_scores_multiple(manual_paths, auto_paths)
print(f"Joint Accuracy: {joint_accuracy:.2%}")

Comparing 1COMPLETED_RUNS/10llama-3.1-8b-manually-scored/results/open_ended_scores/context-focus/results_layer=12_multiplier=-2.0_behavior=context-focus_type=open_ended_use_base_model=False_model_size=7b.json and 1COMPLETED_RUNS/20llama-3.1-8b-evaluation-correlations/results/context-focus/5_times/context-focus/results_layer=12_multiplier=-2.0_behavior=context-focus_type=open_ended_use_base_model=False_model_size=7b.json
Accuracy: 97.73%
Incorrect indices: [6, 19, 22, 44, 47]
Comparing 1COMPLETED_RUNS/10llama-3.1-8b-manually-scored/results/open_ended_scores/context-focus/results_layer=12_multiplier=-1.0_behavior=context-focus_type=open_ended_use_base_model=False_model_size=7b.json and 1COMPLETED_RUNS/20llama-3.1-8b-evaluation-correlations/results/context-focus/5_times/context-focus/results_layer=12_multiplier=-1.0_behavior=context-focus_type=open_ended_use_base_model=False_model_size=7b.json
Accuracy: 95.45%
Incorrect indices: [6, 10, 19, 22, 26, 40, 41]
Comparing 1COMPLETED_RUNS/10llam