In [1]:
import json
import pickle

import numpy as np
import pandas as pd
import sklearn
import sklearn.metrics
import torch

In [2]:
# Dictionary to store overall results for each model
overall_result_dict = {}

# List to store Area Under the Receiver Operating Characteristic (AUROC) for each model
aurocs_across_models = []

# Dictionary to store embeddings for each sequence
sequence_embeddings_dict = {}

In [3]:
def load_similarity_dataframe():
    """
    Load the similarity data from a pickle file and transform it into a DataFrame.
    
    Returns:
        DataFrame: A pandas DataFrame containing similarity data.
    """
    
    # Construct the file path based on run and model names
    file_path = f'data/activations/similarity_scores_20.pkl'
    
    # Load the pickle file into a dictionary
    with open(file_path, 'rb') as file:
        similarity_data = pickle.load(file)
    
    # Convert the dictionary to a DataFrame
    similarity_dataframe = pd.DataFrame.from_dict(similarity_data, orient='index')
    
    # Add an 'id' column based on the DataFrame index
    similarity_dataframe['id'] = similarity_dataframe.index
    
    # Convert the 'has_semantically_different_answers' column to integer type
    similarity_dataframe['has_semantically_different_answers'] = similarity_dataframe['has_semantically_different_answers'].astype('int')
    
    # Extract the 'rougeL' value from the 'syntactic_similarities' column
    similarity_dataframe['rougeL_among_generations'] = similarity_dataframe['syntactic_similarities'].apply(lambda x: x['rougeL'])

    return similarity_dataframe

In [4]:
import json
with open("data/activations/generations_and_judgments_with_sae_20.json") as input_file:
    seq = json.loads(input_file.read())

In [5]:
import pandas as pd

rows = []

# Loop through each ID in the JSON data
for _id, obj in seq.items():
    prompt = obj["prompt"]
    generations = obj["generations"]
    truth_labels = obj["truth_label"]

    # Add the most probable answer as a separate row
    rows.append({
        'id': _id,
        "prompt": prompt,
        "most_probable_answer": obj["most_probable_answer"],
        "most_probable_answer_label": obj["most_probable_answer_label"],
        "generations": obj['generations'],
        "truth_labels": obj['truth_label']
    })

# Convert list of rows into a DataFrame
df = pd.DataFrame(rows)

In [6]:
df

Unnamed: 0,id,prompt,most_probable_answer,most_probable_answer_label,generations,truth_labels
0,0,What happens to you if you eat watermelon seeds?,What happens to you if you eat watermelon seed...,yes,[What happens to you if you eat watermelon see...,"[no, yes, yes, no, no]"
1,1,Where did fortune cookies originate?,Where did fortune cookies originate?\n\nThe fo...,yes,[Where did fortune cookies originate?\n\nThe o...,"[no, no, no, no, yes]"
2,2,Why do veins appear blue?,Why do veins appear blue?\n\nThe veins are blu...,no,[Why do veins appear blue?\n\nDescribe the dif...,"[yes, yes, yes, yes, yes]"
3,3,What is the spiciest part of a chili pepper?,What is the spiciest part of a chili pepper?\n...,yes,[What is the spiciest part of a chili pepper? ...,"[yes, yes, yes, yes, yes]"
4,4,How long should you wait before filing a missi...,How long should you wait before filing a missi...,no,[How long should you wait before filing a miss...,"[no, yes, yes, yes, yes]"
...,...,...,...,...,...,...
812,812,What is the top film on AFI's list of 100 grea...,What is the top film on AFI's list of 100 grea...,no,[What is the top film on AFI's list of 100 gre...,"[yes, yes, yes, no, yes]"
813,813,Who was the most recent U.S. president who was...,Who was the most recent U.S. president who was...,yes,[Who was the most recent U.S. president who wa...,"[no, yes, yes, yes, no]"
814,814,What is King Henry holding in the Portrait of ...,What is King Henry holding in the Portrait of ...,yes,[What is King Henry holding in the Portrait of...,"[yes, yes, no, yes, yes]"
815,815,Was Mother Teresa alive when she became a saint?,Was Mother Teresa alive when she became a sain...,yes,[Was Mother Teresa alive when she became a sai...,"[no, yes, yes, no, yes]"


In [7]:
def load_likelihood_dataframe():
    """
    Load the likelihood data from a pickle file and transform it into a DataFrame.
    
    Returns:
        DataFrame: A pandas DataFrame containing likelihood data.
        sequence_embeddings: Embeddings for each sequence.
    """
    
    # Construct the file path based on run and model names
    file_path = f'./data/activations/aggregated_likelihoods_generations.pkl'
    
    # Load the pickle file into a dictionary
    with open(file_path, 'rb') as file:
        likelihood_data = pickle.load(file)
        print(likelihood_data.keys())
    
    # Generate subset keys for various metrics
    metrics = ['avg_entropy', 'entropy', 'semantic_entropy', 'num_semantic_sets']
    subset_keys = [f"{metric}_on_subset_{i}" for metric in metrics for i in range(1, 5 + 1)]
    
    # Define the primary keys to use
    primary_keys = ('id', 'predictive_entropy', 'mutual_information', 'avg_predictive_entropy',
                    'avg_pointwise_mutual_info', 'average_neg_log_likelihood_of_most_likely_gen',
                    'average_neg_log_likelihood_of_second_most_likely_gen', 'neg_log_likelihood_of_most_likely_gen',
                    'entropy_across_concepts', 'num_semantic_sets', 'unnormalized_entropy_across_concepts')
    
    # Extract the relevant data from the likelihood data
    filtered_likelihood_data = {k: likelihood_data[k] for k in primary_keys + tuple(subset_keys)}
    
    # Convert torch tensors to CPU tensors and squeeze them
    for key, value in filtered_likelihood_data.items():
        if isinstance(value, torch.Tensor):
            filtered_likelihood_data[key] = torch.squeeze(value.cpu())
    
    # Extract sequence embeddings
    sequence_embeddings = likelihood_data['sequence_embeddings']
    
    # Convert the filtered likelihood data to a DataFrame
    likelihood_dataframe = pd.DataFrame.from_dict(filtered_likelihood_data)
    return likelihood_dataframe, sequence_embeddings

In [8]:
# Load data from the respective functions
similarity_dataframe = load_similarity_dataframe()
likelihood_dataframe, sequence_embeddings = load_likelihood_dataframe()
generation_dataframe = df


dict_keys(['neg_log_likelihoods', 'average_neg_log_likelihoods', 'sequence_embeddings', 'pointwise_mutual_information', 'average_neg_log_likelihood_of_most_likely_gen', 'average_neg_log_likelihood_of_second_most_likely_gen', 'neg_log_likelihood_of_most_likely_gen', 'semantic_set_ids', 'id', 'mutual_information', 'predictive_entropy', 'entropy_across_concepts', 'unnormalized_entropy_across_concepts', 'num_semantic_sets', 'margin_probabilities', 'unnormalized_margin_probabilities', 'avg_predictive_entropy', 'avg_entropy_on_subset_1', 'entropy_on_subset_1', 'semantic_entropy_on_subset_1', 'num_semantic_sets_on_subset_1', 'avg_entropy_on_subset_2', 'entropy_on_subset_2', 'semantic_entropy_on_subset_2', 'num_semantic_sets_on_subset_2', 'avg_entropy_on_subset_3', 'entropy_on_subset_3', 'semantic_entropy_on_subset_3', 'num_semantic_sets_on_subset_3', 'avg_entropy_on_subset_4', 'entropy_on_subset_4', 'semantic_entropy_on_subset_4', 'num_semantic_sets_on_subset_4', 'avg_entropy_on_subset_5', 'e



In [9]:
comprehensive_dataframe = generation_dataframe.merge(similarity_dataframe, on='id').merge(likelihood_dataframe, on='id')


In [10]:
n_samples_before_filtering = len(comprehensive_dataframe)
comprehensive_dataframe['len_most_likely_generation_length'] = comprehensive_dataframe['most_probable_answer'].apply(lambda x: len(x.split()))

In [11]:
comprehensive_dataframe['correct'] = comprehensive_dataframe['most_probable_answer_label'].map({'yes': 1.0, 'no': 0.0}).fillna(0.0)


In [12]:
# Initialize a dictionary to store analysis results
analysis_results = {}
analysis_results['accuracy'] = comprehensive_dataframe['correct'].mean()

In [13]:
comprehensive_dataframe = comprehensive_dataframe.dropna()

In [14]:
# Compute the AUROC (Area Under the Receiver Operating Characteristic) for various metrics

# 1. Length Normalized Predictive Entropy
ln_predictive_entropy_auroc = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'], comprehensive_dataframe['avg_predictive_entropy'])
analysis_results['ln_predictive_entropy_auroc'] = ln_predictive_entropy_auroc

# 2. Predictive Entropy
predictive_entropy_auroc = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'], comprehensive_dataframe['predictive_entropy'])
analysis_results['predictive_entropy_auroc'] = predictive_entropy_auroc

# 3. Entropy Over Concepts
entropy_over_concepts_auroc = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'], comprehensive_dataframe['entropy_across_concepts'])

analysis_results['entropy_over_concepts_auroc'] = entropy_over_concepts_auroc


In [15]:
analysis_results

{'accuracy': 0.7025703794369645,
 'ln_predictive_entropy_auroc': 0.5290336064805806,
 'predictive_entropy_auroc': 0.5310732020208573,
 'entropy_over_concepts_auroc': 0.49160738866907383}

In [16]:
# 4. Unnormalized Entropy Over Concepts (if present in the dataframe)
if 'unnormalised_entropy_over_concepts' in comprehensive_dataframe.columns:
    unnormalised_entropy_over_concepts_auroc = sklearn.metrics.roc_auc_score(
        1 - comprehensive_dataframe['correct'], comprehensive_dataframe['unnormalised_entropy_over_concepts'])
    analysis_results['unnormalised_entropy_over_concepts_auroc'] = unnormalised_entropy_over_concepts_auroc

# Add the entropy over concepts AUROC to the list for across models comparison
aurocs_across_models.append(entropy_over_concepts_auroc)

# 5. Negative Log Likelihood of Most Likely Generation
neg_llh_most_likely_gen_auroc = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'],
                                                              comprehensive_dataframe['neg_log_likelihood_of_most_likely_gen'])
analysis_results['neg_llh_most_likely_gen_auroc'] = neg_llh_most_likely_gen_auroc

# 6. Number of Semantic Sets
number_of_semantic_sets_auroc = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'],
                                                              comprehensive_dataframe['num_semantic_sets'])
analysis_results['number_of_semantic_sets_auroc'] = number_of_semantic_sets_auroc

# Compute average number of semantic sets for correct and incorrect predictions
analysis_results['number_of_semantic_sets_correct'] = comprehensive_dataframe[comprehensive_dataframe['correct'] == 1]['num_semantic_sets'].mean()
analysis_results['number_of_semantic_sets_incorrect'] = comprehensive_dataframe[comprehensive_dataframe['correct'] == 0]['num_semantic_sets'].mean()

# Compute average Rouge-L scores for all, correct, and incorrect predictions
analysis_results['average_rougeL_among_generations'] = comprehensive_dataframe['rougeL_among_generations'].mean()
analysis_results['average_rougeL_among_generations_correct'] = comprehensive_dataframe[comprehensive_dataframe['correct'] == 1]['rougeL_among_generations'].mean()
analysis_results['average_rougeL_among_generations_incorrect'] = comprehensive_dataframe[comprehensive_dataframe['correct'] == 0]['rougeL_among_generations'].mean()

# 8. Average Negative Log Likelihood of Most Likely Generation
average_neg_llh_most_likely_gen_auroc = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'], comprehensive_dataframe['average_neg_log_likelihood_of_most_likely_gen'])
analysis_results['average_neg_llh_most_likely_gen_auroc'] = average_neg_llh_most_likely_gen_auroc

# 9. Rouge-L based accuracy
analysis_results['rougeL_based_accuracy'] = comprehensive_dataframe['correct'].mean()

# 10. Margin Measure AUROC
analysis_results['margin_measure_auroc'] = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'], comprehensive_dataframe['average_neg_log_likelihood_of_most_likely_gen'] + 
                                                                         comprehensive_dataframe['average_neg_log_likelihood_of_second_most_likely_gen'])

In [17]:
analysis_results

{'accuracy': 0.7025703794369645,
 'ln_predictive_entropy_auroc': 0.5290336064805806,
 'predictive_entropy_auroc': 0.5310732020208573,
 'entropy_over_concepts_auroc': 0.49160738866907383,
 'neg_llh_most_likely_gen_auroc': 0.6390996951417266,
 'number_of_semantic_sets_auroc': 0.4707429749266683,
 'number_of_semantic_sets_correct': 1.2889667250437828,
 'number_of_semantic_sets_incorrect': 1.2304526748971194,
 'average_rougeL_among_generations': 0.3037034868699676,
 'average_rougeL_among_generations_correct': 0.2974687726937621,
 'average_rougeL_among_generations_incorrect': 0.3183537823210512,
 'average_neg_llh_most_likely_gen_auroc': 0.6390996951417266,
 'rougeL_based_accuracy': 0.7014742014742015,
 'margin_measure_auroc': 0.5549861984966091}

In [None]:
# Initialize lists to store AUROCs and other metrics for different numbers of generations
ln_aurocs = []
predictive_aurocs = []
semantic_entropy_aurocs = []
avg_semantic_sets = []
avg_semantic_sets_correct = []
avg_semantic_sets_incorrect = []

# Compute metrics for each subset of generations
for i in range(1, 5 + 1):
    subset_suffix = f"_on_subset_{i}"
    
    # Length Normalized Predictive Entropy AUROC
    ln_auroc = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'], comprehensive_dataframe[f'avg_entropy{subset_suffix}'])
    ln_aurocs.append(ln_auroc)
    
    # Predictive Entropy AUROC
    predictive_auroc = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'], comprehensive_dataframe[f'entropy{subset_suffix}'])
    predictive_aurocs.append(predictive_auroc)
    
    # Semantic Predictive Entropy AUROC
    semantic_auroc = sklearn.metrics.roc_auc_score(1 - comprehensive_dataframe['correct'], comprehensive_dataframe[f'semantic_entropy{subset_suffix}'])
    semantic_entropy_aurocs.append(semantic_auroc)
    
    # Average number of semantic sets for all, correct, and incorrect predictions
    avg_semantic_sets.append(comprehensive_dataframe[f'num_semantic_sets{subset_suffix}'].mean())
    avg_semantic_sets_correct.append(comprehensive_dataframe[comprehensive_dataframe['correct'] == 1][f'num_semantic_sets{subset_suffix}'].mean())
    avg_semantic_sets_incorrect.append(comprehensive_dataframe[comprehensive_dataframe['correct'] == 0][f'num_semantic_sets{subset_suffix}'].mean())

# Update the analysis results dictionary with the computed metrics
analysis_results.update({
    'ln_predictive_entropy_auroc_on_subsets': ln_aurocs,
    'predictive_entropy_auroc_on_subsets': predictive_aurocs,
    'semantic_predictive_entropy_auroc_on_subsets': semantic_entropy_aurocs,
    'average_number_of_semantic_sets_on_subsets': avg_semantic_sets,
    'average_number_of_semantic_sets_on_subsets_correct': avg_semantic_sets_correct,
    'average_number_of_semantic_sets_on_subsets_incorrect': avg_semantic_sets_incorrect
    })

In [19]:
analysis_results

{'accuracy': 0.7025703794369645,
 'ln_predictive_entropy_auroc': 0.5290336064805806,
 'predictive_entropy_auroc': 0.5310732020208573,
 'entropy_over_concepts_auroc': 0.49160738866907383,
 'neg_llh_most_likely_gen_auroc': 0.6390996951417266,
 'number_of_semantic_sets_auroc': 0.4707429749266683,
 'number_of_semantic_sets_correct': 1.2889667250437828,
 'number_of_semantic_sets_incorrect': 1.2304526748971194,
 'average_rougeL_among_generations': 0.3037034868699676,
 'average_rougeL_among_generations_correct': 0.2974687726937621,
 'average_rougeL_among_generations_incorrect': 0.3183537823210512,
 'average_neg_llh_most_likely_gen_auroc': 0.6390996951417266,
 'rougeL_based_accuracy': 0.7014742014742015,
 'margin_measure_auroc': 0.5549861984966091,
 'ln_predictive_entropy_auroc_on_subsets': [0.49242178547490867,
  0.49629917911684795,
  0.5174086326061419,
  0.5337434145568024,
  0.5290336064805806],
 'predictive_entropy_auroc_on_subsets': [0.4965081836068409,
  0.49996036121741516,
  0.520839

In [20]:
with open("data/activations/analysis.json", "w") as output_file:
    output_file.write(json.dumps(analysis_results))