In [3]:
import os
import random
import numpy as np
import torch
import pickle
import json

# Please make sure you are using CUDA enabled GPU for this project
device = 'cuda'

# Setting the seed value ensures that the results are reproducible across different runs
seed_val = 10

# Ensuring that the seed is set for Python's hashing, random operations, NumPy, and PyTorch
os.environ['PYTHONHASHSEED'] = str(seed_val)
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)

<torch._C.Generator at 0x7c552410cfb0>

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = model = AutoModelForCausalLM.from_pretrained(
        "google/gemma-2-2b",
        device_map="cuda",
        torch_dtype=torch.float16
    ).to('cuda')

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b")

NameError: name 'torch' is not defined

In [None]:
with open('data/activations/similarity_scores_20.pkl', 'rb') as infile:
    similarities_dict = pickle.load(infile)
with open('data/activations/generations_and_judgments_with_sae_20.json') as infile:
    sequences = json.loads(infile.read())

In [None]:
def compute_neg_log_likelihoods(model, responses):
    """This function computes various negative log-likelihoods and other metrics for a given set of responses."""
    with torch.no_grad():
        result = []
        for sample_key in responses:
            id_ = sample_key
            sample = responses[sample_key]
            
            result_dict = {}
            prompt = sample['prompt']
            generations = tokenizer(sample['generations'])['input_ids']
            print(generation.shape)
            
            # Initialize tensors to store various metrics
            average_neg_log_likelihoods, average_unconditioned_neg_log_likelihoods, neg_log_likelihoods, neg_unconditioned_log_likelihoods, pointwise_mutual_information = torch.zeros((generations.shape[0],)), torch.zeros((generations.shape[0],)), torch.zeros((generations.shape[0],)), torch.zeros((generations.shape[0],)), torch.zeros((generations.shape[0],))
            sequence_embeddings = []

            # Iterating through each generation and compute metrics
            for generation_index in range(generations.shape[0]):
                prompt = tokenizer(prompt)['input_ids']
                prompt = prompt[prompt != tokenizer.pad_token_id]
                generation = generations[generation_index][generations[generation_index] != tokenizer.pad_token_id]

                # This computation of the negative log likelihoods follows this tutorial: https://huggingface.co/docs/transformers/perplexity
                # Compute the negative log likelihoods following the Hugging Face tutorial
                target_ids = generation.clone()
                target_ids[:len(prompt)] = -100
                model_output = model(torch.reshape(generation, (1, -1)), labels=target_ids, output_hidden_states=True)
                generation_only = generation.clone()[(len(prompt) - 1):]
                unconditioned_model_output = model(torch.reshape(generation_only, (1, -1)), labels=generation_only, output_hidden_states=True)
                hidden_states = model_output['hidden_states']
                average_neg_log_likelihood = model_output['loss']

                # Compute various likelihoods and information metrics
                average_unconditioned_neg_log_likelihood = unconditioned_model_output['loss']
                average_neg_log_likelihoods[generation_index] = average_neg_log_likelihood
                average_unconditioned_neg_log_likelihoods[generation_index] = average_unconditioned_neg_log_likelihood
                neg_log_likelihoods[generation_index] = average_neg_log_likelihood * (len(generation) - len(prompt))
                neg_unconditioned_log_likelihoods[generation_index] = average_unconditioned_neg_log_likelihood * (
                    len(generation) - len(prompt))
                pointwise_mutual_information[generation_index] = -neg_log_likelihoods[
                    generation_index] + neg_unconditioned_log_likelihoods[generation_index]

                # Compute the average of the last layer's token embeddings
                average_of_last_layer_token_embeddings = torch.mean(hidden_states[-1], dim=1)
                sequence_embeddings.append(average_of_last_layer_token_embeddings)

            # Compute metrics for the most likely generations
            most_likely_generation = sample['most_likely_generation_ids'].to(device)
            target_ids = most_likely_generation.clone()
            target_ids[:len(prompt)] = -100
            model_output = model(torch.reshape(most_likely_generation, (1, -1)), labels=target_ids, output_hidden_states=True)
            hidden_states = model_output['hidden_states']
            average_neg_log_likelihood_of_most_likely_gen = model_output['loss']
            most_likely_generation_embedding = torch.mean(hidden_states[-1], dim=1)

            # Compute metrics for the second most likely generation
            second_most_likely_generation = sample['second_most_likely_generation_ids'].to(device)
            target_ids = second_most_likely_generation.clone()
            target_ids[:len(prompt)] = -100
            model_output = model(torch.reshape(second_most_likely_generation, (1, -1)), labels=target_ids, output_hidden_states=True)
            hidden_states = model_output['hidden_states']
            average_neg_log_likelihood_of_second_most_likely_gen = model_output['loss']

            neg_log_likelihood_of_most_likely_gen = average_neg_log_likelihood_of_most_likely_gen * (len(most_likely_generation) - len(prompt))

            sequence_embeddings = torch.stack(sequence_embeddings)
            
            # Compile all the computed metrics into a dictionary
            result_dict.update({
                'prompt': prompt,
                'generations': generations,
                'average_neg_log_likelihoods': average_neg_log_likelihoods,
                'neg_log_likelihoods': neg_log_likelihoods,
                'sequence_embeddings': most_likely_generation_embedding,
                'most_likely_sequence_embedding': most_likely_generation,
                'average_unconditioned_neg_log_likelihoods': average_unconditioned_neg_log_likelihoods,
                'neg_unconditioned_log_likelihoods': neg_unconditioned_log_likelihoods,
                'pointwise_mutual_information': pointwise_mutual_information,
                'average_neg_log_likelihood_of_most_likely_gen': average_neg_log_likelihood_of_most_likely_gen,
                'average_neg_log_likelihood_of_second_most_likely_gen': average_neg_log_likelihood_of_second_most_likely_gen,
                'neg_log_likelihood_of_most_likely_gen': neg_log_likelihood_of_most_likely_gen,
                'semantic_set_ids': torch.tensor(similarities_dict[id_[0]]['semantic_set_ids'], device=device),
                'id': id_
            })
            
            result.append(result_dict)

        return result

In [None]:
likelihoods = compute_neg_log_likelihoods(model, sequences)
with open(f'data/activations/likelihoods.pkl','wb') as outfile:
    pickle.dump(likelihoods, outfile)