In [1]:

#model_name= 'meta-llama/Meta-Llama-3.1-8B'
model_name = 'openai-community/gpt2-xl'
#model_name = 'EleutherAI/gpt-j-6b'

In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from transformers import GPTJForCausalLM, GPT2Tokenizer
from transformers import GPTNeoForCausalLM, GPT2Tokenizer
from transformers import set_seed
# from transformers import GPT2Tokenizer, OPTForCausalLM
import json
import random
import os
import numpy as np

from torch.utils.data import DataLoader
from transformers import AutoTokenizer


import pandas as pd  
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import entropy

set_seed(1)
## TODOs ###
# only successul edits ?
# number of demos
# longer prompts
# instruction-tuned models


os.environ["CUDA_VISIBLE_DEVICES"] = "1"

os.environ["CUDA_LAUNCH_BLOCKING"] = "1"


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# The indices of nearest neighbours are stored in corpus_idx.txt.
with open('../corpus_idx.txt', 'r') as fIn:
    lines = fIn.readlines()
    lines = [line[:-1] for line in lines]
    corpus_idx = [[int(idx) for idx in line.split()] for line in lines]


def get_probs(model, tokenizer, inputs, device, bs = 32):
    data_loader = DataLoader(inputs, batch_size = bs, shuffle= False)
    model.eval()
    results = []
    answers = []
    for b in data_loader:
        batch = tokenizer(b, return_tensors='pt', padding=True, truncation=True)
        batch.to(device)
        with torch.no_grad():
            outputs = model(input_ids = batch['input_ids'], attention_mask = batch['attention_mask'], output_hidden_states=True)

        if tokenizer.padding_side == 'right': 
            last_index = batch['attention_mask'].sum(axis=1) - 1
            probs =  torch.nn.functional.softmax(outputs.logits[torch.arange(batch['input_ids'].size(0)), last_index], dim=1)

            ## get preds
            logits = outputs.logits
            last_non_masked = batch["attention_mask"].sum(1) - 1 # index of the last non-padding token
            to_gather = last_non_masked.unsqueeze(1).repeat(1, logits.size(-1)).unsqueeze(1) # shape: batch_size x 1 x vocab size
            gathered = torch.gather(logits, 1, to_gather).squeeze(1) # shape: batch_size x vocab_size
            ans = torch.argmax(gathered, dim=1) # shape: batch_size
        
            answers += ans.detach().cpu().numpy().tolist()

        elif tokenizer.padding_side == 'left':
            raise ValueError # probs = torch.nn.functional.softmax(outputs.logits[torch.arange(batch['input_ids'].size(0)), torch.empty(batch['input_ids'].size(0), dtype=torch.int32).fill_(-1)])
        
        reprs_b = torch.topk(probs, 100).values
        results.append(reprs_b.detach().cpu().numpy() )
    results = np.concatenate(results, axis=0)
    return results, np.array(answers)


def construct_icl_examples(idx, demos, clean=False):
    # len: 32
    order = [2, 1, 2, 0, 1, 2, 2, 0, 2, 2, 1, 0, 2, 1, 2, 0, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2]
    random.shuffle(order)
    icl_examples = []
    demo_ids = corpus_idx[idx]
    demo_ids = demo_ids[:len(order)]
    for demo_id, o in zip(demo_ids, order):
        line = demos[demo_id-2000]
        new_fact = line['requested_rewrite']['prompt'].format(line['requested_rewrite']['subject'])
        target_new = line['requested_rewrite']['target_new']['str']
        target_true = line['requested_rewrite']['target_true']['str']

        if not clean:
            if o == 0:
                # same prompt for "updating" and querying, both use taret_new
                # example: New Fact: The mother tongue of Robert Lecourt is English\nPrompt: The mother tongue of Robert Lecourt is English
                icl_examples.append(f'New Fact: {new_fact} {target_new}\nPrompt: {new_fact} {target_new}\n\n')
            elif o == 1:
                # one prompt for "updating" and another prompt for querying, both use taret_new
                # example: New Fact: The mother tongue of Colette Darfeuil is Russian\nPrompt: Colette Darfeuil spoke the language Russian
                prompt = random.choice(line['paraphrase_prompts'])
                icl_examples.append(f'New Fact: {new_fact} {target_new}\nPrompt: {prompt} {target_new}\n\n')
            elif o == 2:
                # one prompt with target_new, another prompt with target_true
                # example: New Fact: The mother tongue of Marc-Philippe Daubresse is Russian\nPrompt: The mother tongue of Melchior de Vogüé is French
                prompt = random.choice(line['neighborhood_prompts'])
                icl_examples.append(f'New Fact: {new_fact} {target_new}\nPrompt: {prompt} {target_true}\n\n')
        else:
            # clean setting : teach model to ignore "New Fact"
            if o == 0:
                icl_examples.append(f'New Fact: {new_fact} {target_new}\nPrompt: {new_fact} {target_true}\n\n')
            elif o == 1:
                prompt = random.choice(line['paraphrase_prompts'])
                icl_examples.append(f'New Fact: {new_fact} {target_new}\nPrompt: {prompt} {target_true}\n\n')
            elif o == 2:
                prompt = random.choice(line['neighborhood_prompts'])
                icl_examples.append(f'New Fact: {new_fact} {target_new}\nPrompt: {prompt} {target_true}\n\n')

    icl_examples.reverse()
    return icl_examples


def icl_lm_eval(model, tokenizer, icl_examples, targets, x):
    ppls = [] 
    for target in targets:
        tgt_len = len(tokenizer.encode(' ' + target))
        encodings = tokenizer(''.join(icl_examples) + f'{x} {target}', return_tensors='pt')
        input_ids = encodings['input_ids'].to(device)
        target_ids = input_ids.clone()
        target_ids[:, :-tgt_len] = -100
        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
            ppl = torch.exp(outputs.loss)
            ppls.append(ppl.item())
    return ppls

def get_final_probs(yesno_ppls, icl_ppls, orig_ppls):
    yes_prob = 1 / yesno_ppls[0]
    no_prob = 1 / yesno_ppls[1]
    final_probs = [yes_prob / icl_ppls[0] + no_prob / orig_ppls[0], yes_prob / icl_ppls[1] + no_prob / orig_ppls[1]]
    return final_probs

def get_match_acc(model, tok, inputs1, inputs2, device, bs = 32):
    #assert inputs1[0] != inputs2[0]

    preds1 = get_preds(model, tok, inputs1, device, bs = bs)
    preds2 = get_preds(model, tok, inputs2, device, bs = bs)
    
    return np.mean(preds1 == preds2)


def get_preds(model, tok, inputs, device, bs=32):
    answers = []

    data_loader = DataLoader(inputs, batch_size = bs, shuffle= False)

    for prompts in data_loader: 

        prompt_tok = tok(
        prompts,
        padding=True,
        truncation=True,
        return_tensors="pt").to(device)
           
        with torch.no_grad():
            logits = model(**prompt_tok).logits # shape: batch_size x num tokens x vocab size
            last_non_masked = prompt_tok["attention_mask"].sum(1) - 1 # index of the last non-padding token
            to_gather = last_non_masked.unsqueeze(1).repeat(1, logits.size(-1)).unsqueeze(1) # shape: batch_size x 1 x vocab size
            gathered = torch.gather(logits, 1, to_gather).squeeze(1) # shape: batch_size x vocab_size
            ans = torch.argmax(gathered, dim=1) # shape: batch_size
        
            answers += ans.detach().cpu().numpy().tolist()

    assert len(answers) == len(inputs)
    return np.array(answers)

In [None]:
if 'gpt-j' in model_name:
    model = GPTJForCausalLM.from_pretrained(model_name).to(device)
elif 'gpt2-xl' in model_name:
    model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
elif 'vicuna' in model_name.lower() or 'llama' in model_name.lower():
    from transformers import LlamaForCausalLM 
    model = LlamaForCausalLM.from_pretrained(model_name).to(device)
else:
    raise ValueError


# model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
# model = GPTNeoXForCausalLM.from_pretrained(model_name).half().to(device)
# model = GPTNeoForCausalLM.from_pretrained(model_name).to(device)


model.eval()

tokenizer = AutoTokenizer.from_pretrained(model_name)

#tokenizer = GPT2TokenizerFast.from_pretrained(model_name)
tokenizer.padding_side = 'right'
tokenizer.truncation_side = 'left'

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

In [4]:
lines = []

with open('../counterfact.json', 'r') as f:
    lines = json.load(f)
    
icl_examples = []
demos = lines[2000:]
lines = lines[:2000]
calibrate_magnitude = .0
success_cnt = 0
para_success_cnt = 0
magnitude = .0
para_magnitude = .0
orig_magnitude = .0
total_cnt = 0
para_total_cnt = 0
orig_success_cnt = 0
orig_total_cnt = 0

In [5]:
# icl_cnt = 0
example_idx = 0

ike_inputs = []
ike_inputs_clean = []
normal_inputs = []
ike_inputs_bos = []
normal_bos_inputs = []

for i, line in enumerate(lines):

    #if i % 10 == 0:
    #    print(i, success_cnt, total_cnt, magnitude / (total_cnt + 1e-12), para_success_cnt, para_magnitude / (para_total_cnt + 1e-12), orig_success_cnt ,orig_magnitude / (i + 1e-12))
    relation = line['requested_rewrite']['relation_id'] # e.g., P103
    prompt = line['requested_rewrite']['prompt'] # e.g., 'The mother tongue of Danielle Darrieux is'
    subject = line['requested_rewrite']['subject'] # 'Danielle Darrieux'
    prompt_calibrate = prompt.format('SUBJECT') # The mother tongue of SUBJECT is
    prompt = prompt.format(subject) # 'The mother tongue of Danielle Darrieux is'
    PROMPTS = [prompt, prompt_calibrate] # ['The mother tongue of Danielle Darrieux is', 'The mother tongue of SUBJECT is']

    target_true = line['requested_rewrite']['target_true']['str'] # French
    target_new = line['requested_rewrite']['target_new']['str'] # English
    
    PPLs = []
    targets = [target_new, target_true]
    icl_examples = construct_icl_examples(example_idx, demos)
    icl_examples_clean = construct_icl_examples(example_idx, demos, clean=True)

    # 'New Fact: The mother tongue of Danielle Darrieux is English\nPrompt: The mother tongue of Danielle Darrieux is English\n\n'
    icl_examples.append(f'New Fact: {prompt} {target_new}\nPrompt: {prompt} {target_new}\n\n')
    icl_examples_clean.append(f'New Fact: {prompt} {target_new}\nPrompt: {prompt} {target_true}\n\n')

    example_idx += 1

    ike_inputs.append(''.join(icl_examples) + f'{prompt}')
    ike_inputs_bos.append(''.join(icl_examples) + f'{tokenizer.bos_token} {prompt}')

    ike_inputs_clean.append(''.join(icl_examples_clean) + f'{prompt}')

    normal_inputs.append( f'{prompt}')
    normal_bos_inputs.append( f'{tokenizer.bos_token} {prompt}')

    #edit_ppls = icl_lm_eval(model, tokenizer, icl_examples, [target_new, target_true], f'New Fact: {prompt} {target_new}\nPrompt: {prompt}')


In [6]:
if 'gpt2' in model_name:
    inf_bs = 16
else:
    inf_bs = 8

    
ike = get_probs(model, tokenizer, ike_inputs, device, bs = inf_bs)
ike_clean = get_probs(model, tokenizer, ike_inputs_clean, device, bs = inf_bs)
ike_bos = get_probs(model, tokenizer, ike_inputs_bos, device, bs = inf_bs)
normal = get_probs(model, tokenizer, normal_inputs, device, bs = inf_bs)

In [7]:
normal_bos = get_probs(model, tokenizer, normal_bos_inputs, device, bs = inf_bs)

In [None]:
i=1
ike_inputs[i]

In [None]:
ike_inputs_clean[i]

In [10]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score


def classify(probs1, probs2, model_name, setting, ent=False):
    # Example data
    list1 = np.array([x[:10] for x in probs1]) # [[1.1, 1.2], [1.2, 1.1], [1.3, 1.4], [1.4, 1.3]]  # Class B instances
    list2 = np.array([x[:10] for x in probs2]) # [[0.1, 0.2], [0.2, 0.1], [0.3, 0.4], [0.4, 0.3]]  # Class A instances

    if ent:
        list1 = np.array([entropy(x) for x in list1]).reshape((-1,1))
        list2 = np.array([entropy(x) for x in list2]).reshape((-1,1))

    # Split each class data into training and testing sets
    list1_train, list1_test = train_test_split(list1, test_size=0.5, shuffle=False)
    list2_train, list2_test = train_test_split(list2, test_size=0.5, shuffle=False)

    # Combine the training data and create labels
    X_train = np.vstack((list1_train, list2_train))
    y_train = np.hstack((np.zeros(len(list1_train)), np.ones(len(list2_train))))

    # Combine the testing data and create labels
    X_test = np.vstack((list1_test, list2_test))
    y_test = np.hstack((np.zeros(len(list1_test)), np.ones(len(list2_test))))


    shuffled_indices = np.random.permutation(len(X_train))
    X_train = X_train[shuffled_indices]
    y_train = y_train[shuffled_indices]



    # Create and train the logistic regression model with L1 regularization
    model = LogisticRegression(penalty='l1', solver='liblinear')
    model.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    pr = precision_score(y_test, y_pred)
    re = recall_score(y_test, y_pred)
    f1score = f1_score(y_test, y_pred)

    report = classification_report(y_test, y_pred)

    print(f'Accuracy: {accuracy}')
    print('Classification Report:')



    print(report)
    return list1, list2, np.round(accuracy*100,2),np.round(pr*100,2), np.round(re*100,2), np.round(f1score*100,2)


In [11]:
pairs = [(normal, ike, 'vanilla', 
          'unedited', 'IKE-edited'), # vanilla setting: unedited prompts vs. IKE-edited

        (ike_clean, ike, 'deconfounding', 
         'IKE-corrupted', 'IKE-edited'), # deconfounding setting: IKE-corrupted vs. IKE-edited

        (ike_bos, ike, 'IKE-edited vs. BOS', 
         'IKE-edited + BOS', 'IKE-edited'),  # bos: IKE-edited w/ BOS vs. IKE-edited
         
        (normal, ike_bos, 'unedited vs. BOS',
          'unedited', 'IKE-edited + BOS'), # normal vs. bos: IKE-edited w/ BOS vs. normal

        (normal, normal_bos, 'null effect',
          'unedited', 'unedited vs. bos + unedited'), # normal vs. bos: IKE-edited w/ BOS vs. normal
        ]        


In [None]:
#pairs = [(normal_probs, ike_probs, normal_preds, ike_preds, 'vanilla', 'unedited', 'IKE-edited'), # vanilla setting: unedited prompts vs. IKE-edited
#        (ike_probs_clean, ike_probs, ike_preds_clean, ike_preds, 'deconfounding', 'unedited (long)', 'IKE-edited'), # deconfounding setting: corrupted vs. IKE-edited
#        (ike_probs_bos, ike_probs, ike_preds_bos, ike_preds, 'bos', 'IKE-edited (bos)', 'IKE-edited'),  # bos: IKE-edited w/ BOS vs. IKE-edited
#        (ike_probs_bos, normal_probs, ike_preds_bos, normal_preds, 'normal vs. bos', 'IKE-edited (bos)', 'unedited')] # normal vs. bos: IKE-edited w/ BOS vs. normal

entropy_options = [False]

for c1, c2, setting, s1, s2 in pairs: 
    
    p1, preds1 = c1[0], c1[1]
    p2, preds2 = c2[0], c2[1]

    for ent in entropy_options:
        list1, list2, accuracy, pr, re, f1score = classify(p1, p2, model_name, setting, ent=ent)

        same_preds = np.round(np.mean(preds1 == preds2)*100,2)

        with open('./results_new_final.csv', 'a') as f:
            f.write('{},{},{},{:.2f},{:.2f},{:.2f},{:.2f},{:.2f}\n'.format(model_name, setting, str(ent), accuracy, pr, re, f1score, same_preds))


        dict = {
        'Probs':np.mean(list1, axis=0).tolist() +  np.mean(list2, axis=0).tolist(),
        'Prompts' : [s1]*10 + [s2]*10
        }

        if ent:
            continue

        df = pd.DataFrame.from_dict(dict)

        plt.figure(figsize=(8,5), dpi=320)
        plt.rcParams.update({'font.size': 16})
        sns.kdeplot( x='Probs', hue='Prompts', data=df, palette="Set2", fill=True, bw_adjust=0.6, cumulative=False)

        plt.savefig(f'./probs/{setting}-' + model_name.split('/')[-1] + '.pdf', format='pdf')

        # Show the plot
        plt.show()


In [13]:
icl_examples = ['New Fact: The mother tongue of Jonathan Littell is Greek\nPrompt: Jonathan Littell, speaker of Greek\n\n', 'New Fact: The mother tongue of Michel Braudeau is Russian\nPrompt: Montesquieu, speaker of French\n\n', 'New Fact: The mother tongue of Louis Florencie is Russian\nPrompt: The mother tongue of François Bayrou is French\n\n', 'New Fact: The mother tongue of Rainer Maria Rilke is French\nPrompt: The mother tongue of Wilhelm Ackermann is German\n\n', 'New Fact: The mother tongue of Robert Lecourt is English\nPrompt: The mother tongue of Robert Lecourt is English\n\n', 'New Fact: The mother tongue of Jan Wils is Italian\nPrompt: Henk van Woerden is a native speaker of Dutch\n\n', 'New Fact: The mother tongue of Elsa Zylberstein is German\nPrompt: The mother tongue of Georges Duhamel is French\n\n', 'New Fact: The mother tongue of Daniel-Rops is Polish\nPrompt: The mother tongue of Daniel-Rops is Polish\n\n', 'New Fact: The mother tongue of Jan Commelin is French\nPrompt: Jan Commelin spoke the language French\n\n', 'New Fact: The mother tongue of Alain Marleix is Russian\nPrompt: Jean-Luc Picard, a native French\n\n', 'New Fact: The mother tongue of Jean-Baptiste Solignac is Russian\nPrompt: The native language of Octave Mirbeau is French\n\n', "New Fact: The mother tongue of Michel d'Ornano is Swedish\nPrompt: Jean-Luc Picard, a native French\n\n", 'New Fact: The mother tongue of Ewa Farna is French\nPrompt: Ewa Farna spoke the language French\n\n', 'New Fact: The mother tongue of Patrick Chamoiseau is Dutch\nPrompt: The native language of Patrick Chamoiseau is Dutch\n\n', 'New Fact: The mother tongue of Dominique Cabrera is Dutch\nPrompt: The mother tongue of Dominique Cabrera is Dutch\n\n', 'New Fact: The mother tongue of Henri Diamant-Berger is English\nPrompt: Henri Diamant-Berger, a native English\n\n', 'New Fact: The mother tongue of Dominique Zardi is Dutch\nPrompt: Dominique Zardi, a native Dutch\n\n', 'New Fact: The mother tongue of Michel Camdessus is Russian\nPrompt: Michel Camdessus is a native speaker of Russian\n\n', 'New Fact: The mother tongue of Catherine Picard is Dutch\nPrompt: The mother tongue of Catherine Picard is Dutch\n\n', 'New Fact: The mother tongue of Martin Lamotte is Latin\nPrompt: Georges Duhamel, speaker of French\n\n', 'New Fact: The mother tongue of Philippe de Mornay is Russian\nPrompt: Georges Duhamel, a native French\n\n', 'New Fact: The mother tongue of Marie NDiaye is Russian\nPrompt: Marie NDiaye is a native speaker of Russian\n\n', 'New Fact: The mother tongue of Jean-Antoine Chaptal is English\nPrompt: Léon Blum is a native speaker of French\n\n', 'New Fact: The mother tongue of Catherine Deneuve is Dutch\nPrompt: Catherine Deneuve, a native Dutch\n\n', 'New Fact: The mother tongue of Raymond Triboulet is Dutch\nPrompt: The mother tongue of Ferdinand de Saussure is French\n\n', 'New Fact: Daniel Darc is a native speaker of Dutch\nPrompt: Daniel Darc, a native Dutch\n\n', 'New Fact: The mother tongue of Louis Carrogis Carmontelle is Polish\nPrompt: Louis Carrogis Carmontelle spoke the language Polish\n\n', 'New Fact: The mother tongue of Daniel Pennacchioni is Russian\nPrompt: Melchior de Vogüé, speaker of French\n\n', 'New Fact: The mother tongue of Camille Flammarion is Dutch\nPrompt: Ferdinand de Saussure spoke the language French\n\n', 'New Fact: The mother tongue of Bernard Cerquiglini is English\nPrompt: Jean-Luc Picard spoke the language French\n\n', 'New Fact: The mother tongue of Marc-Philippe Daubresse is Russian\nPrompt: The mother tongue of Melchior de Vogüé is French\n\n', 'New Fact: The mother tongue of Colette Darfeuil is Russian\nPrompt: Colette Darfeuil spoke the language Russian\n\n']

In [None]:
print(len(icl_examples))
for i in range(len(icl_examples))[:10]:
    prompt_split = icl_examples[i].split('\n')
    print(prompt_split[0])
    print(prompt_split[1])
    print('\n')


In [None]:
print(len(icl_examples_clean))
for i in range(len(icl_examples_clean))[:5]:
    prompt_split = icl_examples_clean[i].split('\n')
    print(prompt_split[0])
    print(prompt_split[1])
    print('\n')


In [16]:
#get_match_acc(model, tokenizer, normal_inputs, ike_inputs_bos, device)

In [17]:
#get_match_acc(model, tokenizer, ike_inputs[:100], ike_inputs_bos[:100], device)

In [None]:
if 'vicuna' in model_name.lower() or 'llama' in model_name.lower():
    e_mat = model.model.embed_tokens.weight
else:
    e_mat = model.transformer.wte.weight

bos_embedding = e_mat[tokenizer.bos_token_id].unsqueeze(0)
sim_list = torch.nn.functional.cosine_similarity(e_mat, bos_embedding).cpu().detach().numpy()

# Sample list of float numbers
float_list = sim_list.tolist()

# Step 1: Pair each element with its index
indexed_list = list(enumerate(float_list))

# Step 2: Sort the pairs based on the elements in descending order
sorted_indexed_list = sorted(indexed_list, key=lambda x: x[1], reverse=True)

# Step 3: Extract the indices from the sorted pairs
sorted_indices = [(index,value) for index, value in sorted_indexed_list]

print(sorted_indices)

In [None]:
for i, x in enumerate(sorted_indices[:20]):
    print(i, tokenizer.decode(x[0]), x[1])

In [None]:
tokenizer.decode(sorted_indices[10][0])

In [None]:
sorted_indices