In [1]:
import os
import json
import numpy as np
import pandas as pd

In [25]:
def get_avg_std(metric_list, percent=100, std_flag=False):
    mean_val = np.mean(metric_list)
    std_val = np.std(metric_list)
    if std_flag:
        return f"{mean_val*percent:.2f}±{std_val:.2f}"
    else:
        return np.round(mean_val*percent, 2)


def summarize_results(json_folder, std_flag=False):
    metrics_ls, metrics_ls_multi_hop = [], []

    for filename in sorted(os.listdir(json_folder)):
        if filename.endswith('.json') and 'multi_turn' not in filename:
            filepath = os.path.join(json_folder, filename)
            with open(filepath, 'r') as file:
                metrics = json.load(file)
                parts = filename.split('_')
                domain = parts[0]
                topic = ' '.join(parts[1:-1])  
                edit_method = parts[-1].replace('.json', '')  

                results = {
                    "domain": domain,
                    "topic": topic,
                    "edit_method": edit_method,
                    "efficacy_pre": get_avg_std([e['pre']['edit_acc'][0] for e in metrics]),
                    "efficacy_post": get_avg_std([e['post']['edit_acc'][0] for e in metrics]),
                    "rephrase_pre": get_avg_std([e['pre']['rephrase_acc'][0] for e in metrics]),
                    "rephrase_post": get_avg_std([e['post']['rephrase_acc'][0] for e in metrics]),
                    "yes_pre": get_avg_std([e['pre']['yes_questions']['yes_acc'][0] for e in metrics]),
                    "yes_post": get_avg_std([e['post']['yes_questions']['yes_acc'][0] for e in metrics]),
                    "no_pre": get_avg_std([e['pre']['no_questions']['no_acc'][0] for e in metrics]),
                    "no_post": get_avg_std([e['post']['no_questions']['no_acc'][0] for e in metrics]),
                    "mc_pre": get_avg_std([e['pre']['multiple_choice_questions']['multiple_choice_acc'][0] for e in metrics]),
                    "mc_post": get_avg_std([e['post']['multiple_choice_questions']['multiple_choice_acc'][0] for e in metrics]),
                    "reversed_pre": get_avg_std([e['pre']['reversed_relation_questions']['reversed_relation_acc'][0] for e in metrics]),
                    "reversed_post": get_avg_std([e['post']['reversed_relation_questions']['reversed_relation_acc'][0] for e in metrics]),
                    "locality_post": get_avg_std([e['post']['locality'][f'locality_acc'][0] for e in metrics]),
                }

                results_multi_hop = {
                    "domain": domain,
                    "topic": topic,
                    "edit_method": edit_method,
                    "questions_2hop_pre": get_avg_std([e['pre']['questions_2hop']['2hop_acc'][0] for e in metrics]),
                    "questions_2hop_post": get_avg_std([e['post']['questions_2hop']['2hop_acc'][0] for e in metrics]),
                    "questions_3hop_pre": get_avg_std([e['pre']['questions_3hop']['3hop_acc'][0] for e in metrics]),
                    "questions_3hop_post": get_avg_std([e['post']['questions_3hop']['3hop_acc'][0] for e in metrics]),
                    "questions_4hop_pre": get_avg_std([e['pre']['questions_4hop']['4hop_acc'][0] for e in metrics]),
                    "questions_4hop_post": get_avg_std([e['post']['questions_4hop']['4hop_acc'][0] for e in metrics]),
                    "questions_5hop_pre": get_avg_std([e['pre']['questions_5hop']['5hop_acc'][0] for e in metrics]),
                    "questions_5hop_post": get_avg_std([e['post']['questions_5hop']['5hop_acc'][0] for e in metrics]),
                    "questions_6hop_pre": get_avg_std([e['pre']['questions_6hop']['6hop_acc'][0] for e in metrics]),
                    "questions_6hop_post": get_avg_std([e['post']['questions_6hop']['6hop_acc'][0] for e in metrics]),
                }
                metrics_ls.append(results)
                metrics_ls_multi_hop.append(results_multi_hop)
    df = pd.DataFrame(metrics_ls)
    df_multi_hop = pd.DataFrame(metrics_ls_multi_hop)
    df = df.set_index('edit_method').loc[edit_method_order_ls].reset_index()
    df_multi_hop = df_multi_hop.set_index('edit_method').loc[edit_method_order_ls].reset_index()
    df['edit_method'] = df['edit_method'].replace('ICL', 'ICE')
    return df, df_multi_hop

model_id_ls = ['meta-llama/Meta-Llama-3-8B-Instruct', 'mistralai/Mistral-7B-Instruct-v0.3', 'meta-llama/Llama-2-7b-chat-hf']
model_id_format_ls = [e.split('/')[-1].replace('-', '_').lower() for e in model_id_ls]
model_name_ls = ["Llama2-7B", "Llama3-8B", "Mistral-v0.3-7B"]
edit_method_order_ls = ['FT-L', 'FT-M', 'MEMIT', 'ROME', 'LoRA', 'ICL', 'GRACE']

In [26]:
df_llama2, df_multi_hop_llama2 = summarize_results("../results/llama_2_7b_chat_hf")
df_llama3, df_multi_hop_llama3 = summarize_results("../results/meta_llama_3_8b_instruct")
df_mistral, df_multi_hop_mistral = summarize_results("../results/mistral_7b_instruct_v0.3")

In [28]:
df_llama3

Unnamed: 0,edit_method,domain,topic,efficacy_pre,efficacy_post,rephrase_pre,rephrase_post,yes_pre,yes_post,no_pre,no_post,mc_pre,mc_post,reversed_pre,reversed_post,locality_post
0,FT-L,art,sculpture,0.0,47.00,12.00,59.00,61.00,46.00,26.00,10.00,34.00,33.00,6.00,0.00,16.00
1,FT-L,business,brand,0.0,56.00,12.00,52.00,62.00,51.00,22.00,14.00,27.00,28.00,14.00,5.00,15.00
2,FT-L,business,corporation,0.0,37.78,5.56,34.44,62.22,45.56,14.44,6.67,27.78,32.22,4.44,0.00,26.67
3,FT-L,business,industry,0.0,52.04,18.37,53.06,88.78,41.84,26.53,7.14,31.63,38.78,17.35,13.27,12.24
4,FT-L,entertainment,anime,0.0,42.00,7.00,48.00,63.00,30.00,22.00,10.00,24.00,25.00,2.00,0.00,4.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,GRACE,places,country,0.0,100.00,19.00,2.00,61.00,10.00,42.00,32.00,51.00,1.00,51.00,21.00,91.00
178,GRACE,places,landmark,0.0,100.00,12.00,3.00,60.00,8.00,41.00,21.00,43.00,1.00,19.00,3.00,40.00
179,GRACE,technology,database,0.0,100.00,8.54,2.44,73.17,23.17,29.27,23.17,39.02,2.44,4.88,1.22,24.39
180,GRACE,technology,programming language,0.0,100.00,19.00,2.00,61.00,15.00,36.00,13.00,27.00,3.00,19.00,4.00,40.00


In [29]:
df_llama3.groupby(['domain', 'edit_method']).mean().reset_index()

Unnamed: 0,domain,edit_method,efficacy_pre,efficacy_post,rephrase_pre,rephrase_post,yes_pre,yes_post,no_pre,no_post,mc_pre,mc_post,reversed_pre,reversed_post,locality_post
0,art,FT-L,0.0,47.000000,12.000000,59.000000,61.000000,46.000000,26.000000,10.000000,34.00,33.000000,6.000000,0.000000,16.000000
1,art,FT-M,0.0,77.000000,12.000000,74.000000,61.000000,61.000000,26.000000,23.000000,34.00,33.000000,6.000000,2.000000,53.000000
2,art,GRACE,0.0,99.000000,12.000000,3.000000,61.000000,10.000000,26.000000,15.000000,34.00,2.000000,6.000000,1.000000,26.000000
3,art,ICE,0.0,90.000000,12.000000,87.000000,61.000000,87.000000,26.000000,65.000000,34.00,81.000000,6.000000,72.000000,50.000000
4,art,LoRA,0.0,88.000000,12.000000,86.000000,61.000000,63.000000,26.000000,25.000000,34.00,54.000000,6.000000,4.000000,30.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,technology,GRACE,0.0,99.666667,10.513333,2.813333,70.390000,20.723333,29.423333,20.723333,32.34,2.480000,9.626667,1.740000,38.796667
59,technology,ICE,0.0,96.706667,10.513333,92.820000,70.390000,91.226667,29.423333,78.090000,32.34,91.300000,9.626667,70.023333,65.503333
60,technology,LoRA,0.0,91.050000,10.513333,90.123333,70.390000,72.763333,29.423333,22.023333,32.34,63.876667,9.626667,7.960000,33.090000
61,technology,MEMIT,0.0,75.210000,10.513333,73.210000,70.390000,83.463333,29.423333,24.276667,32.34,65.950000,9.626667,8.700000,23.903333


In [33]:
column_ls = ['domain', 'edit_method', 'efficacy_post', 'locality_post']
df_llama3[column_ls].groupby(['domain', 'edit_method']).mean().mean().reset_index(name='overall')

Unnamed: 0,index,overall
0,efficacy_post,79.594921
1,locality_post,37.59455


In [31]:
df_llama2['model'] = 'Llama2-7B'
df_llama3['model'] = 'Llama3-8B'
df_mistral['model'] = 'Mistral-v0.3-7B'
df_combined = pd.concat([df_llama2, df_llama3, df_mistral])
df_combined_grouped = df_combined.groupby(['model', 'edit_method']).mean().reset_index()
df_combined_grouped['Avg Generalization Scores Pre'] = df_combined_grouped[['rephrase_pre', 'yes_pre', 'no_pre', 'mc_pre', 'reversed_pre']].mean(axis=1)
df_combined_grouped['Avg Generalization Scores Post'] = df_combined_grouped[['rephrase_post', 'yes_post', 'no_post', 'mc_post', 'reversed_post']].mean(axis=1)
df_combined_grouped[['model', 'edit_method', 'Avg Generalization Scores Pre', 'Avg Generalization Scores Post']]


Unnamed: 0,model,edit_method,Avg Generalization Scores Pre,Avg Generalization Scores Post
0,Llama2-7B,FT-L,35.139385,42.697308
1,Llama2-7B,FT-M,35.139385,42.877538
2,Llama2-7B,GRACE,35.139385,0.520692
3,Llama2-7B,ICE,35.139385,77.858154
4,Llama2-7B,LoRA,35.139385,47.484462
5,Llama2-7B,MEMIT,35.139385,49.981846
6,Llama2-7B,ROME,35.154769,50.924615
7,Llama3-8B,FT-L,31.888,30.153923
8,Llama3-8B,FT-M,31.888,40.305692
9,Llama3-8B,GRACE,31.888,7.981077


In [5]:
df_multi_hop_llama3

Unnamed: 0,edit_method,domain,topic,questions_2hop_pre,questions_2hop_post,questions_3hop_pre,questions_3hop_post,questions_4hop_pre,questions_4hop_post,questions_5hop_pre,questions_5hop_post,questions_6hop_pre,questions_6hop_post
0,FT-L,art,sculpture,33.00,17.00,32.00,13.00,34.00,19.00,34.00,21.00,45.00,32.00
1,FT-L,business,brand,37.00,19.00,32.00,15.00,31.00,19.00,26.00,18.00,28.00,19.00
2,FT-L,business,corporation,26.67,8.89,33.33,10.00,12.22,5.56,18.89,11.11,25.56,15.56
3,FT-L,business,industry,52.04,13.27,35.71,14.29,23.47,12.24,27.55,17.35,29.59,17.35
4,FT-L,entertainment,anime,14.00,4.00,25.00,5.00,26.00,7.00,24.00,13.00,29.00,18.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
177,GRACE,places,country,29.00,1.00,24.00,3.00,29.00,2.00,20.00,1.00,22.00,0.00
178,GRACE,places,landmark,45.00,3.00,37.00,3.00,36.00,2.00,34.00,0.00,37.00,1.00
179,GRACE,technology,database,45.12,0.00,28.05,0.00,20.73,0.00,23.17,0.00,28.05,1.22
180,GRACE,technology,programming language,41.00,4.00,28.00,0.00,25.00,0.00,32.00,0.00,28.00,1.00


In [34]:
df_multi_hop_llama3.groupby(['domain', 'edit_method']).mean().mean().reset_index(name='overall')

Unnamed: 0,index,overall
0,questions_2hop_pre,37.066667
1,questions_2hop_post,24.118783
2,questions_3hop_pre,33.833968
3,questions_3hop_post,21.868889
4,questions_4hop_pre,29.770847
5,questions_4hop_post,20.224021
6,questions_5hop_pre,29.543968
7,questions_5hop_post,21.04
8,questions_6hop_pre,34.053598
9,questions_6hop_post,24.998995


In [36]:
# robustness evaluation
def summarize_multi_turn_overall(folder_paths):
    data_list = []
    for folder_path in folder_paths:
        for filename in os.listdir(folder_path):
            if filename.endswith('.json'):
                parts = filename.split('_')
                domain = parts[0]
                topic = ' '.join(parts[1:-2])  
                edit_method = parts[-2]
                type_ = parts[-1].replace('.json', '')

                file_path = os.path.join(folder_path, filename)
                with open(file_path, 'r') as file:
                    metrics = json.load(file)
                
                multi_turn_ls_post = [e['post']['edit_acc_multi_turn'] for e in metrics]
                
                post_mean = np.mean(multi_turn_ls_post, axis=0)
                
                for turn, post in enumerate(post_mean):
                    data_list.append({
                        'model': os.path.basename(folder_path),
                        'domain': domain,
                        'topic': topic,
                        'edit_method': edit_method,
                        'type': type_,
                        'turn': turn,
                        'robustness score': post
                    })
    
    return pd.DataFrame(data_list)

folder_paths_multi_turn = [
    '../results/llama_2_7b_chat_hf_multi_turn',
    '../results/meta_llama_3_8b_instruct_multi_turn',
    '../results/mistral_7b_instruct_v0.3_multi_turn',
]
df_multi_turn = summarize_multi_turn_overall(folder_paths_multi_turn)
df_multi_turn

Unnamed: 0,model,domain,topic,edit_method,type,turn,robustness score
0,llama_2_7b_chat_hf_multi_turn,entertainment,music genre,FT-L,yes,0,0.690000
1,llama_2_7b_chat_hf_multi_turn,entertainment,music genre,FT-L,yes,1,0.410000
2,llama_2_7b_chat_hf_multi_turn,entertainment,music genre,FT-L,yes,2,0.560000
3,llama_2_7b_chat_hf_multi_turn,entertainment,music genre,FT-L,yes,3,0.330000
4,llama_2_7b_chat_hf_multi_turn,entertainment,music genre,FT-L,yes,4,0.280000
...,...,...,...,...,...,...,...
6001,mistral_7b_instruct_v0.3_multi_turn,health,medication,LoRA,yes,6,0.433333
6002,mistral_7b_instruct_v0.3_multi_turn,health,medication,LoRA,yes,7,0.466667
6003,mistral_7b_instruct_v0.3_multi_turn,health,medication,LoRA,yes,8,0.466667
6004,mistral_7b_instruct_v0.3_multi_turn,health,medication,LoRA,yes,9,0.466667


In [42]:
df_multi_turn.groupby(['model', 'edit_method', 'turn']).mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,robustness score
model,edit_method,turn,Unnamed: 3_level_1
llama_2_7b_chat_hf_multi_turn,FT-L,0,0.669753
llama_2_7b_chat_hf_multi_turn,FT-L,1,0.355590
llama_2_7b_chat_hf_multi_turn,FT-L,2,0.362630
llama_2_7b_chat_hf_multi_turn,FT-L,3,0.229536
llama_2_7b_chat_hf_multi_turn,FT-L,4,0.205715
...,...,...,...
mistral_7b_instruct_v0.3_multi_turn,ROME,6,0.740984
mistral_7b_instruct_v0.3_multi_turn,ROME,7,0.740984
mistral_7b_instruct_v0.3_multi_turn,ROME,8,0.740984
mistral_7b_instruct_v0.3_multi_turn,ROME,9,0.740984
