In [1]:
import os
import json
import numpy as np
import pandas as pd

def calc_metric(metric_list, percent=100, std_flag=False):
    mean_val = np.mean(metric_list)
    std_val = np.std(metric_list)
    if std_flag:
        return f"{mean_val*percent:.2f}±{std_val:.2f}"
    else:
        return np.round(mean_val*percent, 2)

In [2]:
def summarize_json_to_df(json_folder, std_flag=False):
    metrics_list = []
    portability_type = 'reasoning_acc', 'Subject_Aliasing_acc'

    for filename in os.listdir(json_folder):
        if filename.endswith('.json'):
            filepath = os.path.join(json_folder, filename)
            with open(filepath, 'r') as file:
                metrics = json.load(file)
                base_filename = filename.replace('_results.json', '')
                edit_method, data, model = base_filename.split('_', 2)
                # if data != 'counterfact':  # wikibio  counterfact zsre
                #     continue
                # print(edit_method, data, model)
                # for e in metrics:
                #     print(e['post'])
                    # print(e['post']['rewrite_acc'][0])
                
                if edit_method == 'IKE':
                    reliability_pre = calc_metric([e['pre']['edit_acc'] for e in metrics])
                    reliability_post = calc_metric([e['post']['edit_acc'] for e in metrics])
                else:
                    reliability_pre = calc_metric([e['pre']['edit_acc'][0] for e in metrics])
                    reliability_post = calc_metric([e['post']['edit_acc'][0] for e in metrics])
                    # locality_post: calc_metric([e['post']['locality'][0] for e in metrics])

                if data == 'wikibio':
                    portability_pre = None
                    portability_post = None
                else:
                    # portability_pre = calc_metric([e['pre']['portability'][list(e['pre']['portability'].keys())[0]][0] for e in metrics])
                    # portability_post = calc_metric([e['post']['portability'][list(e['post']['portability'].keys())[0]][0] for e in metrics])
                    ls_pre, ls_post = [], []
                    for e in metrics:
                        # print(e['pre']['portability'])
                        if e['pre']['portability'] or e['post']['portability']:
                            portability_type = list(e['pre']['portability'].keys())[0]
                            ls_pre.append(e['pre']['portability'][portability_type][0])
                            ls_post.append(e['post']['portability'][portability_type][0])
                    portability_pre = calc_metric(ls_pre)
                    portability_post = calc_metric(ls_post)

                results = {
                    "data": data,
                    "edit_method": edit_method,
                    "reliability_pre": reliability_pre,
                    "reliability_post": reliability_post,
                    # "locality": calc_metric([e['post']['locality'][0] for e in metrics]),
                    # "locality": locality_post,
                    "portability_pre": portability_pre,
                    "portability_post": portability_post,
                    # "fluency_pre": calc_metric([e['pre']['fluency']['ngram_entropy'] for e in metrics]),
                    # "fluency_post": calc_metric([e['post']['fluency']['ngram_entropy'] for e in metrics]),
                }
                metrics_list.append(results)
    df = pd.DataFrame(metrics_list)
    return df


summarize_json_to_df("../results/know_edit/new_eval").sort_values(by=['data', 'edit_method'])

Unnamed: 0,data,edit_method,reliability_pre,reliability_post,portability_pre,portability_post
19,counterfact,FT,0.24,62.34,3.02,44.49
8,counterfact,IKE,0.24,72.11,3.02,53.28
5,counterfact,MEMIT,0.24,25.15,3.02,16.54
20,counterfact,ROME,0.24,26.22,3.02,12.34
7,counterfact,SERAC,0.24,0.24,3.02,3.02
15,recent,FT,15.96,76.46,26.38,53.32
0,recent,FT-L,15.96,61.77,26.38,44.47
10,recent,ROME,15.96,70.62,26.38,44.79
6,recent,SERAC,15.96,15.96,26.38,26.38
9,wikibio,FT,6.21,6.21,,


In [2]:
with open('../results/know_edit/old_eval/ROME_counterfact_Llama-2-7b-chat-hf_results.json', 'r') as file:
    metrics = json.load(file)
e = metrics[0] 
# , e['post']['portability']
print(e['post']['portability'], '\n', list(e['post']['portability'].values()))
np.mean(list(e['post']['locality'].values()))

{'Subject_Aliasing_acc': [0.0, 0.5, 0.0, 1.0], 'reasoning_acc': [0.0, 0.0, 0.0, 0.0, 0.25, 0.3333333333333333, 1.0]} 
 [[0.0, 0.5, 0.0, 1.0], [0.0, 0.0, 0.0, 0.0, 0.25, 0.3333333333333333, 1.0]]


  arr = asanyarray(a)


array([0.41666667, 0.25      , 0.5       , 0.25      , 0.25      ,
       0.5       , 0.5       , 0.5       , 0.5       , 0.375     ])

In [4]:
e['post']

{'rewrite_acc': [1.0],
 'locality': {'Relation_Specificity_acc': [0.8333333333333334,
   0.5,
   1.0,
   0.5,
   0.5,
   1.0,
   1.0,
   1.0,
   1.0],
  'Forgetfulness_acc': [0.75]},
 'portability': {'Subject_Aliasing_acc': [0.0, 0.5, 0.0, 1.0],
  'reasoning_acc': [0.0, 0.0, 0.0, 0.0, 0.25, 0.3333333333333333, 1.0]},
 'fluency': {'ngram_entropy': 6.117325041741868}}

In [2]:
# Example metrics: {'rewrite_acc': [1.0], 'locality': {'Relation_Specificity_acc': [0.8333333333333334, 0.5, 1.0, 0.5, 0.5, 1.0, 1.0, 1.0, 1.0], 'Forgetfulness_acc': [0.75]}, 'portability': {'Subject_Aliasing_acc': [0.0, 0.5, 0.0, 1.0], 'reasoning_acc': [0.0, 0.0, 0.0, 0.0, 0.25, 0.3333333333333333, 1.0]}, 'fluency': {'ngram_entropy': 6.117325041741868}}

def summarize_json_to_df(json_folder, std_flag=False):
    metrics_list = []

    for filename in os.listdir(json_folder):
        if filename.endswith('.json'):
            with open(os.path.join(json_folder, filename), 'r') as file:
                metrics = json.load(file)
                edit_method, data, model = filename.split('_')[:3]

                if edit_method == 'IKE':
                    reliability_pre = calc_metric([e['pre']['rewrite_acc'] for e in metrics])
                    reliability_post = calc_metric([e['post']['rewrite_acc'] for e in metrics])
                else:
                    reliability_pre = calc_metric([e['pre']['rewrite_acc'][0] for e in metrics])
                    reliability_post = calc_metric([e['post']['rewrite_acc'][0] for e in metrics])

                if data == 'wikibio':
                    portability_pre = None
                    portability_post = None
                else:
                    ls_pre, ls_post = [], []
                    for e in metrics:
                        if e['pre']['portability'] or e['post']['portability']:
                            pre_values = [np.mean(v) for v in e['pre']['portability'].values()]
                            post_values = [np.mean(v) for v in e['post']['portability'].values()]
                            ls_pre.append(np.mean(pre_values))
                            ls_post.append(np.mean(post_values))
                    portability_pre = calc_metric(ls_pre)
                    portability_post = calc_metric(ls_post)

                locality_post = calc_metric([np.mean([np.mean(v) for v in e['post']['locality'].values()]) for e in metrics])

                results = {
                    "data": data,
                    "edit_method": edit_method,
                    "reliability_pre": reliability_pre,
                    "reliability_post": reliability_post,
                    "portability_pre": portability_pre,
                    "portability_post": portability_post,
                    "locality_post": locality_post,
                }
                metrics_list.append(results)
    df = pd.DataFrame(metrics_list)
    return df


summarize_json_to_df("../results/know_edit/old_eval").sort_values(by=['data', 'edit_method'])

Unnamed: 0,data,edit_method,reliability_pre,reliability_post,portability_pre,portability_post,locality_post
24,counterfact,FT,25.9,100.0,26.42,74.35,77.06
2,counterfact,FT-L,25.9,44.84,26.42,33.93,50.28
10,counterfact,IKE,25.9,100.0,26.58,88.61,66.29
22,counterfact,LoRA,25.9,100.0,26.42,72.58,65.11
7,counterfact,MEMIT,25.9,97.97,20.44,45.59,64.37
25,counterfact,ROME,25.88,98.51,20.41,43.75,68.65
9,counterfact,SERAC,50.0,100.0,41.63,63.16,100.0
18,recent,FT,47.4,100.0,40.68,65.6,64.1
0,recent,FT-L,47.4,56.3,40.68,40.47,43.78
17,recent,IKE,47.4,99.97,39.97,79.38,64.12


In [None]:
    # df['edit_method'] = pd.Categorical(df['edit_method'], ["ROME", "FT-M", "ICL"])
    # df['Reli_increase'] = df.apply(lambda x: x['Reliability_post'] - x['Reliability_pre'], axis=1)
    # df['Gene_increase'] = df.apply(lambda x: x['Generalization_post'] - x['Generalization_pre'], axis=1)
    # df['Port_increase'] = df.apply(lambda x: x['Portability_post'] - x['Portability_pre'], axis=1)
    # return df[['edit_method', 'model', 'Reliability_pre', 'Reliability_post', 'Reli_increase', 'Generalization_pre', 
    #            'Generalization_post', 'Gene_increase', 'Portability_pre', 'Portability_post', 'Port_increase']]