In [1]:
import matplotlib.pyplot as plt
%matplotlib inline
import os, sys
from os.path import join
import json
import numpy as np
import pandas as pd
from rouge_score import rouge_scorer
from tqdm.auto import tqdm
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from scipy.stats import sem

import seaborn as sns
import copy
from natsort import natsorted
import os
from os.path import join 
from pathlib import Path
from scipy.stats import ks_2samp
from matplotlib import font_manager
from matplotlib.font_manager import FontProperties
font_path = '../Times New Roman.ttf'
prop = FontProperties(fname=font_path)
plt.rcParams['font.family'] = prop.get_name()
# add font to font manager 
font_manager.fontManager.addfont(font_path)

%matplotlib inline
sns.set_theme(style="whitegrid", font_scale=1.5, font=prop.get_name())

colors = list(sns.color_palette("magma", n_colors=8))
fs=22




In [2]:
def get_mean_df(df):
    mean_df = copy.deepcopy(df)
    # delete row "generated_text"
    mean_df = mean_df.drop(['generated_text', 'normalized_gt_loss'], axis=0)
    mean_df.loc['avg_gt_prob']=np.zeros(len(mean_df.columns))
    # for each entry in df, the entry is a dict, get the mean of the values
    for eval_task, metrics in mean_df.items():
        # iterate through metrics
        for metric, res in metrics.items():
            # get mean
            # print(metric)
            if metric in ['avg_gt_prob', 'forget_quality', 'truth_ratio']:
                continue
            mean_df[eval_task][metric] = np.mean(list(res.values()))

        if 'eval_log' in eval_task:
            perplexities = np.array(list(df[eval_task]['avg_gt_loss'].values()))
            probs = np.exp(-1 * perplexities)
            mean_df[eval_task]['avg_gt_prob'] = np.mean(probs)

        else:
            avg_gt_loss = df[eval_task]['avg_gt_loss']
            avg_perturb_loss = df[eval_task]['average_perturb_loss']
            data_indices = avg_gt_loss.keys()
            normalized_gt_prob = {}
            for idx in data_indices:
                truth_prob = np.exp(-1 * avg_gt_loss[idx])
                perturb_prob = np.exp(-1 * np.array(avg_perturb_loss[idx]))
                all_prob = np.array([truth_prob, *perturb_prob])
                normalized_gt_prob[idx] = truth_prob / all_prob.sum()
            mean_df[eval_task]['avg_gt_prob'] = np.mean(np.array(list(normalized_gt_prob.values())))
        
        if eval_task == 'eval_log_forget.json':
            # truth_ratio = np.array(list(df[eval_task]['truth_ratio'].values()))
            # adjusted_truth_ratio = np.minimum(truth_ratio, 1/truth_ratio)

            avg_paraphrased_loss = df[eval_task]['avg_paraphrased_loss']
            avg_perturb_loss = df[eval_task]['average_perturb_loss']

            data_indices = list(avg_paraphrased_loss.keys())
            avg_paraphrased_loss = np.array([avg_paraphrased_loss[idx] for idx in data_indices])
            avg_perturb_loss = np.array([avg_perturb_loss[idx] for idx in data_indices]).mean(-1)

            truth_ratio = np.exp(avg_paraphrased_loss-avg_perturb_loss)
            adjusted_truth_ratio = np.minimum(truth_ratio, 1/truth_ratio)
            mean_df[eval_task]['truth_ratio'] = np.mean(adjusted_truth_ratio)
        else:
            avg_paraphrased_loss = df[eval_task]['avg_paraphrased_loss']
            avg_perturb_loss = df[eval_task]['average_perturb_loss']

            data_indices = list(avg_paraphrased_loss.keys())
            
            avg_paraphrased_loss = np.array([avg_paraphrased_loss[idx] for idx in data_indices])
            avg_perturb_loss = np.array([avg_perturb_loss[idx] for idx in data_indices]).mean(-1)
            truth_ratio = np.exp(avg_paraphrased_loss-avg_perturb_loss)
            
            adjusted_truth_ratio = np.maximum(0, 1-truth_ratio)
            mean_df[eval_task]['truth_ratio'] = np.mean(adjusted_truth_ratio)
            
    return mean_df

    

In [3]:
def get_forget_quality(unlearn_df, retain_df):
    unlearn_df.loc['forget_quality']=np.zeros(len(unlearn_df.columns))
    for eval_task, _ in unlearn_df.items():
        if eval_task == 'eval_log_forget.json':
            retain_truth_ratio = retain_df[eval_task]['truth_ratio']
            unlearn_truth_ratio = unlearn_df[eval_task]['truth_ratio']

            data_indices = list(retain_truth_ratio.keys())
            retain_truth_ratio = np.array([retain_truth_ratio[idx] for idx in data_indices])
            unlearn_truth_ratio = np.array([unlearn_truth_ratio[idx] for idx in data_indices])

            ks_test = ks_2samp(retain_truth_ratio, unlearn_truth_ratio)
            pvalue = ks_test.pvalue
            unlearn_df[eval_task]['forget_quality'] = pvalue
    return unlearn_df



In [4]:
forget_rates=['forget01', 'forget05', 'forget10']
model_family = ['llama2-7b', 'phi']
retain_model_path_dict = {
    'llama2-7b': {
        'forget01': '/home/zhilif/tofu/paper_models/ft_epoch5_lr1e-05_llama2-7b_retain99_wd0.01/checkpoint-618',
        'forget05': '/home/zhilif/tofu/paper_models/ft_epoch5_lr1e-05_llama2-7b_retain95_wd0.01/checkpoint-593',
        'forget10': '/home/zhilif/tofu/paper_models/ft_epoch5_lr1e-05_llama2-7b_retain90_wd0.01/checkpoint-562'
    },
    'phi': {
        'forget01': '/home/zhilif/tofu/paper_models/ft_epoch5_lr2e-05_phi_retain99_wd0.01/checkpoint-618',
        'forget05': '/home/zhilif/tofu/paper_models/ft_epoch5_lr2e-05_phi_retain95_wd0.01/checkpoint-593',
        'forget10': '/home/zhilif/tofu/paper_models/ft_epoch5_lr2e-05_phi_retain90_wd0.01/checkpoint-562'
    }
}
retain_df_dict = {}
for model in model_family:
    retain_df_dict[model] = {}
    for rate in forget_rates:
        retain_model_path = retain_model_path_dict[model][rate]
        retain_df_dict[model][rate] = pd.read_json(join(retain_model_path, f'eval_results/ds_size300/eval_log_aggregated.json'))
        


In [5]:
model_family = ['llama2-7b', 'phi']
ft_model_path = {
    'llama2-7b': '/home/zhilif/tofu/paper_models/ft_epoch5_lr1e-05_llama2-7b_full_wd0.01/checkpoint-625',
    'phi': '/home/zhilif/tofu/paper_models/ft_epoch5_lr2e-05_phi_full_wd0.01/checkpoint-625'
}
algorithms = ['grad_ascent', 'grad_diff', 'idk', 'KL']
forget_rates=['forget01', 'forget05', 'forget10']
lr_map = {
    'phi': '2e-05',
    'llama2-7b': '1e-05',
}

df_dict = {}
for model in model_family:
    df_dict[model] = {}
    for algo in algorithms:
        df_dict[model][algo] = {}
        for rate in forget_rates:
            ft_df = pd.read_json(join(ft_model_path[model], f'eval_results/ds_size300/eval_log_aggregated.json'))
            retain_df = retain_df_dict[model][rate]
            ft_df = get_forget_quality(ft_df, retain_df)
            mean_ft_df = get_mean_df(ft_df)
            subfolder1 = join(ft_model_path[model], f'{algo}_1e-05_{rate}')
            # iterate through the subfolder of subfolder1 that starts with checkpoint
            ckpt_folders = os.listdir(subfolder1)
            ckpt_folders = natsorted([i for i in ckpt_folders if 'checkpoint' in i])
            ckpt_df = pd.DataFrame(index=mean_ft_df.index, columns=mean_ft_df.columns)

            ckpt_df_list = [mean_ft_df]
            for ckpt in ckpt_folders:
                eval_result_path = join(subfolder1, ckpt, 'eval_results/ds_size300/eval_log_aggregated.json')
                eval_result = pd.read_json(eval_result_path)
                eval_result = get_forget_quality(eval_result, retain_df)
                mean_df = get_mean_df(eval_result)
                ckpt_df_list.append(mean_df)
            
            for column in ckpt_df.columns:
                for index in ckpt_df.index:
                    # Concatenate the cell values across DataFrames into a list
                    ckpt_df.at[index, column] = [df.at[index, column] for df in ckpt_df_list]

            df_dict[model][algo][rate] = ckpt_df

In [None]:
# Just some sanity checks
# for model in model_family:
#     for rate in forget_rates:
#         df = retain_df_dict[model][rate]
#         for eval_task, metrics in df.items():
#             truth_ratio = np.array(list(metrics['truth_ratio'].values()))
#             avg_paraphrased_loss = np.array(list(metrics['avg_paraphrased_loss'].values()))
#             avg_perturb_loss = np.array(list(metrics['average_perturb_loss'].values())).mean(-1)
#             truth_ratio2 = np.exp(avg_paraphrased_loss - avg_perturb_loss)
#             assert np.allclose(truth_ratio, truth_ratio2)


In [None]:
ckpt_step_map = {
    'forget01': {
        0: 0,
        1: 1,
        2: 2,
        3: 3,
        4: 4,
        5: 5
    },
    'forget05': {
        0: 0,
        1: 6,
        2: 12,
        3: 18,
        4: 24,
        5: 30,
    },
    'forget10': {
        0: 0,
        1: 12,
        2: 24,
        3: 36,
        4: 48,
        5: 60,
    }
}
save_alg_name = {
    'grad_ascent': 'grad_ascent',
    'grad_diff': 'KL', 
    'idk': 'dpo',
    'KL': 'oracle'
}
for model in model_family:
    for algo in algorithms:
        for rate in forget_rates:
            try:
                ckpt_df = df_dict[model][algo][rate]
                    
                fig, ax = plt.subplots(1, 3, figsize=(15, 3), sharey=True)

                label_names = {
                    'rougeL_recall': 'ROUGE',
                    'avg_gt_prob': 'Probability',
                    'truth_ratio': 'Truth Ratio',
                }
                forget_rate = float(rate.split('forget')[-1])
                retain_rate = 100 - forget_rate
                for i, m in enumerate(['rougeL_recall', 'avg_gt_prob', 'truth_ratio']):
                    checkpoints = [0, 1, 2, 3 ,4, 5]
                    n_ckpts = len(checkpoints)

                    if i == 0:
                        ax[i].plot(checkpoints, ckpt_df['eval_real_world_wo_options.json'][m][:n_ckpts], label='World Facts', color=colors[2], linewidth=2, markersize=8, marker='o')
                        ax[i].plot(checkpoints, ckpt_df['eval_real_author_wo_options.json'][m][:n_ckpts], label='Real Authors', color=colors[0], linewidth=2, markersize=8, marker='x')
                        ax[i].plot(checkpoints, ckpt_df['eval_log.json'][m][:n_ckpts], label=f'Retain Set ({retain_rate}%)', color=colors[-1], linewidth=2, markersize=8, marker='*')
                        ax[i].plot(checkpoints, ckpt_df['eval_log_forget.json'][m][:n_ckpts], label=f'Forget Set ({int(forget_rate)}%)', color=colors[6], linewidth=2, markersize=8, linestyle='--', marker='^')
                    else:
                        ax[i].plot(checkpoints, ckpt_df['eval_real_world_wo_options.json'][m][:n_ckpts], color=colors[2], linewidth=2, markersize=8, marker='o')
                        ax[i].plot(checkpoints, ckpt_df['eval_real_author_wo_options.json'][m][:n_ckpts], color=colors[0], linewidth=2, markersize=8, marker='x')
                        ax[i].plot(checkpoints, ckpt_df['eval_log.json'][m][:n_ckpts], color=colors[-1], linewidth=2, markersize=8, marker='*')
                        ax[i].plot(checkpoints, ckpt_df['eval_log_forget.json'][m][:n_ckpts], color=colors[6], linewidth=2, markersize=8, linestyle='--', marker='^')

                    ckpt_steps = [str(ckpt_step_map[rate][i]) for i in checkpoints]
                    ax[i].set_xticks(checkpoints, ckpt_steps)
                    # ax[i].set_xlabel
                    ax[i].set_xticks(ax[i].get_xticks()[::2])
                    ax[i].set_ylim([-0.1, 1.1])
                    if i > 0:
                        ax[i].sharey(ax[0])
                        # ax[i].set_yticks([])

                    
                    ax[i].set_ylabel(label_names[m], fontsize=fs)
                    
                    ax[i].spines['bottom'].set_color('black')
                    ax[i].spines['left'].set_color('black')
                    ax[i].spines['top'].set_color('black')
                    ax[i].spines['right'].set_color('black')
                    # ax[1].set_xlabel(f'Unlearning Steps {model, algo, rate}', fontsize=fs)
                    ax[1].set_xlabel(f'Unlearning Steps', fontsize=fs)

                    fig.legend(loc='upper center', bbox_to_anchor=(0.484, 1.15), ncol=4, fontsize=fs)
                    # plt.title(model)
                    # plt.xlabel('Training steps')
                    # plt.ylabel('ROUGE')
                    # plt.legend()
                    # plt.show()
                    # break
                    fig_folder = f'./figure/all_metrics/{model}'
                    Path(fig_folder).mkdir(parents=True, exist_ok=True)
                    fig.savefig(f'{fig_folder}/1GPU_{save_alg_name[algo]}_1e-05_{rate}_all3metric.pdf', format='pdf', bbox_inches='tight')
                    plt.close()
            except Exception as e:
                print(f'Error in {model} {algo} {rate} ')
                print(e)
                


