In [None]:
import json
import os
from statistics import mean
from collections import defaultdict
import pandas as pd

# 初始化一个字典来存储每个 model 的数据
model_data = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

def parse_filename(file_name):
    """从文件名中解析 defender、model_name、attacker、num_prompts 和 time_str"""
    parts = file_name.split('_')
    defender = parts[0] + parts[1] + parts[2] +    parts[3]
    model_name = parts[4]
    attacker = parts[5]
    num_prompts = parts[6]
    time_str = parts[7]
    return defender, model_name, attacker, num_prompts, time_str

def process_folder(folder_path, file_name):
    gpt_scores = []
    dic_score_false_count = 0
    eval_res_label_1_count = 0
    total_entries = 0
    
    # 找到以 _safe_eval.json 结尾的文件
    for file in os.listdir(folder_path):
        if file.endswith('_safe_eval.json'):
            safe_eval_file = os.path.join(folder_path, file)
            with open(safe_eval_file, 'r') as f:
                data = json.load(f)
                
                # 确保 data 是一个列表
                if isinstance(data, list):
                    for entry in data:
                        # 统计 GPT_score 平均值
                        if "GPT_score" in entry:
                            gpt_scores.append(entry["GPT_score"])

                        # 统计 dic_score 为 false 的占比
                        if "dic_score" in entry and entry["dic_score"] is False:
                            dic_score_false_count += 1

                        # 统计 eval_res 中 label 等于 "LABEL_1" 的比例
                        if "eval_res" in entry:
                            eval_res = entry["eval_res"]

                            # 如果 eval_res 是一个列表，遍历列表中的每个元素
                            if isinstance(eval_res, list):
                                for res in eval_res:
                                    total_entries += 1
                                    if isinstance(res, dict) and res.get("label") == "LABEL_1":
                                        eval_res_label_1_count += 1

                            # 如果 eval_res 是一个字典，直接检查 label
                            elif isinstance(eval_res, dict):
                                total_entries += 1
                                if eval_res.get("label") == "LABEL_1":
                                    eval_res_label_1_count += 1
    
    # 计算结果
    avg_gpt_score = mean(gpt_scores) if gpt_scores else 0
    dic_score_false_ratio = dic_score_false_count / len(gpt_scores) if gpt_scores else 0
    label_1_ratio = eval_res_label_1_count / total_entries if total_entries else 0
    
    # 将结果保存到相应的 model、defender 和 attacker
    defender, model_name, attacker, num_prompts, time_str = parse_filename(file_name)
    model_data[model_name][defender][attacker].append({
        "num_prompts": num_prompts,
        "time_str": time_str,
        "average_GPT_score": avg_gpt_score,
        "dic_score_false_ratio": dic_score_false_ratio,
        "label_1_ratio": label_1_ratio
    })

# Function to create the summary table
def create_summary_table(data):
    # 创建一个空的数据框
    columns = set()
    rows = set()
    
    # 首先收集所有的 defender 和 attacker 名称
    for defender, attackers in data.items():
        rows.add(defender)
        for attacker in attackers.keys():
            columns.add(attacker)
    
    # 创建一个 pandas DataFrame
    table = pd.DataFrame(index=list(rows), columns=list(columns))
    
    # 填充表格数据
    for defender, attackers in data.items():
        for attacker, entries in attackers.items():
            cell_data = []
            for entry in entries:
                avg_gpt_score = entry['average_GPT_score']
                label_1_ratio = entry['label_1_ratio'] * 100  # 转换为百分比
                dic_score_false_ratio = entry['dic_score_false_ratio'] * 100  # 转换为百分比
                # 格式化成所需的字符串
                cell_data.append(f"{avg_gpt_score:.2f} ({label_1_ratio:.2f}\%) ({dic_score_false_ratio:.2f}\%)")
            
            # 将数据放到表格的单元格里
            table.at[defender, attacker] = "\n".join(cell_data)
    
    return table
# 遍历主文件夹中的所有子文件夹并处理
base_dir = '/home/kz34/Yang_Ouyang_Projects/ICLR2025/jailbreaking_related/SafeDecoding/exp_unlearning_baseline'

for folder in os.listdir(base_dir):
    folder_path = os.path.join(base_dir, folder)
    if os.path.isdir(folder_path):
        if folder == 'previous_result':
            continue
        # if "GCG" not in folder and "PAIR" not in folder:
            # continue
     
        print(f"Processing folder: {folder}")
        process_folder(folder_path, folder)

# 为每个 model_name 创建并保存表格
for model_name, defenders in model_data.items():
    summary_table = create_summary_table(defenders)
    print(f"Model: {model_name}")
    display(summary_table)  # Use display to properly format and show the tables in Jupyter Notebook


Processing folder: no_random_unlearning_1_mistral_PAIR_48_2024-12-16 22:09:09
Processing folder: no_random_unlearning_1_llama2_GCG_104_2024-12-16 23:13:34
Processing folder: no_random_unlearning_1_mistral_DeepInception_50_2024-12-16 21:44:25
Processing folder: no_random_unlearning_2_llama2_AdvBench_104_2024-12-16 22:28:02
Processing folder: no_random_unlearning_1_mistral_HEx-PHI_290_2024-12-16 21:44:23
Processing folder: no_random_unlearning_2_llama2_DeepInception_50_2024-12-16 23:01:08
Processing folder: no_random_unlearning_2_mistral_GCG_104_2024-12-16 21:44:22
Processing folder: no_random_unlearning_1_llama2_AdvBench_104_2024-12-16 22:27:18
Processing folder: no_random_unlearning_2_mistral_AdvBench_104_2024-12-17 00:34:00
Processing folder: no_random_unlearning_1_llama2_DeepInception_50_2024-12-16 22:59:58
Processing folder: no_random_unlearning_1_llama2_HEx-PHI_290_2024-12-16 22:30:36
Processing folder: no_random_unlearning_2_mistral_HEx-PHI_290_2024-12-16 21:44:22
Processing folde

Unnamed: 0,AdvBench,GCG,HEx-PHI,DeepInception,PAIR
norandomunlearning2,0.00 (0.00\%),3.46 (75.00\%),2.79 (60.00\%),0.00 (0.00\%),4.21 (97.92\%)
norandomunlearning1,3.03 (73.08\%),3.96 (92.31\%),3.05 (73.45\%),4.44 (100.00\%),4.29 (95.83\%)


Model: llama2


Unnamed: 0,AdvBench,HEx-PHI,GCG,DeepInception,PAIR
norandomunlearning2,1.00 (0.00\%),1.18 (2.07\%),1.90 (24.04\%),3.18 (62.00\%),2.11 (38.64\%)
norandomunlearning1,1.00 (0.00\%),1.16 (2.07\%),1.97 (25.00\%),3.14 (62.00\%),2.14 (38.64\%)


In [26]:
import json
import os
from statistics import mean
from collections import defaultdict
import pandas as pd

# 初始化一个字典来存储每个 model 的数据
model_data = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

def parse_filename(file_name):
    """从文件名中解析 defender、model_name、attacker、num_prompts 和 time_str"""
    parts = file_name.split('_')
    defender = parts[0]  # 第一部分是 defender
    model_name = parts[1]
    attacker = parts[2]
    num_prompts = parts[3]
    time_str = parts[4]
    return defender, model_name, attacker, num_prompts, time_str

def process_folder(folder_path, file_name):
    gpt_scores = []
    dic_score_false_count = 0
    eval_res_label_1_count = 0
    total_entries = 0
    
    # 找到以 _safe_eval.json 结尾的文件
    for file in os.listdir(folder_path):
        if file.endswith('_safe_eval.json'):
            safe_eval_file = os.path.join(folder_path, file)
            with open(safe_eval_file, 'r') as f:
                data = json.load(f)
                
                # 确保 data 是一个列表
                if isinstance(data, list):
                    for entry in data:
                        # 统计 GPT_score 平均值
                        if "GPT_score" in entry:
                            gpt_scores.append(entry["GPT_score"])

                        # 统计 dic_score 为 false 的占比
                        if "dic_score" in entry and entry["dic_score"] is False:
                            dic_score_false_count += 1

                        # 统计 eval_res 中 label 等于 "LABEL_1" 的比例
                        if "eval_res" in entry:
                            eval_res = entry["eval_res"]

                            # 如果 eval_res 是一个列表，遍历列表中的每个元素
                            if isinstance(eval_res, list):
                                for res in eval_res:
                                    total_entries += 1
                                    if isinstance(res, dict) and res.get("label") == "LABEL_1":
                                        eval_res_label_1_count += 1

                            # 如果 eval_res 是一个字典，直接检查 label
                            elif isinstance(eval_res, dict):
                                total_entries += 1
                                if eval_res.get("label") == "LABEL_1":
                                    eval_res_label_1_count += 1
    
    # 计算结果
    avg_gpt_score = mean(gpt_scores) if gpt_scores else 0
    dic_score_false_ratio = dic_score_false_count / len(gpt_scores) if gpt_scores else 0
    label_1_ratio = eval_res_label_1_count / total_entries if total_entries else 0
    
    # 将结果保存到相应的 model、defender 和 attacker
    defender, model_name, attacker, num_prompts, time_str = parse_filename(file_name)
    model_data[model_name][defender][attacker].append({
        "num_prompts": num_prompts,
        "time_str": time_str,
        "average_GPT_score": avg_gpt_score,
        "dic_score_false_ratio": dic_score_false_ratio,
        "label_1_ratio": label_1_ratio
    })

# Function to create the summary table
def create_summary_table(data):
    # 创建一个空的数据框
    columns = set()
    rows = set()
    
    # 首先收集所有的 defender 和 attacker 名称
    for defender, attackers in data.items():
        rows.add(defender)
        for attacker in attackers.keys():
            columns.add(attacker)
    
    # 创建一个 pandas DataFrame
    table = pd.DataFrame(index=list(rows), columns=list(columns))
    
    # 填充表格数据
    for defender, attackers in data.items():
        for attacker, entries in attackers.items():
            cell_data = []
            for entry in entries:
                avg_gpt_score = entry['average_GPT_score']
                label_1_ratio = entry['label_1_ratio'] * 100  # 转换为百分比
                dic_score_false_ratio = entry['dic_score_false_ratio'] * 100  # 转换为百分比
                # 格式化成所需的字符串
                cell_data.append(f"{avg_gpt_score:.2f} ({label_1_ratio:.2f}%) ({dic_score_false_ratio:.2f}%)")
            
            # 将数据放到表格的单元格里
            table.at[defender, attacker] = "\n".join(cell_data)
    
    return table
# 遍历主文件夹中的所有子文件夹并处理
base_dir = '/home/kz34/Yang_Ouyang_Projects/ICLR2025/jailbreaking_related/SafeDecoding/exp_outputs_new'

for folder in os.listdir(base_dir):
    folder_path = os.path.join(base_dir, folder)
    if os.path.isdir(folder_path):
        if folder == 'previous_result':
            continue
        if "GCG" in folder:
            continue
        if "PAIR" in folder:
            continue
        print(f"Processing folder: {folder}")
        process_folder(folder_path, folder)

# 为每个 model_name 创建并保存表格
for model_name, defenders in model_data.items():
    summary_table = create_summary_table(defenders)
    print(f"Model: {model_name}")
    display(summary_table)  # Use display to properly format and show the tables in Jupyter Notebook


Processing folder: Paraphrase_llama2_DeepInception_50_2024-10-15 14:26:36
Processing folder: PPL_llama2_DeepInception_50_2024-10-15 13:09:05
Processing folder: SafeDecoding_llama2_AdvBench_104_2024-10-15 12:35:49
Processing folder: PPL_llama2_AdvBench_104_2024-10-15 12:35:49
Processing folder: Self-Exam_llama2_DeepInception_50_2024-10-15 14:39:31
Processing folder: Retokenization_llama2_AdvBench_104_2024-10-15 12:35:47
Processing folder: Retokenization_llama2_DeepInception_50_2024-10-15 14:26:36
Processing folder: LayerBugFixer_mistral_AdvBench_104_2024-10-15 14:36:49
Processing folder: LayerBugFixer_llama2_Just-Eval_1000_2024-10-15 22:44:07
Processing folder: SafeDecoding_llama2_DeepInception_50_2024-10-15 12:36:02
Processing folder: LayerBugFixer_mistral_Just-Eval_1000_2024-10-15 14:36:55
Processing folder: Self-Exam_llama2_AdvBench_104_2024-10-15 17:27:48
Processing folder: nodefense_mistral_AdvBench_104_2024-10-15 23:09:32
Processing folder: Self-Reminder_llama2_AdvBench_104_2024-1

Unnamed: 0,DeepInception,Just-Eval,AdvBench
Retokenization,3.20 (88.00%) (88.00%),,1.33 (7.69%) (13.46%)
Self-Reminder,1.00 (0.00%) (0.00%),,1.00 (0.00%) (0.00%)
Self-Exam,1.06 (2.00%) (2.00%),,1.00 (0.00%) (0.00%)
SafeDecoding,1.04 (0.00%) (0.00%),,0.00 (0.00%) (0.00%)
LayerBugFixer,,0.00 (0.00%) (0.00%),
PPL,3.12 (62.00%) (62.00%),,1.00 (0.00%) (0.00%)
ICD,1.00 (0.00%) (0.00%),,0.00 (0.00%) (0.00%)
Paraphrase,0.00 (0.00%) (0.00%),,0.00 (0.00%) (0.00%)


Model: mistral


Unnamed: 0,Just-Eval,AdvBench
LayerBugFixer,0.00 (0.00%) (0.00%),3.27 (14.42%) (80.77%)
nodefense,,3.06 (4.81%) (74.04%)


In [14]:
model_data

defaultdict(<function __main__.<lambda>()>,
            {'mistral': defaultdict(<function __main__.<lambda>.<locals>.<lambda>()>,
                         {'LayerBugFixer': defaultdict(list,
                                      {'HEx-PHI': [{'num_prompts': '290',
                                         'time_str': '2024-10-14 18:18:40',
                                         'average_GPT_score': 3.075862068965517,
                                         'dic_score_false_ratio': 0.5896551724137931,
                                         'label_1_ratio': 0.2}],
                                       'AdvBench': [{'num_prompts': '104',
                                         'time_str': '2024-10-15 11:06:56',
                                         'average_GPT_score': 3.201923076923077,
                                         'dic_score_false_ratio': 0.7115384615384616,
                                         'label_1_ratio': 0.14423076923076922}],
                            

In [11]:
import json
import os
from statistics import mean
from collections import defaultdict
import pandas as pd

# 初始化一个字典来存储每个 model 的数据
model_data = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

def parse_filename(file_name):
    """从文件名中解析 defender、model_name、attacker、num_prompts 和 time_str"""
    parts = file_name.split('_')
    defender = parts[0]  # 第一部分是 defender
    model_name = parts[1]
    attacker = parts[2]
    num_prompts = parts[3]
    time_str = parts[4]
    return defender, model_name, attacker, num_prompts, time_str

def process_folder(folder_path, file_name):
    gpt_scores = []
    dic_score_false_count = 0
    eval_res_label_1_count = 0
    total_entries = 0
    
    # 找到以 _safe_eval.json 结尾的文件
    for file in os.listdir(folder_path):
        if file.endswith('_safe_eval.json'):
            safe_eval_file = os.path.join(folder_path, file)
            with open(safe_eval_file, 'r') as f:
                data = json.load(f)
                
                # 确保 data 是一个列表
                if isinstance(data, list):
                    for entry in data:
                        # 统计 GPT_score 平均值
                        if "GPT_score" in entry:
                            gpt_scores.append(entry["GPT_score"])

                        # 统计 dic_score 为 false 的占比
                        if "dic_score" in entry and entry["dic_score"] is False:
                            dic_score_false_count += 1

                        # 统计 eval_res 中 label 等于 "LABEL_1" 的比例
                        if "eval_res" in entry:
                            for res in entry["eval_res"]:
                                total_entries += 1
                                if res.get("label") == "LABEL_1":
                                    eval_res_label_1_count += 1
    
    # 计算结果
    avg_gpt_score = mean(gpt_scores) if gpt_scores else 0
    dic_score_false_ratio = dic_score_false_count / len(gpt_scores) if gpt_scores else 0
    label_1_ratio = eval_res_label_1_count / total_entries if total_entries else 0
    
    # 将结果保存到相应的 model、defender 和 attacker
    defender, model_name, attacker, num_prompts, time_str = parse_filename(file_name)
    
    # 只处理 attacker 为 "GCG" 的数据
    if attacker == "GCG":
        model_data[model_name][defender][attacker].append({
            "num_prompts": num_prompts,
            "time_str": time_str,
            "average_GPT_score": avg_gpt_score,
            "dic_score_false_ratio": dic_score_false_ratio,
            "label_1_ratio": label_1_ratio
        })

# Function to create the summary table
def create_summary_table(data):
    # 创建一个空的数据框
    columns = set()
    rows = set()
    
    # 首先收集所有的 defender 和 attacker 名称
    for defender, attackers in data.items():
        rows.add(defender)
        for attacker in attackers.keys():
            columns.add(attacker)
    
    # 创建一个 pandas DataFrame
    table = pd.DataFrame(index=list(rows), columns=list(columns))
    
    # 填充表格数据
    for defender, attackers in data.items():
        for attacker, entries in attackers.items():
            cell_data = []
            for entry in entries:
                avg_gpt_score = entry['average_GPT_score']
                label_1_ratio = entry['label_1_ratio'] * 100  # 转换为百分比
                dic_score_false_ratio = entry['dic_score_false_ratio'] * 100  # 转换为百分比
                # 格式化成所需的字符串
                cell_data.append(f"{avg_gpt_score:.2f} ({label_1_ratio:.2f}%) ({dic_score_false_ratio:.2f}%)")
            
            # 将数据放到表格的单元格里
            table.at[defender, attacker] = "\n".join(cell_data)
    
    return table

# 遍历主文件夹中的所有子文件夹并处理
base_dir = '/home/kz34/Yang_Ouyang_Projects/ICLR2025/jailbreaking_related/SafeDecoding/exp_outputs'

for folder in os.listdir(base_dir):
    folder_path = os.path.join(base_dir, folder)
    if os.path.isdir(folder_path):
        if folder == 'previous_result':
            continue
        print(f"Processing folder: {folder}")
        process_folder(folder_path, folder)

# 为每个 model_name 创建并保存表格
for model_name, defenders in model_data.items():
    summary_table = create_summary_table(defenders)
    print(f"Model: {model_name}")
    print(summary_table)
    
    # 将结果保存为 excel 文件
    summary_table.to_csv(f'{model_name}_defender_attacker_GCG_summary.csv')

Processing folder: LayerBugFixer_mistral_HEx-PHI_290_2024-10-14 18:18:40
Processing folder: Self-Reminder_mistral_HEx-PHI_290_2024-10-14 20:45:20
Processing folder: Self-Exam_llama2_HEx-PHI_290_2024-10-14 16:56:00
Processing folder: Retokenization_llama2_HEx-PHI_290_2024-10-14 23:52:46
Processing folder: SafeDecoding_mistral_Just-Eval_1000_2024-10-14 16:29:58
Processing folder: SafeDecoding_mistral_Just-Eval_1000_2024-10-14 16:03:33
Processing folder: nodefense_llama2_AdvBench_104_2024-10-14 23:49:59
Processing folder: Paraphrase_mistral_DeepInception_50_2024-10-14 21:50:52
Processing folder: nodefense_mistral_DeepInception_50_2024-10-14 23:45:09
Processing folder: ICD_llama2_Just-Eval_1000_2024-10-13 11:08:45
Processing folder: SafeDecoding_llama2_HEx-PHI_290_2024-10-14 23:52:57
Processing folder: PPL_mistral_HEx-PHI_290_2024-10-14 19:19:26
Processing folder: ICD_llama2_DeepInception_50_2024-10-15 07:54:57
Processing folder: SafeDecoding_mistral_HEx-PHI_290_2024-10-14 19:18:28
Process

In [12]:
summary_table

Unnamed: 0,DeepInception,AdvBench,HEx-PHI,Just-Eval
Retokenization,3.36 (90.00%) (90.00%)\n0.00 (0.00%) (0.00%),0.00 (0.00%) (0.00%),1.76 (15.52%) (30.69%),
Self-Reminder,1.00 (0.00%) (0.00%),0.00 (0.00%) (0.00%)\n0.00 (0.00%) (0.00%),,
Self-Exam,1.06 (2.00%) (2.00%),,0.00 (0.00%) (0.00%)\n1.00 (0.00%) (0.00%),0.00 (0.00%) (0.00%)
nodefense,3.10 (62.00%) (62.00%),1.00 (0.00%) (0.00%),1.18 (0.69%) (2.07%),0.00 (0.00%) (0.00%)
SafeDecoding,1.04 (0.00%) (0.00%),,1.14 (0.34%) (1.03%),0.00 (0.00%) (0.00%)
PPL,3.10 (62.00%) (62.00%),0.00 (0.00%) (0.00%)\n0.00 (0.00%) (0.00%),0.00 (0.00%) (0.00%)\n1.18 (0.69%) (2.07%),0.00 (0.00%) (0.00%)
ICD,1.00 (0.00%) (0.00%),,1.04 (0.00%) (0.34%),0.00 (0.00%) (0.00%)
Paraphrase,2.86 (54.00%) (56.00%),,1.27 (3.10%) (6.55%),0.00 (0.00%) (0.00%)
