In [3]:
import json
import os
from statistics import mean
from collections import defaultdict
import pandas as pd

# 初始化一个字典来存储每个 model 的数据
model_data = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))

def parse_filename(file_name):
    """从文件名中解析 defender、model_name、attacker、num_prompts 和 time_str"""
    parts = file_name.split('_')
    defender = parts[0]  # 第一部分是 defender
    model_name = parts[1]
    attacker = parts[2]
    num_prompts = parts[3]
    time_str = parts[4]
    return defender, model_name, attacker, num_prompts, time_str

def process_folder(folder_path, file_name):
    gpt_scores = []
    dic_score_false_count = 0
    eval_res_label_1_count = 0
    total_entries = 0
    
    # 找到以 _safe_eval.json 结尾的文件
    for file in os.listdir(folder_path):
        if file.endswith('_safe_eval.json'):
            safe_eval_file = os.path.join(folder_path, file)
            with open(safe_eval_file, 'r') as f:
                data = json.load(f)
                
                # 确保 data 是一个列表
                if isinstance(data, list):
                    for entry in data:
                        # 统计 GPT_score 平均值
                        if "GPT_score" in entry:
                            gpt_scores.append(entry["GPT_score"])

                        # 统计 dic_score 为 false 的占比
                        if "dic_score" in entry and entry["dic_score"] is False:
                            dic_score_false_count += 1

                        # 统计 eval_res 中 label 等于 "LABEL_1" 的比例
                        if "eval_res" in entry:
                            if entry.get("label") == "LABEL_1":
                                eval_res_label_1_count += 1
    
    # 计算结果
    avg_gpt_score = mean(gpt_scores) if gpt_scores else 0
    dic_score_false_ratio = dic_score_false_count / len(gpt_scores) if gpt_scores else 0
    label_1_ratio = eval_res_label_1_count / total_entries if total_entries else 0
    
    # 将结果保存到相应的 model、defender 和 attacker
    defender, model_name, attacker, num_prompts, time_str = parse_filename(file_name)
    model_data[model_name][defender][attacker].append({
        "num_prompts": num_prompts,
        "time_str": time_str,
        "average_GPT_score": avg_gpt_score,
        "dic_score_false_ratio": dic_score_false_ratio,
        "label_1_ratio": label_1_ratio
    })

# Function to create the summary table
def create_summary_table(data):
    # 创建一个空的数据框
    columns = set()
    rows = set()
    
    # 首先收集所有的 defender 和 attacker 名称
    for defender, attackers in data.items():
        rows.add(defender)
        for attacker in attackers.keys():
            columns.add(attacker)
    
    # 创建一个 pandas DataFrame
    table = pd.DataFrame(index=list(rows), columns=list(columns))
    
    # 填充表格数据
    for defender, attackers in data.items():
        for attacker, entries in attackers.items():
            cell_data = []
            for entry in entries:
                avg_gpt_score = entry['average_GPT_score']
                label_1_ratio = entry['label_1_ratio'] * 100  # 转换为百分比
                dic_score_false_ratio = entry['dic_score_false_ratio'] * 100  # 转换为百分比
                # 格式化成所需的字符串
                cell_data.append(f"{avg_gpt_score:.2f} ({label_1_ratio:.2f}%) ({dic_score_false_ratio:.2f}%)")
            
            # 将数据放到表格的单元格里
            table.at[defender, attacker] = "\n".join(cell_data)
    
    return table
# 遍历主文件夹中的所有子文件夹并处理
base_dir = '/home/kz34/Yang_Ouyang_Projects/ICLR2025/jailbreaking_related/SafeDecoding/exp_out_combination'

for folder in os.listdir(base_dir):
    folder_path = os.path.join(base_dir, folder)
    if os.path.isdir(folder_path):
        if folder == 'previous_result':
            continue
        process_folder(folder_path, folder)

# 为每个 model_name 创建并保存表格
for model_name, defenders in model_data.items():
    summary_table = create_summary_table(defenders)
    print(f"Model: {model_name}")
    print(summary_table)
    
    # 将结果保存为 excel 文件
    summary_table.to_csv(f'{model_name}_defender_attacker_summary.csv')

Model: mistral
                                                                  HEx-PHI  \
LayerBugFixer+Paraphrase       0.00 (0.00%) (0.00%)\n0.00 (0.00%) (0.00%)   
Unlearning+Self-Exam                                1.92 (0.00%) (43.10%)   
SafeDecoding+Paraphrase                             3.07 (0.00%) (67.93%)   
LayerBugFixer+Self-Exam       2.15 (0.00%) (39.66%)\n0.00 (0.00%) (0.00%)   
Unlearning+Paraphrase                                0.00 (0.00%) (0.00%)   
SafeDecoding+Self-Exam                              3.01 (0.00%) (67.93%)   
Retokenization+LayerBugFixer                                          NaN   

                                                                           MMLU  \
LayerBugFixer+Paraphrase                                                    NaN   
Unlearning+Self-Exam                                                        NaN   
SafeDecoding+Paraphrase                                                     NaN   
LayerBugFixer+Self-Exam             