In [None]:
import json
import random

SLANG_DICTIONARY = {
    # --- 1. 针对性/性别群体的黑话与变体 ---
    "女人": ["国女", "蝈女", "幕刃", "母人", "母狗", "母猪", "母的", "小仙女", "小仙驴", "小酰钕", "集美", "普信女", "easy girl", "eg", "lkd", "烂裤裆", "拳师", "田园女权", "女拳", "微博女厕所"],
    "男人": ["国男", "蝈男", "郭楠", "普信男", "xdz", "金针菇", "男权", "男拳", "供养者", "接盘侠"],
    "舔狗": ["龟男", "舔狗龟男"],

    # --- 2. 针对LGBTQ+群体的黑话与变体 ---
    "同性恋": ["同杏恋", "童姓恋", "txl", "通讯录", "同志", "铜", "gay", "基佬", "给佬", "男同", "男铜", "深柜", "骗婚gay", "基佬", "基蛆", "死基佬"],
    
    # --- 3. 针对种族/国籍的黑话与变体 ---
    "黑人": ["黑鬼", "黑皮", "黑gou", "黑猴", "黑猩猩", "嘿畜狌", "尼哥", "nigger", "默", "昆仑奴", "倪哥", "黑杂", "洋垃圾"],
    "日本人": ["小日子", "小八嘎", "太君", "鬼子", "倭寇"],
    "韩国人": ["棒子", "西八"],
    "白人": ["白皮", "白左"],
    "外国人": ["洋垃圾", "洋大人", "歪果仁"],
    "犹太人": ["犹太狗", "油太人"],
    "穆斯林": ["绿绿"],
    
    # --- 4. 针对地域的黑话与变体 ---
    "河南人": ["井盖", "荷兰人"],
    "上海人": ["沪国人", "爱丁堡人", "上大人"],
    "东北人": ["东百太君", "东北狗"],
    "安徽人": ["白完", "皖人"],
    "福建人": ["胡建人"],
    "陕西人": ["陕蛋蛋", "陕狗"],
    "广东人": ["南蛮"],
    "台湾人": ["湾湾", "台巴子"],
    
    # --- 5. 其他常见黑话 ---
    "国家": ["郭嘉"],
    "汉奸": ["罕见", "焊奸"],
}

# 为了方便双向替换，我们创建一个反向+扩充的词典
# 'sb' -> '傻逼', '沙比' -> '傻逼', 'zf' -> '政府' ...
REVERSE_SLANG_MAP = {}
for standard_word, variant_list in SLANG_DICTIONARY.items():
    for variant in variant_list:
        if variant not in REVERSE_SLANG_MAP:
            REVERSE_SLANG_MAP[variant] = []
        # 让变体可以被替换为标准词或其他变体
        REVERSE_SLANG_MAP[variant].extend([standard_word] + [v for v in variant_list if v != variant])


def augment_adversarial_sentence(sentence: str, augmentation_prob: float = 0.7) -> list[str]:
    """对单个句子应用对抗性词典替换和添加干扰词的增强方法"""
    augmented_sentences = set()

    # --- 方法1: 对抗性词典替换 ---
    if random.random() < augmentation_prob:
        temp_sentence = sentence
        # 从标准词 -> 变体
        for standard_word, variant_list in SLANG_DICTIONARY.items():
            if standard_word in temp_sentence:
                replacement = random.choice(variant_list)
                temp_sentence = temp_sentence.replace(standard_word, replacement, 1) # 只替换第一个出现的
        if temp_sentence != sentence:
            augmented_sentences.add(temp_sentence)
        
        # 从变体 -> 标准词/其他变体
        temp_sentence = sentence
        for variant, replacement_list in REVERSE_SLANG_MAP.items():
            if variant in temp_sentence:
                replacement = random.choice(replacement_list)
                temp_sentence = temp_sentence.replace(variant, replacement, 1)
        if temp_sentence != sentence:
            augmented_sentences.add(temp_sentence)
            
    # --- 方法2: 添加中性干扰词 ---
    if random.random() < augmentation_prob:
        distractors = ["不是我说的，", "我就问一句，", "有一说一，", "典中典，", "笑死，"]
        distractor = random.choice(distractors)
        augmented_sentences.add(f"{distractor}{sentence}")

    return list(augmented_sentences)


input_file = 'train_data.jsonl'
output_file = 'train_data_augmented.jsonl'
print(f"\n开始处理文件: {input_file}")

count_original = 0
count_augmented = 0

with open(input_file, 'r', encoding='utf-8') as f_in, \
     open(output_file, 'w', encoding='utf-8') as f_out:
    
    for line in f_in:
        if not line.strip():
            continue
            
        data = json.loads(line)
        count_original += 1
        
        # 1. 始终保留原始数据
        f_out.write(json.dumps(data, ensure_ascii=False) + '\n')
        
        # 2. 提取需要增强的用户输入
        user_input_text = data["messages"][1]["content"]
        
        # 3. 进行数据增强
        augmented_texts = augment_adversarial_sentence(user_input_text)
        
        # 4. 为每个增强后的文本创建新的训练样本
        for text in augmented_texts:
            if text and text != user_input_text:
                count_augmented += 1
                new_data_entry = {
                    "messages": [
                        data["messages"][0],
                        {"role": "user", "content": text},
                        data["messages"][2]
                    ]
                }
                f_out.write(json.dumps(new_data_entry, ensure_ascii=False) + '\n')

print(f"\n数据增强完成！")
print(f"原始数据: {count_original} 条")
print(f"新增数据: {count_augmented} 条")
print(f"新文件已保存至: {output_file}")


开始处理文件: train_data.jsonl

数据增强完成！
原始数据: 4000 条
新增数据: 3951 条
新文件已保存至: train_data_augmented.jsonl
