# 中文诈骗场景命名实体识别(NER)训练
本notebook用于训练中文诈骗场景下的命名实体识别模型，用于识别诈骗文本中的关键实体。

In [None]:
# 安装必要的库
!pip install transformers datasets seaborn

In [None]:
# 导入基本库
import pandas as pd
import numpy as np
import os
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
import transformers  # 添加这个导入以支持EarlyStoppingCallback
from datasets import Dataset, load_metric
from sklearn.model_selection import train_test_split
import logging
from collections import Counter
import re
import json
import random
import io

# 设置日志
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
logger.info(f'使用设备: {device}')

## 数据加载
本项目使用专门针对中文诈骗场景的NER数据集，包含多种诈骗类型的文本和实体标注：
1. 中奖诈骗：涉及MONEY, LINK等标签
2. 冒充公检法：涉及NAME, PHONE, LOC等标签
3. 刷单诈骗：涉及MONEY, ACCOUNT等标签
4. 理财诈骗：涉及MONEY, PRODUCT, PERCENT等标签
5. 冒充客服：涉及PHONE, LINK等标签
6. 冒充领导：涉及NAME, MONEY, ACCOUNT等标签
7. 疫情诈骗：涉及ACCOUNT, PHONE等标签
8. 网络交友：涉及MONEY, ACCOUNT等标签

In [None]:
# 检测是否在Colab环境中
IN_COLAB = 'google.colab' in str(get_ipython())

if IN_COLAB:
    # 导入Colab专用库
    from google.colab import files
    from google.colab import drive
    import requests
    print("欢迎使用Google Colab训练NER模型！")
    
    # 挂载Google Drive
    drive.mount('/content/drive')
    
    # 下载项目数据
    !wget -q https://raw.githubusercontent.com/yourusername/fraud_bot/main/data/ner_training_data_enhanced.csv -O ner_training_data_enhanced.csv
    
    # 如果下载失败，提示上传
    if not os.path.exists('ner_training_data_enhanced.csv'):
        print("请上传NER训练数据CSV文件...")
        uploaded = files.upload()
        for filename in uploaded.keys():
            if filename.endswith('.csv'):
                !cp "{filename}" ner_training_data_enhanced.csv
                print(f"已将 {filename} 复制为 ner_training_data_enhanced.csv")
                break
else:
    print("您不在Google Colab环境中，将使用本地数据。")

In [None]:
# 加载数据集
import pandas as pd

# 在Colab中直接使用下载/上传的文件，在本地环境中使用相对路径
data_path = 'ner_training_data_enhanced.csv' if IN_COLAB else 'data/ner_training_data_enhanced.csv'

try:
    df = pd.read_csv(data_path)
    print(f"成功加载数据集，共{len(df)}条记录")
except Exception as e:
    print(f"加载数据集失败: {e}")
    # 使用备用示例数据
    print("使用备用示例数据...")
    data = [
        {"text": "我刚刚接到一个自称是北京公安局的电话，说我涉嫌洗钱，要求我转账5万元到安全账户", "labels": "O O O O O O B-ORG I-ORG I-ORG O O O O O O O O O O O O O B-MONEY I-MONEY O O O O"},
        {"text": "有人自称是中国移动客服，说我的手机号码需要实名认证，让我点击链接并输入银行卡号", "labels": "O O O O B-ORG I-ORG I-ORG O O O O O O O O O O O O O O O O O O O O O O"},
        {"text": "最近收到短信说我的微信账号在广州登录，需要点击链接确认", "labels": "O O O O O O O O B-APP O O B-LOC O O O O O O O O O"},
        {"text": "有快递员说我的包裹到了，让我支付关税2000元", "labels": "O B-ROLE O O O O O O O O O O O B-MONEY I-MONEY"},
        {"text": "网站上说投资比特币可以年化收益30%，只需要投入10000元", "labels": "B-CHANNEL O O O B-PRODUCT O O O O O B-PERCENT O O O O O B-MONEY I-MONEY"}
    ]
    df = pd.DataFrame(data)
    print(f"已生成{len(df)}条示例数据")

# 显示数据前几行
print("\n数据预览:")
display(df.head())

## 数据预处理
处理数据格式并准备训练

In [None]:
# 检查数据格式
print(f"数据格式: {df.columns.tolist()}")

# 确保数据包含所需字段
required_columns = ["text", "labels"]
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"数据集缺少必要的列: {col}")

# 数据统计
print(f"数据集大小: {len(df)}条记录")

# 标签统计分析
all_labels = []
for label_seq in df['labels']:
    all_labels.extend(label_seq.split())

label_counter = Counter(all_labels)
print("标签分布:")
for label, count in label_counter.most_common():
    print(f"  {label}: {count}")

# 提取唯一标签
unique_labels = sorted(list(set(all_labels)))
print(f"唯一标签: {unique_labels}")

# 创建标签映射
label_list = unique_labels
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for i, label in enumerate(label_list)}

print(f"标签映射:")
print(json.dumps(label2id, ensure_ascii=False, indent=2))

In [None]:
# 划分训练集和验证集
train_df, eval_df = train_test_split(df, test_size=0.2, random_state=42)
print(f"训练集: {len(train_df)}条, 验证集: {len(eval_df)}条")

# 转换为datasets格式
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)

# 加载分词器
tokenizer = BertTokenizerFast.from_pretrained('bert-base-chinese')

In [None]:
# 数据预处理函数
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=128,
        # 我们需要这个来正确地对齐标签
        is_split_into_words=False,
        return_offsets_mapping=True
    )
    
    labels = []
    
    for i, label_seq in enumerate(examples["labels"]):
        label_ids = []
        label_list = label_seq.split()
        
        # 这里假设标记和标签是一一对应的
        word_ids = [None]  # 特殊标记[CLS]
        
        for word_idx in range(len(label_list)):
            token_ids = tokenizer(examples["text"][i][word_idx], 
                                 add_special_tokens=False)["input_ids"]
            # 可能会被分成多个token
            for _ in range(len(token_ids)):
                word_ids.append(word_idx)
        
        # 添加[SEP]特殊标记
        word_ids.append(None)
        
        # 对齐标签
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None or word_idx >= len(label_list):
                label_ids.append(-100)  # 特殊标记标签为-100
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label_list[word_idx]])
            else:
                # 同一个词的部分，使用标签的前缀保持一致
                # 例如B-XXX后面的部分应该是I-XXX
                current_label = label_list[word_idx]
                if current_label.startswith("B-"):
                    label_ids.append(label2id[f"I-{current_label[2:]}"])
                else:
                    label_ids.append(label2id[current_label])
            previous_word_idx = word_idx
            
        labels.append(label_ids)
    
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
# 处理数据
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)

print("数据处理完成！")

## 模型定义与训练
选择合适的中文预训练模型并设置最优训练参数

In [None]:
# 选择预训练模型
MODEL_CHOICES = {
    "bert": "bert-base-chinese",            # 原始BERT中文版
    "roberta": "hfl/chinese-roberta-wwm-ext", # 哈工大RoBERTa中文版
    "macbert": "hfl/chinese-macbert-base"    # MacBERT中文版
}

# 选择使用哪个模型，可以更换为roberta或macbert获得更好的效果
MODEL_NAME = "bert"
PRETRAINED_MODEL = MODEL_CHOICES[MODEL_NAME]
print(f"使用预训练模型: {PRETRAINED_MODEL}")

# 加载分词器
tokenizer = BertTokenizerFast.from_pretrained(PRETRAINED_MODEL)

# 定义模型
model = BertForTokenClassification.from_pretrained(
    PRETRAINED_MODEL, 
    num_labels=len(label_list), 
    id2label=id2label, 
    label2id=label2id
)
model.to(device)

# 训练参数 - 针对诈骗NER的优化参数
training_args = TrainingArguments(
    output_dir=f"./{MODEL_NAME}_ner_antifraud",
    num_train_epochs=15,            # 增加训练轮次
    per_device_train_batch_size=16, # 对于小数据集增大batch_size
    per_device_eval_batch_size=16,
    evaluation_strategy='steps',    # 更频繁的评估
    eval_steps=50,                  # 每50步评估一次
    save_strategy='steps',
    save_steps=50,
    logging_dir='./logs',
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model='f1',     # 使用F1作为最佳模型指标
    greater_is_better=True,
    warmup_ratio=0.1,               # 使用比例而不是固定步数
    weight_decay=0.01,
    learning_rate=3e-5,             # 略微提高学习率
    label_smoothing_factor=0.1,
    fp16=torch.cuda.is_available(), # 如果有GPU则使用混合精度训练
    gradient_accumulation_steps=2,  # 梯度累积
    save_total_limit=2              # 只保存最佳的2个模型
)

In [None]:
# 改进的评估指标计算函数
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=2)
    
    # 去除填充和特殊标记的预测和标签
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    # 在实体级别计算评估指标
    entity_results = calculate_entity_metrics(true_labels, true_predictions)
    
    # 在标签级别计算评估指标
    token_results = calculate_token_metrics(true_labels, true_predictions)
    
    # 合并两种评估结果，优先使用实体级别的F1
    results = {
        "precision": entity_results["precision"], 
        "recall": entity_results["recall"], 
        "f1": entity_results["f1"],
        "token_accuracy": token_results["accuracy"]
    }
    
    return results

# 实体级别评估函数
def calculate_entity_metrics(true_labels, true_predictions):
    results = {"precision": 0.0, "recall": 0.0, "f1": 0.0}
    
    # 提取实体及类型
    def extract_entities(seq):
        entities = []
        entity = []
        entity_type = None
        
        for i, tag in enumerate(seq):
            if tag.startswith("B-"):
                if entity:
                    entities.append((entity_type, tuple(entity)))
                    entity = []
                entity_type = tag[2:]
                entity.append(i)
            elif tag.startswith("I-") and entity:
                if tag[2:] == entity_type:
                    entity.append(i)
            elif tag == "O":
                if entity:
                    entities.append((entity_type, tuple(entity)))
                    entity = []
                    entity_type = None
        
        if entity:
            entities.append((entity_type, tuple(entity)))
        
        return entities
    
    # 按实体类型计算
    entity_types = set()
    true_by_type = {}
    pred_by_type = {}
    correct_by_type = {}
    
    for true_label, true_prediction in zip(true_labels, true_predictions):
        true_entities = extract_entities(true_label)
        pred_entities = extract_entities(true_prediction)
        
        # 收集所有实体类型
        for entity_type, _ in true_entities:
            entity_types.add(entity_type)
        for entity_type, _ in pred_entities:
            entity_types.add(entity_type)
        
        # 计算每种类型的实体数量
        for entity_type, entity_indices in true_entities:
            true_by_type[entity_type] = true_by_type.get(entity_type, 0) + 1
            
        for entity_type, entity_indices in pred_entities:
            pred_by_type[entity_type] = pred_by_type.get(entity_type, 0) + 1
            
        for entity in pred_entities:
            if entity in true_entities:
                entity_type = entity[0]
                correct_by_type[entity_type] = correct_by_type.get(entity_type, 0) + 1
    
    # 计算总体评估指标
    total_true = sum(true_by_type.values())
    total_pred = sum(pred_by_type.values())
    total_correct = sum(correct_by_type.values())
    
    if total_pred > 0:
        results["precision"] = total_correct / total_pred
    if total_true > 0:
        results["recall"] = total_correct / total_true
    if results["precision"] + results["recall"] > 0:
        results["f1"] = 2 * results["precision"] * results["recall"] / (results["precision"] + results["recall"])
    
    # 输出每种实体类型的评估指标
    for entity_type in sorted(entity_types):
        true_count = true_by_type.get(entity_type, 0)
        pred_count = pred_by_type.get(entity_type, 0)
        correct_count = correct_by_type.get(entity_type, 0)
        
        precision = correct_count / pred_count if pred_count > 0 else 0
        recall = correct_count / true_count if true_count > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0
        
        results[f"{entity_type}_precision"] = precision
        results[f"{entity_type}_recall"] = recall
        results[f"{entity_type}_f1"] = f1
    
    return results

# 标签级别评估函数
def calculate_token_metrics(true_labels, true_predictions):
    total = 0
    correct = 0
    
    for true_label, true_prediction in zip(true_labels, true_predictions):
        for tl, tp in zip(true_label, true_prediction):
            total += 1
            if tl == tp:
                correct += 1
    
    accuracy = correct / total if total > 0 else 0
    return {"accuracy": accuracy}

In [None]:
# 创建Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    # 添加Early Stopping回调
    callbacks=[transformers.EarlyStoppingCallback(early_stopping_patience=3)]
)

# 训练模型
trainer.train()

# 评估模型
eval_results = trainer.evaluate()
print(f"评估结果: {eval_results}")

# 显示评估指标细节
for metric, value in eval_results.items():
    if "_" in metric and ("precision" in metric or "recall" in metric or "f1" in metric):
        print(f"  {metric}: {value:.4f}")

## 模型保存和测试
保存训练好的模型并进行测试

In [None]:
# 保存模型
output_dir = f"./{MODEL_NAME}_ner_antifraud_final"
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"模型保存在 {output_dir}")

# 保存标签映射和训练配置
config = {
    "id2label": id2label,
    "label2id": label2id,
    "model_name": MODEL_NAME,
    "pretrained_model": PRETRAINED_MODEL,
    "max_length": 128,
    "training_args": training_args.to_dict()
}

with open(os.path.join(output_dir, "config.json"), "w", encoding="utf-8") as f:
    json.dump(config, f, ensure_ascii=False, indent=2)

# 可视化训练过程
train_results = trainer.state.log_history
eval_loss = []
train_loss = []
f1_scores = []
steps = []

for item in train_results:
    if 'loss' in item and 'epoch' in item:
        train_loss.append(item['loss'])
        steps.append(item['step'])
    if 'eval_loss' in item:
        eval_loss.append(item['eval_loss'])
        if 'eval_f1' in item:
            f1_scores.append(item['eval_f1'])

plt.figure(figsize=(10, 6))
plt.plot(steps, train_loss, 'b-', label='Training loss')
plt.xlabel('Steps')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.legend()
plt.grid(True)
plt.show()

if f1_scores:
    plt.figure(figsize=(10, 6))
    plt.plot(range(len(f1_scores)), f1_scores, 'g-', label='F1 Score')
    plt.xlabel('Evaluation Step')
    plt.ylabel('F1 Score')
    plt.title('F1 Score Evolution')
    plt.legend()
    plt.grid(True)
    plt.show()

In [None]:
# 改进的实体预测函数
def predict_entities(text, visualize=False):
    # 对输入文本进行分词
    tokens = tokenizer.tokenize(text)
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # 预测
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)
    
    # 解码预测结果
    predicted_token_labels = []
    for token_predictions in predictions:
        token_labels = []
        for prediction in token_predictions:
            label = id2label.get(prediction.item(), "O")
            token_labels.append(label)
        predicted_token_labels.append(token_labels)
    
    # 跳过特殊标记[CLS]和[SEP]
    token_labels = predicted_token_labels[0][1:-1]
    input_tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0])[1:-1]
    
    # 提取实体
    entities = {}
    current_entity = []
    current_type = None
    char_to_token = []
    tokens_info = []
    
    for i, (token, label) in enumerate(zip(input_tokens, token_labels)):
        # 重构原始文本
        tokens_info.append((token, label))
        
        if label.startswith("B-"):
            if current_entity:
                entity_text = "".join(current_entity).replace("##", "")
                if current_type not in entities:
                    entities[current_type] = []
                entities[current_type].append(entity_text)
            current_entity = [token]
            current_type = label[2:]
        elif label.startswith("I-") and current_entity and current_type == label[2:]:
            current_entity.append(token)
        elif label == "O":
            if current_entity:
                entity_text = "".join(current_entity).replace("##", "")
                if current_type not in entities:
                    entities[current_type] = []
                entities[current_type].append(entity_text)
                current_entity = []
                current_type = None
    
    # 处理最后一个实体（如果有）
    if current_entity:
        entity_text = "".join(current_entity).replace("##", "")
        if current_type not in entities:
            entities[current_type] = []
        entities[current_type].append(entity_text)
    
    # 可视化结果
    if visualize:
        from IPython.display import HTML, display
        import pandas as pd
        import numpy as np
        
        # 创建带有HTML颜色编码的文本显示
        html = "<h3>标注结果:</h3><p>"
        
        # 颜色映射
        color_map = {
            "ORG": "#FF9999",     # 淡红色
            "MONEY": "#99FF99",   # 淡绿色
            "LINK": "#9999FF",    # 淡蓝色
            "PHONE": "#FFFF99",   # 淡黄色
            "NAME": "#FF99FF",    # 淡紫色
            "ACCOUNT": "#99FFFF", # 淡青色
            "TIME": "#FFCC99",    # 淡橙色
            "LOC": "#99CCFF",     # 淡蓝紫色
            "APP": "#CC99FF",     # 紫罗兰色
            "ROLE": "#FF6666",    # 橙红色
            "PRODUCT": "#66FF66", # 浅绿色
            "CHANNEL": "#6666FF", # 深蓝色
            "PERCENT": "#CCCCFF"  # 淡紫蓝色
        }
        
        current_entity_type = None
        for token, label in tokens_info:
            token_text = token.replace('##', '')
            
            if label.startswith("B-"):
                entity_type = label[2:]
                current_entity_type = entity_type
                color = color_map.get(entity_type, "#CCCCCC")
                html += f'<span style="background-color: {color};">{token_text}</span>'
            elif label.startswith("I-") and current_entity_type == label[2:]:
                color = color_map.get(current_entity_type, "#CCCCCC")
                html += f'<span style="background-color: {color};">{token_text}</span>'
            else:
                current_entity_type = None
                html += token_text
                
        html += "</p>"
        
        # 显示颜色图例
        html += "<h3>实体类型:</h3>"
        html += "<ul style=\"list-style-type:none;\">"
        for entity_type, color in color_map.items():
            if entity_type in [item[2:] for item in set([label for _, label in tokens_info if label != "O"])]:
                html += f'<li><span style="background-color: {color}; padding: 2px 5px;">{entity_type}</span></li>'
        html += "</ul>"
        
        display(HTML(html))
        
        # 显示提取的实体表格
        if entities:
            print("\n识别出的实体:")
            for entity_type, entity_values in entities.items():
                print(f"  {entity_type}: {', '.join(entity_values)}")
        else:
            print("\n未识别出任何实体")
    
    return entities

In [None]:
# 测试数据
test_samples = [
    "我刚收到一条短信，说我的银行卡需要注销，要求我点击链接网址www.bank-verify.com或拨打电话13800138000",
    "有人自称是公安局的让我去ATM机操作转账5000元到安全账户6217001234567890",
    "刚才有个自称是中国移动的客服说我的手机号需要实名认证，让我提供身份证和银行卡",
    "小王告诉我最近有个投资项目，年化收益率30%，只需投1万元就能月入3000元",
    "网站上说投资比特币可以年化收益35%，只需要投入10000元到账户1234567890"
]

print("测试模型效果:")
for i, sample in enumerate(test_samples):
    print(f"\n\n示例 {i+1}: {sample}\n")
    entities = predict_entities(sample, visualize=True)

## 模型下载与部署
导出训练好的模型并下载，供后续在应用中使用

In [None]:
# 创建用于生产环境的推理脚本
inference_script = f"""
# 中文诈骗场景命名实体识别(NER)推理脚本
# 基于模型: {PRETRAINED_MODEL}
# 训练时间: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}

import torch
from transformers import BertTokenizerFast, BertForTokenClassification
import json
import os

# 配置参数
MODEL_PATH = "./model"  # 替换为实际模型路径

# 加载标签映射
with open(os.path.join(MODEL_PATH, "config.json"), "r", encoding="utf-8") as f:
    config = json.load(f)
    id2label = config["id2label"]
    # 将字符串键转换为整数键
    id2label = {{int(k): v for k, v in id2label.items()}}

# 加载模型和分词器
tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH)
model = BertForTokenClassification.from_pretrained(MODEL_PATH)

# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def predict_entities(text):
    # 分词
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {{k: v.to(device) for k, v in inputs.items()}}
    
    # 预测
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)
    
    # 解码预测结果
    predicted_token_labels = []
    for token_predictions in predictions:
        token_labels = []
        for prediction in token_predictions:
            label = id2label.get(prediction.item(), "O")
            token_labels.append(label)
        predicted_token_labels.append(token_labels)
    
    # 提取实体
    entities = {{}}
    current_entity = []
    current_type = None
    
    # 跳过特殊标记[CLS]和[SEP]
    token_labels = predicted_token_labels[0][1:-1]
    tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0])[1:-1]
    
    for i, (token, label) in enumerate(zip(tokens, token_labels)):
        if label.startswith("B-"):
            if current_entity:
                entity_text = "".join(current_entity).replace("##", "")
                if current_type not in entities:
                    entities[current_type] = []
                entities[current_type].append(entity_text)
            current_entity = [token]
            current_type = label[2:]
        elif label.startswith("I-") and current_entity and current_type == label[2:]:
            current_entity.append(token)
        elif label == "O":
            if current_entity:
                entity_text = "".join(current_entity).replace("##", "")
                if current_type not in entities:
                    entities[current_type] = []
                entities[current_type].append(entity_text)
                current_entity = []
                current_type = None
    
    # 处理最后一个实体（如果有）
    if current_entity:
        entity_text = "".join(current_entity).replace("##", "")
        if current_type not in entities:
            entities[current_type] = []
        entities[current_type].append(entity_text)
    
    return entities

# 使用示例
if __name__ == "__main__":
    test_text = "我刚收到一条短信，说我的银行卡需要注销，要求我点击链接或拨打电话13800138000"
    result = predict_entities(test_text)
    print(f"输入文本: {{test_text}}")
    print("识别出的实体:")
    for entity_type, entity_values in result.items():
        print(f"  {{entity_type}}: {{', '.join(entity_values)}}")
"""

# 保存推理脚本
with open(os.path.join(output_dir, "inference.py"), "w", encoding="utf-8") as f:
    f.write(inference_script)

# 创建一个简单的README文件
readme = f"""
# 中文诈骗场景命名实体识别(NER)模型

## 模型信息
- 预训练模型: {PRETRAINED_MODEL}
- 训练数据集: 中文诈骗场景NER数据集
- 训练时间: {pd.Timestamp.now().strftime('%Y-%m-%d')}
- 支持实体类型: {', '.join(sorted(set([k[2:] for k in label2id.keys() if k != 'O' and k.startswith('B-')]))}

## 快速使用
1. 安装依赖: `pip install torch transformers`
2. 使用 `inference.py` 脚本进行推理

## 示例代码
```python
from inference import predict_entities

text = "我刚收到一条短信，说我的银行卡需要注销，要求我点击链接或拨打电话13800138000"
entities = predict_entities(text)
print(entities)
```
"""

with open(os.path.join(output_dir, "README.md"), "w", encoding="utf-8") as f:
    f.write(readme)

# Colab环境下，压缩并提供下载
if IN_COLAB:
    # 压缩模型和相关文件
    !zip -r "{MODEL_NAME}_ner_antifraud_model.zip" "{output_dir}/" 
    
    # 下载模型
    from google.colab import files
    files.download(f"{MODEL_NAME}_ner_antifraud_model.zip")
    print(f"模型下载链接已生成，请点击下载 {MODEL_NAME}_ner_antifraud_model.zip")
else:
    print(f"模型和相关文件已保存到 {output_dir} 目录")

## 结论与后续优化

### 已完成工作
1. 基于中文诈骗场景数据集训练了命名实体识别(NER)模型
2. 该模型可识别多种诈骗类型中的关键实体，如机构(ORG)、金额(MONEY)、地点(LOC)、电话(PHONE)等
3. 通过优化训练参数提高了模型性能
4. 提供了可视化的实体标注结果
5. 生成了便于部署的推理脚本

### 后续优化方向
1. **数据扩充**：增加更多诈骗场景数据，特别是新型网络诈骗文本
2. **模型改进**：尝试更高级的预训练模型如BERT-wwm、RoBERTa、MacBERT等
3. **多任务学习**：将NER与诈骗意图分类结合，构建多任务模型
4. **模型蒸馏**：将训练好的大模型知识蒸馏到小模型中，提高推理速度
5. **规则增强**：结合规则识别某些特定格式的实体（如电话号码、银行卡号等）

### 实际应用
本模型可集成到反诈骗系统中，用于：
1. 短信/聊天记录自动检测和分析
2. 客服系统中的风险信息提取
3. 反诈骗知识图谱构建的基础
4. 诈骗套路自动识别与预警

在实际部署时，建议结合其他模块（如意图分类）形成完整的反诈骗AI系统。