In [31]:
import pandas as pd
import numpy as np
from transformers import BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset
import torch

In [32]:
import os
import glob
import json
import random
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.utils import resample
import matplotlib.pyplot as plt

In [33]:
# 指定包含JSON文件的文件夹路径
folder_path = '../data/esg_label_result'

# 使用glob获取文件夹中所有的JSON文件
json_files = glob.glob(os.path.join(folder_path, "*.json"))

all_data = []

# 逐个读取每个JSON文件
for file in json_files:
    with open(file, 'r', encoding='utf-8') as f:
        data = json.load(f)
        all_data.extend(data)  # 将所有JSON文件的数据合并到一个列表中

# # 打印合并后的数据
# print(all_data)

In [4]:
# 读取JSON文件 
# with open('../data/esg_label_result/AF Global Limited_report_filtered.json', 'r') as f:
#     data = json.load(f)

In [34]:
data = [item for item in all_data if "error" not in item]

In [35]:
label_dict ={
        "B-ENV_GHG_AET": 0,
        "I-ENV_GHG_AET": 0,
        "B-ENV_GHG_AE1": 0,
        "I-ENV_GHG_AE1": 0,
        "B-ENV_GHG_AE2": 0,
        "I-ENV_GHG_AE2": 0,
        "B-ENV_GHG_AE3": 0,
        "I-ENV_GHG_AE3": 0,
        "B-ENV_GHG_EIT": 0,
        "I-ENV_GHG_EIT": 0,
        "B-ENV_GHG_EI1": 0,
        "I-ENV_GHG_EI1": 0,
        "B-ENV_GHG_EI2": 0,
        "I-ENV_GHG_EI2": 0,
        "B-ENV_GHG_EI3": 0,
        "I-ENV_GHG_EI3": 0,
        "B-ENV_ENC_TEC": 0,
        "I-ENV_ENC_TEC": 0,
        "B-ENV_ENC_ECI": 0,
        "I-ENV_ENC_ECI": 0,
        "B-ENV_WAC_TWC": 0,
        "I-ENV_WAC_TWC": 0,
        "B-ENV_WAC_WCI": 0,
        "I-ENV_WAC_WCI": 0,
        "B-ENV_WAG_TWG": 0,
        "I-ENV_WAG_TWG": 0,
        "B-SOC_GED_CEG_M": 0,
        "I-SOC_GED_CEG_M": 0,
        "B-SOC_GED_CEG_F": 0,
        "I-SOC_GED_CEG_F": 0,
        "B-SOC_GED_NHG_M": 0,
        "I-SOC_GED_NHG_M": 0,
        "B-SOC_GED_NHG_F": 0,
        "I-SOC_GED_NHG_F": 0,
        "B-SOC_GED_ETG_M": 0,
        "I-SOC_GED_ETG_M": 0,
        "B-SOC_GED_ETG_F": 0,
        "I-SOC_GED_ETG_F": 0,
        "B-SOC_AGD_CEA_U30": 0,
        "I-SOC_AGD_CEA_U30": 0,
        "B-SOC_AGD_CEA_B35": 0,
        "I-SOC_AGD_CEA_B35": 0,
        "B-SOC_AGD_CEA_A50": 0,
        "I-SOC_AGD_CEA_A50": 0,
        "B-SOC_AGD_NHI_U30": 0,
        "I-SOC_AGD_NHI_U30": 0,
        "B-SOC_AGD_NHI_B35": 0,
        "I-SOC_AGD_NHI_B35": 0,
        "B-SOC_AGD_NHI_A50": 0,
        "I-SOC_AGD_NHI_A50": 0,
        "B-SOC_AGD_TOR_U30": 0,
        "I-SOC_AGD_TOR_U30": 0,
        "B-SOC_AGD_TOR_B35": 0,
        "I-SOC_AGD_TOR_B35": 0,
        "B-SOC_AGD_TOR_A50": 0,
        "I-SOC_AGD_TOR_A50": 0,
        "B-SOC_DEV_ATH_M": 0,
        "I-SOC_DEV_ATH_M": 0,
        "B-SOC_DEV_ATH_F": 0,
        "I-SOC_DEV_ATH_F": 0,
        "B-SOC_OHS_FAT": 0,
        "I-SOC_OHS_FAT": 0,
        "B-SOC_OHS_HCI": 0,
        "I-SOC_OHS_HCI": 0,
        "B-SOC_OHS_REC": 0,
        "I-SOC_OHS_REC": 0,
        "B-SOC_OHS_RWI": 0,
        "I-SOC_OHS_RWI": 0,
        "B-GOV_BOC_BIN": 0,
        "I-GOV_BOC_BIN": 0,
        "B-GOV_BOC_WOB": 0,
        "I-GOV_BOC_WOB": 0,
        "B-GOV_MAD_WMT": 0,
        "I-GOV_MAD_WMT": 0,
        "B-GOV_ETB_ACD": 0,
        "I-GOV_ETB_ACD": 0,
        "B-GOV_ETB_ACT_N": 0,
        "I-GOV_ETB_ACT_N": 0,
        "B-GOV_ETB_ACT_P": 0,
        "I-GOV_ETB_ACT_P": 0,
        "B-GOV_CER_LRC": 0,
        "I-GOV_CER_LRC": 0,
        "B-GOV_ALF_AFD": 0,
        "I-GOV_ALF_AFD": 0,
        "B-GOV_ASS_ASR": 0,
        "I-GOV_ASS_ASR": 0,
        "B-VALUE": 0,
        "I-VALUE": 0,
        "B-UNIT": 0,
        "I-UNIT": 0,
        "O": 0
    }

In [36]:
# 提取BIO标注数据
texts = []
labels = []
err = []

for entry in data:
    text = entry['text']
    entity_labels = ["O"] * len(text)  # 初始化为'O'

    for entity in entry['entity']:
        start, end, label = entity['start'], entity['end'], entity['labels'][0]
        if label not in label_dict:
            continue
        # 检查字典情况
        # if end > len(entity_labels):
        #     err.append(data.index(entry))
        #     continue
        for i in range(start, end):
            entity_labels[i] = label

    texts.append(list(text))
    labels.append(entity_labels)

# 将数据转换为 DataFrame 格式
df = pd.DataFrame({"tokens": texts, "ner_tags": labels})

In [37]:
# 非实体句删除

no_entity_data = df[df['ner_tags'].apply(lambda x: all(label == "O" for label in x))]
entity_data = df[~df['ner_tags'].apply(lambda x: all(label == "O" for label in x))]

# 保留 15% 的无实体句子
no_entity_sample = no_entity_data.sample(frac=0.15, random_state=42)

# 合并数据
balanced_df = pd.concat([entity_data, no_entity_sample])

# 打乱数据集顺序
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# 检查新的数据分布
print("无实体句子数量:", len(no_entity_sample))
print("实体句子数量:", len(entity_data))
print("合并后的数据集样本数:", len(balanced_df))

无实体句子数量: 936
实体句子数量: 5153
合并后的数据集样本数: 6089


In [38]:
# 定义需要合并的标签字典，将稀有标签映射到新的标签名
merge_dict = {
    "B-SOC_AGD_TOR_U30": "B-SOC_AGD_TOR",
    "I-SOC_AGD_TOR_U30": "I-SOC_AGD_TOR",
    "B-SOC_AGD_TOR_B35": "B-SOC_AGD_TOR",
    "I-SOC_AGD_TOR_B35": "I-SOC_AGD_TOR",
    "B-SOC_AGD_TOR_A50": "B-SOC_AGD_TOR",
    "I-SOC_AGD_TOR_A50": "I-SOC_AGD_TOR",
    
    'B-SOC_AGD_NHI_B35': 'B-SOC_AGD_NHI',
    'B-SOC_AGD_NHI_A50': 'B-SOC_AGD_NHI',
    'B-SOC_AGD_NHI_U30': 'B-SOC_AGD_NHI',
    'I-SOC_AGD_NHI_U30': 'I-SOC_AGD_NHI',
    'I-SOC_AGD_NHI_B35': 'I-SOC_AGD_NHI',
    'I-SOC_AGD_NHI_A50': 'I-SOC_AGD_NHI',
    
    'B-ENV_GHG_EI1' : 'B-ENV_GHG_EI',
    'I-ENV_GHG_EI1' : 'I-ENV_GHG_EI',
    'B-ENV_GHG_EI2' : 'B-ENV_GHG_EI',
    'I-ENV_GHG_EI2' : 'I-ENV_GHG_EI',
    'B-ENV_GHG_EI3' : 'B-ENV_GHG_EI',
    'I-ENV_GHG_EI3' : 'I-ENV_GHG_EI',
    
    'B-SOC_AGD_CEA_U30' : 'B-SOC_AGD_CEA',
    'I-SOC_AGD_CEA_U30' : 'I-SOC_AGD_CEA',
    'B-SOC_AGD_CEA_B35' : 'B-SOC_AGD_CEA',
    'I-SOC_AGD_CEA_B35' : 'I-SOC_AGD_CEA',
    'B-SOC_AGD_CEA_A50' : 'B-SOC_AGD_CEA',
    'I-SOC_AGD_CEA_A50' : 'I-SOC_AGD_CEA',
    
    'B-SOC_GED_ETG_F' : 'B-SOC_GED_ETG',
    'I-SOC_GED_ETG_F' : 'I-SOC_GED_ETG',
    'B-SOC_GED_ETG_M' : 'B-SOC_GED_ETG',
    'I-SOC_GED_ETG_M' : 'I-SOC_GED_ETG',
    'B-SOC_GED_NHG_M' : 'B-SOC_GED_NHG',
    'I-SOC_GED_NHG_M' : 'I-SOC_GED_NHG',
    'B-SOC_GED_NHG_F' : 'B-SOC_GED_NHG',
    'I-SOC_GED_NHG_F' : 'I-SOC_GED_NHG'
    
    # 添加更多需要合并的标签映射
}

# 定义一个函数，用于将标签序列中的稀有标签合并
def merge_labels(label_sequence, merge_dict):
    return [merge_dict.get(label, label) for label in label_sequence]

# 应用标签合并函数到 DataFrame 的 'labels' 列
balanced_df['ner_tags'] = balanced_df['ner_tags'].apply(lambda x: merge_labels(x, merge_dict))

In [39]:
# 设置 pandas 的显示选项，防止省略
pd.set_option('display.max_rows', None)

# 检查标签合并后的分布
all_labels_flat = [item for sublist in balanced_df['ner_tags'] for item in sublist]
label_counts_after_merge = pd.Series(all_labels_flat).value_counts()

print("合并后的标签分布:")
print(label_counts_after_merge)

# 恢复默认设置（可选）
pd.reset_option('display.max_rows')


合并后的标签分布:
O                  1153555
I-GOV_ALF_AFD        29144
B-GOV_ALF_AFD        26855
B-VALUE              15533
B-GOV_ETB_ACD        10952
I-GOV_ETB_ACD         8668
B-GOV_BOC_BIN         8135
I-GOV_BOC_BIN         8107
I-SOC_DEV_ATH_M       6157
B-UNIT                5897
B-ENV_ENC_TEC         5709
B-ENV_GHG_AET         5702
B-SOC_DEV_ATH_M       5700
I-SOC_OHS_RWI         5246
B-SOC_OHS_RWI         5096
I-ENV_ENC_TEC         4947
I-ENV_GHG_AET         4775
I-UNIT                4584
I-ENV_WAG_TWG         3689
B-SOC_GED_CEG_F       3288
B-ENV_WAG_TWG         3052
I-GOV_CER_LRC         2911
I-SOC_GED_CEG_F       2553
I-VALUE               2248
B-GOV_CER_LRC         2176
I-ENV_WAC_TWC         1971
B-ENV_WAC_TWC         1678
I-GOV_ASS_ASR         1416
B-GOV_ASS_ASR         1349
I-ENV_ENC_ECI         1317
B-ENV_ENC_ECI         1258
I-ENV_GHG_EIT         1161
I-ENV_GHG_AE3         1158
B-SOC_OHS_REC         1127
I-SOC_OHS_REC         1119
I-ENV_GHG_AE2         1108
I-ENV_GHG_AE1     

In [None]:
# # 查找标签所在句子
# # 目标标签
# target_label = "B-SOC_AGD_TOR"

# # 筛选出包含目标标签的句子
# sentences_with_label = balanced_df_resampled[balanced_df_resampled['ner_tags'].apply(lambda x: target_label in x)]

# # 查看筛选结果
# print("包含标签", target_label, "的句子数量:", len(sentences_with_label))
# print(sentences_with_label[['tokens', 'ner_tags']].head())


In [None]:
# from collections import Counter

# # 假设 labels 列表中存储了每个文本的标签序列
# # 将所有标签展开为一个列表，并使用 Counter 统计每种标签的数量
# all_labels = [label for sequence in labels for label in sequence]
# label_counts = Counter(all_labels)

# # 打印每种实体的数量
# for label, count in label_counts.items():
#     print(f"实体标签 '{label}' 的数量为: {count}")

In [41]:
# 定义低频标签的阈值
low_count_threshold = 500

# 获取所有标签的数量分布
all_labels_flat = [item for sublist in balanced_df['ner_tags'] for item in sublist]
label_counts = pd.Series(all_labels_flat).value_counts()  # 假设这是一个标签-数量的字典或 Series

# 找出所有低频标签
low_frequency_labels = [label for label, count in label_counts.items() if count < low_count_threshold]

# 初始化一个新的 DataFrame 来存储过采样的句子
balanced_df_resampled = balanced_df.copy()

# 遍历每一个低频标签，筛选并过采样包含该标签的句子
for label in low_frequency_labels:
    # 筛选出包含当前标签的句子
    sentences_with_label = balanced_df[balanced_df['ner_tags'].apply(lambda x: label in x)]
    
    # 确认是否需要过采样
    if len(sentences_with_label) < low_count_threshold:
        # 过采样该标签的句子
        sentences_with_label_upsampled = resample(sentences_with_label, 
                                                  replace=True, 
                                                  n_samples=50, 
                                                  random_state=42)
        
        # 将过采样后的数据合并到主数据集中
        balanced_df_resampled = pd.concat([balanced_df_resampled, sentences_with_label_upsampled])

# 打乱数据集
balanced_df_resampled = balanced_df_resampled.sample(frac=1, random_state=42).reset_index(drop=True)

# 查看结果
print("过采样后的数据集大小:", len(balanced_df_resampled))


过采样后的数据集大小: 7089


In [42]:
# 设置 pandas 的显示选项，防止省略
pd.set_option('display.max_rows', None)

# 检查标签合并后的分布
all_labels_flat = [item for sublist in balanced_df_resampled['ner_tags'] for item in sublist]
label_counts_after_merge = pd.Series(all_labels_flat).value_counts()

print("合并后的标签分布:")
print(label_counts_after_merge)

# 恢复默认设置（可选）
pd.reset_option('display.max_rows')

合并后的标签分布:
O                  1446378
I-GOV_ALF_AFD        36557
B-GOV_ALF_AFD        30632
B-VALUE              27667
B-GOV_ETB_ACD        11621
I-UNIT               10829
B-UNIT               10115
I-GOV_ETB_ACD         9650
B-GOV_BOC_BIN         8799
I-GOV_BOC_BIN         8334
I-SOC_DEV_ATH_M       6850
I-ENV_WAG_TWG         6585
I-SOC_OHS_RWI         6541
B-SOC_DEV_ATH_M       6247
B-ENV_ENC_TEC         6170
B-SOC_OHS_RWI         6102
B-ENV_GHG_AET         5880
I-ENV_ENC_TEC         5716
I-ENV_GHG_AET         4976
B-SOC_GED_CEG_F       4473
B-ENV_WAG_TWG         4225
I-SOC_GED_CEG_F       3424
B-SOC_DEV_ATH_F       3308
B-SOC_GED_ETG         3162
I-SOC_GED_ETG         3099
I-GOV_CER_LRC         2991
I-SOC_DEV_ATH_F       2710
I-ENV_WAC_TWC         2705
B-SOC_OHS_REC         2601
B-SOC_GED_NHG         2590
I-SOC_OHS_REC         2585
B-SOC_OHS_HCI         2563
I-VALUE               2505
B-SOC_OHS_FAT         2495
I-SOC_GED_NHG         2258
B-GOV_CER_LRC         2255
I-ENV_GHG_EIT     

In [43]:
# 将数据转换为 Hugging Face 的 Dataset 格式
dataset = Dataset.from_pandas(balanced_df_resampled)

In [44]:
# 使用集合存储所有独特标签，避免重复
unique_labels = set(label_counts_after_merge.index)

# 将集合转换为列表并排序
unique_labels = sorted(list(unique_labels))

# 查看所有标签
print("所有独特标签:", unique_labels)
print(len(unique_labels))


所有独特标签: ['B-ENV_ENC_ECI', 'B-ENV_ENC_TEC', 'B-ENV_GHG_AE1', 'B-ENV_GHG_AE2', 'B-ENV_GHG_AE3', 'B-ENV_GHG_AET', 'B-ENV_GHG_EI', 'B-ENV_GHG_EIT', 'B-ENV_WAC_TWC', 'B-ENV_WAC_WCI', 'B-ENV_WAG_TWG', 'B-GOV_ALF_AFD', 'B-GOV_ASS_ASR', 'B-GOV_BOC_BIN', 'B-GOV_BOC_WOB', 'B-GOV_CER_LRC', 'B-GOV_ETB_ACD', 'B-GOV_ETB_ACT_N', 'B-GOV_ETB_ACT_P', 'B-GOV_MAD_WMT', 'B-SOC_AGD_CEA', 'B-SOC_AGD_NHI', 'B-SOC_AGD_TOR', 'B-SOC_DEV_ATH_F', 'B-SOC_DEV_ATH_M', 'B-SOC_GED_CEG_F', 'B-SOC_GED_CEG_M', 'B-SOC_GED_ETG', 'B-SOC_GED_NHG', 'B-SOC_OHS_FAT', 'B-SOC_OHS_HCI', 'B-SOC_OHS_REC', 'B-SOC_OHS_RWI', 'B-UNIT', 'B-VALUE', 'I-ENV_ENC_ECI', 'I-ENV_ENC_TEC', 'I-ENV_GHG_AE1', 'I-ENV_GHG_AE2', 'I-ENV_GHG_AE3', 'I-ENV_GHG_AET', 'I-ENV_GHG_EI', 'I-ENV_GHG_EIT', 'I-ENV_WAC_TWC', 'I-ENV_WAC_WCI', 'I-ENV_WAG_TWG', 'I-GOV_ALF_AFD', 'I-GOV_ASS_ASR', 'I-GOV_BOC_BIN', 'I-GOV_BOC_WOB', 'I-GOV_CER_LRC', 'I-GOV_ETB_ACD', 'I-GOV_ETB_ACT_N', 'I-GOV_ETB_ACT_P', 'I-GOV_MAD_WMT', 'I-SOC_AGD_CEA', 'I-SOC_AGD_NHI', 'I-SOC_AGD_TOR', 'I-S

In [45]:
train_test_split = dataset.train_test_split(test_size=0.2)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

In [46]:
model_name = "nbroad/ESG-BERT"
tokenizer = BertTokenizerFast.from_pretrained(model_name)



In [47]:
label2id = {label: idx for idx, label in enumerate(unique_labels)}
id2label = {idx: label for label, idx in label2id.items()}

In [48]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        is_split_into_words=True, 
        padding=True
    )
    labels = []

    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # 忽略位置
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])  # 将标签转换为整数 ID
            else:
                # 对于当前词的子词部分，通常不需要计算损失，除非你想保持每个子词的相同标签
                label_ids.append(label2id[label[word_idx]] if label[word_idx].startswith("I-") else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


In [49]:
# Define compute_metrics function for evaluation
def compute_metrics(pred):
    # Extract predictions and labels
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=2)
    
    # Remove ignored index (special tokens)
    true_labels = [[label for label, pred in zip(label_row, pred_row) if label != -100] 
                   for label_row, pred_row in zip(labels, predictions)]
    true_predictions = [[pred for label, pred in zip(label_row, pred_row) if label != -100]
                        for label_row, pred_row in zip(labels, predictions)]
    
    # Flatten lists
    true_labels = [item for sublist in true_labels for item in sublist]
    true_predictions = [item for sublist in true_predictions for item in sublist]
    
    # Calculate metrics
    accuracy = accuracy_score(true_labels, true_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, true_predictions, average='weighted')
    
    # Return results in dictionary
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }


In [50]:
# 定义模型
model = BertForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(unique_labels),  # 标签数量
    id2label=id2label,              # 标签ID到名称的映射
    label2id=label2id,              # 标签名称到ID的映射
    ignore_mismatched_sizes=True    # 忽略大小不匹配
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at nbroad/ESG-BERT and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([26]) in the checkpoint and torch.Size([71]) in the model instantiated
- classifier.weight: found shape torch.Size([26, 768]) in the checkpoint and torch.Size([71, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [51]:
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
eval_dataset = eval_dataset.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/5671 [00:00<?, ? examples/s]

Map:   0%|          | 0/1418 [00:00<?, ? examples/s]

In [52]:
training_args = TrainingArguments(
    output_dir="../results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=6, #6个epoch为我研究的最佳周期
    weight_decay=0.01,
    logging_dir="../logs",
    logging_steps=10,
)



In [53]:
# Update Trainer to use compute_metrics function
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

In [54]:
trainer.train()

  0%|          | 0/4254 [00:00<?, ?it/s]

{'loss': 1.8491, 'grad_norm': 4.35384464263916, 'learning_rate': 1.99529854254819e-05, 'epoch': 0.01}
{'loss': 1.3272, 'grad_norm': 2.523653030395508, 'learning_rate': 1.99059708509638e-05, 'epoch': 0.03}
{'loss': 1.2387, 'grad_norm': 3.1073110103607178, 'learning_rate': 1.98589562764457e-05, 'epoch': 0.04}
{'loss': 1.4156, 'grad_norm': 2.16011905670166, 'learning_rate': 1.98119417019276e-05, 'epoch': 0.06}
{'loss': 1.3951, 'grad_norm': 5.637800216674805, 'learning_rate': 1.9764927127409498e-05, 'epoch': 0.07}
{'loss': 1.2785, 'grad_norm': 2.000967264175415, 'learning_rate': 1.9717912552891397e-05, 'epoch': 0.08}
{'loss': 1.4653, 'grad_norm': 3.228508472442627, 'learning_rate': 1.9670897978373297e-05, 'epoch': 0.1}
{'loss': 1.3966, 'grad_norm': 2.3546903133392334, 'learning_rate': 1.9623883403855196e-05, 'epoch': 0.11}
{'loss': 1.1566, 'grad_norm': 1.1956819295883179, 'learning_rate': 1.9576868829337095e-05, 'epoch': 0.13}
{'loss': 1.3093, 'grad_norm': 4.243281364440918, 'learning_rate

  0%|          | 0/178 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 1.0325167179107666, 'eval_accuracy': 0.7961720656100949, 'eval_precision': 0.7256847386549433, 'eval_recall': 0.7961720656100949, 'eval_f1': 0.7358020369387871, 'eval_runtime': 475.0553, 'eval_samples_per_second': 2.985, 'eval_steps_per_second': 0.375, 'epoch': 1.0}
{'loss': 1.0991, 'grad_norm': 3.1566436290740967, 'learning_rate': 1.666196520921486e-05, 'epoch': 1.0}
{'loss': 1.1199, 'grad_norm': 2.434732437133789, 'learning_rate': 1.661495063469676e-05, 'epoch': 1.02}
{'loss': 0.995, 'grad_norm': 2.9561729431152344, 'learning_rate': 1.6567936060178658e-05, 'epoch': 1.03}
{'loss': 1.263, 'grad_norm': 4.817466735839844, 'learning_rate': 1.6520921485660557e-05, 'epoch': 1.04}
{'loss': 0.879, 'grad_norm': 2.2568297386169434, 'learning_rate': 1.6473906911142457e-05, 'epoch': 1.06}
{'loss': 1.017, 'grad_norm': 4.495649337768555, 'learning_rate': 1.6426892336624353e-05, 'epoch': 1.07}
{'loss': 0.8955, 'grad_norm': 3.409916877746582, 'learning_rate': 1.6379877762106252e-05, 'ep

  0%|          | 0/178 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.8553321361541748, 'eval_accuracy': 0.8108272134446114, 'eval_precision': 0.773410919359503, 'eval_recall': 0.8108272134446114, 'eval_f1': 0.7717591447485549, 'eval_runtime': 33.245, 'eval_samples_per_second': 42.653, 'eval_steps_per_second': 5.354, 'epoch': 2.0}
{'loss': 0.7783, 'grad_norm': 3.988316297531128, 'learning_rate': 1.3323930418429714e-05, 'epoch': 2.0}
{'loss': 0.8148, 'grad_norm': 4.2208356857299805, 'learning_rate': 1.3276915843911613e-05, 'epoch': 2.02}
{'loss': 0.7797, 'grad_norm': 3.9417612552642822, 'learning_rate': 1.3229901269393512e-05, 'epoch': 2.03}
{'loss': 0.833, 'grad_norm': 6.075567722320557, 'learning_rate': 1.3182886694875412e-05, 'epoch': 2.05}
{'loss': 0.8206, 'grad_norm': 5.0611042976379395, 'learning_rate': 1.3135872120357311e-05, 'epoch': 2.06}
{'loss': 0.6727, 'grad_norm': 3.9767425060272217, 'learning_rate': 1.308885754583921e-05, 'epoch': 2.07}
{'loss': 0.7853, 'grad_norm': 4.290954113006592, 'learning_rate': 1.304184297132111e-05, '

  0%|          | 0/178 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.7758057117462158, 'eval_accuracy': 0.8198980901059423, 'eval_precision': 0.7875739579715565, 'eval_recall': 0.8198980901059423, 'eval_f1': 0.7900787026898555, 'eval_runtime': 30.4762, 'eval_samples_per_second': 46.528, 'eval_steps_per_second': 5.841, 'epoch': 3.0}
{'loss': 0.5846, 'grad_norm': 4.190857410430908, 'learning_rate': 9.98589562764457e-06, 'epoch': 3.0}
{'loss': 0.7281, 'grad_norm': 4.774765968322754, 'learning_rate': 9.938881053126471e-06, 'epoch': 3.02}
{'loss': 0.707, 'grad_norm': 9.594161987304688, 'learning_rate': 9.89186647860837e-06, 'epoch': 3.03}
{'loss': 0.6858, 'grad_norm': 4.385000228881836, 'learning_rate': 9.84485190409027e-06, 'epoch': 3.05}
{'loss': 0.6901, 'grad_norm': 3.5962817668914795, 'learning_rate': 9.797837329572169e-06, 'epoch': 3.06}
{'loss': 0.5982, 'grad_norm': 2.976015567779541, 'learning_rate': 9.750822755054068e-06, 'epoch': 3.07}
{'loss': 0.621, 'grad_norm': 4.290868282318115, 'learning_rate': 9.703808180535967e-06, 'epoch': 3.

  0%|          | 0/178 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.7251255512237549, 'eval_accuracy': 0.831262143040434, 'eval_precision': 0.7997491804066423, 'eval_recall': 0.831262143040434, 'eval_f1': 0.8053571779240766, 'eval_runtime': 26.973, 'eval_samples_per_second': 52.571, 'eval_steps_per_second': 6.599, 'epoch': 4.0}
{'loss': 0.539, 'grad_norm': 5.922379493713379, 'learning_rate': 6.647860836859427e-06, 'epoch': 4.01}
{'loss': 0.6144, 'grad_norm': 3.7024693489074707, 'learning_rate': 6.600846262341326e-06, 'epoch': 4.02}
{'loss': 0.638, 'grad_norm': 5.994492530822754, 'learning_rate': 6.5538316878232255e-06, 'epoch': 4.03}
{'loss': 0.6555, 'grad_norm': 3.4938244819641113, 'learning_rate': 6.506817113305125e-06, 'epoch': 4.05}
{'loss': 0.6657, 'grad_norm': 5.328569412231445, 'learning_rate': 6.459802538787024e-06, 'epoch': 4.06}
{'loss': 0.6585, 'grad_norm': 5.057304382324219, 'learning_rate': 6.412787964268924e-06, 'epoch': 4.08}
{'loss': 0.6704, 'grad_norm': 4.344435691833496, 'learning_rate': 6.365773389750824e-06, 'epoch':

  0%|          | 0/178 [00:00<?, ?it/s]

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'eval_loss': 0.7264559864997864, 'eval_accuracy': 0.8220568528497705, 'eval_precision': 0.8024852369518309, 'eval_recall': 0.8220568528497705, 'eval_f1': 0.8088597793265181, 'eval_runtime': 27.3946, 'eval_samples_per_second': 51.762, 'eval_steps_per_second': 6.498, 'epoch': 5.0}
{'loss': 0.5743, 'grad_norm': 4.462850093841553, 'learning_rate': 3.309826046074283e-06, 'epoch': 5.01}
{'loss': 0.6058, 'grad_norm': 6.036447048187256, 'learning_rate': 3.2628114715561827e-06, 'epoch': 5.02}
{'loss': 0.4825, 'grad_norm': 6.602058410644531, 'learning_rate': 3.215796897038082e-06, 'epoch': 5.04}
{'loss': 0.454, 'grad_norm': 3.588064193725586, 'learning_rate': 3.1687823225199813e-06, 'epoch': 5.05}
{'loss': 0.4491, 'grad_norm': 6.809605121612549, 'learning_rate': 3.121767748001881e-06, 'epoch': 5.06}
{'loss': 0.7287, 'grad_norm': 5.221611022949219, 'learning_rate': 3.0747531734837804e-06, 'epoch': 5.08}
{'loss': 0.6082, 'grad_norm': 6.4079389572143555, 'learning_rate': 3.0277385989656793e-06, 'e

  0%|          | 0/178 [00:00<?, ?it/s]

{'eval_loss': 0.7157347202301025, 'eval_accuracy': 0.8267531801019099, 'eval_precision': 0.8033165829011489, 'eval_recall': 0.8267531801019099, 'eval_f1': 0.8114657748381907, 'eval_runtime': 27.1215, 'eval_samples_per_second': 52.283, 'eval_steps_per_second': 6.563, 'epoch': 6.0}
{'train_runtime': 2747.4501, 'train_samples_per_second': 12.385, 'train_steps_per_second': 1.548, 'train_loss': 0.7495251491588777, 'epoch': 6.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=4254, training_loss=0.7495251491588777, metrics={'train_runtime': 2747.4501, 'train_samples_per_second': 12.385, 'train_steps_per_second': 1.548, 'total_flos': 8896429798115328.0, 'train_loss': 0.7495251491588777, 'epoch': 6.0})

In [None]:
eval_results = trainer.evaluate()
print(eval_results)

In [44]:
model.save_pretrained('../finetuned_model')
tokenizer.save_pretrained('../finetuned_model')

('../finetuned_model\\tokenizer_config.json',
 '../finetuned_model\\special_tokens_map.json',
 '../finetuned_model\\vocab.txt',
 '../finetuned_model\\added_tokens.json',
 '../finetuned_model\\tokenizer.json')

In [26]:
# 加载微调后的模型和分词器
model_path = "../finetuned_model"
tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = BertForTokenClassification.from_pretrained(model_path)

In [30]:
training_args = TrainingArguments(
    output_dir="../results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="../logs",
    logging_steps=10,
)

# 可以进行增量训练
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

trainer.train()



  0%|          | 0/704 [00:00<?, ?it/s]

{'loss': 0.4127, 'grad_norm': 4.936605930328369, 'learning_rate': 1.9715909090909092e-05, 'epoch': 0.01}
{'loss': 0.5182, 'grad_norm': 5.862741947174072, 'learning_rate': 1.9431818181818182e-05, 'epoch': 0.03}
{'loss': 0.5605, 'grad_norm': 6.257969379425049, 'learning_rate': 1.9147727272727276e-05, 'epoch': 0.04}
{'loss': 0.4638, 'grad_norm': 3.0369224548339844, 'learning_rate': 1.8863636363636366e-05, 'epoch': 0.06}
{'loss': 0.4375, 'grad_norm': 4.453832626342773, 'learning_rate': 1.8579545454545456e-05, 'epoch': 0.07}
{'loss': 0.359, 'grad_norm': 2.8654088973999023, 'learning_rate': 1.8295454545454546e-05, 'epoch': 0.09}
{'loss': 0.5475, 'grad_norm': 3.86381196975708, 'learning_rate': 1.8011363636363636e-05, 'epoch': 0.1}
{'loss': 0.4799, 'grad_norm': 4.379912376403809, 'learning_rate': 1.772727272727273e-05, 'epoch': 0.11}
{'loss': 0.4022, 'grad_norm': 7.674426078796387, 'learning_rate': 1.744318181818182e-05, 'epoch': 0.13}
{'loss': 0.4899, 'grad_norm': 8.123591423034668, 'learning

  0%|          | 0/176 [00:00<?, ?it/s]

{'eval_loss': 0.5392122268676758, 'eval_accuracy': 0.86078280389976, 'eval_precision': 0.8499407766536674, 'eval_recall': 0.86078280389976, 'eval_f1': 0.8526105924205835, 'eval_runtime': 138.3475, 'eval_samples_per_second': 10.177, 'eval_steps_per_second': 1.272, 'epoch': 1.0}
{'train_runtime': 480.4779, 'train_samples_per_second': 11.72, 'train_steps_per_second': 1.465, 'train_loss': 0.49172663959589874, 'epoch': 1.0}


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


TrainOutput(global_step=704, training_loss=0.49172663959589874, metrics={'train_runtime': 480.4779, 'train_samples_per_second': 11.72, 'train_steps_per_second': 1.465, 'total_flos': 1472279909280768.0, 'train_loss': 0.49172663959589874, 'epoch': 1.0})

In [None]:
# 检查函数：检测句子中是否包含至少一个错误标签
def contains_incorrect_label(label_sequence):
    # 如果标签序列全是 'O' 标签，则返回 False（保留该句子）
    if all(label == "O" for label in label_sequence):
        return False
    # 如果存在标签不在合法标签集中，则返回 True（表示该句子含有错误标签）
    return any(label not in label_dict for label in label_sequence)

# 检查函数：检测标签是否符合预期
def is_incorrect_label_sequence(label_sequence):
    for i, label in enumerate(label_sequence):
        # 1. 检查标签是否在合法标签集中
        if label not in label_dict:
            return True
    return False

# 找出标错的句子
incorrect_labels_df = df[df['ner_tags'].apply(contains_incorrect_label)]

In [10]:
print(incorrect_labels_df)

                                                  tokens  \
0      [i, t,  , s, y, m, b, o, l, i, z, e, s,  , t, ...   
43     [s, u, c, h,  , m, e, a, s, u, r, e, s,  , h, ...   
156    [i, n,  , a, d, d, i, t, i, o, n,  , ,,  , t, ...   
440    [w, e,  , a, r, e,  , a, w, a, r, e,  , t, h, ...   
443    [o, u, r,  , a, p, p, r, o, a, c, h,  , t, o, ...   
...                                                  ...   
11321  [v, a, l, u, e, m, a, x,  , p, u, b, l, i, c, ...   
11336  [m, a, n, a, g, e, m, e, n, t,  , t, e, a, m, ...   
11364  [b, y,  , s, y, s, t, e, m, a, t, i, c, a, l, ...   
11385  [t, o, p, i, c,  , :,  , i, n, d, i, r, e, c, ...   
11386  [g, e, n, e, r, a, l,  , s, t, a, n, d, a, r, ...   

                                                ner_tags  
0      [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  
43     [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  
156    [O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-S...  
440    [O, O, O, O, O, O, O, O, O, O, O, O,