每个 token 都有一个对应的标签。


# 准备数据


### 构建数据集

将原始数据处理成：{'sentence': '海钓比赛地点在厦门与金门之间的海域。', 'labels': [[7, 8, '厦门', 'LOC'], [10, 11, '金门', 'LOC']]}


In [1]:
from torch.utils.data import Dataset

categories = set()


class PeopleDaily(Dataset):
    def __init__(self, data_file):
        self.data = self.load_data(data_file)

    def load_data(self, data_file):
        Data = {}
        with open(data_file, "rt") as f:
            for idx, line in enumerate(f.read().split("\n\n")):
                if not line:
                    break
                sentence, labels = "", []
                for i, item in enumerate(line.split("\n")):
                    char, tag = item.split(" ")
                    sentence += char
                    if tag.startswith("B"):
                        labels.append([i, i, char, tag[2:]])  # Remove the B- or I-
                        categories.add(tag[2:])
                    elif tag.startswith("I"):
                        labels[-1][1] = i
                        labels[-1][2] += char
                Data[idx] = {"sentence": sentence, "labels": labels}
        return Data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

In [2]:
train_data = PeopleDaily("data/china-people-daily-ner-corpus/example.train")
vaild_data = PeopleDaily("data/china-people-daily-ner-corpus/example.dev")
test_data = PeopleDaily("data/china-people-daily-ner-corpus/example.test")
print(train_data[0])
print(categories)

{'sentence': '海钓比赛地点在厦门与金门之间的海域。', 'labels': [[7, 8, '厦门', 'LOC'], [10, 11, '金门', 'LOC']]}
{'PER', 'LOC', 'ORG'}


### 数据预处理

将实体标签从原文映射到切分出的 token 上，同时将 [CLS]、[SEP]、[PAD] 等特殊 token 对应的标签设为 -100，
如：`[-100,    1,    2,    0,    0,    0,    1,    2,    2, -100]`

上面 label 中 [7, 8, '厦门', 'LOC'] `7`,`8` 对应的是 char。所以需要 char -> token


##### 建立标签映射词典


In [3]:
id2label = {}
id2label[0] = "0"
for item in categories:
    id2label[len(id2label)] = f"B-{item}"
    id2label[len(id2label)] = f"I-{item}"

label2id = {v: k for k, v in id2label.items()}
print(id2label)
print(label2id)

{0: '0', 1: 'B-PER', 2: 'I-PER', 3: 'B-LOC', 4: 'I-LOC', 5: 'B-ORG', 6: 'I-ORG'}
{'0': 0, 'B-PER': 1, 'I-PER': 2, 'B-LOC': 3, 'I-LOC': 4, 'B-ORG': 5, 'I-ORG': 6}


In [4]:
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
import numpy as np
import torch

check_point = "bert-base-chinese"
tokenizer = AutoTokenizer.from_pretrained(check_point)


def collate_fn(batch_samples):
    batch_sentence, batch_tags = [], []
    for sample in batch_samples:
        batch_sentence.append(sample["sentence"])
        batch_tags.append(sample["labels"])

    batch_inputs = tokenizer(
        batch_sentence,
        padding=True,
        truncation=True,
        return_tensors="pt",
    )

    batch_label = np.zeros(batch_inputs["input_ids"].shape, dtype=int)
    for s_idx, sentence in enumerate(batch_sentence):
        encoding = tokenizer(sentence)
        batch_label[s_idx][0] = -100
        batch_label[s_idx][len(encoding.tokens()) - 1 :] = -100
        for char_start, char_end, _, tag in batch_tags[s_idx]:
            token_start = encoding.char_to_token(char_start)
            token_end = encoding.char_to_token(char_end)
            batch_label[s_idx][token_start] = label2id[f"B-{tag}"]
            batch_label[s_idx][token_start + 1 : token_end + 1] = label2id[f"I-{tag}"]

    return batch_inputs, torch.tensor(batch_label)


train_dataloader = DataLoader(
    train_data, batch_size=4, shuffle=True, collate_fn=collate_fn
)
dev_dataloader = DataLoader(
    vaild_data, batch_size=4, shuffle=True, collate_fn=collate_fn
)
test_dataloader = DataLoader(
    test_data, batch_size=4, shuffle=True, collate_fn=collate_fn
)

batch_X, batch_y = next(iter(train_dataloader))
print(batch_X)
print(batch_y)

{'input_ids': tensor([[ 101,  852,  969, 1963,  769, 6858, 5143, 5320, 5543, 3354, 2768, 1963,
         3634, 4906, 2110, 4638, 5052, 4415, 5381, 5317, 8024, 1814, 2356, 6820,
          833, 1853, 6756,  720, 8043,  102,    0,    0,    0,    0,    0,    0,
            0,    0,    0],
        [ 101, 4510, 1213, 1814, 4638,  782, 1440, 6401, 6381, 5442, 8024, 1158,
         2456, 4510, 1213, 1814, 4638, 1325, 1380, 8024,  794, 3378, 4905, 4923,
         2428,  677, 6432, 2218, 3221,  680, 5774, 4030, 3159,  751, 4638, 1325,
         1380,  511,  102],
        [ 101, 1398, 3198, 6858, 6814, 2347, 2458, 6792, 4638, 3940, 6887, 2972,
         1220,  704, 1912, 5307, 3845, 1394,  868, 8024, 1158, 6863, 3696, 7313,
         1912,  769, 3173, 4638,  831, 1232,  511,  102,    0,    0,    0,    0,
            0,    0,    0],
        [ 101, 1266, 2785, 3777, 4696, 3221,  671,  702, 6225, 7881, 4638,  100,
         1921, 1828,  100, 8013,  102,    0,    0,    0,    0,    0,    0,    0,
           

# 训练模型


### 构建模型


In [5]:
from transformers import BertPreTrainedModel, BertModel, AutoConfig
import torch.nn as nn

device = "cuda" if torch.cuda.is_available() else "cpu"


class BertForNER(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, len(id2label))
        self.post_init()

    def forward(self, x):
        bert_output = self.bert(**x)
        sequence_output = bert_output.last_hidden_state
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)
        return logits


config = AutoConfig.from_pretrained(check_point)
model = BertForNER.from_pretrained(check_point, config=config).to(device)
print(model)

Some weights of BertForNER were not initialized from the model checkpoint at bert-base-chinese and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForNER(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

### 优化模型参数


计算损失、优化模型参数


In [6]:
def train_loop(dataloader, model, loss_fn, optimizer, lr_scheduler, epoch, total_loss):
    model.train()
    for batch_num, (X, y) in enumerate(dataloader, start=1):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred.permute(0, 2, 1), y)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        lr_scheduler.step()

        total_loss += loss.item()

    return total_loss

在验证/测试循环中评估模型性能


In [11]:
from seqeval.metrics import classification_report
from seqeval.scheme import IOB2


def test_loop(dataloader, model):
    true_labels, true_predictions = [], []

    model.eval()
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            predictions = pred.argmax(dim=-1).cpu().numpy().tolist()
            labels = y.cpu().numpy().tolist()
            true_labels += [
                [id2label[int(l)] for l in label if l != -100] for label in labels
            ]
            true_predictions += [
                [id2label[int(p)] for (p, l) in zip(prediction, label) if l != -100]
                for prediction, label in zip(predictions, labels)
            ]
    print(
        classification_report(true_labels, true_predictions, mode="strict", scheme=IOB2)
    )

组合为 epoch


In [13]:
from transformers import AdamW, get_scheduler

epoch_num = 3
learning_rate = 1e-5

loss_fn = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=learning_rate)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_dataloader) * epoch_num,
)

best_f1 = 0
total_loss = 0
for t in range(epoch_num):
    print(f"Epoch {t+1}\n-------------------------------")
    total_loss = train_loop(
        train_dataloader, model, loss_fn, optimizer, lr_scheduler, t + 1, total_loss
    )
    metrics = test_loop(dev_dataloader, model)
    valid_macro_f1, valid_micro_f1 = (
        metrics["macro avg"]["f1-score"],
        metrics["micro avg"]["f1-score"],
    )
    valid_f1 = metrics["weighted avg"]["f1-score"]

    if valid_f1 > best_f1:
        best_f1 = valid_f1
        torch.save(
            model.state_dict(),
            f"epoch_{t+1}_valid_macrof1_{(100*valid_macro_f1):0.3f}_microf1_{(100*valid_micro_f1):0.3f}_weights.bin",
        )


print("Finished!")

Epoch 1
-------------------------------


KeyboardInterrupt: 

# 测试模型


In [None]:
sentence = "日本外务省3月18日发布消息称，日本首相岸田文雄将于19至21日访问印度和柬埔寨。"

model.load_state_dict(
    torch.load(
        "epoch_3_valid_macrof1_95.878_microf1_96.049_weights.bin",
        map_location=torch.device(device),
    )
)
model.eval()
results = []

首先从输出中取出“B-”或“I-”开头的 token，然后将这些 token 组合成实体，最后将实体对应的 token 的平均概率作为实体的概率。


In [None]:
import json

model.load_state_dict(
    torch.load(
        "epoch_3_valid_macrof1_95.878_microf1_96.049_weights.bin",
        map_location=torch.device("cpu"),
    )
)
model.eval()
with torch.no_grad():
    print("evaluating on test set...")
    true_labels, true_predictions = [], []
    for X, y in tqdm(test_dataloader):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        predictions = pred.argmax(dim=-1).cpu().numpy().tolist()
        labels = y.cpu().numpy().tolist()
        true_labels += [
            [id2label[int(l)] for l in label if l != -100] for label in labels
        ]
        true_predictions += [
            [id2label[int(p)] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
    print(
        classification_report(true_labels, true_predictions, mode="strict", scheme=IOB2)
    )
    results = []
    print("predicting labels...")
    for s_idx in tqdm(range(len(test_data))):
        example = test_data[s_idx]
        inputs = tokenizer(example["sentence"], truncation=True, return_tensors="pt")
        inputs = inputs.to(device)
        pred = model(inputs)
        probabilities = (
            torch.nn.functional.softmax(pred, dim=-1)[0].cpu().numpy().tolist()
        )
        predictions = pred.argmax(dim=-1)[0].cpu().numpy().tolist()

        pred_label = []
        inputs_with_offsets = tokenizer(
            example["sentence"], return_offsets_mapping=True
        )
        tokens = inputs_with_offsets.tokens()
        offsets = inputs_with_offsets["offset_mapping"]

        idx = 0
        while idx < len(predictions):
            pred = predictions[idx]
            label = id2label[pred]
            if label != "O":
                label = label[2:]  # Remove the B- or I-
                start, end = offsets[idx]
                all_scores = [probabilities[idx][pred]]
                # Grab all the tokens labeled with I-label
                while (
                    idx + 1 < len(predictions)
                    and id2label[predictions[idx + 1]] == f"I-{label}"
                ):
                    all_scores.append(probabilities[idx + 1][predictions[idx + 1]])
                    _, end = offsets[idx + 1]
                    idx += 1

                score = np.mean(all_scores).item()
                word = example["sentence"][start:end]
                pred_label.append(
                    {
                        "entity_group": label,
                        "score": score,
                        "word": word,
                        "start": start,
                        "end": end,
                    }
                )
            idx += 1
        results.append(
            {
                "sentence": example["sentence"],
                "pred_label": pred_label,
                "true_label": example["labels"],
            }
        )
    with open("test_data_pred.json", "wt", encoding="utf-8") as f:
        for exapmle_result in results:
            f.write(json.dumps(exapmle_result, ensure_ascii=False) + "\n")

### 保存模型预测结果
