In [89]:
import json
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertModel, BertTokenizer, AdamW
from sklearn.metrics import classification_report

In [90]:
# 参数配置
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
PRETRAINED_MODEL_NAME = "bert-base-chinese"
MAX_LEN = 128

In [91]:
# 读取数据
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = [json.loads(line.strip()) for line in f]
    return data
# 数据加载
train_data = load_data('train.json')
test_data = load_data('test.json')
print('训练集：', len(train_data), train_data[0])
print('测试集：', len(test_data), test_data[0])

训练集： 100 {'rel': '主演', 'ent1': '周星驰', 'ent2': '喜剧之王', 'text': '如何演好自己的角色，请读《演员自我修养》《喜剧之王》周星驰崛起于穷困潦倒之中的独门秘笈'}
测试集： 10 {'rel': '出生地', 'ent1': '圣地亚哥', 'ent2': '查尔斯·阿兰基斯', 'text': '查尔斯·阿兰基斯（Charles Aránguiz），1989年4月17日出生于智利圣地亚哥，智利职业足球运动员，司职中场，效力于德国足球甲级联赛勒沃库森足球俱乐部'}


In [92]:
# 关系类型
id2rel = {0: 'UNK', 1: '主演', 2: '歌手', 3: '简称', 4: '总部地点', 5: '导演', 
          6: '出生地', 7: '目', 8: '出生日期', 9: '占地面积', 10: '上映时间',
          11: '出版社', 12: '作者', 13: '号', 14: '父亲', 15: '毕业院校', 
          16: '成立日期', 17: '改编自', 18: '主持人', 19: '所属专辑', 
          20: '连载网站', 21: '作词', 22: '作曲', 23: '创始人', 24: '丈夫', 
          25: '妻子', 26: '朝代', 27: '民族', 28: '国籍', 29: '身高', 30: '出品公司', 
          31: '母亲', 32: '编剧', 33: '首都', 34: '面积', 35: '祖籍', 36: '嘉宾', 
          37: '字', 38: '海拔', 39: '注册资本', 40: '制片人', 41: '董事长', 42: '所在城市',
          43: '气候', 44: '人口数量', 45: '邮政编码', 46: '主角', 47: '官方语言', 48: '修业年限'}
rel2id = {v: k for k, v in id2rel.items()}
num_labels = len(rel2id)
print('关系类型：', num_labels)

关系类型： 49


In [93]:
# 创建自定义数据集
class RelationExtractionDataset(Dataset):
    def __init__(self, data, tokenizer, rel2id, max_len=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        text = item['text']
        ent1 = item['ent1']
        ent2 = item['ent2']
        rel = item['rel']

        # 构建带标记的句子
        # 参考了这里的标注方式：https://github.com/buppt/ChineseNRE/blob/master/data/SemEval2010_task8_all_data/TRAIN_FILE.TXT
        marked_text = text.replace(ent1, f"<E1>{ent1}</E1>").replace(ent2, f"<E2>{ent2}</E2>")

        # print(123, marked_text)

        inputs = self.tokenizer.encode_plus(
            marked_text, 
            add_special_tokens=True, 
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        input_ids = inputs['input_ids'].squeeze()
        attention_mask = inputs['attention_mask'].squeeze()

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'label': rel2id[rel]
        }

In [94]:
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

train_dataset = RelationExtractionDataset(train_data, tokenizer, rel2id, MAX_LEN)
test_dataset = RelationExtractionDataset(test_data, tokenizer, rel2id, MAX_LEN)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [95]:
# 测试下数据访问
train_dataset[0]

{'input_ids': tensor([ 101, 1963,  862, 4028, 1962, 5632, 2346, 4638, 6235, 5682, 8024, 6435,
         6438,  517, 4028, 1447, 5632, 2769,  934, 1075,  518,  517,  133,  100,
          135, 1599, 1196,  722, 4374,  133,  120,  100,  135,  518,  133,  100,
          135, 1453, 3215, 7720,  133,  120,  100,  135, 2307, 6629,  754, 4956,
         1737, 4057,  948,  722,  704, 4638, 4324, 7305, 4908, 5007,  102,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1

In [96]:

# 模型定义
class BertForRelationExtraction(nn.Module):
    def __init__(self, num_labels):
        super(BertForRelationExtraction, self).__init__()
        self.bert = BertModel.from_pretrained(PRETRAINED_MODEL_NAME)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(self.bert.config.hidden_size, num_labels) # 建模为简单的多分类任务

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.bert(input_ids, attention_mask=attention_mask)
        # pooler就是将[CLS]这个token再过一下全连接层+Tanh激活函数，作为该句子的特征向量
        # https://blog.csdn.net/zhaohongfei_358/article/details/127960742
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        # print(111, logits)
        # print(222, labels)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return loss, logits

In [97]:
# 创建模型
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForRelationExtraction(num_labels).to(device)
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [98]:
# 训练函数
def train(model, dataloader, optimizer, device, epochs=EPOCHS):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for i, batch in enumerate(dataloader):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            loss, _ = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

            if (i + 1) % 2 == 0:
                print(f"Epoch {epoch+1}/{epochs}, Batch {i+1}/{len(dataloader)}, Loss: {total_loss / (i+1):.4f}")

        print(f"Epoch {epoch+1} completed. Average Loss: {total_loss / len(dataloader):.4f}")

In [99]:
# 评估函数
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    preds = []
    true_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            loss, logits = model(input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += loss.item()
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    pred_tags = [id2rel[pred] for pred in preds]
    true_tags = [id2rel[label] for label in true_labels]
    
    report = classification_report([true_tags], [pred_tags])
    return avg_loss, report

In [105]:
# 评估函数
def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    preds = []
    true_labels = []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            # print(111, labels)

            loss, logits = model(input_ids, attention_mask=attention_mask, labels=labels)
            # print(222, logits)

            total_loss += loss.item()
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    # print(333, true_labels)
    # print(444, preds)
    avg_loss = total_loss / len(dataloader)
    pred_tags = [id2rel[id] for id in preds]
    true_tags = [id2rel[id] for id in true_labels]
    report = classification_report(true_tags, pred_tags)
    print(555, pred_tags)
    print(666, true_tags)
    print(f"Test Loss: {avg_loss:.4f}")
    print(f"Test Report:\n{report}") 

In [104]:
# 预测函数
def predict(model, tokenizer, text, ent1, ent2, device):
    marked_text = text.replace(ent1, f"<E1>{ent1}</E1>").replace(ent2, f"<E2>{ent2}</E2>")
    inputs = tokenizer.encode_plus(
        marked_text, 
        add_special_tokens=True, 
        max_length=MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    model.eval()
    with torch.no_grad():
        _, logits = model(input_ids, attention_mask=attention_mask)
        pred = torch.argmax(logits, dim=1).cpu().numpy()[0]

    return id2rel[pred]

In [102]:
# 训练模型
train(model, train_loader, optimizer, device)

Epoch 1/3, Batch 2/7, Loss: 4.0118
Epoch 1/3, Batch 4/7, Loss: 4.0015
Epoch 1/3, Batch 6/7, Loss: 3.9663
Epoch 1 completed. Average Loss: 3.9788
Epoch 2/3, Batch 2/7, Loss: 3.8110
Epoch 2/3, Batch 4/7, Loss: 3.8130
Epoch 2/3, Batch 6/7, Loss: 3.7637
Epoch 2 completed. Average Loss: 3.6701
Epoch 3/3, Batch 2/7, Loss: 3.6708
Epoch 3/3, Batch 4/7, Loss: 3.5528
Epoch 3/3, Batch 6/7, Loss: 3.4983
Epoch 3 completed. Average Loss: 3.4159


In [106]:
# 评估模型
evaluate(model, test_loader, device)

555 ['出生日期', '出生日期', '作者', '作者', '主演', '主演', '出生日期', '出生日期', '作者', '作者']
666 ['出生地', '出生日期', '歌手', '作曲', '出品公司', '导演', '妻子', '丈夫', '作者', '作者']
Test Loss: 3.2158
Test Report:
              precision    recall  f1-score   support

          丈夫       0.00      0.00      0.00         1
          主演       0.00      0.00      0.00         0
          作曲       0.00      0.00      0.00         1
          作者       0.50      1.00      0.67         2
        出品公司       0.00      0.00      0.00         1
         出生地       0.00      0.00      0.00         1
        出生日期       0.25      1.00      0.40         1
          妻子       0.00      0.00      0.00         1
          导演       0.00      0.00      0.00         1
          歌手       0.00      0.00      0.00         1

    accuracy                           0.30        10
   macro avg       0.07      0.20      0.11        10
weighted avg       0.12      0.30      0.17        10



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [107]:
# 预测
sample_text = "韦棣华女士1861年8月22日出生于美国纽约州"
sample_ent1 = "韦棣华"
sample_ent2 = "美国纽约州"
predicted_relation = predict(model, tokenizer, sample_text, sample_ent1, sample_ent2, device)
print(f"Predicted Relation: {predicted_relation}")

Predicted Relation: 出生日期
