In [None]:
import torch

# 示例数据：每条citation_trail可能长度不同
citation_trail = [
    [12345, 67890, 11223],        # 长度为3
    [44556, 77889],               # 长度为2
    [11223, 44556, 12345, 67890]  # 长度为4
]

# 设置特殊 token（如果需要）
special_tokens = {
    '<pad>': 0,
    '<mask>': 1,  # 通常需要一个 mask token 用于掩码语言模型
}

# 获取所有唯一的PID
all_pids = set()
for trail in citation_trail:
    all_pids.update(trail)  # 将每个引用链中的PID加入到set中

# 创建 PID 到索引的映射，索引从 1 开始，保留 0 作为填充
print("Start creating PID to model input IDs...")
pid_to_idx = {**special_tokens, **{pid: idx + len(special_tokens) for idx, pid in enumerate(sorted(all_pids))}}
from itertools import islice
num_items = 5
first_items = dict(islice(pid_to_idx.items(), num_items))
print("Finished. PID to model input IDs (first 5 items):", first_items)
num_pids = len(pid_to_idx)
print(f"Total number of unique PIDs (plus padding value): {num_pids}")

# 设置最大长度
max_length = 10
padding_value = 0

from torch.utils.data import Dataset, DataLoader
# 自定义数据集类
class CitationTrailDataset(Dataset):
    def __init__(self, citation_trails, pid_to_idx, max_length=5, pad_value=padding_value):
        self.citation_trails = citation_trails
        self.pid_to_idx = pid_to_idx
        self.max_length = max_length
        self.pad_value = pad_value

    def __len__(self):
        return len(self.citation_trails)

    def __getitem__(self, idx):
        trail = self.citation_trails[idx]
        # 将 PID 转换为索引
        trail = [self.pid_to_idx.get(pid, self.pad_value) for pid in trail]
        if len(trail) > self.max_length:
            trail = trail[:self.max_length]
        trail = trail + [self.pad_value] * (self.max_length - len(trail))
        
        return torch.tensor(trail, dtype=torch.long)

# 创建数据集并传入 PID 到索引的映射
dataset = CitationTrailDataset(citation_trail, pid_to_idx, max_length=max_length)
# 创建DataLoader
dataloader = DataLoader(dataset, batch_size=2, shuffle=True)
# 打印一个batch的数据
print("Print a batch:")
for batch in dataloader:
    print(batch)

from transformers import RobertaForMaskedLM, RobertaConfig
# 创建自定义的 RobertaConfig
config = RobertaConfig(
    vocab_size=num_pids,          # 设置词汇表大小
    pad_token_id=pid_to_idx['<pad>'],  # 设置填充 token 的 id
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    max_position_embeddings=514,  # 通常为512 + 2
)
# 使用自定义配置初始化模型
model = RobertaForMaskedLM(config=config)
# 查看embedding层
print("Shape of embedding layer:", model.roberta.embeddings.word_embeddings.weight.shape)

from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-5)
# 使用GPU（如果可用）
device = torch.device("cuda")
model.to(device)

# 训练循环
epochs = 3
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in dataloader:
        inputs = batch.to(device)

        # 创建注意力掩码（真实 token 为 1，填充为 0）
        attention_mask = (inputs != padding_value).long()

        # 准备标签用于掩码语言建模
        labels = inputs.clone()
        rand = torch.rand(inputs.shape).to(device)
        mask_arr = (rand < 0.15) * (labels != padding_value)  # 15% 的掩盖

        # 如果当前批次没有被掩盖的 token，手动掩盖一个 token
        if mask_arr.sum() == 0:
            # 获取所有非填充 token 的索引
            non_pad_indices = (labels != padding_value).nonzero(as_tuple=False)
            if non_pad_indices.numel() > 0:
                # 随机选择一个非填充 token 的位置
                idx = non_pad_indices[torch.randint(0, non_pad_indices.size(0), (1,)).item()]
                mask_arr[idx[0], idx[1]] = True

        # 更新 labels，只保留被掩盖 token 的原始值，其余设置为 -100
        labels[~mask_arr] = -100

        # 替换被掩盖的 token 为 <mask> token
        inputs[mask_arr] = pid_to_idx['<mask>']

        # 添加断言以确保输入有效
        assert inputs.max() < num_pids, f"Input ID {inputs.max()} 超出词汇表范围 (vocab_size={num_pids})"
        assert inputs.min() >= 0, f"Input ID {inputs.min()} 为负数"
        assert (labels != -100).sum() > 0, "当前批次没有被掩盖的 token，无法计算损失"

        # 前向传播
        outputs = model(input_ids=inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        # 可选：梯度裁剪
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch + 1}, Loss: {avg_loss:.4f}")


Start creating PID to model input IDs...
Finished. PID to model input IDs (first 5 items): {'<pad>': 0, '<mask>': 1, 11223: 2, 12345: 3, 44556: 4}
Total number of unique PIDs (plus padding value): 7
Print a batch:
tensor([[2, 4, 3, 5, 0, 0, 0, 0, 0, 0],
        [3, 5, 2, 0, 0, 0, 0, 0, 0, 0]])
tensor([[4, 6, 0, 0, 0, 0, 0, 0, 0, 0]])
Shape of embedding layer: torch.Size([7, 768])
Epoch 1, Loss: 3.3882
Epoch 2, Loss: 2.5744
Epoch 3, Loss: 2.8493


In [15]:
# 获取嵌入层
embedding_layer = model.roberta.embeddings.word_embeddings

# 选择要查看的 PID
pid = 12345

# 获取 PID 对应的索引
idx = pid_to_idx[pid]

# 提取嵌入向量
embedding = embedding_layer.weight[idx].detach().cpu().numpy()

# 打印嵌入向量
print(f"PID: {pid}, \nEmbedding: {embedding}")


PID: 12345, 
Embedding: [-1.51849678e-02 -1.69298649e-02  1.37473112e-02  1.45901022e-02
 -1.46607589e-02  5.85053535e-03 -2.43586022e-02  1.64092856e-03
  2.37097917e-03 -1.72557272e-02 -1.76676475e-02 -1.88329685e-02
 -5.31993173e-02  2.04114011e-03  1.12546491e-03  1.91159244e-03
  5.82950423e-03  1.28991753e-02 -1.06629888e-02  1.12393694e-02
 -5.55666350e-03 -1.47349834e-02  1.85030848e-02 -1.12575823e-02
  8.79686978e-03 -1.96818006e-03 -2.48249825e-02  1.17827738e-02
 -9.48400889e-03 -5.76315587e-03  2.57337326e-03 -5.57135837e-03
  1.88882370e-02 -1.47756711e-02  6.67076977e-03 -1.71374585e-02
  1.88149344e-02 -1.30892433e-02  1.30065866e-02 -3.13645937e-02
 -6.34490373e-03 -1.21062268e-02  9.30532999e-03  2.26146858e-02
 -1.09655214e-02  2.60666981e-02 -1.16989398e-02 -2.86462829e-02
  5.46006858e-03  6.94196858e-03 -2.11483594e-02 -7.33413640e-03
  2.25870814e-02 -9.00265109e-03 -1.61297980e-03  2.96900999e-02
  9.15159471e-04 -9.01567098e-03  1.55106010e-02 -1.18190683e-02
 