# 使用pipeline

In [None]:
from transformers.pipelines import pipeline
embedding_model = pipeline(
  "feature-extraction",
  model="bert-base-chinese",
)
embs = embedding_model('今天天气很好')
embs[0][0]

# 使用自己写的代码

In [None]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import numpy as np
# 加载文件
sentences = ['今天天气很好']

# 准备模型
model_name = "bert-base-chinese"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval() 

# 转换为词向量
batch_size = 16  # 批大小
data_loader = DataLoader(sentences, batch_size=batch_size)
for batch in data_loader:
    print(len(batch), batch)
cls_embeddings = []
for batch_sentences in tqdm(data_loader):
    inputs = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt", max_length=512)
    inputs.to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embeddings.append(outputs.last_hidden_state[:, 0].cpu().numpy()) # 只取CLS对应的向量

cls_embeddings = np.vstack(cls_embeddings)
cls_embeddings[0]

In [None]:
np.array_equal(
  np.array(embs[0][0]),
  cls_embeddings[0]
)