# 使用pipeline

In [1]:
from transformers.pipelines import pipeline
embedding_model = pipeline(
  "feature-extraction",
  model="bert-base-chinese",
)
embs = embedding_model('今天天气很好')
embs[0][0]

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[-0.3553845286369324,
 0.48077651858329773,
 0.20319104194641113,
 0.5691454410552979,
 1.0333210229873657,
 -0.8313525319099426,
 0.3445739150047302,
 -0.9092084765434265,
 -0.9920079708099365,
 0.2351675182580948,
 0.20946961641311646,
 0.568382740020752,
 0.8029783964157104,
 0.017373034730553627,
 1.8900421857833862,
 -0.6497050523757935,
 1.0451767444610596,
 -1.3993682861328125,
 -1.0495959520339966,
 0.8101091980934143,
 -0.5843707323074341,
 0.6960948705673218,
 -1.121341347694397,
 -0.7917256951332092,
 -0.32384055852890015,
 0.5696032643318176,
 -0.5509737730026245,
 -0.8218489289283752,
 1.2479422092437744,
 1.5368176698684692,
 0.6405949592590332,
 -0.09939694404602051,
 -0.4520101547241211,
 0.2920151352882385,
 0.7910453081130981,
 -0.8409210443496704,
 0.9998199939727783,
 0.2853350043296814,
 -0.051960766315460205,
 -0.8492034077644348,
 0.3049454092979431,
 -0.1277051866054535,
 -0.6475430727005005,
 1.5440731048583984,
 1.2596133947372437,
 0.1145726665854454,
 -0.232

# 使用自己写的代码

In [2]:
import torch
from torch.utils.data import DataLoader
from transformers import BertTokenizer, BertModel
from tqdm import tqdm
import numpy as np
# 加载文件
sentences = ['今天天气很好']

# 准备模型
model_name = "bert-base-chinese"
model = BertModel.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval() 

# 转换为词向量
batch_size = 16  # 批大小
data_loader = DataLoader(sentences, batch_size=batch_size)
for batch in data_loader:
    print(len(batch), batch)
cls_embeddings = []
for batch_sentences in tqdm(data_loader):
    inputs = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt", max_length=512)
    inputs.to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embeddings.append(outputs.last_hidden_state[:, 0].cpu().numpy()) # 只取CLS对应的向量

cls_embeddings = np.vstack(cls_embeddings)
cls_embeddings[0]

Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


1 ['今天天气很好']


100%|██████████| 1/1 [00:00<00:00, 25.99it/s]


array([-3.55384529e-01,  4.80776519e-01,  2.03191042e-01,  5.69145441e-01,
        1.03332102e+00, -8.31352532e-01,  3.44573915e-01, -9.09208477e-01,
       -9.92007971e-01,  2.35167518e-01,  2.09469616e-01,  5.68382740e-01,
        8.02978396e-01,  1.73730347e-02,  1.89004219e+00, -6.49705052e-01,
        1.04517674e+00, -1.39936829e+00, -1.04959595e+00,  8.10109198e-01,
       -5.84370732e-01,  6.96094871e-01, -1.12134135e+00, -7.91725695e-01,
       -3.23840559e-01,  5.69603264e-01, -5.50973773e-01, -8.21848929e-01,
        1.24794221e+00,  1.53681767e+00,  6.40594959e-01, -9.93969440e-02,
       -4.52010155e-01,  2.92015135e-01,  7.91045308e-01, -8.40921044e-01,
        9.99819994e-01,  2.85335004e-01, -5.19607663e-02, -8.49203408e-01,
        3.04945409e-01, -1.27705187e-01, -6.47543073e-01,  1.54407310e+00,
        1.25961339e+00,  1.14572667e-01, -2.32412770e-01,  9.55400243e-02,
       -6.40026867e-01,  3.66865098e-01,  4.16713923e-01,  9.32367420e+00,
        1.03354299e+00,  

In [3]:
np.array_equal(
  np.array(embs[0][0]),
  cls_embeddings[0]
)

True