# Model


In [2]:
from transformers import BertModel

model = BertModel.from_pretrained("bert-base-uncased")  # load model
model.save_pretrained("./models/bert-base-cased/")  # save model

# Token


In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")  # load tokenizer
tokenizer.save_pretrained("./models/bert-base-cased/")  # save tokenizer

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

('./models/bert-base-cased/tokenizer_config.json',
 './models/bert-base-cased/special_tokens_map.json',
 './models/bert-base-cased/vocab.txt',
 './models/bert-base-cased/added_tokens.json',
 './models/bert-base-cased/tokenizer.json')

In [11]:
tokenized_text = tokenizer("Using a Transformer network is simple")
print(tokenized_text)

{'input_ids': [101, 2478, 1037, 10938, 2121, 2897, 2003, 3722, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [6]:
tokens = tokenizer.tokenize("Using a transformer network is simple")
print(tokens)

['using', 'a', 'transform', '##er', 'network', 'is', 'simple']


In [8]:
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

[2478, 1037, 10938, 2121, 2897, 2003, 3722]


In [10]:
sequence = tokenizer.encode("Using a transformer network is simple")
print(sequence)

[101, 2478, 1037, 10938, 2121, 2897, 2003, 3722, 102]


In [13]:
original_sequence = tokenizer.decode(
    [101, 2478, 1037, 10938, 2121, 2897, 2003, 3722, 102]
)
print(original_sequence)

[CLS] using a transformer network is simple [SEP]


# 处理多段文本


In [17]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

tokens = tokenizer.tokenize("I've been waiting for a HuggingFace course my whole life.")
ids = tokenizer.convert_tokens_to_ids(tokens)
print(ids)

input_ids = torch.tensor([ids])
print(input_ids)

output = model(input_ids)
print(output.logits)

[1045, 1005, 2310, 2042, 3403, 2005, 1037, 17662, 12172, 2607, 2026, 2878, 2166, 1012]
tensor([[ 1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,  2607,
          2026,  2878,  2166,  1012]])
tensor([[-2.7276,  2.8789]], grad_fn=<AddmmBackward0>)


In [19]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

inputs = tokenizer(
    "I've been waiting for a HuggingFace course my whole life.", return_tensors="pt"
)
print(inputs)

outputs = model(**inputs)
print(outputs.logits)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}
tensor([[-1.5607,  1.6123]], grad_fn=<AddmmBackward0>)


# padding - 使得每个 batch 中的 sample 具有相同的长度


In [21]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

checkpoint = "distilbert-base-uncased-finetuned-sst-2-english"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

sequences = ["I've been waiting for a HuggingFace course my whole life.", "So have I!"]
tokens = tokenizer(sequences, padding=True, truncation=True, return_tensors="pt")
print(tokens)

output = model(**tokens)
print(output.logits)

{'input_ids': tensor([[  101,  1045,  1005,  2310,  2042,  3403,  2005,  1037, 17662, 12172,
          2607,  2026,  2878,  2166,  1012,   102],
        [  101,  2061,  2031,  1045,   999,   102,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}
tensor([[-1.5607,  1.6123],
        [-3.6183,  3.9137]], grad_fn=<AddmmBackward0>)


## 编码句子对


In [25]:
from transformers import AutoTokenizer

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

sentence1_list = ["First sentence.", "This is the second sentence.", "Third one."]
sentence2_list = [
    "First sentence is short.",
    "The second sentence is very very very long.",
    "ok.",
]

# 将 sentence1 和 sencentce2 对应索引的句子匹配为句子对。
tokens = tokenizer(
    sentence1_list, sentence2_list, padding=True, truncation=True, return_tensors="pt"
)
print(tokens)
print(tokens["input_ids"].shape)

{'input_ids': tensor([[ 101, 2034, 6251, 1012,  102, 2034, 6251, 2003, 2460, 1012,  102,    0,
            0,    0,    0,    0,    0,    0],
        [ 101, 2023, 2003, 1996, 2117, 6251, 1012,  102, 1996, 2117, 6251, 2003,
         2200, 2200, 2200, 2146, 1012,  102],
        [ 101, 2353, 2028, 1012,  102, 7929, 1012,  102,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}
torch.Size([3, 18])


### 添加新 token


In [32]:
from transformers import AutoTokenizer, AutoModel

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenizer.add_tokens(["[ENT_START]", "[ENT_END]"], special_tokens=True)

sentence = "Two [ENT_START] cars [ENT_END] collided in a [ENT_START] tunnel [ENT_END] this morning."
print(tokenizer.tokenize(sentence))

['two', '[ENT_START]', 'cars', '[ENT_END]', 'collided', 'in', 'a', '[ENT_START]', 'tunnel', '[ENT_END]', 'this', 'morning', '.']


In [34]:
model = AutoModel.from_pretrained(checkpoint)

# 调整 embedding 矩阵
model.resize_token_embeddings(len(tokenizer))

# token embedding 初始化 - 初始化为已有 token 的值
num_added_tokens = tokenizer.add_tokens(
    ["[ENT_START]", "[ENT_END]"], special_tokens=True
)

token_id = tokenizer.convert_tokens_to_ids("entity")
token_embedding = model.embeddings.word_embeddings.weight[token_id]
print(token_embedding)

with torch.no_grad():
    for i in range(1, num_added_tokens + 1):
        model.embeddings.word_embeddings.weight[-i:, :] = (
            token_embedding.clone().detach().requires_grad_(True)
        )  # detach() 这通常用于创建一个不需要梯度计算的张量。
        # requires_grad_(True): 这个方法改变了张量的requires_grad属性。如果设置为True，那么在进行梯度计算时，会计算这个张量的梯度。这通常用于训练模型的参数。

print(model.embeddings.word_embeddings.weight[-2:, :])

tensor([-3.8909e-03, -1.3111e-02, -9.4551e-02, -5.1809e-02,  3.2491e-03,
        -1.2508e-02, -9.8832e-03,  7.4605e-03, -1.5419e-02, -7.5015e-02,
        -1.1553e-02, -6.8970e-02, -5.3246e-02,  3.1283e-02, -1.7483e-02,
        -1.2914e-02, -3.1466e-02,  7.9941e-05,  1.4641e-02,  1.9323e-02,
        -2.6789e-02, -6.0619e-02, -8.2200e-03, -8.3062e-02, -8.5552e-02,
        -1.1586e-02,  6.1244e-03,  1.9840e-02, -1.4748e-02, -2.7729e-02,
        -2.0458e-02, -6.0993e-03,  6.6157e-03, -4.9288e-03, -7.0220e-02,
        -2.7434e-02, -8.8032e-03, -6.3211e-02, -7.4845e-02,  4.3453e-02,
         8.4693e-03, -9.0122e-03, -1.1829e-01, -8.4419e-02,  1.2732e-02,
        -1.2657e-02, -4.7312e-03, -1.8933e-02, -4.4400e-02, -5.8645e-02,
        -9.9826e-03, -1.3900e-02, -5.9908e-02,  3.9422e-03, -5.3567e-02,
        -8.7809e-03,  2.7742e-02, -6.1284e-03, -5.7271e-02, -3.9378e-02,
        -4.6032e-02, -4.0745e-02, -1.9952e-02,  4.0362e-03, -8.6448e-02,
        -7.3612e-02, -2.1270e-02,  9.6670e-03, -1.1

In [40]:
description = ["start of entity", "end of entity"]

with torch.no_grad():
    for i, token in enumerate(
        reversed(description), start=1
    ):  # start=1 表示 i 从1开始计数
        tokenized = tokenizer.tokenize(token)
        print(tokenized)
        tokenized_ids = tokenizer.convert_tokens_to_ids(tokenized)
        new_embedding = model.embeddings.word_embeddings.weight[tokenized_ids].mean(
            axis=0
        )
        model.embeddings.word_embeddings.weight[-i, :] = (
            new_embedding.clone().detach().requires_grad_(True)
        )
print(model.embeddings.word_embeddings.weight[-2:, :])

['end', 'of', 'entity']
['start', 'of', 'entity']
tensor([[-0.0340, -0.0144, -0.0441,  ..., -0.0016,  0.0318, -0.0151],
        [-0.0060, -0.0202, -0.0312,  ..., -0.0084,  0.0193, -0.0296]],
       grad_fn=<SliceBackward0>)
