Skip to content

Commit

Permalink
fix(bert): change LayerNorm to layer_norm
Browse files Browse the repository at this point in the history
  • Loading branch information
mmmwhy committed Jan 28, 2022
1 parent db5965f commit a99831e
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 14 deletions.
36 changes: 28 additions & 8 deletions pure_attention/backbone_bert/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,19 @@ test_query = "结果一致性验证"

tokenizer = Tokenizer(bert_model_path + "/vocab.txt")
bert = BertModel(bert_model_path)
tokens_ids, segments_ids = tokenizer.encode(test_query, max_len=64)

bert_pooler_output = bert(tokens_ids, token_type_ids=segments_ids).pooler_output
tokenizer_output= tokenizer.encode(test_query, max_len=64)

our_bert_pooler_output = bert(
input_ids=tokenizer_output.input_ids,
token_type_ids=tokenizer_output.token_type_ids,
attention_mask=tokenizer_output.attention_mask).pooler_output

bert_last_hidden_state = bert(
input_ids=tokenizer_output.input_ids,
token_type_ids=tokenizer_output.token_type_ids,
attention_mask=tokenizer_output.attention_mask).last_hidden_state


```

Expand Down Expand Up @@ -74,25 +84,35 @@ import torch
from transformers import BertModel
from transformers import BertTokenizer

from pure_attention.common.nlp.tokenization import Tokenizer as LocalTokenizer
from pure_attention.backbone_bert.bert_model import BertModel as OurBertModel


bert_model_path = "/data/pretrain_modal/chinese-roberta-wwm-ext-large"
test_query = "结果一致性验证"

text_tokenizer = BertTokenizer.from_pretrained(bert_model_path, do_lower_case=True)
bert_model = BertModel.from_pretrained(bert_model_path)

tensor_caption = text_tokenizer.encode(test_query, return_tensors="pt", padding='max_length', truncation=True,
tensor_caption = text_tokenizer(test_query, return_tensors="pt", padding='max_length', truncation=True,
max_length=64)

origin_bert_pooler_output = bert_model(tensor_caption).pooler_output

origin_bert_pooler_output = bert_model(
input_ids=tensor_caption.input_ids,
attention_mask=tensor_caption.attention_mask,
token_type_ids=tensor_caption.token_type_ids).pooler_output

# 我们简化重构后的代码
from pure_attention.common.nlp.tokenization import Tokenizer as LocalTokenizer
from pure_attention.backbone_bert.bert_model import BertModel as OurBertModel
tokenizer = LocalTokenizer(bert_model_path + "/vocab.txt")
bert = OurBertModel(bert_model_path)
tokens_ids, segments_ids = tokenizer.encode(test_query, max_len=64)
tokenizer_output = tokenizer.encode(test_query, max_len=64)

our_bert_pooler_output = bert(
input_ids=tokenizer_output.input_ids,
token_type_ids=tokenizer_output.token_type_ids,
attention_mask=tokenizer_output.attention_mask).pooler_output

our_bert_pooler_output = bert(tokens_ids, token_type_ids=segments_ids).pooler_output

print("check result:", torch.cosine_similarity(origin_bert_pooler_output, our_bert_pooler_output))
```
8 changes: 4 additions & 4 deletions pure_attention/backbone_bert/bert_layer.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def __init__(self, config):
self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)

self.LayerNorm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
self.dropout = nn.Dropout(config.hidden_dropout_prob)

def forward(self, input_ids, token_type_ids=None, position_ids=None):
Expand All @@ -59,7 +59,7 @@ def forward(self, input_ids, token_type_ids=None, position_ids=None):

# 注意按位相加
embeddings = words_embeddings + position_embeddings + token_type_embeddings
embeddings = self.LayerNorm(embeddings)
embeddings = self.layer_norm(embeddings)
embeddings = self.dropout(embeddings)
return embeddings

Expand Down Expand Up @@ -204,14 +204,14 @@ def __init__(self, intermediate_size, hidden_size, hidden_dropout_prob, layer_no
"""
super(BertAddNorm, self).__init__()
self.dense = nn.Linear(intermediate_size, hidden_size)
self.LayerNorm = BertLayerNorm(hidden_size, eps=layer_norm_eps)
self.layer_norm = BertLayerNorm(hidden_size, eps=layer_norm_eps)
self.dropout = nn.Dropout(hidden_dropout_prob)

def forward(self, hidden_states, input_tensor):
hidden_states = self.dense(hidden_states)
hidden_states = self.dropout(hidden_states)
# 残差,非常重要
hidden_states = self.LayerNorm(hidden_states + input_tensor)
hidden_states = self.layer_norm(hidden_states + input_tensor)
return hidden_states


Expand Down
4 changes: 4 additions & 0 deletions pure_attention/backbone_bert/bert_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,10 @@ def from_pretrained(self, pretrained_model_path):
new_key = new_key.replace('beta', 'bias')
if 'bert.' in key:
new_key = new_key.replace('bert.', '')
# 兼容部分不优雅的变量命名
if 'LayerNorm' in key:
new_key = new_key.replace('LayerNorm', 'layer_norm')


if new_key:
old_keys.append(key)
Expand Down
11 changes: 11 additions & 0 deletions pure_attention/backbone_bert/package.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,14 @@ def __init__(self, last_hidden_state, pooler_output, attentions):
self.last_hidden_state = last_hidden_state
self.pooler_output = pooler_output
self.attentions = attentions

class TokenizerOutput:
input_ids: torch.LongTensor = None
token_type_ids: torch.LongTensor = None
attention_mask: torch.LongTensor = None

def __init__(self, token_ids, segment_ids, attention_mask):
# 以下写法是等价的
self.input_ids = token_ids
self.token_type_ids = segment_ids
self.attention_mask = attention_mask
10 changes: 8 additions & 2 deletions pure_attention/common/nlp/tokenization.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import numpy as np
import torch

from pure_attention.backbone_bert.package import TokenizerOutput
from pure_attention.common.logger import init_logger

logger = init_logger(__name__)
Expand Down Expand Up @@ -189,21 +190,26 @@ def encode(
second_segment_ids = [1] * len(second_token_ids)
first_token_ids.extend(second_token_ids)
first_segment_ids.extend(second_segment_ids)


attention_mask = [1] * len(first_token_ids)

# 做一个 padding 操作
if is_padding:
while len(first_token_ids) < max_len:
first_token_ids.append(self.vocab[self.pad_token])
first_segment_ids.append(self.vocab[self.pad_token])
attention_mask.append(0)

if max_len and len(first_token_ids) > max_len:
first_token_ids = first_token_ids[:max_len]
first_segment_ids = first_segment_ids[:max_len]
attention_mask = attention_mask[:max_len]

first_token_ids = torch.tensor([first_token_ids])
first_segment_ids = torch.tensor([first_segment_ids])
attention_mask = torch.tensor([attention_mask])

return first_token_ids, first_segment_ids
return TokenizerOutput(first_token_ids, first_segment_ids, attention_mask)


class BasicTokenizer(object):
Expand Down

0 comments on commit a99831e

Please sign in to comment.