In [1]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 8.51kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 2.61MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 3.38MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 305kB/s]


In [3]:
# WordPiece tokenization이 어떤식으로 진행되는지 확인해 보기
example_text = "These days word embeddings are important."

In [4]:
input_text = "[CLS]" + example_text + "[SEP]"

In [6]:
# 토크나이제이션 수행하기
tokenized_text = tokenizer.tokenize(input_text)

# Print out the tokens.
print (tokenized_text)

['[CLS]', 'these', 'days', 'word', 'em', '##bed', '##ding', '##s', 'are', 'important', '.', '[SEP]']


In [7]:
# 각 토큰의 아이디 확인하기
indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)

for token, index in zip(tokenized_text, indexed_tokens):
    print('Token: {0},  Index: {1}'.format(token, index))

Token: [CLS],  Index: 101
Token: these,  Index: 2122
Token: days,  Index: 2420
Token: word,  Index: 2773
Token: em,  Index: 7861
Token: ##bed,  Index: 8270
Token: ##ding,  Index: 4667
Token: ##s,  Index: 2015
Token: are,  Index: 2024
Token: important,  Index: 2590
Token: .,  Index: 1012
Token: [SEP],  Index: 102


In [8]:
# tokenizer.vocab.keys() : 토크나이저의 단어 집합에 접근
print(list(tokenizer.vocab.keys())[2000:2010])

['to', 'was', 'he', 'is', 'as', 'for', 'on', 'with', 'that', 'it']


In [9]:
len(tokenizer.vocab.keys())

30522

In [10]:
text = "After stealing money from the bank vault, the bank robber was seen " \
       "fishing on the Mississippi river bank."
# 첫번째, 두번쨰 bank는 은행을 의미, 세번째 bank는 강둑을 의미

In [13]:
# 자동으로 해줌
inputs = tokenizer(text, return_tensors="np", max_length=50, padding='max_length',
                  truncation=True)
# return_tensors="np" : 결과를 넘파이로 반환
# max_length=50 : 최대 토근 길이를 50으로 설정
# padding='max_length' : 최대 길이 50으로 패딩
# truncation=True : 최대 길이 50으로 자르기

In [12]:
inputs
# input_ids : 각각 토큰의 아이디 정보
# token_type_ids : 세그먼트 아이디 정보(sentence embedding <- 첫번째 문장인지, 두번째 문장인지)
# attention_mask : 어텐션 마스크 정보(원래의 토큰은 1 패딩은 0)

{'input_ids': array([[  101,  2044, 11065,  2769,  2013,  1996,  2924, 11632,  1010,
         1996,  2924, 27307,  2001,  2464,  5645,  2006,  1996,  5900,
         2314,  2924,  1012,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]]), 'token_type_ids': array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]]), 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]])}

In [14]:
inputs['input_ids']

array([[  101,  2044, 11065,  2769,  2013,  1996,  2924, 11632,  1010,
         1996,  2924, 27307,  2001,  2464,  5645,  2006,  1996,  5900,
         2314,  2924,  1012,   102,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0]])

In [15]:
# Load pre-trained model (weights)
model = TFBertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states. 모든 히든 스테이트를 반환할지 여부(모든 hidden state + embedding vector)
                                  )

Downloading model.safetensors: 100%|██████████| 440M/440M [02:38<00:00, 2.77MB/s] 
Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model 

In [16]:
outputs = model(inputs)

In [17]:
outputs.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])

In [18]:
outputs

TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(1, 50, 768), dtype=float32, numpy=
array([[[-0.49644378, -0.18308322, -0.5231451 , ..., -0.19021133,
          0.37379816,  0.39644393],
        [-0.13227084, -0.27622476, -0.3495361 , ..., -0.45665827,
          0.37864694, -0.1096139 ],
        [-0.36261445, -0.4001648 ,  0.06757425, ..., -0.3207096 ,
         -0.27089828, -0.30042604],
        ...,
        [-0.39838782, -0.5654812 , -0.06805929, ...,  0.34025484,
          0.17032674, -0.2169472 ],
        [ 0.3990308 , -0.21066247,  0.14042513, ..., -0.08418038,
         -0.13027407, -0.12785563],
        [ 0.5375854 , -0.13355498,  0.20943257, ..., -0.1283941 ,
         -0.08842918, -0.0809501 ]]], dtype=float32)>, pooler_output=<tf.Tensor: shape=(1, 768), dtype=float32, numpy=
array([[-0.60308236, -0.3342156 , -0.717412  ,  0.3347205 ,  0.51445705,
        -0.17216602,  0.4502394 ,  0.27680653, -0.37692517, -0.99984133,
        -0.36572412,  0.753

In [19]:
outputs.last_hidden_state.shape

TensorShape([1, 50, 768])

In [20]:
last_hidden_state = outputs.last_hidden_state
last_hidden_state[0][0] # 첫번째 문장의 첫번째 토큰의 벡터값 -> CLS token (문서의 전체적인 의미를 담고 있는 벡터값)

<tf.Tensor: shape=(768,), dtype=float32, numpy=
array([-4.96443778e-01, -1.83083221e-01, -5.23145080e-01,  5.25868118e-01,
        5.07566869e-01,  1.63770512e-01,  2.03401342e-01,  3.17121327e-01,
       -5.31791151e-02, -1.73957482e-01,  1.56624258e-01, -2.91166931e-01,
       -4.70112771e-01,  6.43651426e-01,  1.01685435e-01,  4.04034108e-02,
       -2.23800600e-01,  4.66818213e-01,  7.84238100e-01, -2.29058057e-01,
       -1.18187830e-01, -1.04612708e-01,  2.05792338e-01,  1.56229943e-01,
       -3.35296616e-02, -1.64334744e-01, -3.00018936e-01, -9.57757086e-02,
       -9.04607400e-02,  3.83675277e-01,  5.08687869e-02,  5.76399416e-02,
       -1.03359938e-01, -8.35442364e-01,  1.45373300e-01, -3.95008206e-01,
        4.80394438e-02, -1.43011436e-01,  4.68777120e-02,  3.04917872e-01,
       -3.79328519e-01,  8.92913416e-02, -2.46660128e-01,  4.72978391e-02,
        2.10815817e-01, -6.77736163e-01, -3.22645378e+00, -7.80532211e-02,
       -2.20757276e-01, -2.99442798e-01,  8.09335709

In [21]:
outputs.pooler_output.shape
# 각 문서의 [CLS] 토큰에 대한 마지막 인코더 블록에서 출력하는 
# 은닉 상태 벡터를 입력으로 받아서 다시 768차원의 최종 벡터를 반환하는 
# 완전연결층의 결과물

TensorShape([1, 768])

In [22]:
hidden_states = outputs.hidden_states
len(hidden_states) # encoder layer의 개수 + embedding layer 1개

13

In [23]:
bank1_vector = outputs.last_hidden_state[0][6] # 1st bank (첫번째 문장의 6번째 토큰)
bank2_vector = outputs.last_hidden_state[0][10] # 2nd bank
bank3_vector = outputs.last_hidden_state[0][19] # 3rd bank

In [24]:
bank1_vector.shape

TensorShape([768])

In [25]:
import numpy as np

In [26]:
# 첫 번째 bank와 두 번째 bank 간의 코사인 유사도
np.dot(bank1_vector, bank2_vector)/(np.linalg.norm(bank1_vector)*np.linalg.norm(bank2_vector))

0.952733

In [27]:
# 첫 번째 bank와 세 번째 bank 간의 코사인 유사도
np.dot(bank1_vector, bank3_vector)/(np.linalg.norm(bank1_vector)*np.linalg.norm(bank3_vector))

0.6988578

In [28]:
# 두 번째 bank와 세 번째 bank 간의 코사인 유사도
np.dot(bank3_vector, bank2_vector)/(np.linalg.norm(bank3_vector)*np.linalg.norm(bank2_vector))

0.69788176