# 0. Initialize model and tokenizer

In [1]:
import torch 
from torch import nn
from transformers import BertTokenizer, BertModel

model_name = 'bert-base-uncased'

tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name, output_hidden_states=True)

# 1. Input

- tokenizer: segmentation -> split sentence into tokens (input_ids, token_type, attention_mask) (one token = one number)
- embedding: one dimension -> 768 dimension

In [2]:
text = "After stealing money from the bank vault, the bank robber was seen " \
   "fishing on the Mississippi river bank."

token_input = tokenizer(text, return_tensors='pt')
token_input

{'input_ids': tensor([[  101,  2044, 11065,  2769,  2013,  1996,  2924, 11632,  1010,  1996,
          2924, 27307,  2001,  2464,  5645,  2006,  1996,  5900,  2314,  2924,
          1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [3]:
token_input['input_ids'], token_input['input_ids'].shape

(tensor([[  101,  2044, 11065,  2769,  2013,  1996,  2924, 11632,  1010,  1996,
           2924, 27307,  2001,  2464,  5645,  2006,  1996,  5900,  2314,  2924,
           1012,   102]]),
 torch.Size([1, 22]))

-> batch_size = 1, just one sentence, length of sequence is 22 (no truncate and padding)

# 2. Model forward
- forward
    - embedding -> encoder -> pooler

In [22]:
model.eval()

with torch.no_grad():
    output = model(**token_input)

output.keys()

odict_keys(['last_hidden_state', 'pooler_output', 'hidden_states'])

# 3. Output

- len(outputs) == 3
- outputs[0] -> **encoder layer output**
    - last_hidden_state, shape:
        - batch_size \* seq_len \* hidden_size (1 \* 22 \* 768)
- outputs[1] -> **pooler layer output**
    - pooler_output, shape
        - batch_size \* hidden_size(1 \* 768)
    - Last layer hidden-state of the first token of the sequence (classification token, [CLS])
- outputs[2] (model.config.output_hidden_states = True)
    - type: tuple
    - one for the output of the embeddings(1), if the model has an embedding layer(12), + one for the output of each layer(1)
    - (1+12) \* (batch_size \* seq_len \* hidden_size) = 13 \* 1 \* 22 \* 768
- 
- outputs[0] == outputs[2][-1]
- outputs[1] == model.pooler(outputs[2][-1])
- outputs[2][0] == model.embeddings(token_input['input_ids'], token_input['token_type_ids'])

In [5]:
len(output), len(output[0]), len(output[1])

(3, 1, 1)