In [1]:
from transformers import BertModel, BertTokenizer

In [16]:
import torch

In [2]:
#load vennila bert model
model = BertModel.from_pretrained("bert-base-uncased")

tokenizer  = BertTokenizer.from_pretrained("bert-base-uncased")

In [4]:
"""
word embedding : context free word embedding
position embedding : embedding position
token type embedding : 0 or 1 used to lookup the segment embedding
"""
model.embeddings

BertEmbeddings(
  (word_embeddings): Embedding(30522, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [5]:
text = "welcome to todays classs, hope you guys enjoyed it"

In [6]:
tokenizer.encode(text, return_tensors = "pt") #returns pytorch tensor

tensor([[ 101, 6160, 2000, 2651, 2015, 2465, 2015, 1010, 3246, 2017, 4364, 5632,
         2009,  102]])

In [9]:
#undesratnding word embedding
model.embeddings.word_embeddings(tokenizer.encode(text, return_tensors = "pt"))

tensor([[[ 0.0136, -0.0265, -0.0235,  ...,  0.0087,  0.0071,  0.0151],
         [ 0.0208, -0.0638,  0.0090,  ..., -0.0462, -0.0553, -0.0105],
         [ 0.0131,  0.0082, -0.0087,  ...,  0.0159, -0.0078,  0.0182],
         ...,
         [-0.0526, -0.0520,  0.0160,  ..., -0.0533, -0.0505,  0.0284],
         [-0.0449, -0.0279, -0.0088,  ...,  0.0133,  0.0185,  0.0093],
         [-0.0145, -0.0100,  0.0060,  ..., -0.0250,  0.0046, -0.0015]]],
       grad_fn=<EmbeddingBackward0>)

In [10]:
model.embeddings.word_embeddings(tokenizer.encode(text, return_tensors = "pt")).shape

torch.Size([1, 14, 768])

In [11]:
#[ 0.0136, -0.0265, -0.0235,  ...,  0.0087,  0.0071,  0.0151] - CLS same as above
# [-0.0145, -0.0100,  0.0060,  ..., -0.0250,  0.0046, -0.0015] - SEP 
# CLS at start ane SEP at end will be same for all the records
model.embeddings.word_embeddings(tokenizer.encode("hello welcome", return_tensors = "pt"))

tensor([[[ 0.0136, -0.0265, -0.0235,  ...,  0.0087,  0.0071,  0.0151],
         [-0.0043, -0.0330, -0.0217,  ..., -0.0425, -0.0127, -0.0389],
         [ 0.0208, -0.0638,  0.0090,  ..., -0.0462, -0.0553, -0.0105],
         [-0.0145, -0.0100,  0.0060,  ..., -0.0250,  0.0046, -0.0015]]],
       grad_fn=<EmbeddingBackward0>)

In [12]:

model.embeddings.word_embeddings(tokenizer.encode("hello welcome", return_tensors = "pt")).shape

torch.Size([1, 4, 768])

In [13]:
#POSITION EMBEDDING
# default bert base model takes 512 word in a sentence .

In [14]:
model.embeddings.position_embeddings

Embedding(512, 768)

In [35]:

torch.LongTensor(range(14))

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13])

In [34]:
model.embeddings.position_embeddings(torch.LongTensor(range(14)))

tensor([[ 1.7505e-02, -2.5631e-02, -3.6642e-02,  ...,  3.3437e-05,
          6.8312e-04,  1.5441e-02],
        [ 7.7580e-03,  2.2613e-03, -1.9444e-02,  ...,  2.8910e-02,
          2.9753e-02, -5.3247e-03],
        [-1.1287e-02, -1.9644e-03, -1.1573e-02,  ...,  1.4908e-02,
          1.8741e-02, -7.3140e-03],
        ...,
        [-1.1239e-02,  4.1275e-03, -1.6536e-02,  ...,  1.8726e-02,
          1.4024e-02,  8.7044e-03],
        [-1.5178e-02,  5.2001e-03, -2.7250e-03,  ...,  1.6257e-02,
          1.9132e-04,  9.7079e-03],
        [-1.8272e-02,  4.4238e-03, -1.3151e-03,  ...,  1.4878e-02,
          7.0050e-03,  1.0528e-02]], grad_fn=<EmbeddingBackward0>)

In [19]:
model.embeddings.position_embeddings(torch.LongTensor(range(6))).shape

torch.Size([6, 768])

In [20]:
#TOKEN TYPE EMBEDDING

In [21]:
model.embeddings.token_type_embeddings

Embedding(2, 768)

In [33]:

torch.LongTensor([0] * 14)

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [32]:
model.embeddings.token_type_embeddings(torch.LongTensor([0] * 14))

tensor([[ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        ...,
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086]],
       grad_fn=<EmbeddingBackward0>)

In [31]:
model.embeddings.token_type_embeddings(torch.LongTensor([0] * 14)).shape

torch.Size([14, 768])

In [25]:
#appply everything together(applying feed forward norm)

In [30]:
model.embeddings.LayerNorm(
    model.embeddings.word_embeddings(tokenizer.encode(text, return_tensors = "pt")) + \
    model.embeddings.position_embeddings(torch.LongTensor(range(14))) + \
    model.embeddings.token_type_embeddings(torch.LongTensor([0] * 14))
)

tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
         [ 0.7885, -0.4432,  0.2550,  ..., -0.0480, -0.0742, -0.0010],
         [ 0.2024,  0.4469, -0.0687,  ...,  0.4889,  0.3223,  0.2605],
         ...,
         [-0.7861, -0.2950,  0.4169,  ..., -0.3405, -0.2996,  0.9149],
         [-0.8188,  0.0307,  0.1364,  ...,  0.5349,  0.5190,  0.4778],
         [-0.6056,  0.0968,  0.1880,  ..., -0.2773,  0.1849,  0.0580]]],
       grad_fn=<NativeLayerNormBackward0>)

In [36]:
## all the above step is done in single command below
#model.embeding does , word + position + token type
model.embeddings(tokenizer.encode(text, return_tensors = "pt"))

tensor([[[ 0.1686, -0.2858, -0.3261,  ..., -0.0276,  0.0383,  0.1640],
         [ 0.7885, -0.4432,  0.2550,  ..., -0.0480, -0.0742, -0.0010],
         [ 0.2024,  0.4469, -0.0687,  ...,  0.4889,  0.3223,  0.2605],
         ...,
         [-0.7861, -0.2950,  0.4169,  ..., -0.3405, -0.2996,  0.9149],
         [-0.8188,  0.0307,  0.1364,  ...,  0.5349,  0.5190,  0.4778],
         [-0.6056,  0.0968,  0.1880,  ..., -0.2773,  0.1849,  0.0580]]],
       grad_fn=<NativeLayerNormBackward0>)