<a href="https://colab.research.google.com/github/nirmal-ank/Transformers/blob/main/NLU_with_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
model = BertModel.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [None]:
named_params = list(model.named_parameters())

In [None]:
print("The BERT model has {:} different named parameters.\n".format(len(named_params)))

The BERT model has 199 different named parameters.



In [None]:
for p in named_params[0:5]:
  print("{:<55} {:>12}".format(p[0],str(tuple(p[1].size()))))

embeddings.word_embeddings.weight                       (30522, 768)
embeddings.position_embeddings.weight                     (512, 768)
embeddings.token_type_embeddings.weight                     (2, 768)
embeddings.LayerNorm.weight                                   (768,)
embeddings.LayerNorm.bias                                     (768,)


In [None]:
for p in named_params[5:21]:
  print("{:<55} {:>12}".format(p[0],str(tuple(p[1].size()))))

encoder.layer.0.attention.self.query.weight               (768, 768)
encoder.layer.0.attention.self.query.bias                     (768,)
encoder.layer.0.attention.self.key.weight                 (768, 768)
encoder.layer.0.attention.self.key.bias                       (768,)
encoder.layer.0.attention.self.value.weight               (768, 768)
encoder.layer.0.attention.self.value.bias                     (768,)
encoder.layer.0.attention.output.dense.weight             (768, 768)
encoder.layer.0.attention.output.dense.bias                   (768,)
encoder.layer.0.attention.output.LayerNorm.weight             (768,)
encoder.layer.0.attention.output.LayerNorm.bias               (768,)
encoder.layer.0.intermediate.dense.weight                (3072, 768)
encoder.layer.0.intermediate.dense.bias                      (3072,)
encoder.layer.0.output.dense.weight                      (768, 3072)
encoder.layer.0.output.dense.bias                             (768,)
encoder.layer.0.output.LayerNorm.w

In [None]:
for p in named_params[-2:]:
  print("{:<55} {:>12}".format(p[0],str(tuple(p[1].size()))))

pooler.dense.weight                                       (768, 768)
pooler.dense.bias                                             (768,)


In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
tokens= tokenizer.encode("Rajesh loves to study about transformers.")

In [None]:
response = model(torch.tensor(tokenizer.encode(tokens)).unsqueeze(0))

In [None]:
response.last_hidden_state

tensor([[[-0.1479, -0.2184, -0.2849,  ..., -0.3093,  1.0494,  0.2841],
         [-0.0955, -0.1527, -0.2696,  ..., -0.3192,  1.0766,  0.2047],
         [ 1.2062, -0.8747, -0.8963,  ...,  0.2876,  0.5944, -0.1765],
         ...,
         [ 0.2162, -0.9268, -0.1171,  ...,  0.0396,  0.9059, -0.8442],
         [ 0.8353, -0.0064,  0.0195,  ...,  0.1884, -0.2348, -0.5020],
         [ 0.8353, -0.0063,  0.0201,  ...,  0.1887, -0.2346, -0.5024]]],
       grad_fn=<NativeLayerNormBackward0>)

In [None]:
response.pooler_output.shape

torch.Size([1, 768])

In [None]:
model.pooler

BertPooler(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (activation): Tanh()
)

In [None]:
CLS_embedding = response.last_hidden_state[:,0,:].unsqueeze(0)

In [None]:
CLS_embedding.shape

torch.Size([1, 1, 768])

In [None]:
model.pooler(CLS_embedding).shape

torch.Size([1, 768])

In [None]:
(model.pooler(CLS_embedding) == response.pooler_output).all()

tensor(True)

In [None]:
total_params = 0
for p in model.parameters():
  if len(p.shape) ==2:
    total_params += p.shape[0] * p.shape[1]
print(f"Total parameters = {total_params}")

Total parameters = 109360128


## WordPiece tokenization

In [None]:
print(f"Length of BERT base vocabulary = {len(tokenizer.vocab)}")

Length of BERT base vocabulary = 30522


In [None]:
text1 = "I love my pet Python."
text2 = "I love coding in Python."

In [None]:
tokens1 = tokenizer.encode(text1)
tokens2 = tokenizer.encode(text2)

In [None]:
for t in tokens1:
  print(tokenizer.decode([t]))

[CLS]
i
love
my
pet
python
.
[SEP]


In [None]:
for t in tokens2:
  print(tokenizer.decode([t]))

[CLS]
i
love
coding
in
python
.
[SEP]


In [None]:
"Niel" in tokenizer.vocab

False

In [None]:
for_unknown = tokenizer.encode("Niel")

In [None]:
for t in for_unknown:
  print(tokenizer.decode([t]))

[CLS]
ni
##el
[SEP]


In [None]:
tokens_plus1 = tokenizer.encode_plus(text1)
tokens_plus2 = tokenizer.encode_plus(text2)

In [None]:
print(tokens_plus1)

{'input_ids': [101, 1045, 2293, 2026, 9004, 18750, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}


In [None]:
general = model(torch.tensor(tokens1).unsqueeze(0))[0][:,5,:].detach().numpy()
code = model(torch.tensor(tokens2).unsqueeze(0))[0][:,5,:].detach().numpy()


In [None]:
general_embedding = model(torch.tensor(tokenizer.encode("snake")).unsqueeze(0))[0][:,1,:].detach().numpy()
programming_embedding = model(torch.tensor(tokenizer.encode("programming")).unsqueeze(0))[0][:,1,:].detach().numpy()


In [None]:
code.shape

(1, 768)

In [None]:
programming_embedding.shape

(1, 768)

In [None]:
cosine_similarity(code, programming_embedding)

array([[0.28580743]], dtype=float32)

In [None]:
cosine_similarity(code, general_embedding)

array([[0.30203456]], dtype=float32)

## Many Embeddings of BERT

In [None]:
model.embeddings

BertEmbeddings(
  (word_embeddings): Embedding(30522, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [None]:
ex_phrase = "I am Niel"

In [None]:
tokens_pt= tokenizer.encode(ex_phrase,return_tensors="pt")
tokens_pt

tensor([[ 101, 1045, 2572, 9152, 2884,  102]])

In [None]:
model.embeddings.word_embeddings(torch.tensor(tokenizer.encode(ex_phrase,return_tensors="pt").clone().detach().requires_grad_(False)))

  model.embeddings.word_embeddings(torch.tensor(tokenizer.encode(ex_phrase,return_tensors="pt").clone().detach().requires_grad_(False)))


tensor([[[ 0.0136, -0.0265, -0.0235,  ...,  0.0087,  0.0071,  0.0151],
         [-0.0211,  0.0059, -0.0179,  ...,  0.0163,  0.0122,  0.0073],
         [-0.0437, -0.0150,  0.0029,  ..., -0.0282,  0.0474, -0.0448],
         [-0.0163, -0.0844, -0.0084,  ..., -0.0216,  0.0330, -0.0381],
         [-0.0439,  0.0315, -0.0015,  ..., -0.0086, -0.0030,  0.0044],
         [-0.0145, -0.0100,  0.0060,  ..., -0.0250,  0.0046, -0.0015]]],
       grad_fn=<EmbeddingBackward0>)

In [None]:
model.embeddings.word_embeddings(torch.tensor(tokenizer.encode("I am Robert",return_tensors="pt").clone().detach().requires_grad_(False)))

  model.embeddings.word_embeddings(torch.tensor(tokenizer.encode("I am Robert",return_tensors="pt").clone().detach().requires_grad_(False)))


tensor([[[ 0.0136, -0.0265, -0.0235,  ...,  0.0087,  0.0071,  0.0151],
         [-0.0211,  0.0059, -0.0179,  ...,  0.0163,  0.0122,  0.0073],
         [-0.0437, -0.0150,  0.0029,  ..., -0.0282,  0.0474, -0.0448],
         [-0.0033, -0.0325, -0.0210,  ...,  0.0133, -0.0615, -0.0664],
         [-0.0145, -0.0100,  0.0060,  ..., -0.0250,  0.0046, -0.0015]]],
       grad_fn=<EmbeddingBackward0>)

In [None]:
model.embeddings.position_embeddings

Embedding(512, 768)

In [None]:
model.embeddings.position_embeddings(torch.tensor(range(6))).shape

torch.Size([6, 768])

In [None]:
model.embeddings.token_type_embeddings

Embedding(2, 768)

In [None]:
model.embeddings.token_type_embeddings(torch.tensor([0]*6))

tensor([[ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086],
        [ 0.0004,  0.0110,  0.0037,  ..., -0.0066, -0.0034, -0.0086]],
       grad_fn=<EmbeddingBackward0>)

In [None]:
model.embeddings.LayerNorm(
    model.embeddings.word_embeddings(torch.tensor(tokenizer.encode(ex_phrase,return_tensors="pt").clone().detach().requires_grad_(False))) + \
    model.embeddings.position_embeddings(torch.tensor(range(6))) + \
    model.embeddings.token_type_embeddings(torch.tensor([0]*6))
)

  model.embeddings.word_embeddings(torch.tensor(tokenizer.encode(ex_phrase,return_tensors="pt").clone().detach().requires_grad_(False))) + \


tensor([[[ 1.6855e-01, -2.8577e-01, -3.2613e-01,  ..., -2.7571e-02,
           3.8253e-02,  1.6400e-01],
         [-3.4024e-04,  5.3974e-01, -2.8805e-01,  ...,  7.5731e-01,
           8.9008e-01,  1.6575e-01],
         [-6.3496e-01,  1.9748e-01,  2.5116e-01,  ..., -4.0819e-02,
           1.3468e+00, -6.9357e-01],
         [ 3.3420e-02, -1.0170e+00, -1.2740e-02,  ...,  2.2663e-01,
           9.4980e-01, -4.0098e-01],
         [-4.1618e-01,  8.9394e-01,  3.3912e-01,  ...,  4.1110e-01,
           3.7617e-01,  4.6739e-01],
         [-3.2507e-01, -3.1879e-01, -1.1632e-01,  ..., -3.9602e-01,
           4.1120e-01, -7.7552e-02]]], grad_fn=<NativeLayerNormBackward0>)

In [None]:
model.embeddings(tokenizer.encode(ex_phrase,return_tensors="pt")).shape

torch.Size([1, 6, 768])