#Generating contexualized word embeddings

In [9]:
from transformers import AutoModel, AutoTokenizer
#Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/deberta-base")

#Load Language model
model = AutoModel.from_pretrained("microsoft/deberta-v3-xsmall")

#Tokenize the sentence
tokens = tokenizer('Hello world', return_tensors='pt')

#Process the tokens
output = model(**tokens)[0]

In [10]:
output.shape

torch.Size([1, 4, 384])

This means that we have four tokens in embedded in a vector of 384 values. The first value is the number of batches used in the training phase.
If we want to speed up the process by running more batches / sentences at one time we can increase the batch.

In [7]:
print(output)

tensor([[[-3.4816,  0.0861, -0.1819,  ..., -0.0612, -0.3911,  0.3017],
         [ 0.1898,  0.3208, -0.2315,  ...,  0.3714,  0.2478,  0.8048],
         [ 0.2071,  0.5036, -0.0485,  ...,  1.2175, -0.2292,  0.8582],
         [-3.4278,  0.0645, -0.1427,  ...,  0.0658, -0.4367,  0.3834]]],
       grad_fn=<NativeLayerNormBackward0>)


In [11]:
print(tokens)

{'input_ids': tensor([[    1, 31414,   232,     2]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}


In [12]:
for _ in tokens['input_ids']:
  print(tokenizer.decode(_))

[CLS]Hello world[SEP]


#Text Embedding

In [16]:
from sentence_transformers import SentenceTransformer

#Load Model
model = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

#Convert text to text embeddings
vector = model.encode('Hello world')

print(type(vector))
print(vector.shape)
print(vector)

<class 'numpy.ndarray'>
(768,)
[ 2.62497496e-02  1.33955907e-02 -4.53314325e-03 -2.17914507e-02
  5.45518696e-02 -4.96646622e-03  6.65558968e-03  3.06263138e-02
 -5.76279080e-03 -4.56203381e-03 -3.31330928e-03 -4.84962612e-02
 -1.13640139e-02  3.50774154e-02  9.30946991e-02 -8.66874307e-02
  5.10865450e-02  9.88612417e-03 -6.35692701e-02 -8.55018292e-03
  7.05439178e-03 -3.86240124e-03  2.47443132e-02  4.28849496e-02
  3.50941643e-02 -2.98482180e-02  1.02525717e-02  2.23449674e-02
  2.08900124e-02  9.49222594e-03 -3.30444016e-02 -1.22841485e-02
  5.35289198e-02  2.54292116e-02  2.02217711e-06 -3.41910273e-02
  9.61000845e-03 -1.64845362e-02  5.60950860e-03 -4.25004447e-03
 -2.28012074e-02  4.03547138e-02  3.05203488e-03  3.13725881e-02
 -1.08123627e-02 -3.55708264e-02  2.22929195e-02  1.68711902e-03
  2.07723165e-03  2.31162757e-02  6.88586896e-03 -6.83093350e-03
 -4.87613119e-02 -2.70107836e-02  1.54911485e-02  3.73168960e-02
  2.72793397e-02  2.64989305e-02 -1.69231603e-03 -2.8822375

#Pretrained word Embeddings

In [18]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m39.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [20]:
!pip install --upgrade --force-reinstall numpy

Collecting numpy
  Downloading numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-2.2.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.4/16.4 MB[0m [31m63.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: numpy
  Attempting uninstall: numpy
    Found existing installation: numpy 1.26.4
    Uninstalling numpy-1.26.4:
      Successfully uninstalled numpy-1.26.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gensim 4.3.3 requires numpy<2.0,>=1.18.5, but you have numpy 2.2.4 which is incompatible.
numba 0.60.0 requires numpy<2.1,>=1.22, but you have numpy 2.2.4 which is incom

In [2]:
!pip install --upgrade --force-reinstall gensim

Collecting gensim
  Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
Collecting scipy<1.14.0,>=1.7.0 (from gensim)
  Using cached scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
Collecting smart-open>=1.8.1 (from gensim)
  Downloading smart_open-7.1.0-py3-none-any.whl.metadata (24 kB)
Collecting wrapt (from smart-open>=1.8.1->gensim)
  Downloading wrapt-1.17.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.4 kB)
Using cached gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
Using cached numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
Using cached scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (38

In [1]:
import gensim.downloader as api

In [2]:
model = api.load('glove-wiki-gigaword-50')



In [3]:
model.most_similar([model['cat']],topn=11)

[('cat', 1.0),
 ('dog', 0.9218004941940308),
 ('rabbit', 0.8487821221351624),
 ('monkey', 0.8041081428527832),
 ('rat', 0.7891963124275208),
 ('cats', 0.7865270972251892),
 ('snake', 0.7798910737037659),
 ('dogs', 0.7795814871788025),
 ('pet', 0.7792249917984009),
 ('mouse', 0.7731667757034302),
 ('bite', 0.7728800177574158)]

#Dynamic Embeddings

In [4]:
from transformers import AutoTokenizer, AutoModel
import torch

# 1. Load a pre-trained BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [5]:
# 2. Example sentences with the word "bank"
sentences = [
    "She is swimming in the river bank",
    "She withdrew 100 euro off of her bank account"
]

In [6]:
# 3. Tokenize and convert sentences to model input tensors
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')

print(encoded_input)

{'input_ids': tensor([[ 101, 2016, 2003, 5742, 1999, 1996, 2314, 2924,  102,    0,    0],
        [ 101, 2016, 6780, 2531, 9944, 2125, 1997, 2014, 2924, 4070,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [10]:
# 4. Run the sentences through BERT
# feeding our tokenized input into the BERT model to get its contextual embeddings
with torch.no_grad():
    outputs = model(**encoded_input)
# BERT goes through multiple transformer layers (each with self-attention mechanisms).
# The outputs include "last_hidden_state" of shape [batch_size, seq_length, hidden_dim]
print(outputs)
last_hidden_states = outputs.last_hidden_state
# This is a tensor of shape [batch_size, sequence_length, hidden_dim], where:
# batch_size = number of sentences (in our example, 2)
# sequence_length = number of tokens in each sentence after tokenization (padding makes them the same length)
# Each position [i, j] in the last_hidden_state is the contextual embedding for the j-th token in the i-th sentence.
# Because BERT reads the entire sentence, each token’s vector is influenced by its context, so the embedding for “bank” in “river bank” differs from “bank” in “bank account.”
# hidden_dim = the hidden size of the BERT model (for bert-base-uncased, it’s 768 by default)

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.0079, -0.1184, -0.1200,  ..., -0.3597,  0.1501,  0.3539],
         [ 0.2412, -0.8554,  0.1856,  ..., -0.9770,  0.2875, -0.1332],
         [ 0.2337, -0.3878,  0.1125,  ..., -0.5350, -0.1055,  0.1853],
         ...,
         [ 0.6876,  0.0099, -0.4805,  ..., -0.0780, -0.6332, -0.6509],
         [-0.1134, -0.2845,  0.1771,  ...,  0.0539,  0.3155, -0.0449],
         [-0.2174, -0.2968,  0.2114,  ...,  0.1056,  0.3928, -0.1537]],

        [[-0.2659, -0.0949, -0.0534,  ..., -0.2797,  0.0812,  0.5101],
         [ 0.0210, -0.9154, -0.2446,  ..., -0.2996,  0.3267, -0.0536],
         [-0.9583, -0.1491,  0.3209,  ..., -0.5905, -0.0805,  0.2630],
         ...,
         [ 0.7711, -0.5809,  0.6585,  ..., -0.2036,  0.2418,  0.0377],
         [-0.2100, -0.8931, -0.1297,  ..., -0.2923, -0.1967,  0.4327],
         [ 0.7873, -0.1549, -0.2069,  ...,  0.0623, -0.5462, -0.1654]]]), pooler_output=tensor([[-0.9109, -0.5117, -0.8502,  .

In [11]:
# 5. Locate the token "bank" in each sentence and extract its hidden state
for i, sentence in enumerate(sentences):
    # Get the tokens for the sentence
    tokens = tokenizer.tokenize(tokenizer.decode(encoded_input["input_ids"][i]))
    print(f"Sentence {i}: {sentence}")
    print(f"Tokens: {tokens}")

    # Find index of the word "bank" in the tokens
    # (Remember sometimes BERT splits words, e.g. "bank" might be "bank" or "ban", "##k" etc.
    #  but in this case, "bank" should remain whole. You may need a more robust approach
    #  if the word is sub-word tokenized.)
    token_index = tokens.index("bank")

    # Extract the embedding vector for "bank"
    bank_embedding = last_hidden_states[i, token_index, :]
    print(f"bank embedding (first 5 dims): {bank_embedding[:5]}\n")


Sentence 0: She is swimming in the river bank
Tokens: ['[CLS]', 'she', 'is', 'swimming', 'in', 'the', 'river', 'bank', '[SEP]', '[PAD]', '[PAD]']
bank embedding (first 5 dims): tensor([-0.3768, -0.7126, -0.3114, -0.1963, -0.0266])

Sentence 1: She withdrew 100 euro off of her bank account
Tokens: ['[CLS]', 'she', 'withdrew', '100', 'euro', 'off', 'of', 'her', 'bank', 'account', '[SEP]']
bank embedding (first 5 dims): tensor([ 0.7711, -0.5809,  0.6585, -0.1168,  0.5897])

