<a href="https://colab.research.google.com/github/pandiarajan-src/DataEngineering/blob/main/LLMOps%5CBERT_Embeddigns_Simple.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
# prompt: I want to implement a simple embeddings examples using BERT

!pip install transformers sentence-transformers

from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer, util
import torch

# Load pre-trained model and tokenizer
model_name = 'bert-base-uncased'  # You can choose other BERT models
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Example sentences
sentences = [
    "This is an example sentence.",
    "Each sentence is converted into an embedding.",
    "These embeddings can be used for various NLP tasks.",
    "Sentence embeddings are useful for semantic similarity."
]

print(sentences)


# Tokenize the sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
print(f"encoded_input: \n {encoded_input}")

# Get the embeddings
with torch.no_grad():
    model_output = model(**encoded_input)
    embeddings = model_output.last_hidden_state[:, 0, :]  # Use the [CLS] token embeddings
    print(f"model_output: \n {model_output}")
    print(f"embeddings: \n {embeddings}")

# Calculate cosine similarity between sentences
cosine_scores = util.cos_sim(embeddings, embeddings)

# Print the similarity scores
print(f"cosine_scores: \n {cosine_scores}")


# Example using SentenceTransformer (more efficient for sentence embeddings)
model_sentence_transformers = SentenceTransformer('all-mpnet-base-v2') #Use a Sentence-BERT model
embeddings_sentence_transformers = model_sentence_transformers.encode(sentences)
cosine_scores_sentence_transformers = util.cos_sim(embeddings_sentence_transformers, embeddings_sentence_transformers)

print(f"cosine_scores_sentence_transformers: \n {cosine_scores_sentence_transformers}")


['This is an example sentence.', 'Each sentence is converted into an embedding.', 'These embeddings can be used for various NLP tasks.', 'Sentence embeddings are useful for semantic similarity.']
encoded_input: 
 {'input_ids': tensor([[  101,  2023,  2003,  2019,  2742,  6251,  1012,   102,     0,     0,
             0,     0,     0,     0,     0,     0],
        [  101,  2169,  6251,  2003,  4991,  2046,  2019,  7861,  8270,  4667,
          1012,   102,     0,     0,     0,     0],
        [  101,  2122,  7861,  8270,  4667,  2015,  2064,  2022,  2109,  2005,
          2536, 17953,  2361,  8518,  1012,   102],
        [  101,  6251,  7861,  8270,  4667,  2015,  2024,  6179,  2005, 21641,
         14402,  1012,   102,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0