In [1]:
import torch
from transformers import AutoTokenizer, AutoModel, AutoModelForCausalLM
from transformers import BertTokenizer, BertForMaskedLM
from sentence_transformers import SentenceTransformer
import numpy as np
import random
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
from transformers import XLMRobertaModel, XLMRobertaTokenizer
import torch
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
model = XLMRobertaModel.from_pretrained("xlm-roberta-base")

In [2]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [12]:
sentences = [
    "ASAN xidmət mərkəzlərinə xoş gəlmisiniz!"
]

In [13]:
for sentence in sentences:
    # Tokenize and convert to model input format
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    
    # Get model output
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Extract embeddings (last hidden state)
    embeddings = outputs.last_hidden_state
    
    # Find token ID for the word "bank"
    tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0])
    # bank_idx = tokens.index("bank")
    
    # # Extract the embedding for "bank" in this context
    # bank_embedding = embeddings[0, bank_idx]
    
    # print(f"\nSentence: {sentence}")
    # print(f"Embedding dimension: {bank_embedding.shape}")
    # print(f"First 5 values of 'bank' embedding: {bank_embedding[:5].numpy()}")

In [14]:
tokens

['<s>',
 '▁',
 'ASAN',
 '▁xidmət',
 '▁mərkəzləri',
 'nə',
 '▁xoş',
 '▁gəl',
 'mi',
 'siniz',
 '!',
 '</s>']

In [18]:
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Example sentences in English and a low-resource language (e.g., Azerbaijani)
sentences = [
    "I am a Machine Learning Engineer.",  # English
    "Mən dizaynerliklə məşğul oluram."  # Azerbaijani
]

# Get embeddings for both sentences
embeddings = []

for sentence in sentences:
    # Tokenize and get model inputs
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True)
    
    # Forward pass, no gradient needed
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Use the [CLS] token embedding as sentence representation
    sentence_embedding = outputs.last_hidden_state[:, 0, :]
    embeddings.append(sentence_embedding)
    
    print(f"Sentence: {sentence}")
    print(f"Embedding shape: {sentence_embedding.shape}")
    print(f"First 5 values: {sentence_embedding[0, :5].numpy()}\n")

# Calculate similarity between the English and Azerbaijani sentences
similarity = torch.nn.functional.cosine_similarity(embeddings[0], embeddings[1])
print(f"Similarity between English and Azerbaijani sentences: {similarity.item():.4f}")

Sentence: I am a Machine Learning Engineer.
Embedding shape: torch.Size([1, 768])
First 5 values: [ 0.1772717  -0.26520622  0.5473463   0.25062045  0.23591822]

Sentence: Mən dizaynerliklə məşğul oluram.
Embedding shape: torch.Size([1, 768])
First 5 values: [-0.06480627 -0.25263363  0.12214869  0.38965446 -0.24019417]

Similarity between English and Azerbaijani sentences: 0.6739


In [2]:
# 1. Load embedding model (better to use sentence-transformers for this)
model = SentenceTransformer('all-MiniLM-L6-v2')

# 2. Example knowledge base (in production, this would be much larger)
documents = [
    "Python is a high-level programming language known for its readability.",
    "TensorFlow is a machine learning framework developed by Google.",
    "PyTorch is a machine learning framework developed by Facebook.",
    "Contextual embeddings capture word meaning based on surrounding context.",
    "BERT is a transformer-based model that generates contextual embeddings."
]

# 3. Encode documents (create embeddings)
document_embeddings = model.encode(documents)

# 4. RAG retrieval function
def retrieve_relevant_context(query, top_k=2):
    # Encode the query
    query_embedding = model.encode([query])[0]
    
    # Calculate similarity with all documents
    similarities = cosine_similarity([query_embedding], document_embeddings)[0]
    
    # Get top-k most similar documents
    top_indices = np.argsort(similarities)[-top_k:][::-1]
    
    # Return relevant documents and their similarity scores
    results = []
    for idx in top_indices:
        results.append({
            "document": documents[idx],
            "similarity": similarities[idx]
        })
    
    return results

# 5. Example query
query = "What are contextual embeddings in NLP?"
relevant_docs = retrieve_relevant_context(query)

print(f"Query: {query}\n")
print("Retrieved relevant documents:")
for i, doc in enumerate(relevant_docs):
    print(f"{i+1}. {doc['document']} (similarity: {doc['similarity']:.4f})")

# In a complete RAG system, these retrieved documents would be passed to 
# a generative model (like GPT) to produce the final response



.gitattributes:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

model.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

model_O1.onnx:   0%|          | 0.00/90.4M [00:00<?, ?B/s]

model_O2.onnx:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

model_O3.onnx:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

model_O4.onnx:   0%|          | 0.00/45.2M [00:00<?, ?B/s]

model_qint8_arm64.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

model_qint8_avx512.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

model_qint8_avx512_vnni.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

model_quint8_avx2.onnx:   0%|          | 0.00/23.0M [00:00<?, ?B/s]

openvino_model.bin:   0%|          | 0.00/90.3M [00:00<?, ?B/s]

openvino_model.xml:   0%|          | 0.00/211k [00:00<?, ?B/s]

openvino_model_qint8_quantized.bin:   0%|          | 0.00/22.9M [00:00<?, ?B/s]

openvino_model_qint8_quantized.xml:   0%|          | 0.00/368k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

  _torch_pytree._register_pytree_node(


Query: What are contextual embeddings in NLP?

Retrieved relevant documents:
1. Contextual embeddings capture word meaning based on surrounding context. (similarity: 0.7027)
2. BERT is a transformer-based model that generates contextual embeddings. (similarity: 0.6570)


In [5]:
# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

# Example sentence
text = "The cat sat on the mat."

# Tokenize
tokens = tokenizer.tokenize(text)
print(f"Original tokens: {tokens}")

# Create input IDs
input_ids = tokenizer.convert_tokens_to_ids(tokens)

# Create a copy of input IDs for the labels (ground truth)
labels = input_ids.copy()

# Constants for masking procedure
mask_token_id = tokenizer.convert_tokens_to_ids(['[MASK]'])[0]
vocab_size = tokenizer.vocab_size
masking_prob = 0.15

# Randomly mask tokens for MLM
for i in range(len(input_ids)):
    if random.random() < masking_prob:  # 15% chance to mask
        rand = random.random()
        
        if rand < 0.8:  # 80% of the time, replace with [MASK]
            input_ids[i] = mask_token_id
        elif rand < 0.9:  # 10% of the time, replace with random word
            input_ids[i] = random.randint(0, vocab_size - 1)
        # 10% of the time, keep the word unchanged
    
# Convert to tensor format for the model
input_tensor = torch.tensor([input_ids])
labels_tensor = torch.tensor([labels])

# In real training, tokens where no masking occurred would have label = -100
# to ignore them in the loss calculation

# Forward pass and calculate loss
outputs = model(input_tensor, labels=labels_tensor)
loss = outputs.loss

print(f"Masked tokens: {tokenizer.convert_ids_to_tokens(input_ids)}")
print(f"MLM Loss: {loss.item()}")

# During actual pre-training, this loss would be backpropagated to update the model weights

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Original tokens: ['the', 'cat', 'sat', 'on', 'the', 'mat', '.']
Masked tokens: ['the', '[unused794]', 'sat', 'on', 'the', 'mat', '.']
MLM Loss: 2.898212432861328


In [17]:
# Load model and tokenizer (using a smaller model for example purposes)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Few-shot examples for sentiment classification in a low-resource language (Azerbaijani)
few_shot_prompt = """
Classify these Azerbaijani sentences as positive or negative.

Sentence: Bu film çox maraqlı idi. (This movie was very interesting.)
Sentiment: Positive

Sentence: Hava bu gün çox gözəldir. (The weather is very beautiful today.)
Sentiment: Positive

Sentence: Yeməkdən razı deyiləm. (I am not satisfied with the food.)
Sentiment: Negative

Sentence: Kitab məni məyus etdi. (The book disappointed me.)
Sentiment: Negative

Sentence: Bu məhsul keyfiyyətli deyil. (This product is not of good quality.)
Sentiment: 
"""

# Tokenize and generate
inputs = tokenizer(few_shot_prompt, return_tensors="pt")
outputs = model.generate(
    inputs.input_ids,
    max_length=len(inputs.input_ids[0]) + 10,
    temperature=0.7,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id
)

# Decode and print result
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(generated_text)


Classify these Azerbaijani sentences as positive or negative.

Sentence: Bu film çox maraqlı idi. (This movie was very interesting.)
Sentiment: Positive

Sentence: Hava bu gün çox gözəldir. (The weather is very beautiful today.)
Sentiment: Positive

Sentence: Yeməkdən razı deyiləm. (I am not satisfied with the food.)
Sentiment: Negative

Sentence: Kitab məni məyus etdi. (The book disappointed me.)
Sentiment: Negative

Sentence: Bu məhsul keyfiyyətli deyil. (This product is not of good quality.)
Sentiment: 

Sentence: Yeməkd


In [18]:
# Load multilingual BERT
model_name = "bert-base-multilingual-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Example sentence in Azerbaijani
sentence = "Mən təbii dil emalını sevirəm."  # "I love natural language processing."

# Tokenize
inputs = tokenizer(sentence, return_tensors="pt")
tokens = tokenizer.convert_ids_to_tokens(inputs.input_ids[0])

print(f"Tokenized sentence: {tokens}")

# Get embeddings
with torch.no_grad():
    outputs = model(**inputs)

# Get sentence-level embedding (using [CLS] token)
sentence_embedding = outputs.last_hidden_state[:, 0, :]
print(f"Sentence embedding shape: {sentence_embedding.shape}")

# Get token-level embeddings
token_embeddings = outputs.last_hidden_state[0]
print(f"Token embeddings shape: {token_embeddings.shape}")

# Print embedding for each token
for i, token in enumerate(tokens):
    print(f"Token: {token}, Embedding shape: {token_embeddings[i].shape}")
    print(f"First 5 values: {token_embeddings[i][:5].numpy()}")

Tokenized sentence: ['[CLS]', 'M', '##ən', 't', '##əbii', 'dil', 'em', '##alı', '##nı', 'se', '##vir', '##əm', '.', '[SEP]']
Sentence embedding shape: torch.Size([1, 768])
Token embeddings shape: torch.Size([14, 768])
Token: [CLS], Embedding shape: torch.Size([768])
First 5 values: [-0.051672    0.07287467  0.20710555  0.45764723 -0.06344569]
Token: M, Embedding shape: torch.Size([768])
First 5 values: [ 0.42265168  0.06584752  0.8201619   0.48402944 -0.25524417]
Token: ##ən, Embedding shape: torch.Size([768])
First 5 values: [ 0.03612332  0.1700448   0.92322004  0.45827347 -0.39794305]
Token: t, Embedding shape: torch.Size([768])
First 5 values: [ 0.15537417 -0.37486845  1.3681626   0.67887956 -0.6675606 ]
Token: ##əbii, Embedding shape: torch.Size([768])
First 5 values: [ 0.1742309  -0.31118524  1.2627839   0.96189666 -0.39453092]
Token: dil, Embedding shape: torch.Size([768])
First 5 values: [ 0.22533163  0.16853754  1.3803712   0.49217775 -0.3043777 ]
Token: em, Embedding shape: to