In [1]:
from transformers import XLMRobertaModel, XLMRobertaTokenizer
import torch

In [2]:
tokenizer = XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")
model = XLMRobertaModel.from_pretrained("xlm-roberta-base")

model.safetensors:  98%|#########7| 1.09G/1.12G [00:00<?, ?B/s]

In [13]:
model.eval()
embeddings = []

sentences = ["I am a Machine Learning Engineer.", 
            "I have studied at Baku Higher Oil School."]

with torch.no_grad():
    for sent in sentences:
        inputs = tokenizer(sent, return_tensors="pt", padding=True)
        outputs = model(**inputs)
        sent_embedding = outputs.last_hidden_state[:, 0, :]
        embeddings.append(sent_embedding)
        print(f"Sentence: {sent}")
        print(f"Sentence embedding: {sent_embedding[0, :15].numpy()}\n")


similarity = torch.nn.functional.cosine_similarity(embeddings[0], embeddings[1])
print(f"\nSimilarity between sentences is: {similarity.item():.4f}")

Sentence: I am a Machine Learning Engineer.
Sentence embedding: [ 0.07968017  0.11534335  0.08195748 -0.02582322  0.08123931 -0.04061352
  0.02149741 -0.03822808  0.10016774 -0.20736234  0.00568451  0.19195543
 -0.05058713  0.03820678  0.03627932]

Sentence: I have studied at Baku Higher Oil School.
Sentence embedding: [ 0.10620356  0.09569398  0.0708335   0.00350476  0.0746572  -0.0956919
  0.01693573 -0.0437999   0.13914847 -0.16263598 -0.00573601  0.18464628
 -0.06672061  0.02892233  0.05468758]


Similarity between sentences is: 0.9982


In [None]:
inputs = tokenizer(
    "HuggingFace is a company based in Paris and New York", add_special_tokens=False, return_tensors="pt"
)

with torch.no_grad():
    logits = model(**inputs).logits

predicted_token_class_ids = logits.argmax(-1)

In [14]:
# Mean pooling function
def mean_pooling(model_output, attention_mask):
    # First element of model_output contains all token embeddings
    token_embeddings = model_output.last_hidden_state
    
    # Create attention mask in the same shape as token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    
    # Sum token embeddings and divide by the total token count
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    
    # Return mean-pooled embedding
    return sum_embeddings / sum_mask

# Test sentences
azerbaijani_pairs = [
    # Similar meaning
    ["Mən maşın öyrənmə mühəndisiyəm.", "Mənim işimin adı maşın öyrənmə mühəndisliyidir."],
    
    # Different meaning
    ["Mən maşın öyrənmə mühəndisiyəm.", "Mən Bakı Ali Neft Məktəbində oxumuşam."]
]

english_pairs = [
    # Different meaning
    ["I am a Machine Learning Engineer.", "I have studied at Baku Higher Oil School."]
]

# Process all sentence pairs
model.eval()
with torch.no_grad():
    for i, pairs in enumerate([azerbaijani_pairs, english_pairs]):
        language = "Azerbaijani" if i == 0 else "English"
        print(f"\n{language} Sentence Pairs:")
        
        for j, pair in enumerate(pairs):
            # Process both sentences in the pair
            embeddings = []
            for sent in pair:
                # Tokenize sentences
                encoded_input = tokenizer(sent, padding=True, truncation=True, return_tensors='pt')
                
                # Compute token embeddings
                outputs = model(**encoded_input)
                
                # Apply mean pooling
                embedding = mean_pooling(outputs, encoded_input['attention_mask'])
                embeddings.append(embedding)
                
                # Print first 5 values for inspection
                print(f"Sentence: {sent}")
                print(f"Embedding (first 5 values): {embedding[0, :5].numpy()}")
            
            # Compute similarity
            similarity = torch.nn.functional.cosine_similarity(embeddings[0], embeddings[1])
            similarity_type = "similar" if j == 0 else "different"
            print(f"Similarity between {similarity_type} sentences: {similarity.item():.4f}\n")


Azerbaijani Sentence Pairs:
Sentence: Mən maşın öyrənmə mühəndisiyəm.
Embedding (first 5 values): [ 0.03006841  0.08905941  0.02749911 -0.05828931  0.13824546]
Sentence: Mənim işimin adı maşın öyrənmə mühəndisliyidir.
Embedding (first 5 values): [ 0.05652007  0.00162089  0.03152019 -0.08628289  0.15883191]
Similarity between similar sentences: 0.9963

Sentence: Mən maşın öyrənmə mühəndisiyəm.
Embedding (first 5 values): [ 0.03006841  0.08905941  0.02749911 -0.05828931  0.13824546]
Sentence: Mən Bakı Ali Neft Məktəbində oxumuşam.
Embedding (first 5 values): [ 0.10853648  0.05083771  0.02417384 -0.06507861  0.1261565 ]
Similarity between different sentences: 0.9960


English Sentence Pairs:
Sentence: I am a Machine Learning Engineer.
Embedding (first 5 values): [-0.02279879  0.0478951   0.04590836 -0.00515924  0.16044599]
Sentence: I have studied at Baku Higher Oil School.
Embedding (first 5 values): [0.00707844 0.03284997 0.00859101 0.01044608 0.1108994 ]
Similarity between similar sen

In [17]:
from sentence_transformers import SentenceTransformer

# Load a multilingual sentence transformer
model = SentenceTransformer('distiluse-base-multilingual-cased-v2')

In [19]:
# Get embeddings
embedding1 = model.encode("Çox acam və yemək yeməliyəm.")
embedding2 = model.encode("İnsan çox acdıqda yemək yeməlidir.")

# Calculate similarity
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity([embedding1], [embedding2])[0][0]
print(f"Similarity: {similarity:.4f}")

Similarity: 0.3408
