In [None]:
#Setup
%pip install sentence-transformers


In [1]:
# Understand embeddings layer

import torch
import torch.nn as nn

class TransformerEmbeddings(nn.Module):
  def __init__(self, vocab_size, embedding_dim, max_length=512):
    super.__init__()

    # Token embeddings: convert words to vectors
    self.token_embeddings = nn.Embedding(vocab_size, embedding_dim)

    # Positional embeddings: add sequence order information
    self.positional_embeddings = nn.Embedding(max_length, embedding_dim)

    # Segment embeddings: distinguish different parts of input (e.g., question vs context)
    self.segment_embeddings = nn.Embedding(2, embedding_dim)

    # Layer normalization
    self.layer_norm = nn.LayerNorm(embedding_dim)
    self.dropout = nn.Dropout(0.1)

  def forward(self, input_ids, segment_id=None):
    # Get sequance length from input
    seq_length = input_ids.size(1)

    # Create position IDs indices (0, 1, 2,..)
    position_ids = torch.arange(seq_length, device=input_ids.device)
    position_ids = position_ids.unsqueeze(0).expand_as(input_ids)

    #segment embedding, if non initialized to zero
    if segment_id is None:
      segment_id = torch.zeros_like(input_ids)

    # combine all embedings
    embeddings = (
        self.token_embeddings(input_ids) +
        self.positional_embeddings(position_ids) +
        self.sengment_embeddings(segment_id)
    )

    # Apply layer normalization and dropout
    embeddings = self.layer_norm(embeddings)
    embeddings = self.dropout(embeddings)

    return embeddings




KeyboardInterrupt: 

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

monolingual_model = SentenceTransformer('all-MiniLM-L6-v2')
multilingual_model = SentenceTransformer('paraphrase-multilingual-mpnet-base-v2')


In [None]:
# Example sentences in different languages
sentences = {
    'english': "The weather is beautiful today.",
    'spanish': "El clima está hermoso hoy.",
    'french': "Le temps est magnifique aujourd'hui.",
    'german': "Das Wetter ist heute wunderschön."
}

# Function to compute similarity between embeddings
def compute_similarity(emb1, emb2):
    return np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))

# Compare embeddings across languages
def compare_sentences(model, sentences):
    # Generate embeddings for all sentences
    embeddings = {
        lang: model.encode(text, convert_to_numpy=True)
        for lang, text in sentences.items()
    }

    # Compare each pair
    print(f"\nSimilarity scores for {model.__class__.__name__}:")
    for lang1 in sentences:
        for lang2 in sentences:
            if lang1 < lang2:  # avoid duplicate comparisons
                sim = compute_similarity(embeddings[lang1], embeddings[lang2])
                print(f"{lang1} vs {lang2}: {sim:.4f}")

In [None]:
# Test both models
for model in [monolingual_model, multilingual_model]:
    compare_sentences(model, sentences)


Similarity scores for SentenceTransformer:
english vs spanish: 0.1565
english vs french: 0.1190
english vs german: 0.1472
french vs spanish: 0.3038
french vs german: 0.2329
german vs spanish: 0.0404

Similarity scores for SentenceTransformer:
english vs spanish: 0.9916
english vs french: 0.9766
english vs german: 0.9900
french vs spanish: 0.9801
french vs german: 0.9930
german vs spanish: 0.9877


In [None]:
# Example of batch processing for efficiency
texts = list(sentences.values())
batch_embeddings = multilingual_model.encode(texts, batch_size=8)

In [None]:
# Good Practice
max_seq_length = model.max_seq_length
def chunk_text(text, max_length=max_seq_length):
    # Split into sentences or chunks
    chunks = [text[i:i + max_length] for i in range(0, len(text), max_length)]
    return chunks

# Process long document
long_text_embeddings = model.encode(chunk_text(long_document))