<a href="https://colab.research.google.com/github/namratabiswas/Attention_in_Transformers/blob/main/sentence_embedding_comparison.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install dependencies
!pip install -q torch numpy scikit-learn

We import PyTorch for building and running the model, math for computing the positional encodings, and cosine_similarity to measure how close two sentence embeddings are.

In [2]:
# Imports
import torch
import torch.nn as nn
import math
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

Transformers have no sense of word order inherently. Therefore POSITIONAL ENCODING encodes each position with a unique pattern using sine/cosine functions.(keep track of the order of words)

In [3]:
# Positional Encoding
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=100):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1).float()
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer('pe', pe)

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

In [4]:
# Transformer Encoder Model
class TransformerEncoderModel(nn.Module):
    def __init__(self, vocab_size, d_model=64, nhead=4, num_layers=2, dim_feedforward=256, max_len=100):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.pos_encoder = PositionalEncoding(d_model, max_len)
        encoder_layer = nn.TransformerEncoderLayer(d_model, nhead, dim_feedforward)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers)
        self.fc = nn.Linear(d_model, d_model)

    def forward(self, src):
        src = self.embedding(src)
        src = self.pos_encoder(src)
        src = src.permute(1, 0, 2)  # [seq_len, batch, d_model]
        output = self.transformer_encoder(src)
        output = output.mean(dim=0)  # Global average pooling
        return self.fc(output)


Parameters Explained:

vocab_size: Size of vocabulary

d_model: Dimensionality of embeddings

nhead: Number of attention heads

num_layers: Stacked encoder layers

dim_feedforward: Size of feed-forward network inside encoder

max_len: Max sequence length for positional encoding

In [5]:
# Toy Vocabulary and Tokenizer
vocab = {'I': 0, 'like': 1, 'cats': 2, 'dogs': 3, 'hate': 4, 'you': 5, 'love': 6}
def tokenize(sentence):
    return [vocab[word] for word in sentence.split() if word in vocab]


In [6]:
# Sample Sentences
sentences = [
    ("I like cats", "I like dogs"),
    ("I love cats", "I hate cats"),
    ("you like dogs", "I like cats")
]

In [7]:
#  Padding and Tensors

# Pads input to a fixed length
def pad_sequence(seq, max_len, pad_value=0):
    return seq + [pad_value] * (max_len - len(seq))

# Max token length
max_len = 5
X = []

# Convert all sentence pairs to tensors
for s1, s2 in sentences:
    tokens1 = pad_sequence(tokenize(s1), max_len)
    tokens2 = pad_sequence(tokenize(s2), max_len)
    X.append((torch.tensor(tokens1), torch.tensor(tokens2)))

In [9]:
# Initialize and Run Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TransformerEncoderModel(vocab_size=len(vocab), max_len=max_len).to(device)
model.eval()

print("Sentence Similarity Results:\n")
with torch.no_grad():
    for (t1, t2), (s1, s2) in zip(X, sentences):
        t1 = t1.unsqueeze(0).to(device)
        t2 = t2.unsqueeze(0).to(device)
        emb1 = model(t1).cpu().numpy()
        emb2 = model(t2).cpu().numpy()
        sim = cosine_similarity(emb1, emb2)[0][0]
        print(f"Similarity between: \"{s1}\" and \"{s2}\" → {sim:.3f}")

Sentence Similarity Results:

Similarity between: "I like cats" and "I like dogs" → 0.962
Similarity between: "I love cats" and "I hate cats" → 0.947
Similarity between: "you like dogs" and "I like cats" → 0.908


