In [None]:
# 2. Generating Your First Embeddings

In [1]:
from sentence_transformers import SentenceTransformer
import numpy as np

print("Libraries imported successfully!")

Libraries imported successfully!


In [2]:
# Load a small, fast embedding model
print("Loading embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded!")
print(f"Model produces {model.get_sentence_embedding_dimension()} dimensional embeddings")

Loading embedding model...


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded!
Model produces 384 dimensional embeddings


In [3]:
# Simple example
text = "The cat sat on the mat"

# Generate embedding
embedding = model.encode(text)

print(f"Original text: {text}")
print(f"Embedding shape: {embedding.shape}")
print(f"Embedding type: {type(embedding)}")
print(f"\nFirst 10 values: {embedding[:10]}")

Original text: The cat sat on the mat
Embedding shape: (384,)
Embedding type: <class 'numpy.ndarray'>

First 10 values: [ 0.13040183 -0.01187012 -0.02811703  0.0512387  -0.05597447  0.03019154
  0.03016128  0.02469838 -0.01837059  0.05876673]


In [5]:
def cosine_similarity(vec1, vec2):
    """
    Calculate cosine similarity between two vectors.
    
    Returns a score between -1 and 1 (higher = more similar)
    """
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)

print("Similarity function ready!")

Similarity function ready!


In [6]:
# Create test sentences
sentences = [
    "The cat sat on the mat",
    "A feline rested on the rug",      # Similar meaning, different words
    "Dogs are loyal animals",          # Different topic
    "Python is a programming language" # Completely unrelated
]

# Generate embeddings for all sentences
embeddings = model.encode(sentences)

# Compare first sentence to all others
print("Comparing to: 'The cat sat on the mat'\n")
for i, sentence in enumerate(sentences):
    similarity = cosine_similarity(embeddings[0], embeddings[i])
    print(f"Similarity to '{sentence}'")
    print(f"Score: {similarity:.3f}\n")

Comparing to: 'The cat sat on the mat'

Similarity to 'The cat sat on the mat'
Score: 1.000

Similarity to 'A feline rested on the rug'
Score: 0.564

Similarity to 'Dogs are loyal animals'
Score: 0.165

Similarity to 'Python is a programming language'
Score: 0.031



In [7]:
# Sample knowledge base
documents = [
    "Python is a high-level programming language known for simplicity",
    "Machine learning enables computers to learn from data",
    "Neural networks are inspired by biological brains",
    "Dogs are loyal and friendly pets that need exercise",
    "Cats are independent animals that make great companions",
    "JavaScript is used for web development and runs in browsers",
    "Deep learning uses multi-layered neural networks",
    "Puppies require training and socialization from an early age"
]

print(f"Knowledge base: {len(documents)} documents")

Knowledge base: 8 documents


In [8]:
# Generate embeddings for all documents
print("Generating embeddings for all documents...")
doc_embeddings = model.encode(documents)

print(f"Created {len(doc_embeddings)} embeddings")
print(f"Each embedding has {doc_embeddings[0].shape[0]} dimensions")

Generating embeddings for all documents...
Created 8 embeddings
Each embedding has 384 dimensions


In [9]:
def search(query, documents, doc_embeddings, top_k=3):
    """
    Search for documents similar to the query.
    
    Args:
        query: Search query (string)
        documents: List of document texts
        doc_embeddings: Pre-computed document embeddings
        top_k: Number of results to return
    
    Returns:
        List of (document, similarity_score) tuples
    """
    # Embed the query
    query_embedding = model.encode(query)
    
    # Calculate similarities
    similarities = []
    for i, doc_emb in enumerate(doc_embeddings):
        similarity = cosine_similarity(query_embedding, doc_emb)
        similarities.append((documents[i], similarity))
    
    # Sort by similarity (highest first)
    similarities.sort(key=lambda x: x[1], reverse=True)
    
    # Return top k results
    return similarities[:top_k]

print("Search function ready!")

Search function ready!


In [10]:
# Test different queries
queries = [
    "What is artificial intelligence?",
    "Tell me about pet dogs",
    "How do I code in Python?"
]

for query in queries:
    print(f"\n{'='*80}")
    print(f"QUERY: {query}")
    print(f"{'='*80}")
    
    results = search(query, documents, doc_embeddings, top_k=3)
    
    for i, (doc, score) in enumerate(results, 1):
        print(f"\n{i}. (Score: {score:.3f})")
        print(f"   {doc}")


QUERY: What is artificial intelligence?

1. (Score: 0.408)
   Machine learning enables computers to learn from data

2. (Score: 0.395)
   Neural networks are inspired by biological brains

3. (Score: 0.326)
   Python is a high-level programming language known for simplicity

QUERY: Tell me about pet dogs

1. (Score: 0.548)
   Dogs are loyal and friendly pets that need exercise

2. (Score: 0.437)
   Puppies require training and socialization from an early age

3. (Score: 0.413)
   Cats are independent animals that make great companions

QUERY: How do I code in Python?

1. (Score: 0.554)
   Python is a high-level programming language known for simplicity

2. (Score: 0.148)
   Puppies require training and socialization from an early age

3. (Score: 0.138)
   JavaScript is used for web development and runs in browsers


In [11]:
# Load two different models for comparison
print("Loading models...\n")

model_small = SentenceTransformer('all-MiniLM-L6-v2')      # 384 dimensions
model_large = SentenceTransformer('all-mpnet-base-v2')     # 768 dimensions

print("Both models loaded!")
print(f"Small model: {model_small.get_sentence_embedding_dimension()} dimensions")
print(f"Large model: {model_large.get_sentence_embedding_dimension()} dimensions")

Loading models...



modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Both models loaded!
Small model: 384 dimensions
Large model: 768 dimensions


In [12]:
# Compare on a similarity task
test_pairs = [
    ("The dog is running", "A canine is jogging"),           # Similar
    ("I love pizza", "Pizza is delicious"),                  # Related
    ("Python programming", "Cooking pasta")                  # Unrelated
]

print("Comparing model performance:\n")
for text1, text2 in test_pairs:
    # Small model
    emb1_small = model_small.encode([text1, text2])
    sim_small = cosine_similarity(emb1_small[0], emb1_small[1])
    
    # Large model  
    emb1_large = model_large.encode([text1, text2])
    sim_large = cosine_similarity(emb1_large[0], emb1_large[1])
    
    print(f"Pair: '{text1}' vs '{text2}'")
    print(f"  Small model: {sim_small:.3f}")
    print(f"  Large model: {sim_large:.3f}")
    print()

Comparing model performance:

Pair: 'The dog is running' vs 'A canine is jogging'
  Small model: 0.818
  Large model: 0.827

Pair: 'I love pizza' vs 'Pizza is delicious'
  Small model: 0.801
  Large model: 0.785

Pair: 'Python programming' vs 'Cooking pasta'
  Small model: 0.142
  Large model: 0.120



In [14]:
class SimpleRetriever:
    def __init__(self, model_name='all-MiniLM-L6-v2'):
        """
        Initialize retriever with embedding model.
        """
        self.model = SentenceTransformer(model_name)
        self.chunks = []
        self.embeddings = None
    
    def add_documents(self, documents, chunk_size=500):
        """
        Add documents to the retriever (chunks and embeds them).
        """
        # Simple chunking (from Module 2)
        for doc in documents:
            words = doc.split()
            for i in range(0, len(words), chunk_size):
                chunk = ' '.join(words[i:i+chunk_size])
                self.chunks.append(chunk)
        
        # Generate embeddings
        print(f"Embedding {len(self.chunks)} chunks...")
        self.embeddings = self.model.encode(self.chunks)
        print(f"Ready! {len(self.chunks)} chunks indexed.")
    
    def search(self, query, top_k=3):
        """
        Search for relevant chunks.
        """
        # Embed query
        query_embedding = self.model.encode(query)
        
        # Calculate similarities
        similarities = []
        for i, chunk_emb in enumerate(self.embeddings):
            sim = cosine_similarity(query_embedding, chunk_emb)
            similarities.append((self.chunks[i], sim))
        
        # Sort and return top k
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities[:top_k]

print("SimpleRetriever class ready!")

SimpleRetriever class ready!


In [15]:
# Test it with sample documents
sample_docs = [
    """
    Python is a versatile programming language widely used in web development,
    data science, and automation. Its simple syntax makes it beginner-friendly
    while remaining powerful for advanced applications.
    """,
    """
    Machine learning is a subset of artificial intelligence that enables systems
    to learn and improve from experience. Popular frameworks include TensorFlow,
    PyTorch, and scikit-learn.
    """,
    """
    Dogs are loyal companions that require regular exercise, training, and
    veterinary care. Different breeds have varying needs and temperaments.
    """
]

# Create retriever and add documents
retriever = SimpleRetriever()
retriever.add_documents(sample_docs, chunk_size=100)

# Test searches
test_queries = [
    "How do I start learning to code?",
    "What is AI and machine learning?",
    "Tell me about caring for pets"
]

for query in test_queries:
    print(f"\n{'='*80}")
    print(f"Query: {query}")
    print(f"{'='*80}")
    results = retriever.search(query, top_k=2)
    for i, (chunk, score) in enumerate(results, 1):
        print(f"\nResult {i} (Score: {score:.3f}):")
        print(chunk.strip())

Embedding 3 chunks...
Ready! 3 chunks indexed.

Query: How do I start learning to code?

Result 1 (Score: 0.263):
Python is a versatile programming language widely used in web development, data science, and automation. Its simple syntax makes it beginner-friendly while remaining powerful for advanced applications.

Result 2 (Score: 0.228):
Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience. Popular frameworks include TensorFlow, PyTorch, and scikit-learn.

Query: What is AI and machine learning?

Result 1 (Score: 0.697):
Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience. Popular frameworks include TensorFlow, PyTorch, and scikit-learn.

Result 2 (Score: 0.234):
Python is a versatile programming language widely used in web development, data science, and automation. Its simple syntax makes it beginner-friendly while remaining powerful for advanced applications.

