# Notebook 2: Embedding into PostgreSQL

**Project:** Creating local LLM  
**Author:** Muhammad Ali Tahir  
**Date:** 2025  

---

## ðŸ“‹ Notebook Objectives

1. **Embedding the database


In [2]:
"""
Step 2: Generate Embeddings for Reviews
========================================
This script:
1. Loads review texts from PostgreSQL
2. Generates embeddings using sentence-transformers on GPU
3. Stores embeddings back in PostgreSQL (pgvector)

Model: all-MiniLM-L6-v2 (384 dimensions, fast, good quality)
"""

# â”€â”€ Cell 1: Setup & Imports â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
import torch
import numpy as np
import time
from sqlalchemy import create_engine, text
from sentence_transformers import SentenceTransformer
from urllib.parse import quote_plus

# â”€â”€ Cell 2: Configuration â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
DB_USER = "llmuser"
DB_PASS = quote_plus("U9797013u@")
DB_HOST = "127.0.0.1"
DB_PORT = "5432"
DB_NAME = "llmdb"

engine = create_engine(f"postgresql://{DB_USER}:{DB_PASS}@{DB_HOST}:{DB_PORT}/{DB_NAME}")

# Check GPU availability
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB")

# â”€â”€ Cell 3: Load Model â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
print("\nLoading embedding model...")
model = SentenceTransformer("all-MiniLM-L6-v2", device="cuda" if torch.cuda.is_available() else "cpu")
print(f"Model loaded on: {model.device}")
print(f"Embedding dimension: {model.get_sentence_embedding_dimension()}")

# â”€â”€ Cell 4: Fetch Reviews â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
print("\nFetching reviews from database...")
with engine.connect() as conn:
    result = conn.execute(text("""
        SELECT id, COALESCE(summary, '') || ' ' || COALESCE(review_text, '') as combined_text
        FROM reviews
        WHERE embedding IS NULL
        ORDER BY id
    """))
    rows = result.fetchall()

total = len(rows)
print(f"Reviews to embed: {total:,}")

# â”€â”€ Cell 5: Generate & Store Embeddings in Batches â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
BATCH_SIZE = 512  # Adjust if you run into VRAM issues (try 256 if OOM)

ids = [row[0] for row in rows]
texts = [row[1] for row in rows]

print(f"\nGenerating embeddings (batch size: {BATCH_SIZE})...")
print(f"Estimated time: ~{total / 5000:.0f} minutes on GPU\n")

start_time = time.time()
processed = 0

for i in range(0, total, BATCH_SIZE):
    batch_ids = ids[i:i + BATCH_SIZE]
    batch_texts = texts[i:i + BATCH_SIZE]

    # Generate embeddings
    embeddings = model.encode(
        batch_texts,
        batch_size=BATCH_SIZE,
        show_progress_bar=False,
        normalize_embeddings=True  # Normalize for cosine similarity
    )

    # Store in PostgreSQL
    with engine.connect() as conn:
        for j, (review_id, embedding) in enumerate(zip(batch_ids, embeddings)):
            embedding_list = embedding.tolist()
            conn.execute(
                text("UPDATE reviews SET embedding = :emb WHERE id = :id"),
                {"emb": str(embedding_list), "id": review_id}
            )
        conn.commit()

    processed += len(batch_ids)
    elapsed = time.time() - start_time
    rate = processed / elapsed
    remaining = (total - processed) / rate if rate > 0 else 0

    # Progress update every 10 batches
    if (i // BATCH_SIZE) % 10 == 0 or processed == total:
        print(f"  Progress: {processed:,}/{total:,} ({processed/total*100:.1f}%) | "
              f"Rate: {rate:.0f} reviews/sec | "
              f"ETA: {remaining/60:.1f} min")

total_time = time.time() - start_time
print(f"\nâœ… Embedding generation complete!")
print(f"Total time: {total_time/60:.1f} minutes")
print(f"Average rate: {total/total_time:.0f} reviews/sec")

# â”€â”€ Cell 6: Create HNSW Index for Fast Similarity Search â”€â”€â”€â”€â”€â”€
print("\nCreating vector similarity search index (HNSW)...")
print("This may take a few minutes...")

start = time.time()
with engine.connect() as conn:
    # Drop existing index if any
    conn.execute(text("DROP INDEX IF EXISTS idx_reviews_embedding;"))
    # Create HNSW index for cosine similarity
    conn.execute(text("""
        CREATE INDEX idx_reviews_embedding
        ON reviews
        USING hnsw (embedding vector_cosine_ops)
        WITH (m = 16, ef_construction = 64);
    """))
    conn.commit()

print(f"Index created in {time.time() - start:.1f}s")

# â”€â”€ Cell 7: Test Similarity Search â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€â”€
print("\n" + "=" * 50)
print("TEST: Similarity Search")
print("=" * 50)

test_query = "best organic coffee beans"
print(f"\nQuery: '{test_query}'")

# Generate query embedding
query_embedding = model.encode([test_query], normalize_embeddings=True)[0].tolist()

with engine.connect() as conn:
    result = conn.execute(text("""
        SELECT r.summary, r.score, LEFT(r.review_text, 150) as preview,
               1 - (r.embedding <=> :query_emb::vector) as similarity
        FROM reviews r
        WHERE r.embedding IS NOT NULL
        ORDER BY r.embedding <=> :query_emb::vector
        LIMIT 5
    """), {"query_emb": str(query_embedding)})

    print("\nTop 5 most similar reviews:")
    print("-" * 50)
    for i, row in enumerate(result, 1):
        print(f"\n{i}. [Score: {row[1]}/5 | Similarity: {row[3]:.4f}]")
        print(f"   Summary: {row[0]}")
        print(f"   Preview: {row[2]}...")

print("\nâœ… Vector search is working!")
print("Next step: Set up Ollama + LLM (Step 3)")

ImportError: tokenizers>=0.19,<0.20 is required for a normal functioning of this module, but found tokenizers==0.22.2.
Try: `pip install transformers -U` or `pip install -e '.[dev]'` if you're working with git main