# Embedder Pipeline

Convert optimized chunks to vector embeddings using Ollama for semantic search.

**Steps:**
1. Ensure Ollama is running with nomic-embed-text model
2. Load optimized chunks (with parent-child relationships and demo code)
3. Generate embeddings for chunk content (768 dimensions)
4. Save chunks with embeddings for database import (chunks_embedded.json)

**Output:** Chunks with vector embeddings ready for pgvector storage

In [None]:
import requests

def check_ollama_is_running():
    """Check if Ollama is running and has the required model."""
    try:
        response = requests.get("http://localhost:11434/api/tags", timeout=2)
    except requests.exceptions.ConnectionError:
        print("Ollama is not running!")
        print("Start the Ollama Docker container.")
        raise RuntimeError("Ollama service is not available at http://localhost:11434")
    except Exception as e:
        print(f"Could not connect to Ollama: {e}")
        raise
    
    # Check if model is loaded
    models = [m.get('name', '') for m in response.json().get('models', [])]
    if not any('nomic-embed-text' in name for name in models):
        print("nomic-embed-text model not loaded")
        raise RuntimeError("Required model 'nomic-embed-text' is not available")
    
    print("Ollama is running with nomic-embed-text model")
    return

check_ollama_is_running()

Ollama is running with nomic-embed-text model


## Section 1: Embedding Provider


In [1]:
import requests
import time

class OllamaEmbedder:
    def __init__(self, model: str = "nomic-embed-text", base_url: str = "http://localhost:11434"):
        self.model = model
        self.base_url = base_url
    
    def embed(self, texts: list[str]) -> list[list[float]]:
        print(f"Starting embedding {len(texts)} texts with model {self.model}")
        start = time.time()
        
        response = requests.post(
            f"{self.base_url}/api/embed",
            json={"model": self.model, "input": texts}
        )
        
        duration = time.time() - start
        print(f"Completed in {duration:.2f}s ({duration/len(texts):.3f}s per chunk)")
        
        data = response.json()
        return data["embeddings"]

def create_embedder(model: str = "nomic-embed-text") -> OllamaEmbedder:
    return OllamaEmbedder(model=model)

## Section 2: Load Chunks & Generate Embeddings

Select one of the options below to load and embed chunks:

- **OPTION A:** Load semantic chunks from `01_chunker.ipynb` and generate embeddings
- **OPTION B:** Load primitive chunks from `01_chunker_primitive.ipynb` and generate embeddings

Each block will load the chunks, generate 768-dimensional embeddings, and save to a JSON file ready for database import.

In [2]:
""""Option A: Semantic Chunks (01_chunker.ipynb)"""

import json
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent.parent.parent
CHUNKS_DIR = PROJECT_ROOT / 'etl/data/chunks'

chunks_file = CHUNKS_DIR / 'chunks.json'

if chunks_file.exists():
    with open(chunks_file) as f:
        chunks = json.load(f)
    
    print(f"Loaded {len(chunks)} chunks\n")
    
    embedder = create_embedder(model="nomic-embed-text")
    texts = [chunk['content'] for chunk in chunks]
    print(f"Avg chunk size: {sum(len(t) for t in texts) // len(texts)} characters\n")
    
    embeddings = embedder.embed(texts)
    
    for chunk, embedding in zip(chunks, embeddings):
        chunk['embedding'] = embedding
    
    output_file = CHUNKS_DIR / 'chunks_embedded.json'
    with open(output_file, 'w') as f:
        json.dump(chunks, f, indent=2)
    
    file_size_mb = output_file.stat().st_size / (1024 * 1024)
    print(f"Saved: {output_file.name} ({file_size_mb:.2f} MB")
else:
    print(f"File not found: {chunks_file}")


Loaded 606 chunks

Avg chunk size: 276 characters

Starting embedding 606 texts with model nomic-embed-text


KeyboardInterrupt: 

In [2]:
""""Option B: Primitive Chunks (01_primitive_chunker.ipynb)"""

import json
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent.parent.parent
CHUNKS_DIR = PROJECT_ROOT / 'etl/data/chunks'

primitive_chunks_file = CHUNKS_DIR / 'chunks_primitive.json'

if primitive_chunks_file.exists():
    with open(primitive_chunks_file) as f:
        primitive_chunks = json.load(f)
    
    print(f"Loaded {len(primitive_chunks)} chunks\n")
    
    embedder = create_embedder(model="nomic-embed-text")
    texts = [chunk['content'] for chunk in primitive_chunks]

    print(f"Avg chunk size: {sum(len(t) for t in texts) // len(texts)} characters\n")
    
    embeddings = embedder.embed(texts)
    
    for chunk, embedding in zip(primitive_chunks, embeddings):
        chunk['embedding'] = embedding
    
    output_file = CHUNKS_DIR / 'chunks_primitive_embedded.json'
    with open(output_file, 'w') as f:
        json.dump(primitive_chunks, f, indent=2)
    
    file_size_mb = output_file.stat().st_size / (1024 * 1024)
    print(f"Saved: {output_file.name} ({file_size_mb:.2f} MB")
else:
    print(f"File not found: {primitive_chunks_file}")


Loaded 186 chunks

Avg chunk size: 1619 characters

Starting embedding 186 texts with model nomic-embed-text


KeyboardInterrupt: 