# Embedder Pipeline

Convert chunks to vector embeddings using Ollama.

**Steps:**
1. Ensure Ollama is running
2. Load chunks from disk
3. Generate embeddings
4. Save with embeddings for database import


In [None]:
import subprocess
import time
import requests
import shutil

def ensure_ollama_is_running():
    # Check if server running
    try:
        requests.get("http://localhost:11434/api/tags", timeout=2)
    except:
        subprocess.Popen([shutil.which("ollama") or "/usr/local/bin/ollama", "serve"], 
                        stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
        for i in range(30):
            try:
                requests.get("http://localhost:11434/api/tags", timeout=1)
                break
            except:
                time.sleep(1)
    
    #Step 2: Check if model loaded
    response = requests.get("http://localhost:11434/api/tags", timeout=2)
    models = [m.get('name', '') for m in response.json().get('models', [])]
    if not any('nomic-embed-text' in name for name in models):
        print("Model not loaded. Run: ollama pull nomic-embed-text")
        return False
    
    # Model ready
    return True

ensure_ollama_is_running()

True

## Section 1: Embedding Provider


In [None]:
import requests
import time

class OllamaEmbedder:
    def __init__(self, model: str = "nomic-embed-text", base_url: str = "http://localhost:11434"):
        self.model = model
        self.base_url = base_url
    
    def embed(self, texts: list[str]) -> list[list[float]]:
        print(f"Starting embedding {len(texts)} texts with model {self.model}")
        start = time.time()
        
        response = requests.post(
            f"{self.base_url}/api/embed",
            json={"model": self.model, "input": texts}
        )
        
        duration = time.time() - start
        print(f"Completed in {duration:.2f}s ({duration/len(texts):.3f}s per chunk)")
        
        data = response.json()
        return data["embeddings"]

def create_embedder(model: str = "nomic-embed-text") -> OllamaEmbedder:
    return OllamaEmbedder(model=model)

## Section 2: Load & Embed


In [44]:
import json
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent.parent.parent
CHUNKS_DIR = PROJECT_ROOT / 'etl/data/chunks'

chunks_file = CHUNKS_DIR / 'chunks_primitive.json'
with open(chunks_file) as f:
    chunks = json.load(f)

print(f"Loaded {len(chunks)} chunks")


Loaded 186 chunks


In [None]:
embedder = create_embedder(model="nomic-embed-text")

texts = [chunk['content'] for chunk in chunks]
print(f"Embedding {len(texts)} chunks...")
print(f"Avg chunk size: {sum(len(t) for t in texts) // len(texts)} characters")

embeddings = embedder.embed(texts)

for chunk, embedding in zip(chunks, embeddings):
    chunk['embedding'] = embedding

output_file = CHUNKS_DIR / 'chunks_primitive_embedded.json'
with open(output_file, 'w') as f:
    json.dump(chunks, f, indent=2)

file_size_mb = output_file.stat().st_size / (1024 * 1024)
print(f"Saved: {output_file.name} ({file_size_mb:.2f} MB)")


Embedding 186 chunks...
Avg chunk size: 1613 characters
Starting embedding 186 texts with model nomic-embed-text...
âœ“ Completed in 507.45s (2.728s per chunk)
Saved: chunks_primitive_embedded.json (2.97 MB)
Dimensions: 768
