# Embedder Pipeline

Convert chunks to vector embeddings using Ollama.

**Steps:**
1. Ensure Ollama is running
2. Load chunks from disk
3. Generate embeddings
4. Save with embeddings for database import


In [30]:
import subprocess
import time
import requests
import shutil

def ensure_ollama():
    try:
        response = requests.get("http://localhost:11434/api/tags", timeout=2)
        if response.status_code == 200:
            print("✓ Ollama running")
            return
    except:
        pass
    
    print("Starting Ollama...")
    ollama_path = shutil.which("ollama") or "/usr/local/bin/ollama"
    
    try:
        subprocess.Popen([ollama_path, "serve"], 
                        stdout=subprocess.DEVNULL, 
                        stderr=subprocess.DEVNULL)
        for _ in range(30):
            try:
                response = requests.get("http://localhost:11434/api/tags", timeout=1)
                if response.status_code == 200:
                    print("✓ Ollama started")
                    return
            except:
                time.sleep(1)
    except Exception as e:
        print(f"✗ Failed: {e}")
        return False
    return False

ensure_ollama()


✓ Ollama running


## Section 1: Embedding Provider


In [31]:
from typing import List
from abc import ABC, abstractmethod
import requests
import subprocess

class EmbeddingProvider(ABC):
    @abstractmethod
    def embed(self, texts: List[str]) -> List[List[float]]:
        pass

class OllamaEmbedder(EmbeddingProvider):
    def __init__(self, model: str = "nomic-embed-text", base_url: str = "http://localhost:11434"):
        self.model = model
        self.base_url = base_url
        self._ensure_model()
    
    def _ensure_model(self):
        try:
            subprocess.run(["/usr/local/bin/ollama", "pull", self.model], 
                          stdout=subprocess.DEVNULL, 
                          stderr=subprocess.DEVNULL,
                          timeout=300)
        except:
            pass
    
    def embed(self, texts: List[str]) -> List[List[float]]:
        embeddings = []
        for text in texts:
            response = requests.post(
                f"{self.base_url}/api/embed",
                json={"model": self.model, "input": text}
            )
            data = response.json()
            if "embeddings" in data:
                embeddings.append(data["embeddings"][0])
            elif "embedding" in data:
                embeddings.append(data["embedding"])
            else:
                print(f"Unexpected response: {data}")
                raise KeyError("No embedding in response")
        return embeddings

def create_embedder(model: str = "nomic-embed-text") -> OllamaEmbedder:
    return OllamaEmbedder(model=model)


## Section 2: Load & Embed


In [None]:
import json
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent.parent.parent
CHUNKS_DIR = PROJECT_ROOT / 'etl/data/chunks'

chunks_file = CHUNKS_DIR / 'chunks_primitive.json'
with open(chunks_file) as f:
    chunks = json.load(f)

print(f"Loaded {len(chunks)} chunks")


Loaded 186 chunks


In [None]:
embedder = create_embedder(model="nomic-embed-text")

texts = [chunk['content'] for chunk in chunks]
print(f"Embedding {len(texts)} chunks...")

embeddings = embedder.embed(texts)

for chunk, embedding in zip(chunks, embeddings):
    chunk['embedding'] = embedding

output_file = CHUNKS_DIR / 'chunks_primitive_embedded.json'
with open(output_file, 'w') as f:
    json.dump(chunks, f, indent=2)

file_size_mb = output_file.stat().st_size / (1024 * 1024)
print(f"Saved: {output_file.name} ({file_size_mb:.2f} MB)")
print(f"Dimensions: {len(embeddings[0])}")


Embedding 186 chunks...
