# Contextualized Embeddings

`voyage-context-3`:
- Takes the **full list of chunks from one document** as a single call
- Internally captures document-level context for each chunk's vector
- Requires **no manual prefix building** — the model does it automatically

In [None]:
import { MongoClient } from 'mongodb';

// ← Paste your VoyageAI API key here (get one at https://dash.voyageai.com)
const VOYAGE_API_KEY   = 'pa-...';
const CONTEXT_MODEL    = 'voyage-context-3';
const STANDARD_MODEL   = 'voyage-4';
const QUERY_MODEL      = 'voyage-4-lite';
const DIMS             = 1024;
const CTX_INDEX        = 'ctx_chunk_index';
const STD_INDEX        = 'std_chunk_index';

const client = new MongoClient(process.env.MONGODB_URI!);
await client.connect();
const db      = client.db('voyage_lab');
const srcCol  = db.collection<{ _id: string; [key: string]: unknown }>('listings');
const chunkCol= db.collection('chunks');
await chunkCol.drop().catch(() => {});

console.log('Connected.');

## Step 1 — Chunk listing descriptions

We split each listing's `description` into sentence-level chunks. For `voyage-context-3`, all chunks from one listing are sent together so the model can use the full listing as context.

In [None]:
function chunkText(text: string, maxChars = 250): string[] {
  const sentences = text.match(/[^.!?]+[.!?]+/g) ?? [text];
  const chunks: string[] = [];
  let current = '';
  for (const s of sentences) {
    if ((current + s).length > maxChars && current) {
      chunks.push(current.trim());
      current = s;
    } else {
      current += s;
    }
  }
  if (current.trim()) chunks.push(current.trim());
  return chunks;
}

const listings = await srcCol
  .find({}, { projection: { _id: 1, name: 1, description: 1, property_type: 1, price: 1, address: 1 } })
  .toArray();

// Build per-document chunk lists
const docChunks = listings.map(l => ({
  listingId:     l._id,
  listingName:   String(l.name),
  propertyType:  String(l.property_type),
  price:         l.price as number,
  market:        (l.address as any)?.market as string,
  chunks:        chunkText(String(l.description ?? l.name)),
}));

const totalChunks = docChunks.reduce((s, d) => s + d.chunks.length, 0);
console.log(`${listings.length} listings → ${totalChunks} chunks`);
console.log('Example chunks from first listing:');
docChunks[0].chunks.forEach((c, i) => console.log(`  [${i}] ${c}`));

## Embed with `voyage-context-3` (compared)

In [None]:

// Standard: one flat list of texts
async function embedStandard(texts: string[], inputType: 'document' | 'query'): Promise<number[][]> {
  const res = await fetch('https://api.voyageai.com/v1/embeddings', {
    method: 'POST',
    headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${VOYAGE_API_KEY}` },
    body: JSON.stringify({ input: texts, model: STANDARD_MODEL, input_type: inputType }),
  });
  if (!res.ok) throw new Error(await res.text());
  const json = await res.json() as { data: { embedding: number[] }[] };
  return json.data.map(d => d.embedding);
}

// Contextualized: one doc at a time — all its chunks go in together
async function embedContextualized(chunks: string[], inputType: 'document' | 'query'): Promise<number[][]> {
  const res = await fetch('https://api.voyageai.com/v1/contextualized_embeddings', {
    method: 'POST',
    headers: { 'Content-Type': 'application/json', 'Authorization': `Bearer ${VOYAGE_API_KEY}` },
    body: JSON.stringify({ inputs: [chunks], model: CONTEXT_MODEL, input_type: inputType }),
  });
  if (!res.ok) throw new Error(await res.text());

  const json = await res.json() as { results: { embeddings: number[][] }[] };
  return json.results[0].embeddings;
}

console.log('Helpers defined.');

In [None]:
// ── Embed all chunks and store in MongoDB ──────────────────────────────────────
const allDocs: any[] = [];

for (const doc of docChunks) {
  if (doc.chunks.length === 0) continue;

  // Contextualized: pass all chunks for this document together
  const ctxVecs = await embedContextualized(doc.chunks, 'document');

  // Standard: each chunk is independent
  const stdVecs = await embedStandard(doc.chunks, 'document');

  doc.chunks.forEach((chunk, i) => {
    allDocs.push({
      listingId:        doc.listingId,
      listingName:      doc.listingName,
      propertyType:     doc.propertyType,
      price:            doc.price,
      market:           doc.market,
      chunk,
      chunkIndex:       i,
      embedding_ctx:    ctxVecs[i],
      embedding_std:    stdVecs[i],
    });
  });
}

await chunkCol.insertMany(allDocs);
console.log(`Stored ${allDocs.length} chunks in MongoDB.`);

## Create two vector search indexes and compare retrieval

In [None]:
for (const [name, path] of [[CTX_INDEX, 'embedding_ctx'], [STD_INDEX, 'embedding_std']]) {
  try { await chunkCol.dropSearchIndex(name); await new Promise(r => setTimeout(r, 2000)); } catch {}
  await chunkCol.createSearchIndex({
    name,
    type: 'vectorSearch',
    definition: {
      fields: [
        { type: 'vector', path, numDimensions: DIMS, similarity: 'cosine' },
        { type: 'filter', path: 'propertyType' },
        { type: 'filter', path: 'market' },
      ],
    },
  });
  console.log(`Index '${name}' creation requested.`);
}

console.log('\nWaiting for both indexes to be READY...');
for (let i = 0; i < 40; i++) {
  await new Promise(r => setTimeout(r, 3000));
  const idxs = await chunkCol.listSearchIndexes().toArray();
  const statuses = Object.fromEntries(idxs.map(x => [x.name, x.status]));
  console.log(statuses);
  if (Object.values(statuses).every(s => s === 'READY')) break;
}

In [None]:
// ── Side-by-side comparison ───────────────────────────────────────────────────
const testQueries = [
  'garden and outdoor relaxation',
  'remote work with fast wifi and dedicated desk',
  'historic building with original architecture',
];

for (const q of testQueries) {
  const [qVec] = await embedStandard([q], 'query');

  const search = async (index: string, path: string) =>
    chunkCol.aggregate([
      { $vectorSearch: { index, path, queryVector: qVec, numCandidates: 50, limit: 3 } },
      { $project: { listingName: 1, chunk: 1, score: { $meta: 'vectorSearchScore' } } },
    ]).toArray();

  const [ctxHits, stdHits] = await Promise.all([
    search(CTX_INDEX, 'embedding_ctx'),
    search(STD_INDEX, 'embedding_std'),
  ]);

  console.log(`\nQuery: "${q}"`);
  console.log('  voyage-context-3:');
  ctxHits.forEach((h, i) => console.log(`    ${i+1}. [${(h.score as number).toFixed(4)}] ${h.listingName} — "${String(h.chunk).substring(0, 60)}..."`))
  console.log('  voyage-4 (standard):');
  stdHits.forEach((h, i) => console.log(`    ${i+1}. [${(h.score as number).toFixed(4)}] ${h.listingName} — "${String(h.chunk).substring(0, 60)}..."`))
}

In [None]:
// ── Cleanup ───────────────────────────────────────────────────────────────────
await chunkCol.drop();
await client.close();
console.log('Done.');