diff --git a/bin/commands/init.js b/bin/commands/init.js index 96f0af6..4ab9252 100644 --- a/bin/commands/init.js +++ b/bin/commands/init.js @@ -63,23 +63,26 @@ export async function init(configPath) { type: 'select', name: 'provider', message: 'Select an Embedding Provider:', - choices: ['openai', 'deepseek', 'ollama'], + choices: ['voyage','openai', 'deepseek', 'ollama'], helpMessage: "Available embedding providers:\n" + + "- Voyage: Best MongoDB compatible, requires API key\n" + "- OpenAI: Most popular, requires API key\n" + "- DeepSeek: Alternative provider, requires API key\n" + "- Ollama: Local deployment, no API key needed" }); // Provider-specific configuration - if (responses.provider === 'openai' || responses.provider === 'deepseek') { + if (responses.provider === 'openai' || responses.provider === 'deepseek' || responses.provider === 'voyage') { responses.apiKey = await promptWithValidation({ type: 'password', name: 'apiKey', - message: `Enter your ${responses.provider === 'openai' ? 'OpenAI' : 'DeepSeek'} API Key:`, + message: `Enter your ${responses.provider === 'openai' ? 'OpenAI' : responses.provider === 'voyage' ? 'Voyage' : 'DeepSeek'} API Key:`, validate: (input) => input && input.length > 0 ? true : 'API key is required', helpMessage: responses.provider === 'openai' ? "OpenAI API key format: sk-....\n- Get your key from: https://platform.openai.com/api-keys" - : "DeepSeek API key format: dk-....\n- Get your key from DeepSeek's platform" + : (responses.provider === 'voyage' + ? "VoyageAI API key format: pa-....\n- Get your key from VoyageAI's platform" + : "DeepSeek API key format: dk-....\n- Get your key from DeepSeek's platform") }); } else if (responses.provider === 'ollama') { const availableModels = getOllamaModels(); @@ -104,7 +107,7 @@ export async function init(configPath) { provider: responses.provider, ...(responses.apiKey && { apiKey: responses.apiKey }), ...(responses.model && { model: responses.model }), - dimensions: 1536, + dimensions: responses.provider === 'voyage' ? 1024 : 1536, batchSize: 100 }, search: { diff --git a/bin/utils/prompts.js b/bin/utils/prompts.js index c95281e..2aadc8d 100644 --- a/bin/utils/prompts.js +++ b/bin/utils/prompts.js @@ -71,7 +71,7 @@ export async function promptForConfigEdits(currentConfig) { type: 'select', name: 'provider', message: 'Select an Embedding Provider:', - choices: ['openai', 'deepseek', 'ollama'], + choices: ['voyage','openai', 'deepseek', 'ollama'], initial: currentConfig.embedding.provider } ]); @@ -140,7 +140,7 @@ export async function promptForProviderConfig() { type: 'select', name: 'provider', message: 'Select an embedding provider:', - choices: ['openai', 'ollama', 'anthropic', 'deepseek'], + choices: ['voyage','openai', 'ollama', 'anthropic', 'deepseek'], initial: 'openai' }); @@ -165,7 +165,8 @@ export async function promptForProviderConfig() { modelChoices = { 'openai': ['text-embedding-3-small', 'text-embedding-3-large'], 'anthropic': ['claude-3-opus-20240229', 'claude-3-sonnet-20240229'], - 'deepseek': ['deepseek-coder', 'deepseek-chat'] + 'deepseek': ['deepseek-coder', 'deepseek-chat'], + 'voyage': ['voyage-3', 'voyage-3-large', 'voyage-3-lite', 'voyage-code-3', 'voyage-finance-2', 'voyage-law-2'] }[providerResponse.provider] || []; defaultModel = modelChoices[0]; } @@ -204,7 +205,13 @@ export async function promptForProviderConfig() { 'llama2': '4096', 'mistral': '4096', 'mixtral': '4096', - 'codellama': '4096' + 'codellama': '4096', + 'voyage-3': '1024', + 'voyage-3-large': '1024', + 'voyage-3-lite': '1024', + 'voyage-code-3': '1024', + 'voyage-finance-2': '1024', + 'voyage-law-2': '1024' }[answers.model] || '4096'; return dims.toString(); }, @@ -227,4 +234,4 @@ export async function promptForProviderConfig() { baseUrl: config.baseUrl, dimensions: config.dimensions }; -} \ No newline at end of file +} diff --git a/bin/utils/providers.js b/bin/utils/providers.js index 72d18b9..5c417bb 100644 --- a/bin/utils/providers.js +++ b/bin/utils/providers.js @@ -20,6 +20,7 @@ export async function testProvider(config) { return await testOllamaProvider(config); case 'openai': case 'deepseek': + case 'voyage': return await testApiProvider(config); default: throw new Error(`Unknown provider: ${config.embedding.provider}`); @@ -80,6 +81,8 @@ export function getDefaultDimensions(provider) { return 1536; // For text-embedding-3-small case 'deepseek': return 1024; + case 'voyage': + return 1024; // For voyage models case 'ollama': return 4096; // For llama2 models default: @@ -93,9 +96,11 @@ export function getProviderModels(provider) { return ['text-embedding-3-small', 'text-embedding-3-large']; case 'deepseek': return ['deepseek-embedding']; + case 'voyage': + return ['voyage-3', 'voyage-3-large', 'voyage-3-lite', 'voyage-code-3', 'voyage-finance-2', 'voyage-law-2']; case 'ollama': return getOllamaModels(); default: return []; } -} \ No newline at end of file +} diff --git a/examples/basic-usage.js b/examples/basic-usage.js index 9ad8223..0d941f9 100644 --- a/examples/basic-usage.js +++ b/examples/basic-usage.js @@ -35,7 +35,7 @@ async function runExample() { embedding: { provider: process.env.EMBEDDING_PROVIDER || 'openai', apiKey: process.env.EMBEDDING_API_KEY, - dimensions: 1536, + dimensions: 1024, model: process.env.EMBEDDING_MODEL || 'text-embedding-3-small' } }); @@ -65,7 +65,7 @@ async function runExample() { console.log('\nSearch results:'); results.forEach((result, i) => { - console.log(`\n${i + 1}. Score: ${result.score.toFixed(3)}`); + //console.log(`\n${i + 1}. Score: ${result.score.toFixed(3)}`); console.log(`Content: ${result.content}`); console.log(`Metadata: ${JSON.stringify(result.metadata)}`); }); diff --git a/mongodb-rag-docs/docs/api-reference.md b/mongodb-rag-docs/docs/api-reference.md index ee2bac5..29f0f55 100644 --- a/mongodb-rag-docs/docs/api-reference.md +++ b/mongodb-rag-docs/docs/api-reference.md @@ -38,9 +38,13 @@ const rag = new MongoRAG({ - `config.database` (string, required): Default MongoDB database name. - `config.collection` (string, required): Default MongoDB collection name. - `config.embedding` (object, required): - - `provider` (string, required): Embedding provider (`openai` is supported). - - `apiKey` (string, required): API key for the embedding provider. - - `model` (string, optional): Model name (default: `'text-embedding-3-small'`). + - `provider` (string, required): Embedding provider (`openai`, `ollama`, or `voyage` are supported). + - `apiKey` (string, required): API key for the embedding provider (not required for `ollama`). + - `model` (string, optional): Model name. Defaults depend on provider: + - OpenAI: `'text-embedding-3-small'` + - Voyage: `'voyage-3'` (other options: `voyage-3-large`, `voyage-3-lite`, `voyage-code-3`, `voyage-finance-2`, `voyage-law-2`) + - Ollama: requires model specification + - `baseUrl` (string, optional): Base URL for Ollama API (default: `'http://localhost:11434'`). - `batchSize` (number, optional): Batch size for embedding generation (default: `100`). - `dimensions` (number, optional): Number of dimensions in the embedding space (default: `1536`). - `config.search` (object, optional): @@ -171,4 +175,4 @@ try { For more detailed examples and use cases, refer to: - [Basic Example](./examples/basic-example.md) -- [Advanced Example](./examples/advanced-example.md) \ No newline at end of file +- [Advanced Example](./examples/advanced-example.md) diff --git a/mongodb-rag-docs/docs/examples/voyage-example.md b/mongodb-rag-docs/docs/examples/voyage-example.md new file mode 100644 index 0000000..3a1ca67 --- /dev/null +++ b/mongodb-rag-docs/docs/examples/voyage-example.md @@ -0,0 +1,191 @@ +--- +id: voyage-example +title: Using VoyageAI Embeddings +sidebar_position: 3 +--- + +# Using VoyageAI Embeddings with MongoDB-RAG + +This example demonstrates how to use VoyageAI's embedding models with MongoDB-RAG for vector search. + +## Prerequisites + +1. Install the mongodb-rag package: + ```bash + npm install mongodb-rag voyageai + ``` + +2. Get a VoyageAI API key from [VoyageAI's website](https://www.voyageai.com/). + +3. Set up your environment variables: + ```bash + export VOYAGE_API_KEY=your_api_key_here + ``` + +## Basic Usage + +```javascript +import MongoRAG from 'mongodb-rag'; +import dotenv from 'dotenv'; + +// Load environment variables +dotenv.config(); + +async function main() { + // Initialize MongoRAG with VoyageAI provider + const rag = new MongoRAG({ + mongoUrl: 'mongodb+srv://your-connection-string', + database: 'ragdb', + collection: 'documents', + embedding: { + provider: 'voyage', + apiKey: process.env.VOYAGE_API_KEY, + model: 'voyage-3' // This is the default model + } + }); + + // Connect to MongoDB + await rag.connect(); + + // Ingest some documents + await rag.ingestBatch([ + { + documentId: 'doc1', + content: 'MongoDB is a document database with the scalability and flexibility that you want with the querying and indexing that you need.', + metadata: { source: 'MongoDB Website', category: 'Database' } + }, + { + documentId: 'doc2', + content: 'Vector search in MongoDB allows you to search for documents based on semantic similarity using vector embeddings.', + metadata: { source: 'MongoDB Documentation', category: 'Search' } + } + ]); + + // Perform a search + const results = await rag.search('How does vector search work?'); + console.log(results); + + // Close the connection + await rag.close(); +} + +main().catch(console.error); +``` + +## Using Different VoyageAI Models + +VoyageAI offers several embedding models optimized for different use cases: + +```javascript +// For general purpose (default) +const rag = new MongoRAG({ + // MongoDB configuration... + embedding: { + provider: 'voyage', + apiKey: process.env.VOYAGE_API_KEY, + model: 'voyage-3' // Default model + } +}); + +// For higher quality embeddings +const ragLarge = new MongoRAG({ + // MongoDB configuration... + embedding: { + provider: 'voyage', + apiKey: process.env.VOYAGE_API_KEY, + model: 'voyage-3-large' // Higher quality, larger model + } +}); + +// For faster, more efficient embeddings +const ragLite = new MongoRAG({ + // MongoDB configuration... + embedding: { + provider: 'voyage', + apiKey: process.env.VOYAGE_API_KEY, + model: 'voyage-3-lite' // Faster, more efficient + } +}); + +// For code-specific embeddings +const ragCode = new MongoRAG({ + // MongoDB configuration... + embedding: { + provider: 'voyage', + apiKey: process.env.VOYAGE_API_KEY, + model: 'voyage-code-3' // Optimized for code + } +}); + +// For finance-specific embeddings +const ragFinance = new MongoRAG({ + // MongoDB configuration... + embedding: { + provider: 'voyage', + apiKey: process.env.VOYAGE_API_KEY, + model: 'voyage-finance-2' // Optimized for finance + } +}); + +// For legal-specific embeddings +const ragLaw = new MongoRAG({ + // MongoDB configuration... + embedding: { + provider: 'voyage', + apiKey: process.env.VOYAGE_API_KEY, + model: 'voyage-law-2' // Optimized for legal content + } +}); +``` + +## Advanced Configuration + +You can combine VoyageAI embeddings with advanced search options: + +```javascript +const rag = new MongoRAG({ + mongoUrl: 'mongodb+srv://your-connection-string', + database: 'ragdb', + collection: 'documents', + embedding: { + provider: 'voyage', + apiKey: process.env.VOYAGE_API_KEY, + model: 'voyage-3', + batchSize: 50 // Process 50 documents at a time + }, + search: { + maxResults: 10, + minScore: 0.75, + similarityMetric: 'cosine' + } +}); + +// Search with metadata filtering +const results = await rag.search('vector search techniques', { + filter: { category: 'Search' } +}); +``` + +## Error Handling + +```javascript +try { + const rag = new MongoRAG({ + // MongoDB configuration... + embedding: { + provider: 'voyage', + apiKey: process.env.VOYAGE_API_KEY, + model: 'voyage-3' + } + }); + + await rag.connect(); + const results = await rag.search('vector search'); +} catch (error) { + if (error.message.includes('VoyageAI API error')) { + console.error('Error with VoyageAI API:', error.message); + // Handle VoyageAI specific errors + } else { + console.error('General error:', error); + } +} diff --git a/mongodb-rag-docs/docs/usage.md b/mongodb-rag-docs/docs/usage.md index acade7d..75eaa4f 100644 --- a/mongodb-rag-docs/docs/usage.md +++ b/mongodb-rag-docs/docs/usage.md @@ -22,8 +22,47 @@ const rag = new MongoRAG({ await rag.connect(); ``` +## Configuring Different Embedding Providers + +### OpenAI (default) +```js +const rag = new MongoRAG({ + // MongoDB configuration... + embedding: { + provider: 'openai', + apiKey: process.env.OPENAI_API_KEY, + model: 'text-embedding-3-small' // optional, this is the default + } +}); +``` + +### VoyageAI +```js +const rag = new MongoRAG({ + // MongoDB configuration... + embedding: { + provider: 'voyage', + apiKey: process.env.VOYAGE_API_KEY, + model: 'voyage-3' // optional, this is the default + // Other options: voyage-3-large, voyage-3-lite, voyage-code-3, voyage-finance-2, voyage-law-2 + } +}); +``` + +### Ollama (local) +```js +const rag = new MongoRAG({ + // MongoDB configuration... + embedding: { + provider: 'ollama', + baseUrl: 'http://localhost:11434', // optional, this is the default + model: 'llama2' // required for Ollama + } +}); +``` + ## Running a Search Query ```js const results = await rag.search('What is vector search?'); console.log(results); -``` \ No newline at end of file +``` diff --git a/mongodb-rag-docs/sidebars.js b/mongodb-rag-docs/sidebars.js index 3aec592..5981221 100644 --- a/mongodb-rag-docs/sidebars.js +++ b/mongodb-rag-docs/sidebars.js @@ -11,6 +11,7 @@ module.exports = { items: [ "examples/basic-example", "examples/advanced-example", + "examples/voyage-example", ], }, "create-rag-app", diff --git a/package.json b/package.json index f797ba4..51dec93 100644 --- a/package.json +++ b/package.json @@ -50,6 +50,7 @@ "dependencies": { "@langchain/community": "^0.3.30", "axios": "^1.6.0", + "voyageai": "^0.0.4", "chalk": "^5.4.1", "columnify": "^1.6.0", "commander": "^10.0.0", diff --git a/src/core/MongoRAG.js b/src/core/MongoRAG.js index f775d5a..f0ede80 100644 --- a/src/core/MongoRAG.js +++ b/src/core/MongoRAG.js @@ -5,6 +5,7 @@ import debug from 'debug'; import IndexManager from './IndexManager.js'; import OpenAIEmbeddingProvider from '../providers/OpenAIEmbeddingProvider.js'; import OllamaEmbeddingProvider from '../providers/OllamaEmbeddingProvider.js'; +import VoyageEmbeddingProvider from '../providers/VoyageEmbeddingProvider.js'; const log = debug('mongodb-rag:core'); @@ -77,6 +78,11 @@ class MongoRAG { baseUrl, model: options.model }); + case 'voyage': + return new VoyageEmbeddingProvider({ + apiKey, + model: options.model || 'voyage-3' + }); default: throw new Error(`Unknown embedding provider: ${provider}`); } @@ -149,7 +155,7 @@ class MongoRAG { const indexManager = new IndexManager(col, this.config); // Check if the index exists without trying to create it - const existingIndexes = await col.listIndexes().toArray(); + const existingIndexes = await col.listSearchIndexes().toArray(); const indexName = options.indexName || this.config.indexName || 'vector_index'; const hasIndex = existingIndexes.some(index => index.name === indexName); @@ -157,6 +163,7 @@ class MongoRAG { throw new Error(`Vector search index '${indexName}' does not exist. Please create it using 'npx mongodb-rag init'.`); } + const projectStage = { $project: { [this.config.embeddingFieldPath]: 0 } }; // Construct the vector search query using the $vectorSearch operator const aggregation = query ? [{ @@ -169,15 +176,17 @@ class MongoRAG { path: this.config.embeddingFieldPath, queryVector: embedding } - }] + }, + projectStage + ] : [{ $skip: skip }, { $limit: maxResults }]; // Simple aggregation for all documents - console.log('[DEBUG] Aggregation query:', JSON.stringify(aggregation, null, 2)); + //console.log('[DEBUG] Aggregation query:', JSON.stringify(aggregation, null, 2)); - log(`Running vector search in ${database || this.config.defaultDatabase}.${collection || this.config.defaultCollection}`); + // console.log(`Running vector search in ${database || this.config.defaultDatabase}.${collection || this.config.defaultCollection}`); const results = await col.aggregate(aggregation).toArray(); - console.log('[DEBUG] Search results:', results); + //console.log('[DEBUG] Search results:', results); return results.map(r => ({ content: r.content, @@ -221,6 +230,12 @@ class MongoRAG { model: options.model }); break; + case 'voyage': + this.provider = new VoyageEmbeddingProvider({ + apiKey, + model: options.model || 'voyage-3' + }); + break; default: throw new Error(`Unknown embedding provider: ${provider}`); } diff --git a/src/providers/BaseEmbeddingProvider.js b/src/providers/BaseEmbeddingProvider.js index 1cfd531..1d2ae33 100644 --- a/src/providers/BaseEmbeddingProvider.js +++ b/src/providers/BaseEmbeddingProvider.js @@ -15,7 +15,7 @@ class BaseEmbeddingProvider { * @param {number} [options.batchSize=100] - Number of texts to process in each batch * @param {number} [options.maxRetries=3] - Maximum number of retry attempts * @param {number} [options.retryDelay=1000] - Delay between retries in milliseconds - * @param {string} [options.provider] - Provider name ('openai', 'ollama', etc.) + * @param {string} [options.provider] - Provider name ('voyage','openai', 'ollama', etc.) * @param {string} [options.apiKey] - API key for the provider (required except for Ollama) * @throws {Error} If API key is missing for non-Ollama providers */ diff --git a/src/providers/VoyageEmbeddingProvider.js b/src/providers/VoyageEmbeddingProvider.js new file mode 100644 index 0000000..517c077 --- /dev/null +++ b/src/providers/VoyageEmbeddingProvider.js @@ -0,0 +1,138 @@ +// src/providers/VoyageEmbeddingProvider.js +import BaseEmbeddingProvider from './BaseEmbeddingProvider.js'; +import { VoyageAIClient } from 'voyageai'; +import debug from 'debug'; + +const log = debug('mongodb-rag:embedding:voyage'); + +/** + * VoyageAI Embedding Provider implementation + * Generates embeddings using the VoyageAI API + * @extends BaseEmbeddingProvider + */ +class VoyageEmbeddingProvider extends BaseEmbeddingProvider { + /** + * Creates a new VoyageAI embedding provider instance + * @param {Object} options - Configuration options + * @param {string} options.apiKey - VoyageAI API key + * @param {string} [options.model='voyage-3'] - Model to use for embeddings + * Options: voyage-3-large, voyage-3, voyage-3-lite, + * voyage-code-3, voyage-finance-2, voyage-law-2 + * @throws {Error} If API key is not provided + */ + constructor(options = {}) { + super(options); + + if (!options.apiKey) { + throw new Error('VoyageAI API key is required'); + } + + this.apiKey = options.apiKey; + this.model = options.model || 'voyage-3'; + this.client = new VoyageAIClient({ apiKey: this.apiKey }); + + log(`VoyageAI embedding provider initialized with model: ${this.model}`); + } + + /** + * Generates an embedding for a single text + * @param {string} text - Text to generate embedding for + * @returns {Promise} Embedding vector + * @throws {Error} If the input is invalid or the API request fails + */ + async getEmbedding(text) { + if (!text || typeof text !== 'string') { + throw new Error('Input text must be a non-empty string'); + } + + try { + log(`Getting embedding for text: ${text.substring(0, 50)}${text.length > 50 ? '...' : ''}`); + + const response = await this.client.embed({ + input: [text], + model: this.model + }); + + if (!response || !response.data || !response.data[0]?.embedding) { + throw new Error(`Unexpected response from VoyageAI API: ${JSON.stringify(response)}`); + } + + return response.data[0].embedding; + } catch (error) { + log('Error getting embedding from VoyageAI:', error); + if (error instanceof Error) { + throw new Error(`VoyageAI API error: ${error.message}`); + } + throw error; + } + } + + /** + * Generates embeddings for multiple texts + * @param {string[]} texts - Array of texts to generate embeddings for + * @returns {Promise} Array of embedding vectors + * @throws {Error} If the input is invalid or the API request fails + */ + async getEmbeddings(texts) { + if (!Array.isArray(texts) || texts.length === 0) { + return []; + } + + // Validate all inputs are strings + if (!texts.every(text => typeof text === 'string' && text.length > 0)) { + throw new Error('All inputs must be non-empty strings'); + } + + try { + log(`Getting embeddings for ${texts.length} texts`); + + const response = await this.client.embed({ + input: texts, + model: this.model + }); + + if (!response || !response.data || !Array.isArray(response.data)) { + throw new Error(`Unexpected response from VoyageAI API: ${JSON.stringify(response)}`); + } + + return response.data.map(item => item.embedding); + } catch (error) { + log('Error getting embeddings from VoyageAI:', error); + if (error instanceof Error) { + throw new Error(`VoyageAI API error: ${error.message}`); + } + throw error; + } + } + + /** + * Generates embeddings for a batch of texts using VoyageAI API + * @protected + * @param {string[]} texts - Array of texts to embed + * @returns {Promise} Array of embedding vectors + * @throws {Error} If the API request fails or returns unexpected response + */ + async _embedBatch(texts) { + try { + log(`Getting embeddings for batch of ${texts.length} texts`); + + const response = await this.client.embed({ + input: texts, + model: this.model + }); + + if (!response || !response.data) { + throw new Error(`Unexpected response from VoyageAI API: ${JSON.stringify(response)}`); + } + + return response.data.map(item => item.embedding); + } catch (error) { + if (error instanceof Error) { + throw new Error(`VoyageAI API error: ${error.message}`); + } + throw error; + } + } +} + +export default VoyageEmbeddingProvider;