# BCG Multi-Agent & Multimodal AI Platform - Embeddings and Vector Store

This notebook demonstrates the embeddings generation and vector store components of the BCG Multi-Agent & Multimodal AI Platform, including:
1. Generating embeddings for document chunks
2. Storing embeddings in a vector database
3. Performing semantic search queries
4. Analyzing search results

In [None]:
import os
import sys
import logging
from pathlib import Path

# Add the project root to the Python path
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)

## 1. Setup and Configuration

In [None]:
from configs.config import (
    RAW_DATA_DIR,
    PROCESSED_DATA_DIR,
    EMBEDDINGS_DIR,
    VECTOR_STORE_PATH,
)

# Ensure the embeddings directory exists
EMBEDDINGS_DIR.mkdir(parents=True, exist_ok=True)
VECTOR_STORE_PATH.mkdir(parents=True, exist_ok=True)

print(f"Raw data directory: {RAW_DATA_DIR}")
print(f"Processed data directory: {PROCESSED_DATA_DIR}")
print(f"Embeddings directory: {EMBEDDINGS_DIR}")
print(f"Vector store path: {VECTOR_STORE_PATH}")

## 2. Check for Processed Documents

Let's check if we have processed documents available from the previous data ingestion step.

In [None]:
import json

# Find processed document directories
processed_dirs = [d for d in PROCESSED_DATA_DIR.iterdir() if d.is_dir()]

# Collect document info
document_info = []

for doc_dir in processed_dirs:
    json_files = list(doc_dir.glob("*.json"))
    
    for json_file in json_files:
        try:
            with open(json_file, "r", encoding="utf-8") as f:
                doc_data = json.load(f)
            
            document_info.append({
                "document_id": doc_data.get("document_id", ""),
                "filename": doc_data.get("filename", ""),
                "chunks": len(doc_data.get("text_chunks", [])),
                "visuals": len(doc_data.get("visual_elements", [])),
                "json_path": str(json_file),
            })
        except Exception as e:
            print(f"Error reading {json_file}: {str(e)}")

if document_info:
    print(f"Found {len(document_info)} processed documents:")
    for doc in document_info:
        print(f" - {doc['filename']}: {doc['chunks']} chunks, {doc['visuals']} visuals")
else:
    print("No processed documents found. Please run the data ingestion notebook first.")

## 3. Test Embedding Generator

Let's test the embedding generator with some sample text.

In [None]:
from src.embeddings.embedding_generator import EmbeddingGenerator

# Initialize embedding generator with a local model
embedding_generator = EmbeddingGenerator(
    model_name="all-MiniLM-L6-v2",  # Small but effective model
    model_type="local",
    cache_folder=str(EMBEDDINGS_DIR / "models"),
)

# Test with some sample text
sample_texts = [
    "BCG is committed to reducing carbon emissions and achieving net zero.",
    "Sustainability is a core value of Boston Consulting Group's strategy.",
    "Digital transformation is revolutionizing business operations.",
    "Artificial intelligence and machine learning drive innovation."
]

# Generate embeddings for sample texts
embeddings = embedding_generator.generate_embeddings(sample_texts)

# Display information about the embeddings
embedding_dimension = embedding_generator.get_embedding_dimension()
print(f"Embedding dimension: {embedding_dimension}")
print(f"Generated {len(embeddings)} embeddings with dimension {len(embeddings[0])}")

# Show a sample of the first embedding
print("\nSample of first embedding vector:")
print(embeddings[0][:10], "...")

## 4. Test Vector Store

Let's test the vector store by adding some sample documents and performing searches.

In [None]:
from src.embeddings.vector_store import VectorStore
from langchain_core.documents import Document

# Initialize vector store with the embedding model
vector_store = VectorStore(
    embedding_model=embedding_generator.get_embedding_model(),
    persist_directory=str(VECTOR_STORE_PATH / "test"),
    collection_name="test_collection",
)

# Create sample documents with metadata
sample_documents = [
    Document(
        page_content=sample_texts[0],
        metadata={
            "document_id": "doc_1",
            "topic": "sustainability",
            "year": 2023,
        },
    ),
    Document(
        page_content=sample_texts[1],
        metadata={
            "document_id": "doc_1",
            "topic": "sustainability",
            "year": 2023,
        },
    ),
    Document(
        page_content=sample_texts[2],
        metadata={
            "document_id": "doc_2",
            "topic": "digital",
            "year": 2022,
        },
    ),
    Document(
        page_content=sample_texts[3],
        metadata={
            "document_id": "doc_2",
            "topic": "digital",
            "year": 2022,
        },
    ),
]

# Add documents to vector store
ids = vector_store.add_documents(sample_documents)
vector_store.persist()

print(f"Added {len(ids)} documents to vector store")
print(f"Document IDs: {ids}")
print(f"Vector store now contains {vector_store.count()} documents")

## 5. Test Vector Store Search

Let's test searching the vector store with different queries.

In [None]:
# Test semantic search
query = "Environmental sustainability and carbon footprint reduction"
print(f"Searching for: '{query}'\n")

results = vector_store.search(query, k=2)

print("Search Results:")
for i, (doc, score) in enumerate(results):
    print(f"Result {i+1} (Score: {score:.4f})")
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print()

In [None]:
# Test search with metadata filter
query = "business strategy"
filter_condition = {"topic": "digital"}
print(f"Searching for: '{query}' with filter: {filter_condition}\n")

filtered_results = vector_store.search(query, k=2, filter=filter_condition)

print("Filtered Search Results:")
for i, (doc, score) in enumerate(filtered_results):
    print(f"Result {i+1} (Score: {score:.4f})")
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print()

In [None]:
# Test MMR search for diverse results
query = "BCG business"
print(f"Performing MMR search for: '{query}'\n")

mmr_results = vector_store.search_mmr(query, k=3, lambda_mult=0.7)

print("MMR Search Results:")
for i, doc in enumerate(mmr_results):
    print(f"Result {i+1}")
    print(f"Content: {doc.page_content}")
    print(f"Metadata: {doc.metadata}")
    print()

## 6. Initialize Embeddings Manager

Now let's test the embeddings manager that integrates embedding generation and vector storage.

In [None]:
from src.embeddings.embeddings_manager import EmbeddingsManager

# Initialize embeddings manager
embeddings_manager = EmbeddingsManager(
    model_name="all-MiniLM-L6-v2",
    model_type="local",
    vector_store_dir=str(VECTOR_STORE_PATH / "bcg_docs"),
    collection_name="bcg_sustainability_reports",
    cache_folder=str(EMBEDDINGS_DIR / "models"),
)

# Get vector store statistics
stats = embeddings_manager.get_statistics()
print("Embeddings Manager Statistics:")
for key, value in stats.items():
    print(f" - {key}: {value}")

## 7. Process Processed Documents

Now let's process our previously processed documents and add them to the vector store.

In [None]:
# Process all available documents
if document_info:
    print(f"Processing {len(document_info)} documents...\n")
    
    for i, doc in enumerate(document_info):
        print(f"Processing document {i+1}/{len(document_info)}: {doc['filename']}")
        
        try:
            # Process the JSON document
            ids = embeddings_manager.process_json_document(doc['json_path'])
            print(f" - Added {len(ids)} chunks to vector store")
        except Exception as e:
            print(f" - Error processing document: {str(e)}")
    
    # Get updated statistics
    stats = embeddings_manager.get_statistics()
    print("\nUpdated Embeddings Manager Statistics:")
    for key, value in stats.items():
        print(f" - {key}: {value}")
else:
    print("No processed documents available to add to vector store.")

## 8. Test Search Functionality with Real Documents

Now let's test the search functionality with our real BCG sustainability report documents.

In [None]:
# Define some test queries related to BCG sustainability reports
test_queries = [
    "What are BCG's carbon emission reduction targets?",
    "How does BCG approach diversity and inclusion?",
    "What is BCG's strategy for sustainable operations?",
    "What community engagement initiatives does BCG participate in?",
    "How is BCG supporting climate action with its clients?"
]

# Test each query
for query in test_queries:
    print(f"\nQuery: '{query}'")
    
    # Search with MMR for diverse results
    results = embeddings_manager.search(query, k=3, use_mmr=True)
    
    print(f"Top {len(results)} results:")
    for i, doc in enumerate(results):
        print(f"\nResult {i+1}")
        print(f"Document: {doc.metadata.get('filename', 'Unknown')}")
        print(f"Content: {doc.page_content[:300]}..." if len(doc.page_content) > 300 else f"Content: {doc.page_content}")
    
    print("\n" + "-"*80)

## 9. Filter Search Results by Year or Document

Let's test filtering search results by specific metadata attributes like year or document.

In [None]:
# Test query with filters
query = "BCG's climate commitments"

# Filter by document
if document_info and len(document_info) >= 2:
    # Get document IDs for filtering
    doc_id_1 = document_info[0]['document_id']
    doc_id_2 = document_info[1]['document_id'] if len(document_info) > 1 else None
    
    if doc_id_1:
        print(f"\nQuery: '{query}' filtered to document: {doc_id_1}")
        
        # Create filter for specific document
        doc_filter = {"document_id": doc_id_1}
        
        # Search with filter
        filtered_results = embeddings_manager.search(query, k=2, filter=doc_filter, use_mmr=True)
        
        print(f"Top {len(filtered_results)} results from document {doc_id_1}:")
        for i, doc in enumerate(filtered_results):
            print(f"\nResult {i+1}")
            print(f"Document: {doc.metadata.get('filename', 'Unknown')}")
            print(f"Content: {doc.page_content[:300]}..." if len(doc.page_content) > 300 else f"Content: {doc.page_content}")
    
    if doc_id_2:
        print(f"\nQuery: '{query}' filtered to document: {doc_id_2}")
        
        # Create filter for specific document
        doc_filter = {"document_id": doc_id_2}
        
        # Search with filter
        filtered_results = embeddings_manager.search(query, k=2, filter=doc_filter, use_mmr=True)
        
        print(f"Top {len(filtered_results)} results from document {doc_id_2}:")
        for i, doc in enumerate(filtered_results):
            print(f"\nResult {i+1}")
            print(f"Document: {doc.metadata.get('filename', 'Unknown')}")
            print(f"Content: {doc.page_content[:300]}..." if len(doc.page_content) > 300 else f"Content: {doc.page_content}")
else:
    print("Not enough processed documents available for document filtering test.")

## 10. Compare Results Across Reports

Let's compare results across different BCG sustainability reports to see how BCG's approach has evolved over time.

In [None]:
import pandas as pd

# Define a query on a topic that might have evolved over time
evolution_query = "BCG's approach to diversity, equity, and inclusion"

# Check if we have multiple documents
if document_info and len(document_info) >= 2:
    # Get document IDs and filenames
    doc_ids = [doc['document_id'] for doc in document_info]
    doc_filenames = [doc['filename'] for doc in document_info]
    
    # Create a DataFrame to store the comparison
    comparison_data = []
    
    print(f"Comparing results for query: '{evolution_query}' across documents")
    
    # Search each document separately
    for doc_id, filename in zip(doc_ids, doc_filenames):
        # Create filter for specific document
        doc_filter = {"document_id": doc_id}
        
        # Search with filter
        results = embeddings_manager.search(evolution_query, k=1, filter=doc_filter, use_mmr=False)
        
        if results:
            result_doc = results[0]
            comparison_data.append({
                "Document": filename,
                "Content": result_doc.page_content[:500] + ("..." if len(result_doc.page_content) > 500 else ""),
            })
    
    # Create and display DataFrame
    if comparison_data:
        comparison_df = pd.DataFrame(comparison_data)
        display(comparison_df)
    else:
        print("No results found for comparison.")
else:
    print("Not enough processed documents available for comparison.")

## 11. Conclusion

In this notebook, we've successfully demonstrated the embeddings and vector store components of the BCG Multi-Agent & Multimodal AI Platform. We've generated embeddings for the processed BCG Sustainability Reports and stored them in a vector database that enables efficient semantic search.

Key accomplishments include:
1. Generating embeddings using a local sentence transformer model
2. Creating a vector store for efficient similarity search
3. Adding processed document chunks to the vector store
4. Performing semantic searches with various queries
5. Filtering results by document and metadata
6. Comparing content across different documents

These components form the foundation of our RAG system, which will be integrated into the multi-agent architecture in the next steps.