In [1]:
# Environment Setup and Imports
import os
import sys
import logging
from pathlib import Path

# Add the project root to the Python path
current_dir = os.path.dirname(os.getcwd())
if current_dir not in sys.path:
    sys.path.insert(0, current_dir)

# Configure logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Load environment variables
from dotenv import load_dotenv
load_dotenv()

# Verify API key is loaded
api_key = os.getenv("OPENAI_API_KEY")
print(f"API key loaded: {'Yes' if api_key else 'No'}")
print(f"API key starts with: {api_key[:8]}..." if api_key else "No API key found")

API key loaded: Yes
API key starts with: sk-proj-...


In [2]:
#  Initialize Document Processor
from src.document_processor.loader import DocumentLoader
from src.document_processor.text_extractor import TextExtractor
from src.document_processor.chunker import TextChunker

# Initialize components
loader = DocumentLoader()
extractor = TextExtractor()
chunker = TextChunker()

# List available documents
files = loader.get_file_list()
print(f"Found {len(files)} documents:")
for file in files:
    print(f"- {file['name']} ({file['type']})")

Found 3 documents:
- RAG Chatbot Project Plan.docx (word)
- Amazon-com-Inc-2023-Annual-Report.pdf (pdf)
- yearly_product_sales_comparison_with_three_column_bar_graph_slide01-3614560801.jpg (image)


In [3]:
# Process a specific document
if files:
    # Take the first file for testing
    test_file = files[0]
    print(f"Processing {test_file['name']}...")
    
    # Extract text
    text_blocks = extractor.extract_text(test_file)
    print(f"Extracted {len(text_blocks)} text blocks")
    
    # Show a sample
    if text_blocks:
        print("\nSample text block:")
        print(f"Text: {text_blocks[0]['text'][:200]}...")
        print(f"Metadata: {text_blocks[0]['metadata']}")
    
    # Chunk text
    chunks = chunker.chunk_documents(text_blocks)
    print(f"\nCreated {len(chunks)} chunks")
    
    # Show a sample chunk
    if chunks:
        print("\nSample chunk:")
        print(f"Text: {chunks[0]['text'][:200]}...")
        print(f"Metadata: {chunks[0]['metadata']}")
else:
    print("No documents found for processing")

2025-04-25 12:10:56,795 - src.document_processor.text_extractor - INFO - Extracting text from /Users/mdwaquarahmad/Documents/fortune500-rag/uploads/RAG Chatbot Project Plan.docx of type word
2025-04-25 12:10:56,812 - src.document_processor.text_extractor - INFO - Extracted 136 text blocks from /Users/mdwaquarahmad/Documents/fortune500-rag/uploads/RAG Chatbot Project Plan.docx
2025-04-25 12:10:56,813 - src.document_processor.chunker - INFO - Created 136 chunks from 136 text blocks


Processing RAG Chatbot Project Plan.docx...
Extracted 136 text blocks

Sample text block:
Text: RAG Chatbot Project Plan...
Metadata: {'source': 'RAG Chatbot Project Plan.docx', 'company': 'RAG Chatbot Project Plan', 'paragraph': 1, 'section': 'body'}

Created 136 chunks

Sample chunk:
Text: RAG Chatbot Project Plan...
Metadata: {'source': 'RAG Chatbot Project Plan.docx', 'company': 'RAG Chatbot Project Plan', 'paragraph': 1, 'section': 'body', 'chunk': 1, 'total_chunks': 1}


In [4]:
# Initialize Vector Store
from src.vector_store.embeddings import EmbeddingGenerator
from src.vector_store.store import VectorStore

# Initialize embedding generator
embedding_generator = EmbeddingGenerator()

# Initialize vector store
vector_store = VectorStore(embedding_generator)

# Get stats
stats = vector_store.get_stats()
print(f"Vector DB stats: {stats}")

2025-04-25 12:11:20,236 - src.vector_store.embeddings - INFO - Initialized embedding generator with model: text-embedding-3-small
2025-04-25 12:11:20,363 - src.vector_store.store - INFO - Initialized Chroma collection: fortune500_docs
  self.langchain_db = Chroma(


Vector DB stats: {'collection_name': 'fortune500_docs', 'document_count': 37693, 'path': '/Users/mdwaquarahmad/Documents/fortune500-rag/chroma_db'}


In [5]:
# Add Documents to Vector Store
if 'chunks' in locals() and chunks:
    # Add chunks to vector store
    print(f"Adding {len(chunks)} chunks to vector store...")
    doc_ids = vector_store.add_documents(chunks)
    print(f"Added {len(doc_ids)} chunks with IDs: {doc_ids[:3]}...")
    
    # Get updated stats
    stats = vector_store.get_stats()
    print(f"Updated Vector DB stats: {stats}")
else:
    print("No chunks available to add to vector store")

Adding 136 chunks to vector store...


2025-04-25 12:11:50,495 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-04-25 12:11:51,711 - src.vector_store.embeddings - INFO - Generated 136 embeddings
2025-04-25 12:11:51,994 - src.vector_store.store - INFO - Added 136 documents to vector store


Added 136 chunks with IDs: ['aebf693d-02d2-4049-98be-7a74ca7f23ef', '4c81b0a7-a83e-4ba7-a61c-e7118c350b6f', '5bf60538-0773-4026-b23b-d39ff20957d3']...
Updated Vector DB stats: {'collection_name': 'fortune500_docs', 'document_count': 37829, 'path': '/Users/mdwaquarahmad/Documents/fortune500-rag/chroma_db'}


In [6]:
# Test search functionality
test_query = "What was Amazon's revenue in 2023?"
print(f"Searching for: '{test_query}'")

search_results = vector_store.search(test_query)
print(f"Found {len(search_results)} results")

# Display top results
for i, result in enumerate(search_results[:3]):
    print(f"\nResult {i+1} (score: {result['score']:.4f}):")
    print(f"Text: {result['text'][:150]}...")
    print(f"Metadata: {result['metadata']}")

Searching for: 'What was Amazon's revenue in 2023?'


2025-04-25 12:12:20,547 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-04-25 12:12:20,599 - src.vector_store.store - INFO - Found 5 results for query


Found 5 results

Result 1 (score: 0.6782):
Text: By segment, North
America revenue increased 12% Y oY from $316B to $353B, International revenue grew 11% Y oY from
$118B to $131B, and AWS revenue inc...
Metadata: {'total_chunks': 10, 'company': 'Amazon com Inc 2023 Annual Report', 'source': 'Amazon-com-Inc-2023-Annual-Report.pdf', 'chunk': 2, 'page': 2, 'total_pages': 92}

Result 2 (score: 0.6782):
Text: By segment, North
America revenue increased 12% Y oY from $316B to $353B, International revenue grew 11% Y oY from
$118B to $131B, and AWS revenue inc...
Metadata: {'total_pages': 92, 'page': 2, 'company': 'Amazon com Inc 2023 Annual Report', 'total_chunks': 10, 'source': 'Amazon-com-Inc-2023-Annual-Report.pdf', 'chunk': 2}

Result 3 (score: 0.6782):
Text: By segment, North
America revenue increased 12% Y oY from $316B to $353B, International revenue grew 11% Y oY from
$118B to $131B, and AWS revenue inc...
Metadata: {'page': 2, 'total_pages': 92, 'total_chunks': 10, 'company': 'Amazon

In [7]:
# Test Response Generation
from src.llm.response_generator import ResponseGenerator

# Initialize response generator
response_generator = ResponseGenerator()

# Generate response
if search_results:
    print("Generating response...")
    result = response_generator.generate_response(test_query, search_results)
    
    print("\nGenerated Response:")
    print(result["response"])
    
    print("\nSources:")
    for i, source in enumerate(result.get("sources", [])):
        print(f"Source {i+1}: {source}")
else:
    print("No search results available for response generation")

2025-04-25 12:14:54,947 - src.llm.response_generator - INFO - Initialized response generator with model: gpt-4o
2025-04-25 12:14:54,947 - src.llm.response_generator - INFO - Generating response for query: What was Amazon's revenue in 2023?


Generating response...


2025-04-25 12:14:56,687 - httpx - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-04-25 12:14:56,708 - src.llm.response_generator - INFO - Raw LLM response received: content="I don't have enough information to answer this question. The provided context includes revenue figures for Amazon's North America, International, and AWS segments, but it does not provide the total revenue for Amazon in 2023. Additional information on Amazon's total revenue would be needed to answer this question. The source of the provided information is Amazon.com Inc's 2023 Annual Report." additional_kwargs={'refusal': None} response_metadata={'token_usage': {'completion_tokens': 76, 'prompt_tokens': 1037, 'total_tokens': 1113, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_name': 'gpt-4o-2024-08-06', 'sy


Generated Response:
I don't have enough information to answer this question. The provided context includes revenue figures for Amazon's North America, International, and AWS segments, but it does not provide the total revenue for Amazon in 2023. Additional information on Amazon's total revenue would be needed to answer this question. The source of the provided information is Amazon.com Inc's 2023 Annual Report.

Sources:
Source 1: {'total_chunks': 10, 'company': 'Amazon com Inc 2023 Annual Report', 'source': 'Amazon-com-Inc-2023-Annual-Report.pdf', 'chunk': 2, 'page': 2, 'total_pages': 92}
Source 2: {'total_pages': 92, 'page': 2, 'company': 'Amazon com Inc 2023 Annual Report', 'total_chunks': 10, 'source': 'Amazon-com-Inc-2023-Annual-Report.pdf', 'chunk': 2}
Source 3: {'page': 2, 'total_pages': 92, 'total_chunks': 10, 'company': 'Amazon com Inc 2023 Annual Report', 'chunk': 2, 'source': 'Amazon-com-Inc-2023-Annual-Report.pdf'}
Source 4: {'page': 2, 'chunk': 2, 'total_chunks': 10, 'com