### Import Statements

In [None]:
import os
from langchain_community.document_loaders import PyPDFLoader, TextLoader, UnstructuredMarkdownLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_community.vectorstores import Chroma
from dotenv import load_dotenv

load_dotenv()

### Configuration

In [None]:
DATA_DIR = "./data" # Path to your documents
CHROMA_DB_DIR = "./chroma_db"             # Directory to store ChromaDB persistence
COLLECTION_NAME = "customer_support_knowledge" # Name of your collection in ChromaDB
GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")

### Load Documents

In [None]:
def load_documents(data_dir):
    documents = []
    for filename in os.listdir(data_dir):
        file_path = os.path.join(data_dir, filename)
        if filename.endswith(".pdf"):
            loader = PyPDFLoader(file_path)
            documents.extend(loader.load())
            print(f"Loaded {len(loader.load())} pages from {filename}")
        elif filename.endswith(".txt"):
            loader = TextLoader(file_path)
            documents.extend(loader.load())
            print(f"Loaded text from {filename}")
        elif filename.endswith(".md"):
            # UnstructuredMarkdownLoader is good for parsing Markdown structure
            loader = UnstructuredMarkdownLoader(file_path)
            print(f"Loaded markdown from {filename}")
        else:
            print(f"Skipping unsupported file type: {filename}")
    return documents

### Chunking documents

In [None]:
def chunk_documents(documents):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,      # Max characters per chunk
        chunk_overlap=200,    # Overlap between chunks to maintain context
        length_function=len,  # Use character length for chunking
        separators=["\n\n", "\n", " ", ""] # Try splitting by paragraphs, then lines, then words, then characters
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Split {len(documents)} documents into {len(chunks)} chunks.")
    return chunks

### Create Embeddings

In [None]:
def create_and_store_embeddings(chunks, chroma_db_dir, collection_name, google_api_key):
    # Initialize Google Gemini Embeddings
    embeddings = GoogleGenerativeAIEmbeddings(
        model="models/embedding-001", # Recommended embedding model for Gemini
        google_api_key=google_api_key
    )

    # Create a ChromaDB instance, persisting to disk
    # This will create/load the database at CHROMA_DB_DIR
    print(f"Creating/loading ChromaDB at {chroma_db_dir} with collection '{collection_name}'...")
    vector_db = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        persist_directory=chroma_db_dir,
        collection_name=collection_name
    )
    print("Vector database created/updated successfully.")
    return vector_db

### Running file

In [None]:
print("--- Starting Phase 1: Building Knowledge Base ---")

# Ensure the data directory exists
if not os.path.exists(DATA_DIR):
    print(f"Error: Data directory '{DATA_DIR}' not found. Please create it and add your documents.")
    exit()

# Load documents
raw_documents = load_documents(DATA_DIR)
if not raw_documents:
    print("No documents loaded. Please check your data directory and file types.")
    exit()

# Chunk documents
document_chunks = chunk_documents(raw_documents)

# Create and store embeddings in ChromaDB
# The 'vector_db' object is returned, which you can use for testing
vector_db = create_and_store_embeddings(
    document_chunks,
    CHROMA_DB_DIR,
    COLLECTION_NAME,
    GOOGLE_API_KEY
)

print(f"\n--- Phase 1: Knowledge Base Built Successfully! ---")
print(f"Vector database stored at: {CHROMA_DB_DIR}")

### Testing

In [None]:
print("\n--- Performing Basic Retrieval Test ---")
retriever = vector_db.as_retriever(search_kwargs={"k": 1})

test_queries = [
    "My QuantumFlow purifier is showing a red light on its filter status indicator. What does this mean, and what should I do",
    "What kind of purification stages does the QuantumFlow QF-2025 use, and what is its water storage capacity?",
    "I just installed my new QuantumFlow purifier, but the water flow is very slow. What could be the potential reasons for this issue, and what's the first step I should take?",
    "How do I connect my QuantumFlow purifier to the mobile app, and what are some features I can access through the app?",
    "My purified water has a strange taste. Could this be normal for a new filter, or is it a sign of a bigger problem?",
    "What is the warranty period for the QuantumFlow QF-2025, and does it cover the cost of filter replacements?",
    "If I need to buy a replacement sediment filter, what is its specific model number, and where should I purchase genuine filters?"
]

for i, query in enumerate(test_queries):
    print(f"\nTest Query {i+1}: {query}")
    try:
        # Retrieve relevant chunks
        results = retriever.invoke(query)
        print(f"Found {len(results)} relevant chunks.")
        for j, doc in enumerate(results):
            print(f"  Chunk {j+1} (Source: {doc.metadata.get('source', 'N/A')}):")
            print(f"    Content: {doc.page_content}") # Print first 200 chars
    except Exception as e:
        print(f"  Error during retrieval: {e}")

print("\n--- Phase 1 Complete! ---")