# Managing Vector Store from PDF Documents



In [1]:
# Import required libraries
import os
import warnings
import tiktoken
import faiss
from dotenv import load_dotenv

# Document Loading Libraries
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore

def load_pdf_documents(directory):
    """
    Load PDF documents from a specified directory.
    
    Args:
        directory (str): Path to the directory containing PDF files
    
    Returns:
        list: List of loaded documents
    """
    pdfs = []
    docs = []
    
    # Find all PDF files in the specified directory
    for root, _, files in os.walk(directory):
        pdfs.extend([os.path.join(root, file) for file in files if file.endswith(".pdf")])
    
    # Load each PDF document
    for pdf in pdfs:
        loader = PyMuPDFLoader(pdf)
        docs.extend(loader.load())
    
    return docs

def chunk_documents(docs, chunk_size=1000, chunk_overlap=100):
    """
    Split documents into smaller chunks.
    
    Args:
        docs (list): List of documents to chunk
        chunk_size (int): Size of each document chunk
        chunk_overlap (int): Overlap between chunks
    
    Returns:
        list: List of document chunks
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap
    )
    return text_splitter.split_documents(docs)

def create_vector_store(chunks, embedding_model='nomic-embed-text', base_url='http://localhost:11434'):
    """
    Create a vector store from document chunks.
    
    Args:
        chunks (list): List of document chunks
        embedding_model (str): Name of the embedding model
        base_url (str): Base URL for Ollama embeddings
    
    Returns:
        FAISS: Vector store with embedded documents
    """
    # Initialize embeddings
    embeddings = OllamaEmbeddings(model=embedding_model, base_url=base_url)
    
    # Create vector embedding
    vector = embeddings.embed_query("Hello World")
    
    # Create FAISS index
    index = faiss.IndexFlatL2(len(vector))
    vector_store = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
    )
    
    # Add documents to vector store
    vector_store.add_documents(documents=chunks)
    
    return vector_store
def print_retrieved_docs(retrieved_docs, max_length=500):
    """
    Print retrieved documents in a clean, readable format.
    
    Args:
        retrieved_docs (list): List of retrieved documents
        max_length (int): Maximum length of content to display
    """
    print("\n--- Retrieved Documents ---")
    print(f"Total documents retrieved: {len(retrieved_docs)}")
    print("-" * 50)
    
    for i, doc in enumerate(retrieved_docs, 1):
        print(f"\nDocument {i}:")
        print(f"Score: {doc.metadata.get('score', 'N/A')}")
        print(f"Source: {doc.metadata.get('source', 'Unknown')}")
        
        # Truncate content if it's too long
        content = doc.page_content
        if len(content) > max_length:
            content = content[:max_length] + "... [truncated]"
        
        print("\nContent:")
        print(content)
        print("-" * 50)

if __name__ == "__main__":
    """
    Main function to orchestrate document processing and vector store creation.
    """
    # Suppress warnings (optional)
    warnings.filterwarnings('ignore')
    
    # Load PDF documents
    docs = load_pdf_documents("../dataset/health_docs")
    
    # Optional: Check document count and content
    print(f"Total documents loaded: {len(docs)}")
    
    # Chunk documents
    chunks = chunk_documents(docs)
    print(f"Total document chunks: {len(chunks)}")
    
    # Optional: Tokenization check
    # encoding = tiktoken.encoding_for_model("gpt-4o-mini")
    # token_lengths = [len(encoding.encode(chunk.page_content)) for chunk in chunks[:3]]
    # print(f"Token lengths of first 3 chunks: {token_lengths}")
    
    # Create vector store
    vector_store = create_vector_store(chunks)
    
    # Example retrieval
    question = "What nutritional supplements support muscle protein synthesis?"
    retrieved_docs = vector_store.search(query=question, k=5, search_type="similarity")

    print_retrieved_docs(retrieved_docs)
    
    # Optional: Save vector store
    db_name = "../health_docs"
    vector_store.save_local(db_name)

Total documents loaded: 38
Total document chunks: 201

--- Retrieved Documents ---
Total documents retrieved: 5
--------------------------------------------------

Document 1:
Score: N/A
Source: ../dataset/health_docs/dietary supplements.pdf

Content:
supplements mean products that are concentrated sources of vitamins, minerals, or other
substances with a nutritional or physiological effect (e.g., amino acids, essential fatty acids,
probiotics, plants, and herbal extracts) intended to supplement the regular diet. Dietary
supplements are produced in the form of capsules, tablets, pills, and other similar forms,
designed to be taken in measured small unit quantities [1,2]. Dietary supplements, despite
their route of administration and drug-like... [truncated]
--------------------------------------------------

Document 2:
Score: N/A
Source: ../dataset/health_docs/health supplements.pdf

Content:
women consuming isoflavone supplements (59) and, given the clear evidence of 
estrogenicity, 