# Managing Vector Store from HTML Documents

In [None]:
# Import required libraries
import os
import warnings
import requests
from bs4 import BeautifulSoup
from langchain.docstore.document import Document  # Import Document class
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
import faiss

def load_html_documents(urls):
    """
    Load HTML documents from a list of URLs.
    
    Args:
        urls (list): List of URLs to fetch HTML content from
    
    Returns:
        list: List of Document objects
    """
    docs = []
    
    for url in urls:
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract text content from the HTML
            text = soup.get_text(separator="\n")
            # Convert to Document object
            doc = Document(page_content=text, metadata={"source": url})
            docs.append(doc)
        except Exception as e:
            print(f"Failed to load {url}: {e}")
    
    return docs

def chunk_documents(docs, chunk_size=1000, chunk_overlap=100):
    """
    Split documents into smaller chunks.
    
    Args:
        docs (list): List of Document objects to chunk
        chunk_size (int): Size of each document chunk
        chunk_overlap (int): Overlap between chunks
    
    Returns:
        list: List of Document chunks
    """
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, 
        chunk_overlap=chunk_overlap
    )
    return text_splitter.split_documents(docs)

def create_vector_store(chunks, embedding_model='nomic-embed-text', base_url='http://localhost:11434'):
    """
    Create a vector store from document chunks.
    
    Args:
        chunks (list): List of document chunks
        embedding_model (str): Name of the embedding model
        base_url (str): Base URL for Ollama embeddings
    
    Returns:
        FAISS: Vector store with embedded documents
    """
    # Initialize embeddings
    embeddings = OllamaEmbeddings(model=embedding_model, base_url=base_url)
    
    # Create vector embedding
    vector = embeddings.embed_query("Sample Text")
    
    # Create FAISS index
    index = faiss.IndexFlatL2(len(vector))
    vector_store = FAISS(
        embedding_function=embeddings,
        index=index,
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
    )
    
    # Add documents to vector store
    vector_store.add_documents(documents=chunks)
    
    return vector_store

def print_retrieved_docs(retrieved_docs, max_length=500):
    """
    Print retrieved documents in a clean, readable format.
    
    Args:
        retrieved_docs (list): List of retrieved documents
        max_length (int): Maximum length of content to display
    """
    print("\n--- Retrieved Documents ---")
    print(f"Total documents retrieved: {len(retrieved_docs)}")
    print("-" * 50)
    
    for i, doc in enumerate(retrieved_docs, 1):
        print(f"\nDocument {i}:")
        print(f"Score: {doc.metadata.get('score', 'N/A')}")
        print(f"Source: {doc.metadata.get('source', 'Unknown')}")
        
        # Truncate content if it's too long
        content = doc.page_content
        if len(content) > max_length:
            content = content[:max_length] + "... [truncated]"
        
        print("\nContent:")
        print(content)
        print("-" * 50)

if __name__ == "__main__":
    """
    Main function to orchestrate document processing and vector store creation.
    """
    # Suppress warnings (optional)
    warnings.filterwarnings('ignore')
    
    # List of URLs to process
    urls = [
        "https://python.langchain.com/docs/introduction/",
        "https://python.langchain.com/docs/concepts/chat_models/",
        "https://python.langchain.com/docs/concepts/vectorstores/"
    ]
    
    # Load HTML documents
    docs = load_html_documents(urls)
    print(f"Total documents loaded: {len(docs)}")
    
    # Chunk documents
    chunks = chunk_documents(docs)
    print(f"Total document chunks: {len(chunks)}")
    
    # Create vector store
    # vector_store = create_vector_store(chunks)
    
    # Example retrieval
    # question = "Which usecases  LangChain is used for?"
    # try:
    #     retrieved_docs = vector_store.similarity_search(query=question, k=5)
    #     if not retrieved_docs:
    #         print("No documents retrieved.")
    #     else:
    #         print_retrieved_docs(retrieved_docs)
    # except Exception as e:
    #     print(f"Error during similarity search: {e}")
    
    # Optional: Save vector store
    # db_name = "../langchain_docs"
    # try:
    #     vector_store.save_local(db_name)
    #     print(f"Vector store saved to {db_name}.")
    # except Exception as e:
    #     print(f"Failed to save vector store: {e}")

Total documents loaded: 3
Total document chunks: 64

--- Retrieved Documents ---
Total documents retrieved: 5
--------------------------------------------------

Document 1:
Score: N/A
Source: https://python.langchain.com/docs/introduction/

Content:
LangChain
 is a framework for developing applications powered by large language models (LLMs).


LangChain simplifies every stage of the LLM application lifecycle:




Development
: Build your applications using LangChain's open-source 
components
 and 
third-party integrations
.
Use 
LangGraph
 to build stateful agents with first-class streaming and human-in-the-loop support.


Productionization
: Use 
LangSmith
 to inspect, monitor and evaluate your applications, so that you can continuously o... [truncated]
--------------------------------------------------

Document 2:
Score: N/A
Source: https://python.langchain.com/docs/introduction/

Content:
Architecture
 page.




langchain-core
: Base abstractions for chat models and other compone