In [2]:
import re, os
import tiktoken
import requests
from urllib.parse import urljoin

# Add dotenv for environment variable loading
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get Anthropic API key from environment variables
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")
if not anthropic_api_key:
    print("Warning: ANTHROPIC_API_KEY not found in environment variables. Make sure to set it in the .env file.")

from bs4 import BeautifulSoup, Tag
from langchain_community.document_loaders import RecursiveUrlLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain_openai import OpenAIEmbeddings
from langchain_anthropic import ChatAnthropic
from langchain_community.vectorstores import SKLearnVectorStore

def count_tokens(text, model="cl100k_base"):
    """
    Count the number of tokens in the text using tiktoken.
    
    Args:
        text (str): The text to count tokens for
        model (str): The tokenizer model to use (default: cl100k_base for GPT-4)
        
    Returns:
        int: Number of tokens in the text
    """
    encoder = tiktoken.get_encoding(model)
    return len(encoder.encode(text))

def bs4_extractor(html: str, base_url=None) -> str:
    """
    Extract and clean up the main content from HTML using BeautifulSoup.
    
    This function also follows meta-refresh redirects to ensure the latest content is fetched.
    
    Args:
        html (str): The HTML content to extract text from
        base_url (str): The base URL for resolving relative redirects (default: None)
        
    Returns:
        str: The extracted and cleaned content
    """
    soup = BeautifulSoup(html, "lxml")
    
    # Check for meta-refresh redirect
    meta = soup.find('meta', attrs={'http-equiv': re.compile('refresh', re.I)})
    if meta and isinstance(meta, Tag) and meta.attrs.get('content'):
        content_value = meta.attrs.get('content')
        # Ensure content_value is a string
        if isinstance(content_value, str):
            match = re.search(r'url=([^;]+)', content_value, re.IGNORECASE)
            if match and base_url:
                redirect_url = urljoin(base_url, match.group(1).strip())
                print(f"Following meta-refresh redirect to: {redirect_url}")
                resp = requests.get(redirect_url, timeout=15)
                soup = BeautifulSoup(resp.text, "lxml")
    
    # Target the main article content for LangGraph documentation 
    main_content = soup.find("article", class_="md-content__inner")
    
    # If found, use that, otherwise fall back to the whole document
    content = main_content.get_text() if main_content else soup.text
    
    # Clean up whitespace
    content = re.sub(r"\n\n+", "\n\n", content).strip()
    
    return content

def load_langgraph_docs():
    """
    Load LangGraph documentation from the official website.
    
    This function:
    1. Uses RecursiveUrlLoader to fetch pages from the LangGraph website
    2. Counts the total documents and tokens loaded
    
    Returns:
        list: A list of Document objects containing the loaded content
        list: A list of tokens per document
    """
    print("Loading LangGraph documentation...")

    # Load the documentation 
    # Appended index.html to URLs ending with / to directly access the content page
    urls = [
        "https://langchain-ai.github.io/langgraph/concepts/",
        "https://langchain-ai.github.io/langgraph/how-tos/",
        "https://langchain-ai.github.io/langgraph/tutorials/workflows/",
        "https://langchain-ai.github.io/langgraph/tutorials/introduction/",
        "https://langchain-ai.github.io/langgraph/tutorials/langgraph-platform/local-server/",
    ]

    docs = []
    for url in urls:
        print(f"Attempting to load URL: {url}")
        try:
            resp = requests.get(url, timeout=15)
            text = bs4_extractor(resp.text, base_url=url)
            from langchain_core.documents import Document
            doc = Document(page_content=text, metadata={"source": url})
            docs.append(doc)
            print(f"Successfully loaded content from {url}")
        except Exception as e:
            print(f"Error loading {url}: {e}")

    print(f"Loaded {len(docs)} documents from LangGraph documentation.")
    print("\nLoaded URLs:")
    for i, doc in enumerate(docs):
        print(f"{i+1}. {doc.metadata.get('source', 'Unknown URL')}")
    
    # Count total tokens in documents
    total_tokens = 0
    tokens_per_doc = []
    for doc in docs:
        total_tokens += count_tokens(doc.page_content)
        tokens_per_doc.append(count_tokens(doc.page_content))
    print(f"Total tokens in loaded documents: {total_tokens}")
    
    return docs, tokens_per_doc

def save_llms_full(documents):
    """ Save the documents to a file """

    # Open the output file
    output_filename = "llms_full.txt"

    with open(output_filename, "w") as f:
        # Write each document
        for i, doc in enumerate(documents):
            # Get the source (URL) from metadata
            source = doc.metadata.get('source', 'Unknown URL')
            
            # Write the document with proper formatting
            f.write(f"DOCUMENT {i+1}\n")
            f.write(f"SOURCE: {source}\n")
            f.write("CONTENT:\n")
            f.write(doc.page_content)
            f.write("\n\n" + "="*80 + "\n\n")

    print(f"Documents concatenated into {output_filename}")

def split_documents(documents):
    """
    Split documents into smaller chunks for improved retrieval.
    
    This function:
    1. Uses RecursiveCharacterTextSplitter with tiktoken to create semantically meaningful chunks
    2. Ensures chunks are appropriately sized for embedding and retrieval
    3. Counts the resulting chunks and their total tokens
    
    Args:
        documents (list): List of Document objects to split
        
    Returns:
        list: A list of split Document objects
    """
    print("Splitting documents...")
    
    # Initialize text splitter using tiktoken for accurate token counting
    # chunk_size adjusted to better match the bge-large-en-v1.5 model's context window
    # chunk_overlap adjusted proportionally
    text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=1500,  # ~512 tokens, matching model's context window
        chunk_overlap=150  # Adjusted overlap (10% of chunk size)
    )
    
    # Split documents into chunks
    split_docs = text_splitter.split_documents(documents)
    
    print(f"Created {len(split_docs)} chunks from documents.")
    
    # Count total tokens in split documents
    total_tokens = 0
    for doc in split_docs:
        total_tokens += count_tokens(doc.page_content)
    
    print(f"Total tokens in split documents: {total_tokens}")
    
    return split_docs

def create_vectorstore(splits):
    """
    Create a vector store from document chunks using SKLearnVectorStore.
    
    This function:
    1. Initializes an embedding model to convert text into vector representations
    2. Creates a vector store from the document chunks
    
    Args:
        splits (list): List of split Document objects to embed
        
    Returns:
        SKLearnVectorStore: A vector store containing the embedded documents
    """
    print("Creating SKLearnVectorStore...")
    
    # Initialize free embeddings from SentenceTransformers
    from langchain_community.embeddings import HuggingFaceEmbeddings
    # You'll need to install the following packages:
    # pip install sentence-transformers
    # pip install langchain-community
    
    # Using BAAI/bge-large-en-v1.5 which can handle up to 512 tokens and provides better performance for retrieval tasks compared to all-MiniLM-L6-v2

    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")
    
    # Create vector store from documents using SKLearn
    persist_path = os.getcwd()+"/sklearn_vectorstore.parquet"
    vectorstore = SKLearnVectorStore.from_documents(
        documents=splits,
        embedding=embeddings,
        persist_path=persist_path   ,
        serializer="parquet",
    )
    print("SKLearnVectorStore created successfully.")
    
    vectorstore.persist()
    print("SKLearnVectorStore was persisted to", persist_path)

    return vectorstore

In [3]:
# Load the documents
documents, tokens_per_doc = load_langgraph_docs()

# Save the documents to a file
save_llms_full(documents)

# Split the documents
split_docs = split_documents(documents)

# Create the vector store
vectorstore = create_vectorstore(split_docs)

Loading LangGraph documentation...
Attempting to load URL: https://langchain-ai.github.io/langgraph/concepts/
Following meta-refresh redirect to: https://langchain-ai.github.io/langgraph/
Successfully loaded content from https://langchain-ai.github.io/langgraph/concepts/
Attempting to load URL: https://langchain-ai.github.io/langgraph/how-tos/
Following meta-refresh redirect to: https://langchain-ai.github.io/langgraph/
Successfully loaded content from https://langchain-ai.github.io/langgraph/how-tos/
Attempting to load URL: https://langchain-ai.github.io/langgraph/tutorials/workflows/
Successfully loaded content from https://langchain-ai.github.io/langgraph/tutorials/workflows/
Attempting to load URL: https://langchain-ai.github.io/langgraph/tutorials/introduction/
Following meta-refresh redirect to: https://langchain-ai.github.io/langgraph/concepts/why-langgraph
Successfully loaded content from https://langchain-ai.github.io/langgraph/tutorials/introduction/
Attempting to load URL: h

  embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5")
  from .autonotebook import tqdm as notebook_tqdm


SKLearnVectorStore created successfully.
SKLearnVectorStore was persisted to /home/polivei/Desktop/ccode/langgraph-rag-mcp/sklearn_vectorstore.parquet


In [5]:
# Create retriever to get relevant documents (k=3 means return top 3 matches)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
    
# Get relevant documents for the query
query = "explain the difference between LangChain and LangGraph"    
relevant_docs = retriever.invoke(query)
print(f"Retrieved {len(relevant_docs)} relevant documents")

for d in relevant_docs:
    print(d.metadata['source'])
    print(d.page_content[0:500])
    print("\n--------------------------------\n")

Retrieved 3 relevant documents
https://langchain-ai.github.io/langgraph/tutorials/langgraph-platform/local-server/
LangGraph Platform quickstart¶
This guide shows you how to run a LangGraph application locally.
Prerequisites¶
Before you begin, ensure you have the following:

An API key for LangSmith - free to sign up

1. Install the LangGraph CLI¶
Python serverNode server

# Python >= 3.11 is required.

pip install --upgrade "langgraph-cli[inmem]"

npx @langchain/langgraph-cli

2. Create a LangGraph app 🌱¶
Create a new app from the new-langgraph-project-python template or new-langgraph-project-js template. T

--------------------------------

https://langchain-ai.github.io/langgraph/tutorials/langgraph-platform/local-server/
LangGraph Platform quickstart¶
This guide shows you how to run a LangGraph application locally.
Prerequisites¶
Before you begin, ensure you have the following:

An API key for LangSmith - free to sign up

1. Install the LangGraph CLI¶
Python serverNode server

# Py

In [6]:
from langchain_core.tools import tool
from langchain_community.embeddings import HuggingFaceEmbeddings

@tool
def langgraph_query_tool(query: str):
    """
    Query the LangGraph documentation using a retriever.
    
    Args:
        query (str): The query to search the documentation with

    Returns:
        str: A str of the retrieved documents
    """
    retriever = SKLearnVectorStore(
    embedding=HuggingFaceEmbeddings(model_name="BAAI/bge-large-en-v1.5"), 
    persist_path=os.getcwd()+"/sklearn_vectorstore.parquet", 
    serializer="parquet").as_retriever(search_kwargs={"k": 3})

    relevant_docs = retriever.invoke(query)
    print(f"Retrieved {len(relevant_docs)} relevant documents")
    formatted_context = "\n\n".join([f"==DOCUMENT {i+1}==\n{doc.page_content}" for i, doc in enumerate(relevant_docs)])
    return formatted_context

['class langchain_anthropic.chat_models.ChatAnthropic'](https://python.langchain.com/api_reference/anthropic/chat_models/langchain_anthropic.chat_models.ChatAnthropic.html)

['langchain-anthropic'](https://python.langchain.com/docs/integrations/providers/anthropic/)

In [7]:
from langchain_anthropic import ChatAnthropic
llm = ChatAnthropic(model_name="claude-3-7-sonnet-latest", timeout=None, temperature=0.0, stop=None)
augmented_llm = llm.bind_tools([langgraph_query_tool])

instructions = """You are a helpful assistant that can answer questions about the LangGraph documentation. 
Use the langgraph_query_tool for any questions about the documentation.
If you don't know the answer, say "I don't know."""

messages = [
    {"role": "system", "content": instructions},
    {"role": "user", "content": "explain the difference between LangChain and LangGraph"},
]

message = augmented_llm.invoke(messages)
message.pretty_print()


[{'text': "I'll help explain the difference between LangChain and LangGraph. Let me search the documentation for this information.", 'type': 'text'}, {'id': 'toolu_01GzQBW2Guw6xXqiA2UMBFC2', 'input': {'query': 'difference between LangChain and LangGraph'}, 'name': 'langgraph_query_tool', 'type': 'tool_use'}]
Tool Calls:
  langgraph_query_tool (toolu_01GzQBW2Guw6xXqiA2UMBFC2)
 Call ID: toolu_01GzQBW2Guw6xXqiA2UMBFC2
  Args:
    query: difference between LangChain and LangGraph
