In [None]:
! pip install -U langchain-nomic langchain_community tiktoken langchainhub chromadb langchain langgraph tavily-python gpt4all firecrawl-py==3.4.0

In [None]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# SETUP INSTRUCTIONS:
# 1. Create a .env file / or copy the example .env file in the project root directory
# 2. Add your API keys to the .env file (see .env.example for format)
# 3. Get your API keys from:
#    - LangChain: https://smith.langchain.com/
#    - FireCrawl: https://firecrawl.dev/
#    - Tavily: https://tavily.com/

In [None]:
local_llm = 'llama3'
# local_llm = 'gemma3:270m'

In [None]:
### Index

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings
# from langchain_community.document_loaders import FireCrawlLoader
from firecrawl import FirecrawlApp
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.docstore.document import Document

urls = [
    "https://www.drupal.org/blog?page=1",
    "https://www.drupal.org/blog?page=2",
    "https://www.drupal.org/blog?page=3",
    "https://www.drupal.org/blog?page=4",
    "https://www.drupal.org/blog?page=5",
    "https://www.drupal.org/blog?page=6",
    "https://www.drupal.org/blog?page=7",
    "https://www.drupal.org/blog?page=8"
]


app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])

docs = []
for url in urls:
    try:
        scraped_data = app.scrape(url=url, formats=["markdown"])
        # Check if scraped_data exists and extract content
        if scraped_data:
            content = None
            
            # Extract content based on the object type and available attributes
            if hasattr(scraped_data, 'page_content'):
                content = scraped_data.page_content
            elif hasattr(scraped_data, 'content'):
                content = scraped_data.content
            elif hasattr(scraped_data, 'markdown'):
                content = scraped_data.markdown
            elif hasattr(scraped_data, 'text'):
                content = scraped_data.text
            
            if content and content.strip():  # Make sure content is not empty
                # Store as dictionary first
                docs.append({
                    'page_content': content,
                    'metadata': {'source': url}
                })
                # print(f"Successfully added document from {url}, content length: {len(content)}")
            else:
                print(f"No valid content found for {url}")
        else:
            print(f"No data returned for {url}")
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")

print(f"Number of documents scraped: {len(docs)}")
# print(docs[:2])  # Print first two documents for verification
# split documents
# docs_list = [item for sublist in docs for item in sublist]

# Create Document objects from dictionaries and split them
doc_objects = [Document(page_content=doc['page_content'], metadata=doc['metadata']) for doc in docs]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=250, chunk_overlap=0)

doc_splits = text_splitter.split_documents(doc_objects)

print(f"Number of documents before filtering: {len(doc_splits)}")

# Filter out complex metadata and ensure proper document formatting
filtered_docs = []
for doc in doc_splits:
    # Ensure the document is an instance of Document and has a 'metadata' attribute
    if isinstance(doc, Document) and hasattr(doc, 'metadata'):
        clean_metadata = {k: v for k, v in doc.metadata.items() if isinstance(v, (str, int, float, bool))}
        filtered_docs.append(Document(page_content=doc.page_content, metadata=clean_metadata))

print(f"Number of documents after filtering: {len(filtered_docs)}")

# Add to vectorDB
vectorstore = Chroma.from_documents(
    documents=filtered_docs,
    collection_name="rag-chroma",
    embedding=GPT4AllEmbeddings(),
)

retriever = vectorstore.as_retriever()

In [None]:
### Inspect the underlying Chroma collection

collection = vectorstore._collection
print(f"Collection details:")
print(f"- Name: {collection.name}")
print(f"- Count: {collection.count()}")

# Get a peek at the actual stored data
peek_data = collection.peek(limit=2)
print(f"\nPeek at stored data:")
print(f"- IDs: {peek_data.get('ids', [])}")
print(f"- Documents preview: {[doc[:100] + '...' for doc in peek_data.get('documents', [])]}")
print(f"- Metadatas: {peek_data.get('metadatas', [])}")



In [None]:
### Retrieval Grader with LLM

from langchain.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import JsonOutputParser

#LLM
llm = ChatOllama(model=local_llm, format="json", temperature=0)

prompt = PromptTemplate(
  template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a grader assessing relevance of a retrieved document to a user question. 
  If the document contains keywords related to the user question, grade it as relevant. It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
  Give a binary score 'yes' or 'no' based on the relevance of the document to the question. \n
  Provide the binary score as a JSON with a single key 'score' and no preamble or explanation.
   <|eot_id|><|start_header_id|>user<|end_header_id|>
   Here is the retrieved document: \n\n {document} \n\n
   Here is the user question: {question} \n <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
   input_variables=["document", "question"]
)

retrieval_grader = prompt | llm | JsonOutputParser()
question = "what is going on in Drupal community?"
docs = retriever.invoke(question)
docs_txt = docs[1].page_content
print(retrieval_grader.invoke({"question": question, "document": docs_txt}))


In [None]:
### Generate

from langchain.prompts import PromptTemplate
from langchain import hub
from langchain_core.output_parsers import StrOutputParser

# LLM
llm = ChatOllama(model=local_llm, temperature=0.1)

# Prompt
prompt = PromptTemplate(
  template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a domain specific chat assistant for question-answering tasks as per the context provided to you.
  You are chatting with regular non technical end user who doesn't need to know about which particular context is provided to you. Use the following pieces for retrived context to 
  answer the question. Please note, only the Question is asked by the end User, the Context however is provided to you automatically, and not by the End user. \n
  If you know the answer, reply in modest tone and layman terms, Don't say something like "according to the given context or given content, as the end user is not aware" \n
  If you don't know the answer, just say that you don't know, don't give any detailed reply in that case.
  Use five sentences maximum and keep the answer concise without complicating with technical jargon.<|eot_id|><|start_header_id|>user<|end_header_id|>
  Question: {question}
  Context: {context}
  Answer: <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
  input_variables=["question", "document"]
)

# post-processing
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = prompt | llm | StrOutputParser()

# Run
question = "What's the latest version of Drupal? When did it release"
docs = retriever.invoke(question)
generation = rag_chain.invoke({"context": docs, "question": question})
print(generation)


In [None]:
### Web search

from langchain_community.tools.tavily_search import TavilySearchResults

web_search_tool = TavilySearchResults(k=3)

In [None]:
### Hallucination Grader

# LLM
llm = ChatOllama(model=local_llm, format="json", temperature=0)

# Prompt
prompt = PromptTemplate(
  template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a grader assessing whether an answer is grounded 
  in / supported by a set of facts. Give a binary score 'yes' or 'no' score to indicate whether the answer is grounded in / supported 
  by a set of facts. Provide the binary score as a JSON with a single key 'score' and no preamble or explanation. 
  <|eot_id|><|start_header_id|>user<|end_header_id|>
  Here are the facts:
  \n ------- \n
  {documents}
  \n ------- \n
  Here is the answer: {generation} <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
  input_variables=["generation", "documents"],
)

hallucination_grader = prompt | llm | JsonOutputParser()
hallucination_grader.invoke({"documents": docs, "generation": generation})


In [None]:
### Answer Grader

# LLM
llm = ChatOllama(model=local_llm, format="json", temperature=0)

# Prompt
prompt = PromptTemplate(
  template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a grader assessing whether an answer
  is useful to resolve a question. Give binary score 'yes' or 'no' score to indicate whether the answer is useful
  to resolve a question. Provide the binary score as a JSON with a single key 'score' and no preamble or explanation.
  <|eot_id|><|start_header_id|>user<|end_header_id|>
  Here is the answer:
  \n ------- \n
  {generation}
  \n ------- \n
  Here is the question: {question} <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
  input_variables=["generation", "question"],
)

answer_grader = prompt | llm | JsonOutputParser()
answer_grader.invoke({"question": question, "generation": generation})


# Lang Graph

In [None]:
from typing_extensions import TypedDict
from typing import List

### State

class GraphState(TypedDict):
    """
    Represents the state of the graph.

    Attributes:
        question: The user's question.
        generation: LLM generation
        web_search: whether to add web search results
        documents: list of documents
    """
    question: str
    generation: str
    web_search: bool
    documents: List[str]

from langchain.schema import Document

### Nodes

def retrieve(state):
    """
    Retrieve documents from vectorstore

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, documents, that contains the retrieved documents
    """
    print("---RETRIEVE---")
    question = state["question"]

    # Retrieval
    documents = retriever.invoke(question)
    return {"documents": documents, "question": question}


def grade_documents(state):
    """
    Determines whether the retrieved documents are relevant to the question.
    If any document is not relevant, we will set a flag to run the web search.

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Filtered out irrelevant documents and updated web_search state
    """

    print("---CHECK DOCUMENT RELEVANCE TO QUESTION---")
    question = state["question"]
    documents = state["documents"]

    # Score each  doc
    filtered_docs = []
    web_search = "No"
    for d in documents:
        score = retrieval_grader.invoke({"question": question, "document": d.page_content})
        grade = score['score']
        # Document relevant
        if grade.lower() == "yes":
            print("---GRADE: DOCUMENT RELEVANT---")
            filtered_docs.append(d)
        # Document not relevant
        else:
            print("---GRADE: DOCUMENT NOT RELEVANT---")
            # We do not include the document in filtered_docs
            # We set a flag to indicate that we want to run web search
            web_search = "Yes"
            continue
    return {"documents": filtered_docs, "question": question, "web_search": web_search}


def generate(state):
    """
    Generate answer using RAG on retrieved documents

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): New key added to state, generation, that contains the LLM generation
    """
    print("---GENERATE---")
    question = state["question"]
    documents = state["documents"]
    
    # RAG Generation
    generation = rag_chain.invoke({"context": documents, "question": question})
    return {"documents": documents, "question": question, "generation": generation}


def web_search(state):
    """
    Perform web search to get additional context

    Args:
        state (dict): The current graph state

    Returns:
        state (dict): Appended web results to documents
    """

    print("---WEB SEARCH---")
    question = state["question"]
    documents = state["documents"]
    
    # Web Search
    docs = web_search_tool.invoke({"query": question})
    web_result = "\n".join([d["content"] for d in docs])
    web_result = Document(page_content=web_result)
    if documents is not None:
        documents.append(web_result)
    else:
        documents = [web_result]
    return {"documents": documents, "question": question}


### Conditional edge

def decide_to_generate(state):
    """
    Decide whether to generate an answer or perform a web search.

    Args:
        state (dict): The current graph state

    Returns:
        str: Binary decision for next node to call
    """

    print("---ASSESS GRADED DOCUMENTS---")
    question = state["question"]
    web_search = state["web_search"]
    filtered_documents = state["documents"]

    if web_search == "Yes":
        # All document have been filtered check_relevance
        # We will re-generate a new query
        print("---DECISION: ALL DOCUMENTS ARE NOT RELEVANT TO QUESTION, INCLUDE WEB SEARCH---")
        return "websearch"
    else:
        # We have relevant documents, so generate answer
        print("---DECISION: GENERATE---")
        return "generate"


### Conditional edge

def grade_generation_v_documents_and_question(state):
    """
    Check for hallucination.

    Args:
        state (dict): The current graph state

    Returns:
        str: Decision for next node to call
    """

    print("---CHECK HALLUCINATIONS---")
    question = state["question"]
    documents = state["documents"]
    generation = state["generation"]

    score = hallucination_grader.invoke({"documents": documents, "generation": generation})
    grade = score['score']

    # Check hallucination
    if grade == "yes":
        print("---DECISION: GENERATION IS GROUNDED IN DOCUMENTS---")
        # Check question-answering
        print("---GRADE GENERATION vs QUESTION---")
        score = answer_grader.invoke({"question": question, "generation": generation})
        grade = score['score']
        if grade == "yes":
            print("---DECISION: GENERATION ADDRESSES THE QUESTION---")
            return "useful"
        else:
            print("---DECISION: GENERATION DOES NOT ADDRESS THE QUESTION---")
            return "not useful"
    else:
        print("---DECISION: GENERATION IS NOT GROUNDED IN DOCUMENTS, RE-TRY---")
        return "not supported"
    
from langgraph.graph import END, StateGraph
workflow = StateGraph(GraphState)

# Define the nodes
workflow.add_node("websearch", web_search)
workflow.add_node("retrieve", retrieve)
workflow.add_node("grade_documents", grade_documents)
workflow.add_node("generate", generate)

In [None]:
# Build Graph

workflow.set_entry_point("retrieve")
workflow.add_edge("retrieve", "grade_documents")
workflow.add_conditional_edges(
  "grade_documents", 
  decide_to_generate,
  {
    "websearch": "websearch",
    "generate": "generate",
  },
)
workflow.add_edge("websearch", "generate")
workflow.add_conditional_edges(
  "generate",
  grade_generation_v_documents_and_question,
  {
    "not supported": "generate",
    "useful": END,
    "not useful": "websearch"
  },
)

In [None]:
# Compile

app = workflow.compile()

# Test
from pprint import pprint
inputs = {"question": "What's the latest version of Drupal?"}
for output in app.stream(inputs):
    for key, value in output.items():
        pprint(f"Finished running: {key}")
print(value["generation"])

### Comparison: Direct Llama3 Query vs Agentic RAG Workflow

#### 1. Question
```
has Drupal 9 reached EOL? what's the latest version of Drupal?
```

#### 2. Raw (Vanilla) Local Llama3 Answer (No Context / No RAG)
RAW output:
```
As of March 2023, Drupal 8 has reached its End-of-Life (EOL), which means it is no longer receiving security
updates or support from the Drupal community.

However, Drupal 9 is still actively maintained and supported. The latest version of Drupal is currently **Drupal
9.4.2**, released on January 18, 2023. This version includes various bug fixes, security patches, and new features
to improve the overall performance and stability of your Drupal website.

Here's a brief overview of the current status:

* Drupal 8: EOL (End-of-Life) - no longer receiving security updates or support
* Drupal 9: Actively maintained and supported - latest version is **9.4.2**

If you're currently using Drupal 8, it's recommended to upgrade to Drupal 9 as soon as possible to ensure you
receive the latest security patches and features.

total duration:       3.891085186s
load duration:        44.131536ms
prompt eval count:    26 token(s)
prompt eval duration: 50.424352ms
prompt eval rate:     515.62 tokens/s
eval count:           185 token(s)
eval duration:        3.796080071s
eval rate:            48.73 tokens/s
```

##### TLDR:
```
As of March 2023, Drupal 8 has reached its End-of-Life (EOL)...
...latest version of Drupal is currently Drupal 9.4.2, released on January 18, 2023.
```
Issues:
- Drupal 8 actually reached EOL in Nov 2021 (hallucination on date).
- States latest major is still Drupal 9.x (outdated; Drupal 10 released Dec 2022, Drupal 11 target mid/late 2024).
- Fabricated specific patch version/date pairing without evidence.
- Fast (≈3.9s total) and cheap, but ungrounded.

#### 3. Agentic RAG Llama3 Answer (From graph output)
```
Drupal 9 has reached its End Of Life (EOL) in November 2023, and the latest major release is Drupal 11 (released mid 2024).
```
(Paraphrased from value["generation"] stream output; another earlier run produced: “The latest version of Drupal is Drupal 11, released on August 1, 2024.”)
Strengths:
- Correctly identifies Drupal 9 EOL (Nov 2023).
- Advances to current major (Drupal 11) vs stale Drupal 9.x.
- Grounded in retrieved Drupal.org blog content + optional web search path.
Weaknesses:
- Preface like “According to the provided context” slipped through despite prompt guardrails (style refinement needed).
- Exact release date may still be hallucinated if not in retrieved chunks (should cite or soften).

#### 4. Why RAG Helps Here
| Aspect | Vanilla Llama3 | Agentic RAG Graph |
|--------|----------------|-------------------|
| Freshness | Relies on model pretraining cutoff; stale. | Pulls from scraped Drupal blog pages; can refresh indexes. |
| Hallucination Control | None. | Retrieval grading + hallucination + usefulness graders loop. |
| Observability | Only raw text & timings. | Structured stages: retrieve → grade → (optional) web search → generate → evaluate. |
| Upgrade Path | Needs model retrain for new facts. | Just re-scrape + re-embed. |
| Style Control | Minimal (single prompt). | Modular prompts per function (retrieval grading, generation, hallucination check). |
| Failure Modes | Confident wrong specifics. | Can detect ungrounded generations and retry / augment with web search. |

#### 5. Latency / Cost Trade-off
- Vanilla: Single model call (fast).
- RAG Graph: Multiple sequential model invocations (retrieval grading loop + generation + evaluators + possible web search). Higher latency and more tokens, but higher factual reliability.

#### 6. Key Observations
- The biggest factual delta: Vanilla answer outdated & partially incorrect; RAG answer aligns with real lifecycle (Drupal 9 EOL Nov 2023, progression to Drupal 10/11).
- The RAG pipeline’s evaluators act as guardrails but still depend on the retrieval containing the needed fact; adding explicit date-focused re-query could further boost precision.
- Style prompt needs reinforcement to remove meta phrases.

#### 7. Suggested Improvements to Current RAG Notebook
- Add a citation builder: attach source URLs for each supporting snippet.
- Normalize answer if hallucination grader fails twice: fallback to “I don’t have sufficient grounded data.”
- Add a targeted follow-up retrieval if generation mentions a date not present verbatim in supporting documents (date grounding check).
- Cache embeddings & reuse vectorstore across runs to cut startup time.
- Log per-stage timings for empirical latency comparison.

#### 8. When to Use Which
- Use Vanilla: exploratory brainstorming, low-stakes, speed prioritized.
- Use Agentic RAG: lifecycle/status/versioning queries (like EOL/version), compliance, user-facing factual assistance.

#### 9. Takeaway
RAG meaningfully upgrades factual accuracy and resilience vs a bare local LLM, at the cost of complexity and latency. For version/EOL queries, the agentic approach is clearly superior and justifies the overhead.