In [1]:
! pip install -U langchain-nomic langchain_community tiktoken langchainhub chromadb langchain langgraph tavily-python gpt4all firecrawl-py



In [2]:
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# SETUP INSTRUCTIONS:
# 1. Create a .env file / or copy the example .env file in the project root directory
# 2. Add your API keys to the .env file (see .env.example for format)
# 3. Get your API keys from:
#    - LangChain: https://smith.langchain.com/
#    - FireCrawl: https://firecrawl.dev/

True

In [3]:
local_llm = 'llama3'

In [4]:
### Index

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings
# from langchain_community.document_loaders import FireCrawlLoader
from firecrawl import FirecrawlApp
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.docstore.document import Document

urls = [
    "https://www.drupal.org/blog?page=1",
    "https://www.drupal.org/blog?page=2",
    "https://www.drupal.org/blog?page=3",
    "https://www.drupal.org/blog?page=4",
    "https://www.drupal.org/blog?page=5",
    "https://www.drupal.org/blog?page=6",
    "https://www.drupal.org/blog?page=7",
    "https://www.drupal.org/blog?page=8"
]


app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])

docs = []
for url in urls:
    try:
        scraped_data = app.scrape(url=url, formats=["markdown"])
        # Check if scraped_data exists and extract content
        if scraped_data:
            content = None
            
            # Extract content based on the object type and available attributes
            if hasattr(scraped_data, 'page_content'):
                content = scraped_data.page_content
            elif hasattr(scraped_data, 'content'):
                content = scraped_data.content
            elif hasattr(scraped_data, 'markdown'):
                content = scraped_data.markdown
            elif hasattr(scraped_data, 'text'):
                content = scraped_data.text
            
            if content and content.strip():  # Make sure content is not empty
                # Store as dictionary first
                docs.append({
                    'page_content': content,
                    'metadata': {'source': url}
                })
                # print(f"Successfully added document from {url}, content length: {len(content)}")
            else:
                print(f"No valid content found for {url}")
        else:
            print(f"No data returned for {url}")
    except Exception as e:
        print(f"Failed to scrape {url}: {e}")

print(f"Number of documents scraped: {len(docs)}")
# print(docs[:2])  # Print first two documents for verification
# split documents
# docs_list = [item for sublist in docs for item in sublist]

# Create Document objects from dictionaries and split them
doc_objects = [Document(page_content=doc['page_content'], metadata=doc['metadata']) for doc in docs]

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(chunk_size=250, chunk_overlap=0)

doc_splits = text_splitter.split_documents(doc_objects)

print(f"Number of documents before filtering: {len(doc_splits)}")

# Filter out complex metadata and ensure proper document formatting
filtered_docs = []
for doc in doc_splits:
    # Ensure the document is an instance of Document and has a 'metadata' attribute
    if isinstance(doc, Document) and hasattr(doc, 'metadata'):
        clean_metadata = {k: v for k, v in doc.metadata.items() if isinstance(v, (str, int, float, bool))}
        filtered_docs.append(Document(page_content=doc.page_content, metadata=clean_metadata))

print(f"Number of documents after filtering: {len(filtered_docs)}")

# Add to vectorDB
vectorstore = Chroma.from_documents(
    documents=filtered_docs,
    collection_name="rag-chroma",
    embedding=GPT4AllEmbeddings(),
)

retriever = vectorstore.as_retriever()

Number of documents scraped: 8
Number of documents before filtering: 127
Number of documents after filtering: 127
Number of documents before filtering: 127
Number of documents after filtering: 127


In [5]:
### Inspect the underlying Chroma collection

collection = vectorstore._collection
print(f"Collection details:")
print(f"- Name: {collection.name}")
print(f"- Count: {collection.count()}")

# Get a peek at the actual stored data
peek_data = collection.peek(limit=2)
print(f"\nPeek at stored data:")
print(f"- IDs: {peek_data.get('ids', [])}")
print(f"- Documents preview: {[doc[:100] + '...' for doc in peek_data.get('documents', [])]}")
print(f"- Metadatas: {peek_data.get('metadatas', [])}")



Collection details:
- Name: rag-chroma
- Count: 127

Peek at stored data:
- IDs: ['a629ee59-8c5e-4886-9e52-3a5c8fc44ac1', 'c25d0ace-4d33-4a05-a1d8-35592759a83b']
- Documents preview: ['[Skip to main content](https://www.drupal.org/blog?page=1#content) [Skip to search](https://www.drup...', '[Learn how to support DDEV](https://ddev.com/support-ddev/#sponsor-development)\n\n# Drupal blog\n\n## [...']
- Metadatas: [{'source': 'https://www.drupal.org/blog?page=1'}, {'source': 'https://www.drupal.org/blog?page=1'}]


In [12]:
### Retrieval Grader with LLM

from langchain.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import JsonOutputParser

#LLM
llm = ChatOllama(model=local_llm, format="json", temperature=0)

prompt = PromptTemplate(
  template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a grader assessing relevance of a retrieved document to a user question. If the document contains keywords related to the user question, grade it as relevant. It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
  Give a binary score 'yes' or 'no' based on the relevance of the document to the question. \n
  Provide the binary score as a JSON with a single key 'score' and no preamble or explanation.
   <|eot_id|><|start_header_id|>user<|end_header_id|>
   Here is the retrieved document: \n\n {document} \n\n
   Here is the user question: {question} \n <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
   input_variables=["document", "question"]
)

retrieval_grader = prompt | llm | JsonOutputParser()
question = "what is going on in Drupal community?"
docs = retriever.invoke(question)
docs_txt = docs[1].page_content
print(retrieval_grader.invoke({"question": question, "document": docs_txt}))


{'score': 'yes'}
