In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

True

In [3]:
local_llm = 'llama3'

In [4]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.document_loaders import FireCrawlLoader
from langchain_community.vectorstores.utils import filter_complex_metadata
from langchain.docstore.document import Document




In [5]:
urls = [
    'https://omeka.org/s/docs/developer/api/',
    'https://omeka.org/s/docs/developer/api/php_api/',
    'https://omeka.org/s/docs/developer/api/rest_api/',
    'https://omeka.org/s/docs/developer/api/representations/',
    'https://omeka.org/s/docs/developer/api/api_reference/',
    'https://omeka.org/s/docs/developer/api/rest_api_reference/'    
]

docs_list = [FireCrawlLoader(api_key=os.getenv('FIRECRAWL_API_KEY'), url=url, mode='scrape').load() for url in urls]
print(docs_list)

[[Document(metadata={'title': 'Introduction to the API - Omeka S Developer Documentation', 'language': 'en', 'ogLocaleAlternate': [], 'sourceURL': 'https://omeka.org/s/docs/developer/api/', 'statusCode': 200}, page_content='\n\n# Introduction to the API [\uf0c1](\\#introduction-to-the-api "Permanent link")\n\nOmeka S provides an application programming interface (API) that enables [CRUD](https://en.wikipedia.org/wiki/Create,_read,_update_and_delete)\noperations on its resources. API resources are objects with a type, associated data,\nrelationships to other resources, and sets of methods that operate on them. There\nare two ways to access Omeka\'s API:\n\n- Programmatically, from within Omeka\'s PHP environment (see the [PHP API documentation](php_api/))\n- Using Omeka\'s [REST](https://en.wikipedia.org/wiki/Representational_state_transfer) web service (see the [REST API documentation](rest_api/))\n\n## Operations [\uf0c1](\\#operations "Permanent link")\n\nYou can perform these API op

In [6]:

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    chunk_size=300,
    chunk_overlap=30
)

docs = [doc for sublist in docs_list for doc in sublist]
docs_splits = text_splitter.split_documents(docs)

# Filter out complex metadata and ensure proper document formatting
filtered_docs = []
for doc in docs_splits:
    # Ensure the doc is an instance of Document and has a 'metadata' attribute
    if isinstance(doc, Document) and hasattr(doc, 'metadata'):
        clean_metadata = {k: v for k, v in doc.metadata.items() if isinstance(v, (str, int, float, bool))}
        filtered_docs.append(Document(page_content=doc.page_content, metadata=clean_metadata))

# Add to vectorDB
vectorestore = Chroma.from_documents(
    documents=filtered_docs,
    collection_name="rag-chroma",
    embedding=GPT4AllEmbeddings()
)

retriever = vectorestore.as_retriever(search_kwargs={"k":7}) # number of relavant docs to retrieve

In [7]:
### Retrieval Grader

from langchain.prompts import PromptTemplate
from langchain_community.chat_models import ChatOllama
from langchain_core.output_parsers import JsonOutputParser

# LLM
llm = ChatOllama(model=local_llm, format='json', temperature=0) # format : defines the format between user and model

# Llama has specific prompt style as show below
prompt = PromptTemplate(template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a grader assessing relevance
of a retrieved document to a user question. If the document contains keywords related to the user question,
grade it as relevant. It does not need to be a stringent test. The goal is to filter out erroneous retrievals. \n
Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question. \n
Provide the binary score as a JSON with a single key 'score' and no premable or explaination.

<|eot_id|><|start_header_id|>user<|end_header_id|>
Here is the retrieved document: \n\n {document} \n\n
Here is the user question: {question} \n <|eot_id|><|start_header_id|>assistant<|end_header_id|>
""",
input_variables=["question", "document"])

retrieval_grader = prompt | llm | JsonOutputParser()


In [8]:
### Test the retreival grader

# Test 1: Something Relevant
relevant_question = "Give me sample PHP Api in Omeka-s?"
docs = retriever.invoke(relevant_question)
relevant_doc_txt = docs[0].page_content  # check against the first doc
print("Relevant question and doc result: ", retrieval_grader.invoke({"question": relevant_question, "document": relevant_doc_txt}))

# Test 2: Something Relevant
irrelevant_question = "How to use nodejs?"
docs = retriever.invoke(relevant_question)
irrelevant_doc_txt = docs[0].page_content  # check against the first doc
print("Irrelevant question and doc result: ", retrieval_grader.invoke({"question": irrelevant_question, "document": irrelevant_doc_txt}))

Relevant question and doc result:  {'score': 'yes'}
Irrelevant question and doc result:  {'score': 'no'}


In [9]:
### Generate

from langchain import hub
from langchain_core.output_parsers import StrOutputParser

prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id> You are an assistant for question-answering tasks.
    Use the following pieces of retrieved context for more information to answer the question. If you don't know the answer, just say that you don't know.
    Use three sentences maximum and keep the answer concise <|eot_id|><|start_header_id|>user<|end_header_id|>
    Question: {question}
    Context: {context}
    Answer: <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
    input_variables=["question", "document"],
)

llm = ChatOllama(model=local_llm, temperature=0) # format = 'json' would give result in json

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

# Chain
rag_chain = prompt | llm | StrOutputParser()

In [10]:
question = "Give me example usage of resource_class_label api"
docs = retriever.invoke(question)
generation = rag_chain.invoke({
    "context": format_docs(docs), "question": question
})
print(generation) # Not a good answer because the FireCrawl's crawl output is not good.

Here is an example usage of the `resource_class_label` API:

You can use this method to get a list of resources with a specific class label. For instance, you can retrieve all items with a class label "Person" by calling `displayResourceClassLabel("Person")`. This method takes an optional default fallback value, like `displayTitle` and `displayDescription`, which allows you to specify what to display if no resource is found with the given label.


In [11]:
print(format_docs(docs))

Beyond the values, there is other common resource data you might need to access, and each
piece generally has its own method.

- `resourceClass` and `resourceTemplate` get you the Representations for the class and template
for this resource, if any, respectively. To more simply print the resource class's label,
there is a convenience method `displayResourceClassLabel`. It takes an optional default
fallback value, like `displayTitle` and `displayDescription`.
- `owner` gets the Representation for the user that owns the resource, if any.
- `isPublic` returns a boolean marking whether the resource is public.
- `created` and `modified` return PHP DateTime objects for the dates the resource was created
and last modified, respectively. The `$this->i18n()->dateFormat()` helper is useful for
printing and localizing dates.
- `values` simply returns an array of all the values for the resource, grouped by property,
with the properties' Representations and any alternate labels or comments imposed 

In [12]:
from langchain_community.tools.tavily_search import TavilySearchResults
web_search_tool = TavilySearchResults(k=3)

In [13]:
### Hallucinatin Grader

llm = ChatOllama(model=local_llm, format='json', temperatur=0)

prompt = PromptTemplate(
    template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|> You are a grader assessing whether
an answer is grounded in / supported by a set of facts. Give a binary score 'yes' or 'no' score to indicate
whether the answer is grounded in / supported by a set of facts. Provide the binary score as a JSON with a
single key 'score' and no preamble or explanation. <|eot_id|><|start_header_id|>user<|end_header_id|>

Here are the facts:
\n --------- \n
{documents}
\n --------- \n
Here is the answer: {generation} <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
input_variables=["generation", "documents"],
)

hallucination_grader = prompt | llm | JsonOutputParser()

In [15]:
hallucination_grader.invoke(({"documents": docs, "generation": generation}))

{'score': 'yes'}