In [None]:
%load_ext autoreload
%autoreload 2


import bs4

from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings,ChatOllama
from langchain_chroma import Chroma
#from langchain_openai import OpenAIEmbeddings


### Query Tranformations

In [2]:
# Load documents
web_paths = [
    "https://lilianweng.github.io/posts/2024-07-07-hallucination/",
    "https://lilianweng.github.io/posts/2023-10-25-adv-attack-llm/",
    "https://lilianweng.github.io/posts/2023-06-23-agent/",
    "https://lilianweng.github.io/posts/2023-03-15-prompt-engineering/",
    "https://lilianweng.github.io/posts/2023-01-10-inference-optimization/",
    "https://lilianweng.github.io/posts/2022-06-09-vlm/",
]

daft_paths = urls = [
 
     "https://www.getdaft.io/projects/docs/en/stable/10-min.html",
   "https://www.getdaft.io/projects/docs/en/stable/user_guide/basic_concepts.html",
    "https://www.getdaft.io/projects/docs/en/stable/user_guide/read-and-write.html",
    "https://www.getdaft.io/projects/docs/en/stable/user_guide/expressions.html",
    "https://www.getdaft.io/projects/docs/en/stable/user_guide/datatypes.html",
    "https://www.getdaft.io/projects/docs/en/stable/user_guide/dataframe-operations.html",
    "https://www.getdaft.io/projects/docs/en/stable/user_guide/sql.html",
#     "https://www.getdaft.io/projects/docs/en/stable/user_guide/aggregations.html",
#     "https://www.getdaft.io/projects/docs/en/stable/user_guide/udf.html",
#     "https://www.getdaft.io/projects/docs/en/stable/user_guide/poweruser.html",
#     "https://www.getdaft.io/projects/docs/en/stable/user_guide/poweruser/memory.html",
#     "https://www.getdaft.io/projects/docs/en/stable/user_guide/poweruser/partitioning.html",
#     "https://www.getdaft.io/projects/docs/en/stable/user_guide/poweruser/distributed-computing.html",
#     "https://www.getdaft.io/projects/docs/en/stable/user_guide/integrations/ray.html",
#     "https://www.getdaft.io/projects/docs/en/stable/user_guide/integrations/unity-catalog.html",
#     "https://www.getdaft.io/projects/docs/en/stable/user_guide/integrations/iceberg.html",
#     "https://www.getdaft.io/projects/docs/en/stable/user_guide/integrations/delta_lake.html",
#     "https://www.getdaft.io/projects/docs/en/stable/user_guide/integrations/hudi.html",
#     "https://www.getdaft.io/projects/docs/en/stable/user_guide/integrations/microsoft-azure.html",
#     "https://www.getdaft.io/projects/docs/en/stable/user_guide/integrations/aws.html",
#     "https://www.getdaft.io/projects/docs/en/stable/user_guide/integrations/sql.html",
#     "https://www.getdaft.io/projects/docs/en/stable/user_guide/integrations/huggingface.html",
#     "https://www.getdaft.io/projects/docs/en/stable/user_guide/fotw/index.html",
#     "https://www.getdaft.io/projects/docs/en/stable/user_guide/fotw/fotw-000-data-access.html",
#     "https://www.getdaft.io/projects/docs/en/stable/user_guide/fotw/fotw-001-images.html",
]


# bs4_strainer = bs4.SoupStrainer(class_=("post-title", "post-header", "post-content"))


bs4_strainer = bs4.SoupStrainer(class_=("bd-article","bd-main"))
loader = WebBaseLoader(
    web_paths=daft_paths[:],
    bs_kwargs={"parse_only": bs4_strainer},
)

In [None]:
# bs4_strainer = bs4.SoupStrainer(class_=("bd-article","bd-main"))
# loader = WebBaseLoader(
#     web_paths=["https://www.getdaft.io/projects/docs/en/stable/10-min.html",
#                "https://www.getdaft.io/projects/docs/en/stable/",
#                "https://www.getdaft.io/projects/docs/en/stable/user_guide/basic_concepts.html",
#                ],
#     bs_kwargs={"parse_only": bs4_strainer},
# )

docs = loader.load()
docs

In [None]:
docs

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, chunk_overlap=100, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

In [26]:
vectorstore = Chroma.from_documents(documents=all_splits, embedding=OllamaEmbeddings(model="llama3.2:1b"),persist_directory="./chroma_db")

In [27]:
retriever = vectorstore.as_retriever()

In [None]:
llm = ChatOllama(model="llama3.2:1b",temperature=0)

from langchain import hub
from langchain.prompts import PromptTemplate
prompt = hub.pull("rlm/rag-prompt")


# 1. Define the prompt template
rag_prompt_template = """You are a helpful assistant specializing in the Daft data processing library. You have access to official Daft documentation. Use the following pieces of documentation to answer the user's question. If you don't know the answer, just say you don't know.

Context: {context}

Given this context, please:
1. Provide accurate information based on the Daft documentation
2. Include relevant code examples when appropriate
3. Cite specific sections of documentation you're referencing
4. If multiple approaches exist, explain the trade-offs

User Question: {question}"""


prompt = PromptTemplate(
    template=rag_prompt_template,
    input_variables=["context","question"]
)

In [116]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

    
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [117]:
question = "What is daft library"

res =rag_chain.invoke(question)

In [None]:
print(res)

In [None]:
retriever.invoke(question)

#### Multi Retriver 

In [119]:
# Set logging for the queries
import logging

logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [120]:
from langchain.retrievers.multi_query import MultiQueryRetriever

In [121]:
retriever_multi = MultiQueryRetriever.from_llm(
    retriever = retriever,
    llm = llm
)

In [None]:
unique_docs = retriever_multi.invoke(question)
len(unique_docs)

In [123]:
rag_chain_ret_multi = (
    {"context": retriever_multi | format_docs, "question": RunnablePassthrough()}
    | prompt
    # | llm
    # | StrOutputParser()
)


In [None]:
res = rag_chain_ret_multi.invoke(question)

In [None]:
print(res.text)

In [None]:
print(res["text"])

In [None]:
print(res)

In [None]:
res = rag_chain_ret_multi.invoke("what is daft index")

In [None]:
print(res)

### Rag Fusion

In [None]:
from langchain import hub

prompt = hub.pull("langchain-ai/rag-fusion-query-generation")

In [73]:
# 1. Multi-query generation prompt
multi_query_template = """You are an expert in the Daft data processing library. Generate different versions of the given question to retrieve relevant documentation.

Original Question: {question}

Generate 4 different search queries that will help find relevant information from the Daft documentation. The queries should:
- Rephrase the original question in different ways
- Include technical terms related to Daft
- Consider different aspects of the question
- Be specific to data processing and Daft's features

Output format - just the queries, one per line:
1. [Query 1]
2. [Query 2]
3. [Query 3]
4. [Query 4]

Generated Queries:"""

multi_query_prompt = PromptTemplate(
    template=multi_query_template,
    input_variables=["question"]
)


In [138]:
from langchain.prompts import ChatPromptTemplate

# RAG-Fusion: Related
template = """You are a helpful assistant that generates multiple search queries based on a single input query. \n
Generate multiple search queries related to: {question} \n
Output (4 queries):"""

template = """
ou are an expert in the Daft data processing library. Generate different versions of the given question to retrieve relevant documentation.

Original Question: {question}

Generate 4 different search queries that will help find relevant information from the Daft documentation. The queries should:
- Rephrase the original question in different ways
- Include technical terms related to Daft
- Consider different aspects of the question
- Be specific to data processing and Daft's features

Output format - just the queries, one per line:
1. [Query 1]
2. [Query 2]
3. [Query 3]
4. [Query 4]
"""
prompt_rag_fusion = ChatPromptTemplate.from_template(template)

In [150]:
from langchain_core.output_parsers import StrOutputParser
from langchain_openai import ChatOpenAI

generate_queries = (
    prompt_rag_fusion 
    | ChatOpenAI(temperature=0) # lmm
    | StrOutputParser() 
    | (lambda x: x.split("\n"))
)

In [None]:
generate_queries.invoke("is daft written in rust ?")

In [None]:
from langchain.load import dumps, loads

def reciprocal_rank_fusion(results: list[list], k=60):
    """ Reciprocal_rank_fusion that takes multiple lists of ranked documents 
        and an optional parameter k used in the RRF formula """
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

retrieval_chain_rag_fusion = generate_queries | retriever.map() | reciprocal_rank_fusion
docs = retrieval_chain_rag_fusion.invoke({"question": question})
len(docs)

In [None]:
docs

In [155]:
from operator import itemgetter

In [176]:
from langchain_core.runnables import RunnablePassthrough


# RAG
template = """Answer the following question based on this context:

{context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

final_rag_chain = (
    {"context": retrieval_chain_rag_fusion, 
     "question": itemgetter("question")} 
    | prompt
#    | llm
    | StrOutputParser()
)



In [None]:
res = final_rag_chain.invoke({"question":question})

In [None]:
llm.invoke

In [None]:
prom

In [None]:
llm

In [None]:
# from typing import List

# from langchain_core.output_parsers import BaseOutputParser
# from langchain_core.prompts import PromptTemplate
# from pydantic import BaseModel, Field


# # Output parser will split the LLM result into a list of queries
# class LineListOutputParser(BaseOutputParser[List[str]]):
#     """Output parser for a list of lines."""

#     def parse(self, text: str) -> List[str]:
#         lines = text.strip().split("\n")
#         return list(filter(None, lines))  # Remove empty lines


# output_parser = LineListOutputParser()

# QUERY_PROMPT = PromptTemplate(
#     input_variables=["question"],
#     template="""You are an AI language model assistant. Your task is to generate five 
#     different versions of the given user question to retrieve relevant documents from a vector 
#     database. By generating multiple perspectives on the user question, your goal is to help
#     the user overcome some of the limitations of the distance-based similarity search. 
#     Provide these alternative questions separated by newlines.
#     Original question: {question}""",
# )


# # Chain
# llm_chain = QUERY_PROMPT | llm | output_parser

# # Other inputs
# question = "What are the approaches to Task Decomposition?"