In [1]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_community.llms import GPT4All 
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough

In [2]:
loader = PyPDFLoader("./data/2402.03367.pdf")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
segments = loader.load_and_split(text_splitter=text_splitter) # 

In [3]:
len(segments)

73

In [4]:
segments[0]

Document(page_content='RAG-F USION :ANEWTAKE ON RETRIEVAL -AUGMENTED\nGENERATION∗\nZackary Rackauckas\nInfineon Technologies\nSan Jose, CA\nzackary.rackauckas@infineon.com\nABSTRACT\nInfineon has identified a need for engineers, account managers, and customers to rapidly obtain\nproduct information. This problem is traditionally addressed with retrieval-augmented generation\n(RAG) chatbots, but in this study, I evaluated the use of the newly popularized RAG-Fusion method.', metadata={'source': './data/2402.03367.pdf', 'page': 0})

In [5]:
vectorstore = Chroma.from_documents(documents=segments, embedding=GPT4AllEmbeddings())

bert_load_from_file: gguf version     = 2
bert_load_from_file: gguf alignment   = 32
bert_load_from_file: gguf data offset = 695552
bert_load_from_file: model name           = BERT
bert_load_from_file: model architecture   = bert
bert_load_from_file: model file type      = 1
bert_load_from_file: bert tokenizer vocab = 30522


In [6]:
question = "Explain RAG-fusion and how it works."
docs = vectorstore.similarity_search(question, k=5)
len(docs)

5

In [7]:
for i in range(len(docs)):
    for k, v in dict(docs[i]).items():
        print(f'>--{k}--<')
        print(v)
    print()

>--page_content--<
The largest challenge of RAG-Fusion is the slow answer time from receiving the query to outputting the answer. I
compared the runtime to our traditional RAG chatbot by performing ten back-to-back runs with the same query. I then
subtracted the time when the query was received from the time the output was given to determine the time it took for
that run. Back-to-back runs should control for APIs having different response times at different times of the day.
>--metadata--<
{'page': 5, 'source': './data/2402.03367.pdf'}
>--type--<
Document

>--page_content--<
RAG-Fusion: a New Take on Retrieval-Augmented Generation
Figure 1: Diagram illustrating the high level process of RAG-Fusion starting with the original query ”IM72D128 IP
Rating”
Infineon Chatbot for Engineers
There are three potential use cases of the Infineon RAG-Fusion chatbot: providing technical information to engineers,
providing sales-oriented information to account managers, and providing customer-facing in

In [8]:
gpt4all = GPT4All(
    model="./mistral-7b-openorca.Q4_0.gguf",
    max_tokens=2048,
    temp=1, # 1) exp(x_i/T) / sum(exp(x_j/T))
    # top_k=100, # 2) after temp, sort then select top k, normalize
    # top_p=.5, # 3) after top_k, select until cum prob is reached, normalize
    repeat_penalty=1.18,
    repeat_last_n=64,
    n_batch=8,
    n_predict=None,
    streaming=False,
)

In [9]:
# without RAG
gpt4all.generate([f"You are an assistant for question-answering tasks.\nQuestion: {question}\nAnswer:"])

LLMResult(generations=[[Generation(text=' RAG-fusion, also known as Recombination Activating Gene (RAG) fusion, is a technique used in gene therapy to introduce specific genetic modifications into the DNA of cells. This process involves the use of two types of enzymes called recombinases and integrases that are derived from bacteriophages or other sources.\n\nThe RAG-fusion process works through several steps:\n\n1. Isolation of target cells: The first step is to isolate the specific type of cell that needs genetic modification, such as immune system cells like T-cells and B-cells. This can be done using various methods, including magnetic activated cell sorting (MACS) or fluorescence-activated cell sorting (FACS).\n\n2. Introduction of RAG enzymes: The next step is to introduce the recombinase and integrase enzymes into the cells. This can be done using viral vectors, such as adenoviruses or lentiviruses, which are engineered to carry the desired genetic material safely and efficientl

In [10]:
prompt = PromptTemplate.from_template(
"""You are an assistant for question-answering tasks. \
Use the following pieces of retrieved context to answer the question. \
If you don't know the answer, just say that you don't know. \
Question: {question}
Context: {context}
Answer:
"""
) # Use three sentences maximum and keep the answer concise.

retriever = vectorstore.as_retriever()

retrieval_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | gpt4all
    | StrOutputParser()
)

In [11]:
retrieval_chain.invoke(question)

" RAG-Fusion is a type of chatbot model that combines Retrieval-Augmented Generation (RAG) with Large Language Models (LLMs). It works by first generating multiple queries based on the original user query, and then calling an LLM to retrieve relevant documents. The retrieved information is used to generate an answer tailored to the user's question. This approach increases answer quality but may result in longer runtimes due to its more complex process compared to traditional RAG chatbots."