## 1. Code to use if creating a new database

In [30]:
import os
import glob
import json
import extract, index, prepare
from langchain_community.llms import GPT4All
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
import pickle

In [5]:
def save_docs(docs, filename='docs.pkl'):
    with open(filename, 'wb') as f:
        pickle.dump(docs, f)
def load_docs(filename='docs.pkl'):
    with open(filename, 'rb') as f:
        docs = pickle.load(f)
    return docs

In [13]:
# creating txt files and metadata
"""
extract.extract_tex_files(query="Retrieval-Augmented Generation", target_dir="papers", max_results=150) 

filenames, errors = prepare.files_to_plain_text("papers", recursive=True)"""

In [4]:
# saving arxiv ids of papers used in creating the database
"""directory = "papers"

# List to store the arXiv IDs
arxiv_ids = []

# Iterate over the items in the directory
for item in os.listdir(directory):
    # Extract the arXiv ID (assuming it's the part before the first dot)
    arxiv_id = ".".join(item.split('.')[0:2])
    arxiv_ids.append(arxiv_id)

# Write the arXiv IDs to a text file
with open("papers_ids.txt", "w") as file:
    for arxiv_id in arxiv_ids:
        file.write(arxiv_id + "\n")

print(f"Extracted {len(arxiv_ids)} arXiv IDs and saved to arxiv_ids.txt")"""

Extracted 258 arXiv IDs and saved to arxiv_ids.txt


In [6]:
# use if documents not prepared as pkl files already:
"""documents = []
for article in os.listdir("./papers"):
    path = "./papers/" + article
    
    for document in os.listdir(path):
        if document.endswith("txt"):
            with open(f"{path}/{document}") as f:
                text = f.read()
        if document.endswith("metadata.json"):
            with open(f"{path}/{document}") as f:
                metadata = json.load(f)

    
    doc = {
            "text": text,
            "metadata": metadata
        }
    documents.append(doc)

save_docs(documents)
documents = load_docs()"""

In [3]:
documents = load_docs()

In [7]:
texts = []
metadatas = []
for x in documents:
    texts.append(x['text'])
    metadatas.append(x['metadata'])

In [9]:
db_name = "rag_database"
database = index.populate_and_prepare_db(name = db_name, texts = texts, metadatas = metadatas, chunk_size=500, chunk_overlap=50)



Indexing 258 documents


  0%|                                                                                          | 0/258 [00:00<?, ?it/s]Created a chunk of size 1592, which is longer than the specified 500
Created a chunk of size 1725, which is longer than the specified 500
Created a chunk of size 746, which is longer than the specified 500
Created a chunk of size 507, which is longer than the specified 500
Created a chunk of size 579, which is longer than the specified 500
Created a chunk of size 619, which is longer than the specified 500
Created a chunk of size 1240, which is longer than the specified 500
Created a chunk of size 630, which is longer than the specified 500
Created a chunk of size 688, which is longer than the specified 500
Created a chunk of size 735, which is longer than the specified 500
Created a chunk of size 503, which is longer than the specified 500
Created a chunk of size 821, which is longer than the specified 500
Created a chunk of size 592, which is longer than the specifie

Documents and metadata have been added to the Chroma collection.





# 2. RAG chatbot

In [19]:
def update_history(chat_history, query, response):
    chat_history.append(f"\n[user] {query}\n[assistant] {response}")

In [20]:
llm = GPT4All(  model="../models/mistral-7b-instruct-v0.1.Q4_0.gguf",
                backend="gptj",
                verbose=True,
                temp=0.3,
                top_p=0.6,
                repeat_penalty=1.18)

In [21]:
def get_response(llm, query, chat_history = [], db_name="rag_database", top_k = 4, first = True):
    if first:
        prompt_template = """You are a helpful AI assistant. Use the provided excerpts to answer the user's question.
        
Excerpts:
{context}
        
NEW QUERY: {query}
RESPONSE: """
    else:
        prompt_template = """You are a helpful AI assistant. Use the chat history and provided excerpts to answer the user's question.

Chat History:
{chat_history}
    
Excerpts:
{context}
    
NEW QUERY: {query}
RESPONSE: """
    

    results = index.search_for_documents(query, db_name=db_name, top_k=top_k)
    
    context_dict = {}
    for doc in results:
        entry_id = doc.metadata['entry_id']
        title = doc.metadata['title']
        page_content = doc.page_content
        
        if entry_id not in context_dict:
            context_dict[entry_id] = {
                "title": title,
                "excerpts": []
            }
        context_dict[entry_id]["excerpts"].append(page_content)

    context_string = ""
    for entry_id, data in context_dict.items():
        context_string += f"From: **{data['title']} ({entry_id})**\n"
        for i, excerpt in enumerate(data['excerpts']):
            context_string += f"Excerpt {i+1}:\n{excerpt}\n\n"
            
    chat_history_string = "".join(chat_history)

    prompt = prompt_template.format(query=query, context=context_string, chat_history = chat_history_string)

    response = llm.invoke(prompt)

    sources = [(context_dict[k]['title'], k) for k in context_dict.keys()]
    sources_string = "\nSOURCES:\n" + "\n".join([f'{s[0]} ({s[1]})' for s in sources])

    return response, sources_string, context_string

In [24]:
print("Initializing LLM...")
llm = GPT4All(
        model="../models/mistral-7b-instruct-v0.1.Q4_0.gguf",
        backend="gptj",
        verbose=True,
        temp=0.3,
        top_p=0.6,
        repeat_penalty=1.18
    )
print("LLM initialized. Ready for queries!")

chat_history = []
first = True
while True:
    query = input("\nEnter your query (or 'quit' to exit): ")
    if query.lower() == 'quit':
        break

    print("\nProcessing your query...")
    response, sources, context = get_response(llm, query, chat_history, top_k=3, first=first)
    
    update_history(chat_history, query, response)
    
    print("\nResponse:")
    print(response)
    print("\nSurces used:")
    print(sources)
    print("\nExcerpts used:")
    print(context)
    
    first=False

Initializing LLM...
LLM initialized. Ready for queries!



Enter your query (or 'quit' to exit):  Describe prompt engineering techniques to enhance the performance of Retrieval-Augmented Generation systems step by step.



Processing your query...





Response:

Retrieval-Augmented Generation involves several steps, and each step requires specific techniques for prompt engineering. Here is a step-by-step guide on how to enhance the performance of Retrieval-Augmented Generation systems using prompt engineering techniques:

Step 1: Decomposing the original question into multi-step sub-questions on what information should be retrieved. This can be done by breaking down the question into smaller, more manageable parts that are easier for the LLM to process and understand. For example, if the question is "What are some popular books about artificial intelligence?", you could break it down into sub-questions like "Who wrote the book?" or "When was the book published?"

Step 2: Retrieving passages from augmented questions using LLMs. Once the original question has been decomposed, you can use LLMs to retrieve relevant passages based on the sub-questions generated in step 1. This involves training an LLM on a large corpus of text and provi


Enter your query (or 'quit' to exit):  Provide details about how the questions are augmented in the described RAG enhancement technique and its effectiveness and its effectiveness.



Processing your query...





Response:

In the described Retrieval-Augmented Generation (RAG) enhancement technique, the original question is decomposed into multi-step sub-questions on what information should be retrieved. This can be done by breaking down the question into smaller, more manageable parts that are easier for the LLM to process and understand. For example, if the question is "What are some popular books about artificial intelligence?", you could break it down into sub-questions like "Who wrote the book?" or "When was the book published?"

Once the original question has been decomposed, retrieval passages can be generated using LLMs. This involves training an LLM on a large corpus of text and providing it with specific prompts for each sub-question. The effectiveness of this technique is dependent on several factors such as the quality of the LLM used, the size and relevance of the corpus, and the accuracy of the retrieval passages generated.

It's also important to note that self-generation from a


Enter your query (or 'quit' to exit):  quit


## RAG - single questions

In [31]:
query = "What is ODQA?"
results = index.search_for_documents(query, db_name= "rag_database",top_k=5)

In [32]:
# Initialize the context dictionary
context = {}

# Populate the context dictionary
for doc in results:
    entry_id = doc.metadata['entry_id']
    title = doc.metadata['title']
    page_content = doc.page_content
    
    if entry_id not in context:
        context[entry_id] = {
            "title": title,
            "excerpts": []
        }
    context[entry_id]["excerpts"].append(page_content)

# Generate the context_string
context_string = ""

for entry_id, data in context.items():
    context_string += f"From: **{data['title']} ({entry_id})**\n"
    for i, excerpt in enumerate(data['excerpts']):
        context_string += f"Excerpt {i+1}:\n{excerpt}\n\n"

In [34]:
prompt_template = """You are a helpful AI assistant. Use the provided excerpts to answer the user's question.
        
Excerpts:
{context}
        
NEW QUERY: {query}
RESPONSE: """
prompt = prompt_template.format(query=query, context=context_string)

In [35]:
import time

In [36]:
start_time = time.time()
response = llm.invoke(prompt)
end_time = time.time()

print("Time: ", end_time - start_time)
print(response)
sources = [(context[k]['title'], k) for k in context.keys()]
print("\nSOURCES:")
for s in sources:
    print(f'{s[0]} ({s[1]})')

Time:  360.79866123199463

ODQA stands for Online Distributed Query Answering. It refers to the process of answering queries in real-time using distributed systems and data sources. The concept of online query answering (OQA) has been around for a while, but with the rise of big data and distributed computing, it has become increasingly important to be able to answer queries quickly and efficiently across multiple machines or nodes. ODQA is often used in applications such as e-commerce, social media, and real-time analytics where there are large amounts of data that need to be processed and analyzed in real-time.

SOURCES:
SeCTIS: A Framework to Secure CTI Sharing (http://arxiv.org/abs/2406.14102v1)
Runtime Verification on Abstract Finite State Models (http://arxiv.org/abs/2406.12715v1)
Automatic generation of insights from workers' actions in industrial workflows with explainable Machine Learning (http://arxiv.org/abs/2406.12732v1)


In [37]:
print("Comparison -- response without using sources:")

start_time = time.time()
response_no_rag= llm.invoke(query)
end_time = time.time()

print("Time: ", end_time - start_time)
print(response_no_rag)

Comparison -- response without using sources:
Time:  32.05125021934509

Answer: ODQA stands for Open Data Quality Assessment. It's a tool developed by the World Wide Web Consortium (W3C) to evaluate the quality of open data sets. The goal of ODQA is to provide a standardized way to assess the quality of open data, making it easier for users to determine which datasets are reliable and trustworthy.


## 3. Some ideas for improvement
1. Data preparation:  
    - Prepare the text chunks in the database in a way that separates different sections
    - Delete references [#] from excerpts
    - Improve arxiv search to make it more selective 
2. Effective search  
    - Separate the chatbot query (a question) and the similarity search query
    - Use multiple steps to iteratively search for information
3. Chat  
    - Implement chat threads
4. Other
    - use better LLMs