# **Advanced RAG Technique and Evaluation**

In this document we will try the standard RAG versus the compressor based RAG.
We use GPT3.5 Turbo as LLM and will use as retriever a contextual compressor, which only takes the most relevant information from the retrieved documents by the similarity search.

Requirements: Please make sure to execute first LangChainRAG/Embedding-OpenAI-Chroma.ipynb to embed our medical documents. This Notebook is merely applying GPT3.5 Turbo as LLM.

In [1]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.llms import OpenAI
from langchain.chains import RetrievalQA

## **Load Chroma and GPT3.5 Turbo LLM**
We first load the Chroma vector database.

In [2]:
from langchain.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings,GPT4AllEmbeddings,HuggingFaceBgeEmbeddings

In [3]:
import os

persist_directory = "E:\\NLPT\\_Q-A-INLPT-WS2023\\chroma_openai-003\\chroma_openai"
# Create the directory if it does not exist
if not os.path.exists(persist_directory):
    print(f"Please execute first LangChainRAG/Embedding-OpenAI-Chroma.ipynb, we didn't find any Chroma vector storage.")
else:
    print(f"Directory '{persist_directory}' exists, perfect!")

Directory 'E:\NLPT\_Q-A-INLPT-WS2023\chroma_openai-003\chroma_openai' exists, perfect!


In [5]:
import json
from langchain.schema import Document
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from tqdm.auto import tqdm


class HybridSearch:
    def __init__(self, data_path):
        self.data_path = data_path
        os.environ['OPENAI_API_KEY'] = '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'
        self.embedding = OpenAIEmbeddings()
        self.ensemble_retriever = None

    def load_data(self):
        with open(self.data_path, 'r') as file:
            data = json.load(file)
        return data

    def initialize_bm25_retriever(self, docs):
        if not all(isinstance(doc, Document) for doc in docs):
            raise ValueError("All items in docs must be Document instances.")
        abstracts = [doc.page_content for doc in docs]
        bm25_retriever = BM25Retriever.from_texts(abstracts, metadatas=[doc.metadata for doc in docs])
        bm25_retriever.k = 3
        return bm25_retriever

    def transform_data_to_documents(self, data):
        docs = []
        for doc in data:
            title = doc.get('title', {}).get('full_text', '')
            abstract = doc.get('abstract', {}).get('full_text', '')
            keywords = doc.get('keywords', [[]])[0] if doc['keywords'] and isinstance(doc['keywords'][0], list) else []
            document = Document(page_content=abstract, metadata={'title': title, 'keywords': keywords})
            docs.append(document)
        return docs

    def process_documents_with_chroma(self, docs):
        persist_directory = './Chroma/chroma_openai'
        db3 = Chroma(persist_directory=persist_directory, embedding_function=self.embedding)
        if not all(isinstance(doc, Document) for doc in docs):
            raise ValueError("All items in docs must be Document instances.")
        chroma_retriever = db3.as_retriever(search_kwargs={'k': 3})
        return chroma_retriever

    def create_ensemble_retriever(self, bm25_retriever, chroma_retriever):
        # faiss_retriever = faiss_vectorstore.as_retriever(search_kwargs={'k': 10})
        ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, chroma_retriever], weights=[0.7, 0.3])
        self.ensemble_retriever = ensemble_retriever

    def get_relevant_documents(self, query):
        results = self.ensemble_retriever.get_relevant_documents(query)
        print(results)
        formatted_results = []
        for document in results:
            doc_info = {
                'title': document.metadata.get('title', document.metadata.get('source/title','No Title')),
                'keywords': document.metadata.get('keywords', []),
                'abstract': document.page_content
            }
            formatted_results.append(doc_info)
        return formatted_results

hs = HybridSearch(
        'E:\\NLPT\\_Q-A-INLPT-WS2023\\Transfromer_project-20240228T221604Z-002\\Transfromer_project\\data\\papers_latest.json')
data = hs.load_data()
docs = hs.transform_data_to_documents(data)
bm25_retriever = hs.initialize_bm25_retriever(docs)

chroma_vectorstore = hs.process_documents_with_chroma(docs)
hs.create_ensemble_retriever(bm25_retriever, chroma_vectorstore)

In [11]:
from langchain import hub
from langchain_openai import ChatOpenAI

from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


retriever = hs.ensemble_retriever #db3.as_retriever() # print(dir(db3)) to get all functions, attributes
prompt = hub.pull("rlm/rag-prompt")
llm = ChatOpenAI(model_name="gpt-3.5-turbo",temperature=0)

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor,LLMChainFilter
from langchain.llms import OpenAI

compressor = LLMChainExtractor.from_llm(
    llm=llm
)
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=retriever
)

rag_chain_compressor = (
    {"context": compression_retriever | format_docs, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)



## **Generate follow up Questions for compressed context**
Here we use openai to only take the relevant context and generate followup questions

In [49]:
import random, openai
def generate_questions(title, context, amount):
    '''We ask ChatGPT to form questions over our context, while taking into account different forms of questions.
       1. Confirmation Question [yes or no]
       2. Factoid-type Question [what, which, when, who, how]
       3. List-type Question
       4. Casual Question [why or how]
       5. Hypothetical Question [what if]
       6. Complex question (requires understanding of multiple texts)
    '''
    question_type = ["confirmation questions [yes or no]", "factoid-type questions [what, which, when, who, how]","list-type questions","casual questions [why or how]","hypothethical questions [e.g. what would happen if...]","questions"]
    random_type = random.randint(0,len(question_type)-1)

    prompt = f"""Context: {context}\n\nI want to evaluate my document embeddings. Form {question_type[random_type]} in your own words. Answer in this json format {{"question_1": '', ...}} (Exact Amount: {amount}), don't say anything else."""
    # Careful, when you add {title} in the prompt. ChatGPT tends to make the question title-specific

    # You can adjust temperature and max tokens as per your preferences
    response = openai.chat.completions.create(
        model="gpt-3.5-turbo",  # Use the chat model
        messages=[
            {"role": "system", "content": "assistant"},
            {"role": "user", "content": prompt},
        ],
        temperature=0.7,
        max_tokens=100,
        n=2  # Generate three questions
    )
    return response.choices[0].message.content

## **Generate answers using contextual compression**
Here we use LLMChainExtractor to only take the relevant information from each document. We prepare the compressor based retriever and generate answers in an analogous way as above.

In [50]:
query = 'What is ease-mm ?'# 'How does the new prediction method, EASE-MM, select the final prediction model?''
compressed_docs = compression_retriever.get_relevant_documents(query)
#print(rag_chain_compressor.invoke(query))
generated_questions = generate_questions(query, compressed_docs[0].page_content,amount=3)




In [51]:
generated_questions

'{\n  "question_1": "How are document embeddings calculated for Ease-mm?",\n  "question_2": "Why are document embeddings used in evaluating protein stability changes?",\n  "question_3": "How do document embeddings help in predicting accurate protein stability changes induced by single amino acid substitutions?"\n}'