In [1]:
# import models
import openai
from llama_index.llms.openai import OpenAI

from llama_index.llms.anthropic import Anthropic

from llama_index.llms.cohere import Cohere
from llama_index.embeddings.cohere import CohereEmbedding

from llama_index.llms.gemini import Gemini
from llama_index.embeddings.gemini import GeminiEmbedding

from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.llms.openai import OpenAI
from llama_index.embeddings.openai import OpenAIEmbedding

from llama_index.core.indices.query.query_transform import HyDEQueryTransform
from llama_index.core.query_engine import TransformQueryEngine, RetrieverQueryEngine

from llama_index.core import VectorStoreIndex, PromptTemplate, ServiceContext

from llama_index.postprocessor.cohere_rerank import CohereRerank
from llama_index.core.postprocessor import LLMRerank
from llama_index.core.response.pprint_utils import pprint_response

from llama_index.core import (
    VectorStoreIndex,
    SimpleDirectoryReader,
)

import pandas as pd


In [2]:
import os
from getpass import getpass

# OpenAI
os.environ["OPENAI_API_KEY"] = 'OPENAI_API_KEY'

# Cohere
API_KEY_COHERE = 'API_KEY_COHERE'
# Antrophic
API_KEY_ANTROPHIC = "API_KEY_ANTROPHIC"
# Google
API_KEY_GOOGLE = 'API_KEY_GOOGLE'


In [None]:
input_dir_path = '../context_files'
#documents = SimpleDirectoryReader(input_dir_path).load_data()

loader = SimpleDirectoryReader(
            input_dir = input_dir_path,
            required_exts=[".pdf"],
            recursive=True
        )
documents = loader.load_data()

In [None]:
from llama_index.core import Settings

# OpenAI - GPT 4
llm = OpenAI(model="gpt-4o")
embed_model = OpenAIEmbedding(model="text-embedding-3-large")

# Antrophic - Claude 3 opus
llm = Anthropic(model="claude-3-opus-20240229", api_key=API_KEY_ANTROPHIC)
embed_model = OpenAIEmbedding(model="text-embedding-3-large")

# Cohere - Command R+
llm = Cohere(api_key=API_KEY_COHERE, model="command-r-plus")
embed_model = CohereEmbedding(cohere_api_key=API_KEY_COHERE, model_name="embed-english-v3.0", input_type="search_query",)

# Google - Gemini 1.5
llm = Gemini(api_key=API_KEY_GOOGLE, model="models/gemini-1.5-pro-latest")
embed_model = GeminiEmbedding(model_name="models/text-embedding-004")

Settings.llm = llm
Settings.embed_model = embed_model

service_context = ServiceContext.from_defaults(llm = llm)


In [None]:
index = VectorStoreIndex.from_documents(documents, show_progress=True)

# Prompt Template

In [6]:
text_qa_template_str = (
    """  
    You are an expert in Nuclear Medicine that is trusted around the world for your factual accuracy.
    If the context isn't helpful, you can also answer the question on your pretrained knowledge

    Some rules to follow:
    1. Directly reference the given context according to Havard citation style in your answer. Do not include the path where the document is saved.
    2. Avoid statements like 'Based on the context, ...' or 'The context information ...' or anything along those lines.
    3. Focus on succinct answers that provide only the facts necessary, do not be verbose.
    4. Answer the question in the language of the question (if the question is in German, answer in German. If the question is in English answer in English)
    ---------------------
    {context_str}
    ---------------------
    Given the context information and not prior knowledge, answer the query.
    Query: {query_str}
    Answer: """) 
text_qa_template = PromptTemplate(text_qa_template_str)

# Query Engines

In [7]:
# Naive RAG
query_engine_naive = index.as_query_engine(llm = llm,
                                           similarity_top_k=3,
                                           text_qa_template=text_qa_template)

In [8]:
# HyDE
hyde = HyDEQueryTransform(include_original=True)

# LLM Rerank
llm_rerank = LLMRerank(choice_batch_size=10, top_n=3)
query_engine_llm_rerank = index.as_query_engine(
    similarity_top_k=10,
    text_qa_template=text_qa_template,
    node_postprocessors=[llm_rerank],
    # embed_model=embed_model,
    llm=llm
)
# HyDE + LLM Rerank
query_engine_hyde_llm_rerank = TransformQueryEngine(query_engine_llm_rerank, hyde)

# get_answers function

In [11]:
def get_answers(dataframe, answer_list, context_list, llm):
    for i in range(len(dataframe['Question'])):
        print(dataframe['Question'][i])
        user_prompt = dataframe['Question'][i]
        
        try:
            response = query_engine_hyde_llm_rerank.query(user_prompt)
        except:
            pass
        # Get the context from the Llama index
        context = [x.text for x in response.source_nodes]

        answer_list.append(response)
        context_list.append(context)

        dataframe.loc[i, 'LLM'] = llm

        # adding advanced RAG to every questionId
        question_id = dataframe.loc[i, 'QuestionId']
        new_question_id = str(question_id) + ' advanced RAG'
        

        dataframe.loc[i, 'QuestionId'] = new_question_id

        print(response)
        print(i)

# Antrophic Advanced RAG

In [28]:
claude_opus_context_advancedRAG_question_df = pd.read_csv("../Evaluation-Questions.csv")

In [29]:
claude_answers = []
claude_context = []

In [31]:
claude_opus_context_advancedRAG_question_df['context'] = pd.NA

In [None]:
get_answers(claude_opus_context_advancedRAG_question_df, claude_answers, claude_context, 'claude-3-opus-20240229 advanced RAG')

In [33]:
# update dataframe
claude_opus_context_advancedRAG_question_df['answer'] = pd.DataFrame({'answers': claude_answers})
claude_opus_context_advancedRAG_question_df['context'] = pd.DataFrame({'context': claude_context})

In [34]:
filename = './csv-advancedRAG/Claude-3-opus_context_advancedRAG_EvaluationQuestions.csv'
claude_opus_context_advancedRAG_question_df.to_csv(filename, sep=',', index=False, encoding='utf-8')

# Cohere Advanced RAG

In [38]:
command_r_plus_context_advancedRAG_question_df = pd.read_csv("../Evaluation-Questions.csv")

In [39]:
command_answers = []
command_context = []

In [40]:
command_r_plus_context_advancedRAG_question_df['context'] = pd.NA

In [None]:
get_answers(command_r_plus_context_advancedRAG_question_df, command_answers, command_context, 'command-r-plus advanced RAG')

In [42]:
# update dataframe
command_r_plus_context_advancedRAG_question_df['answer'] = pd.DataFrame({'answers': command_answers})
command_r_plus_context_advancedRAG_question_df['context'] = pd.DataFrame({'context': command_context})

In [43]:
filename = './csv-advancedRAG/Command_r_plus_context_advancedRAG_EvaluationQuestions.csv'
command_r_plus_context_advancedRAG_question_df.to_csv(filename, sep=',', index=False, encoding='utf-8')

# OpenAI Advanced RAG

GPT-4

In [15]:
gpt_4_context_advancedRAG_question_df = pd.read_csv("../Evaluation-Questions.csv")

In [16]:
gpt_answers = []
gpt_context = []

In [17]:
gpt_4_context_advancedRAG_question_df['context'] = pd.NA

In [None]:
get_answers(gpt_4_context_advancedRAG_question_df, gpt_answers, gpt_context, 'gpt-4-turbo advanced RAG')

In [19]:
# update dataframe
gpt_4_context_advancedRAG_question_df['answer'] = pd.DataFrame({'answers': gpt_answers})
gpt_4_context_advancedRAG_question_df['context'] = pd.DataFrame({'context': gpt_context})

In [20]:
filename = './csv-advancedRAG/GPT_4_context_advancedRAG_EvaluationQuestions.csv'
gpt_4_context_advancedRAG_question_df.to_csv(filename, sep=',', index=False, encoding='utf-8')

GPT-4o

In [12]:
gpt_4o_context_advancedRAG_question_df = pd.read_csv("../Evaluation-Questions.csv")

In [13]:
gpto_answers = []
gpto_context = []
gpt_4o_context_advancedRAG_question_df['context'] = pd.NA

In [None]:
get_answers(gpt_4o_context_advancedRAG_question_df, gpto_answers, gpto_context, 'gpt-4o advanced RAG')
# update dataframe
gpt_4o_context_advancedRAG_question_df['answer'] = pd.DataFrame({'answers': gpto_answers})
gpt_4o_context_advancedRAG_question_df['context'] = pd.DataFrame({'context': gpto_context})

In [15]:
filename = './csv-advancedRAG/GPT_4o_context_advancedRAG_EvaluationQuestions.csv'
gpt_4o_context_advancedRAG_question_df.to_csv(filename, sep=',', index=False, encoding='utf-8')

# Gemini Advanced RAG

In [89]:
gemini_context_advancedRAG_question_df = pd.read_csv("../Evaluation-Questions.csv")

In [96]:
gemini_answers = []
gemini_context = []

In [91]:
gemini_context_advancedRAG_question_df['context'] = pd.NA

In [None]:
get_answers(gemini_context_advancedRAG_question_df, gemini_answers, gemini_context, 'gemini-1.5-pro-latest advanced RAG')

In [99]:
# update dataframe
gemini_context_advancedRAG_question_df['answer'] = pd.DataFrame({'answers': gemini_answers})
gemini_context_advancedRAG_question_df['context'] = pd.DataFrame({'context': gemini_context})

In [100]:
filename = './csv-advancedRAG/Gemini-1-5_context_advancedRAG_EvaluationQuestions.csv'
gemini_context_advancedRAG_question_df.to_csv(filename, sep=',', index=False, encoding='utf-8')