S009 Retriever Evaluator

- Evaluate the output of a RAG retriever against various LLMs
- Compare the generated answer to the expected answer and generate correctness score


In [None]:
import os
from config import set_environment 
set_environment()

import logging
import sys
#logging.basicConfig(stream=sys.stdout, level=logging.INFO)
#logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

# Only for notebook
import nest_asyncio
nest_asyncio.apply()

In [None]:
from llama_index.core import Settings

from llama_index.core.base.response.schema import Response
from llama_index.core import Settings
from llama_index.core.evaluation import (
    BatchEvalRunner,
    CorrectnessEvaluator,
)

from llama_index.llms.openai import OpenAI
from llama_index.llms.cohere import Cohere

from langchain_community.vectorstores import Chroma
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_anthropic import ChatAnthropic
from langchain_google_genai import ChatGoogleGenerativeAI, GoogleGenerativeAIEmbeddings
from langchain_cohere import ChatCohere, CohereEmbeddings
from langchain_together import ChatTogether
from langchain_fireworks import ChatFireworks
from langchain_core.documents import Document
from langchain_text_splitters import MarkdownHeaderTextSplitter
import tiktoken
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import DirectoryLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain.prompts import ChatPromptTemplate

from langchain.callbacks.tracers import LangChainTracer
from langchain_core.tracers.context import tracing_v2_enabled

from langsmith import Client

from datetime import datetime
import pandas as pd
import json
import random

from evaluation_utils import threadpool_map



Choose the LLM for generation

In [None]:
generation_llm_family = os.environ["GENERATION_LLM_FAMILY"]
generation_llm_model = os.environ["GENERATION_LLM_MODEL"]


if generation_llm_family == "OPENAI":
   llm = ChatOpenAI(model_name=generation_llm_model, temperature=0)
elif generation_llm_family == "ANTHROPIC":
   llm = ChatAnthropic(model_name=generation_llm_model, temperature=0)
elif generation_llm_family == "GOOGLE":
   llm = ChatGoogleGenerativeAI(model=generation_llm_model, temperature=0)
elif generation_llm_family == "COHERE":
   llm = ChatCohere(model=generation_llm_model, temperature=0)
elif generation_llm_family == "META":
   #llm = ChatTogether(model=generation_llm_model, temperature=0)
   llm = ChatFireworks( model=generation_llm_model,temperature=0)
elif generation_llm_family == "QWEN":
   llm = ChatTogether(model=generation_llm_model, temperature=0)
elif generation_llm_family == "MISTRALAI":
   llm = ChatTogether(model=generation_llm_model, temperature=0)

Set the Embedding Model
- We won't be actually calculating any embeddings
- This is just for reporting purposes - record which model was used by the retriever

In [None]:
embedding_llm_family = os.environ["EMBEDDING_LLM_FAMILY"]
embedding_llm_model = os.environ["EMBEDDING_LLM_MODEL"]
embedding_dimensions = int(os.environ["EMBEDDING_DIMESIONS"])


In [None]:
eval_name = os.environ["EVAL_NAME"]
eval_questions = os.environ["EVAL_QUESTIONS"]
eval_results_dir = os.environ["EVAL_RESULTS_DIR"]

rtr_output_file = os.environ["RTR_OUTPUT_FILE"]

rag_strategy = os.environ["RAG_STRATEGY"]
similarity_top_k = int(os.environ["SIMILARITY_TOP_K"]) 

embed_string = embedding_llm_model.replace("models/", "") if "models/" in embedding_llm_model else embedding_llm_model
generation_string = generation_llm_model.replace("meta-llama/", "").replace("accounts/fireworks/models/","").replace("Qwen/", "").replace("models/", "").replace("mistralai/", "") 

if rag_strategy == "S009_00":
    rag_strategy_desc = "Fusion_AI_RTR_PDF"


batch_id = f"{eval_name}_{rag_strategy}_GM_{generation_string}_EM_{embed_string}_{random.randint(0, 999):03}"

output_file = f"{eval_results_dir}/{batch_id}.xlsx"


In [None]:
evaluation_llm_family = os.environ["EVALUATION_LLM_FAMILY"]
evaluation_llm_model = os.environ["EVALUATION_LLM_MODEL"]

if evaluation_llm_family == "OPENAI":
    Settings.eval_llm = OpenAI(temperature=0, model=evaluation_llm_model)
elif evaluation_llm_family == "COHERE":
    Settings.eval_llm = Cohere(api_key=os.environ["COHERE_API_KEY"], model=evaluation_llm_model, temperature=0)

In [None]:
data_list = []

with open(rtr_output_file, 'r') as file:
    for line in file:
        try:
            data_list.append(json.loads(line))
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON: {e}")

# Process the data to create the required DataFrame
data_processed = []

for entry in data_list:
    query = entry.get('question', '')
    generated_answer = entry.get('answer', '')
    retrieved_chunks = "\n".join(chunk['chunkText'] for chunk in entry.get('chunks', []))
    
    generated_prompt = (
        "Use the retrieved context, consisting of these documents, to answer the question. If you don't know the answer, just say that you don't know. Provide a detailed response, but do not invent stuff.\n"
        "Context:\n" + retrieved_chunks + "\n"
        "Question:\n" + query
    )
    
    data_processed.append({
        'query': query,
        'generated_answer': generated_answer,
        'retrieved_chunks': retrieved_chunks,
        'generated_prompt': generated_prompt
    })

# Create DataFrame
retriever_df = pd.DataFrame(data_processed)

eval_questions_df = pd.read_excel(eval_questions, usecols=['query_num', 'query', 'expected_answer'])

retriever_df['query'] = retriever_df['query'].str.strip()
eval_questions_df['query'] = eval_questions_df['query'].str.strip()

# Ensure matching data types
retriever_df['query'] = retriever_df['query'].astype(str)
eval_questions_df['query'] = eval_questions_df['query'].astype(str)

# Check for unique keys
print(f"Retriever unique queries: {retriever_df['query'].nunique()} out of {len(retriever_df)}")
print(f"Eval Questions unique queries: {eval_questions_df['query'].nunique()} out of {len(eval_questions_df)}")

# Drop duplicates if any
eval_questions_df = eval_questions_df.drop_duplicates(subset='query')
retriever_df = retriever_df.drop_duplicates(subset='query')

# Perform the merge
retriever_df = pd.merge(retriever_df, eval_questions_df, on='query', how='left')


# Save DataFrame to JSONL
output_file_path = 'df_retriever.jsonl'
retriever_df.to_json(output_file_path, orient='records', lines=True)
print(f"DataFrame saved to {output_file_path}")



In [None]:
generation_chain = llm | StrOutputParser()

Quick Test

In [None]:
eval_quick_test = retriever_df.loc[10, 'generated_prompt'] if not retriever_df.empty else None

response = generation_chain.invoke(eval_quick_test)
print(f"Question:{eval_quick_test}{chr(10)}")
print(f"Response:{chr(10)}{response}")
print(f"{chr(10)}{chr(10)}Prompt:{chr(10)}{eval_quick_test}{chr(10)}")

In [None]:
def run_rag_pipeline(row):
 
    metadata = {
        "eval_name": eval_name,
        "batch_id":batch_id,
        "query_num": row["query_num"],
        "rag_strategy": rag_strategy,
        "rag_strategy_desc": rag_strategy_desc,
        "parameter_1": similarity_top_k,
        "parameter_2": "",
        "parameter_3": "",
        "parameter_4": "",
        "parameter_5": "",
        "model": generation_llm_model,
        "embed_model": embedding_llm_model,
        "embed_dimensions": embedding_dimensions,
        
    }
       
    with tracing_v2_enabled(project_name=eval_name):
      response = generation_chain.invoke(row['generated_prompt'],{"metadata": metadata})   
    
    return {
        "query_num": row["query_num"],
        "generated_answer": response
        
    }


In [None]:
results = threadpool_map(run_rag_pipeline, [{"row": item[1]} for item in list(retriever_df.iterrows())],num_workers=1)

In [None]:
df = eval_questions_df.merge(pd.DataFrame(results), on="query_num", how="inner")
assert len(df) == len(eval_questions_df)  # Ensure that all queries have been processed

In [None]:
from llama_index.core.evaluation import (
    CorrectnessEvaluator,
)
eval_lidx_c = CorrectnessEvaluator(llm=Settings.eval_llm)

In [None]:
eval_lidx_c = CorrectnessEvaluator(llm=Settings.eval_llm)

runner = BatchEvalRunner(
    {"correctness": eval_lidx_c},
    workers=16,
)

LI_eval_results = await runner.aevaluate_responses(
    queries=df["query"].tolist(),
    responses=[Response(response=x) for x in df["generated_answer"].tolist()],
    reference=[{"reference": x} for x in df["expected_answer"].tolist()],
)

In [None]:
df["correctness_result"] = LI_eval_results["correctness"]
df["correctness_llm"] = df["correctness_result"].map(lambda x: x.score)
df["feedback_llm"] = df["correctness_result"].map(lambda x: x.feedback)
print(f"""Average score: {df["correctness_llm"].mean()}""")

In [None]:
responses_df = pd.DataFrame()
responses_df = df[['query_num', 'query', 'expected_answer', 'generated_answer', 'correctness_llm']]
responses_df['correctness_human'] = responses_df['correctness_llm']
responses_df.loc[:, ['faithfulness_llm', 'faithfulness_human']] = ""
responses_df['rag_strategy'] = rag_strategy
responses_df['rag_strategy_desc'] = rag_strategy_desc
responses_df['parameter_1'] = similarity_top_k
responses_df.loc[:, ['parameter_2', 'parameter_3', 'parameter_4', 'parameter_5']] = ""
responses_df['model'] = generation_string 
responses_df['embed_model'] = embedding_llm_model 
responses_df['eval_model'] = evaluation_llm_model
responses_df['embed_dimensions'] = embedding_dimensions   
responses_df['reranker'] = ""
responses_df['run_date'] = datetime.today().strftime('%Y-%m-%d') 
responses_df['eval_name'] = eval_name
responses_df['batch_id'] = batch_id

In [None]:
client = Client()
runs = client.list_runs (
    project_name=eval_name, 
    filter=f"and(eq(metadata_key, 'batch_id'), eq(metadata_value, '{batch_id}'))",
    is_root=True
)

In [None]:
usage_data = []

for run in runs:
        
    usage_data.append(
        {
            "query_num": run.extra["metadata"]["query_num"],
            "total_tokens": run.total_tokens,
            "prompt_tokens": run.prompt_tokens,
            "completion_tokens": run.completion_tokens,
            "total_cost": f"${run.total_cost:.4f}"
            if run.total_cost
            else None,
            "prompt_cost": f"${run.prompt_cost:.4f}"
            if run.prompt_cost
            else None,
            "completion_cost": f"${run.completion_cost:.4f}"
            if run.completion_cost
            else None,
            "latency": (run.end_time - run.start_time).total_seconds()
            if run.end_time
            else None,  # Pending runs have no end time
            "first_token_ms": (run.first_token_time - run.start_time).total_seconds()*1000
            if run.first_token_time
            else None,  # Pending runs have no end time
        }
    )

usage_df = pd.DataFrame(usage_data)

In [None]:
responses_df = responses_df.merge(usage_df, on='query_num', how='left')

In [None]:
correctness_sum = df['correctness_llm'].sum()
correctness_mean = df['correctness_llm'].mean()

# Create a new DataFrame for the summary
summary_df = pd.DataFrame({
    'Metric': ['Sum', 'Mean'],
    'Value': [correctness_sum, correctness_mean]
})

In [None]:
correctness_df = pd.DataFrame()
correctness_df = df[['query_num', 'query', 'expected_answer', 'generated_answer', 'correctness_llm', 'feedback_llm']]
correctness_df['correctness_human'] = correctness_df['correctness_llm']
correctness_df['feedback_human'] = ""
correctness_df['batch_id'] = batch_id

In [None]:
with pd.ExcelWriter(output_file) as writer:
   responses_df.to_excel(writer, sheet_name="Responses", index=False)
   summary_df.to_excel(writer, sheet_name="Summary", index=False)
   correctness_df.to_excel(writer, sheet_name="Correctness", index=False)

