In [11]:
hallucination_prompt = """
You are an expert evaluator analyzing responses generated by a retrieval-augmented generation (RAG) system for accuracy and hallucination.
The RAG system's task is to answer based solely on the information provided in the ground truth and context. 
You will determine if the RAG system's response includes any information that was not present in the ground truth or context.

Question: {question}

Context:
{context}

Ground Truth:
{ground_truth}

RAG System's Response:
{response}

Instructions:
1. If the response accurately answers the question based on the provided ground truth and context, mark it as "Accurate."
2. If the response is correct but includes minor details that are reasonable but not explicitly in the ground truth or context, mark it as "Minor Additional Detail."
3. If the response says that information is missing, such as "This information is not in the context" or "I cannot provide this information," mark it as "Information Not Found" rather than hallucination.
4. If the response is grounded in the context or ground truth but does not directly answer the question, mark it as "Contextually Relevant but Off-Topic."
5. If the response introduces fabricated or unverified information that is not present in the ground truth or context, mark it as "Hallucination."

Respond with one of the following:
- Accurate
- Minor Additional Detail
- Information Not Found
- Contextually Relevant but Off-Topic
- Hallucination
"""


In [12]:
# Define choices and scoring for the refined prompt
choices = ["Accurate", "Minor Additional Detail", "Information Not Found", "Contextually Relevant but Off-Topic", "Hallucination"]
choice_scores = [1.0, 0.8, 0.6, 0.4, 0.0]  # Scoring based on the refined classifications


In [13]:
import pandas as pd

# Load data from CSV
data = pd.read_csv('C:/Users/Emumba/Documents/genie research/llm-testing-main/llm-testing-main/hallucination/llm_responses.csv')

# Ensure the data contains the necessary columns
required_columns = ["query", "ground truth", "response", "context"]
if not all(column in data.columns for column in required_columns):
    raise ValueError(f"CSV file must contain the following columns: {required_columns}")

# Mapping data columns to prompt variables
prompt_var_to_column_mapping = {
    "question": "query",
    "ground_truth": "ground truth",
    "response": "response",
    "context": "context"
}

In [None]:
import pandas as pd
from uptrain import CustomPromptEval, EvalLLM, Settings

# Set up the evaluation using Uptrain
eval_llm = EvalLLM(settings=Settings(openai_api_key="", response_format={"type": "json_object"}))

# Perform evaluation
results = eval_llm.evaluate(
    data=data.to_dict(orient="records"),
    checks=[CustomPromptEval(
        prompt=hallucination_prompt,
        choices=choices,
        choice_scores=choice_scores,
        prompt_var_to_column_mapping=prompt_var_to_column_mapping
    )]
)

# Process and save results
output_data = []
for result in results:
    output_data.append({
        "query": result["query"],
        "response": result["response"],
        "ground truth": result["ground truth"],
        "score": result.get("score_custom_prompt", "No score")  # Retrieve score or assign "No score" if missing
    })

# Convert to DataFrame
output_df = pd.DataFrame(output_data)

# Save to CSV
output_df.to_csv('evaluation_results.csv', index=False)

print("Results saved to 'evaluation_results.csv'")


100%|██████████| 10/10 [00:06<00:00,  1.66it/s]
  with ThreadPoolExecutor(max_workers=1) as executor:
[32m2024-10-30 16:45:37.908[0m | [1mINFO    [0m | [36muptrain.framework.evalllm[0m:[36mevaluate[0m:[36m376[0m - [1mLocal server not running, start the server to log data and visualize in the dashboard![0m


Results saved to 'evaluation_results.csv'


In [8]:
# Debug: Print the results structure to understand the keys available
print("Results structure:", results)


Results structure: [{'query': 'Does GTE-Large support up to 8192 tokens in a single input like BGE-M3?', 'response': 'No, GTE-Large does not support up to 8192 tokens in a single input. It is limited to 512 tokens. This information can be found in the context under the description for GTE-Large.', 'context': "Open Source Embedding Models GTE-Base General model for similarity search or downstream enrichments. Used for general text blobs. Limited to 512 tokens Embedding Dimensions: 768 Model size: 219 MB GTE-Large High quality general model for similarity search or downstream Used for general text blobs Limited to 512 tokens Embedding Dimensions: 1024 Model Size: 670 MB GTE-Small Good quality general model for faster similarity search or downstream Used for general text blobs Limited to 512 tokens Embedding Dimensions: 384 Model Size: 67 MB E5-Small A good small and fast general model for similarity search or downstream enrichments Used for general text blobs Limited to 512 tokens Embedd