In [2]:
import os
from dotenv import load_dotenv
from openai import OpenAI
from litellm import completion


load_dotenv()

SEED = 42
TEMPERATURE = 0.0

def evaluate_context_relevance(prompt, context):
    """
    Evaluate the relevance of a given context to the prompt.
    
    Args:
    prompt (str): The query or prompt.
    context (str): The context to evaluate for relevance.
    
    Returns:
    JSON: A JSON containing relevance score and explanation.
    """
    system_message = """
    You are an AI assistant tasked with evaluating the relevance of a given context to a specific prompt.
    Your goal is to determine how useful the context would be in answering the prompt.
    
    Please evaluate the relevance on a scale from 0 to 10, where:
    0: Completely irrelevant
    1-3: Marginally relevant (mentions related concepts but doesn't directly address the prompt)
    4-6: Moderately relevant (provides some useful information but doesn't fully answer the prompt)
    7-9: Highly relevant (provides most of the information needed to answer the prompt)
    10: Perfectly relevant (directly and completely answers the prompt)
    
    Provide your response in the following format:
    Score: [Your score from 0 to 10]
    Explanation: [A brief explanation of your scoring, no more than 20 words]
    """
    
    user_message = f"""
    Prompt: {prompt}
    
    Context: {context}
    
    Please evaluate the relevance of this context to the given prompt.
    """
    
    response = completion(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ],
        max_tokens=50,
        temperature=TEMPERATURE,
        seed=SEED
    )
    
    result = response.choices[0].message.content.strip().split("\n")
    score = int(result[0].split(":")[1].strip())
    explanation = result[1].split(":")[1].strip()
    
    return {"score": score, "explanation": explanation}



def generate_overall_reason(average_score, details):
    system_message = """
    You are an AI assistant tasked with summarizing the relevance of multiple contexts to a specific prompt.
    Based on the average relevance score and individual context evaluations, provide an overall explanation
    of how relevant the contexts are collectively to answering the prompt.
    
    Your explanation should be concise (no more than 50 words) and address:
    1. The overall relevance of the contexts
    2. Any particularly relevant or irrelevant contexts
    3. How well the contexts collectively answer the prompt
    """

    user_message = f"""
    Average Relevance Score: {average_score:.2f}
    
    Individual Context Evaluations:
    {', '.join(f"Context {d['context_index']}: Score {d['score']}/10 - {d['explanation']}" for d in details)}
    
    Please provide an overall explanation of the context relevance based on this information.
    """
    
    response = completion(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": system_message},
            {"role": "user", "content": user_message}
        ],
        max_tokens=100,
        temperature=TEMPERATURE,
        seed=SEED
    )
    
    return response.choices[0].message.content.strip()


def calculate_context_relevance_score(prompt, contexts):
    """
    Calculate the overall context relevance score for a given prompt and list of contexts.
    
    Args:
    prompt (str): The query or prompt.
    contexts (list): A list of context strings.
    
    Returns:
    tuple: (float, list) - The average relevance score (0 to 10) and a list of relevance details.
    """
    if not contexts:
        return 0.0, []
    
    relevance_details = []
    total_score = 0
    
    for i, context in enumerate(contexts):
        evaluation = evaluate_context_relevance(prompt, context)
        relevance_details.append({
            "context_index": i+1,
            "score": evaluation["score"],
            "explanation": evaluation["explanation"]
        })
        total_score += evaluation["score"]
    
    average_score = total_score / len(contexts)

    # generate the final reason
    overall_reason = generate_overall_reason(average_score=average_score, details=relevance_details)
    
    return {
        'score': average_score, 
        'details': relevance_details,
        'reason': overall_reason
    }


prompt = "What is the capital of France?"
contexts = [
    "Paris is the capital and most populous city of France.",
    "The Eiffel Tower is a wrought-iron lattice tower on the Champ de Mars in Paris.",
    "London is the capital and largest city of England and the United Kingdom.",
]

result = calculate_context_relevance_score(prompt, contexts)

score = result['score']
details = result['details']
reason = result['reason']

output_string = (
    f"Average Context Relevance Score: {score:.2f}\n"
    "Individual Context Relevance:\n"
    + "\n".join(
        f"Context {detail['context_index']}:\n"
        f"Score: {detail['score']}/10\n"
        f"Explanation: {detail['explanation']}\n"
        for detail in details
    )
)

print(output_string)

print("#"*40)
print(f"Score: {score}")
print("Overall Reason:")
print(reason)

Average Context Relevance Score: 4.67
Individual Context Relevance:
Context 1:
Score: 10/10
Explanation: The context directly and completely answers the prompt by stating that Paris is the capital of France.

Context 2:
Score: 3/10
Explanation: The context mentions Paris, which is relevant, but does not directly state the capital.

Context 3:
Score: 1/10
Explanation: The context mentions a capital city but is irrelevant to the capital of France.

########################################
Score: 4.666666666666667
Overall Reason:
The contexts collectively demonstrate high relevance to the prompt, with an average score of 4.67. Context 1 is particularly relevant as it directly answers the question, while Context 2 offers partial relevance. Context 3 is largely irrelevant. Overall, the contexts provide a strong foundation for answering the prompt.
