In [1]:
import logging
import time

import nest_asyncio
from llama_index.core.evaluation import FaithfulnessEvaluator
from llama_index.core.evaluation import PairwiseComparisonEvaluator
from llama_index.core.evaluation.answer_relevancy import AnswerRelevancyEvaluator
from llama_index.core.evaluation.context_relevancy import ContextRelevancyEvaluator
from llama_index.core.evaluation.correctness import CorrectnessEvaluator
from llama_index.core.evaluation.relevancy import RelevancyEvaluator
from llama_index.llms.openai import OpenAI

from utils.config import OPENAI_API_KEY
from utils.evaluation import (
    get_all_test_data,
    update_database,
    update_evaluation,
    get_current_evaluation
)

Configuration loaded in: 0.0 seconds
Debugging is enabled: True
Device: cuda is available
VectorStoreIndex: wiki_movie_plots
----------------------------------------------------------------------------------------------------


In [3]:
nest_asyncio.apply()

In [4]:
# initialize logging for better debugging
logging.basicConfig(
    level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

In [6]:
def get_record_data(record):
    # init fields for evaluation of correctness via llama index framework
    context = record[5]  # retrieved_contexts from the index
    question = record[6]  # question asked by the user
    reference_answer = record[7]  # reference_answer from the reference model gpt3
    response = record[8]  # response from the llm model that we are evaluating
    return context, question, reference_answer, response

In [5]:
def get_evaluation_model():
    # init evaluation model
    return OpenAI(
        model="gpt-3.5-turbo",
        api_key=OPENAI_API_KEY,
        temperature=0,
    )

In [6]:
def evaluate_answer_relenvancy(evaluation_model, record):
    """
    01 Answer Relevancy Evaluator
    Evaluates the relevancy of response to a query.
    This evaluator considers the query string and response string.

    Metric: answer_relevancy_feedback (str)
    """
    context, question, reference_answer, response = get_record_data(record)

    # evaluation
    evaluator = AnswerRelevancyEvaluator(llm=evaluation_model)
    result = evaluator.evaluate(
        query=question,  # query_string
        response=response  # response_string
    )

    # fields to be updated in the database
    fields = ["answer_feedback"]
    # values to be updated in the database
    values = [f"{result.feedback}"]

    return fields, values

In [7]:
def evaluate_context_relevancy(evaluation_model, record):
    """
    02 Context Relevancy Evaluator
    Evaluates the relevancy of retrieved contexts to a query.
    This evaluator considers the query string and retrieved contexts.

    Metric:
    context_relevancy_score (float)
    context_relevancy_feedback (str)
    """
    context, question, reference_answer, response = get_record_data(record)

    # evaluation
    evaluator = ContextRelevancyEvaluator(llm=evaluation_model)
    result = evaluator.evaluate(
        query=question,  # query_string
        contexts=[context],  # retrieved_contexts
    )

    # fields to be updated in the database
    fields = ["context_relevancy_score", "context_relevancy_feedback"]
    # values to be updated in the database
    values = [result.score, f"{result.feedback}"]

    return fields, values

In [8]:
def evaluate_correctness(evaluation_model, record):
    """
    03 Correctness Evaluator
    Evaluates the correctness of a question answering system.
    This evaluator depends on `reference` answer to be provided, in addition to the
    query string and response string.

    Metric:
    correctness_score (float)
    correctness_feedback (str)
    """
    context, question, reference_answer, response = get_record_data(record)

    # evaluation
    evaluator = CorrectnessEvaluator(llm=evaluation_model)
    result = evaluator.evaluate(
        query=question,  # query_string
        contexts=[context],  # retrieved_contexts
        response=response,  # response_string
        reference=reference_answer  # reference_string
    )

    # fields to be updated in the database
    fields = ["correctness_score", "correctness_feedback"]
    # values to be updated in the database
    values = [result.score, f"{result.feedback}"]

    return fields, values

In [10]:
def evaluate_faithfulness(evaluation_model, record):
    """
    04 Faithfulness Evaluator
    Evaluates whether a response is faithful to the contexts
    (i.e. whether the response is supported by the contexts or hallucinated.)

    This evaluator only considers the response string and the list of context strings.

    Metric:
    faithfulness_score (float)
    faithfulness_feedback (str)
    """
    context, question, reference_answer, response = get_record_data(record)

    # evaluation
    evaluator = FaithfulnessEvaluator(llm=evaluation_model)
    result = evaluator.evaluate(
        contexts=[context],  # retrieved_contexts
        response=response  # response_string
    )

    # fields to be updated in the database
    fields = ["faithfulness_score"]
    # values to be updated in the database
    values = [result.score]

    return fields, values

In [11]:
def evaluate_relevancy(evaluation_model, record):
    """
    05 Relevancy Evaluator
    Evaluates the relevancy of retrieved contexts and response to a query.
    This evaluator considers the query string, retrieved contexts, and response string.

    Metric:
    relevancy_score (float)
    relevancy_feedback (str)
    """
    context, question, reference_answer, response = get_record_data(record)

    # evaluation
    evaluator = RelevancyEvaluator(llm=evaluation_model)
    result = evaluator.evaluate(
        query=question,  # query_string
        contexts=[context],  # retrieved_contexts
        response=response  # response_string
    )

    # fields to be updated in the database
    fields = ["relevancy_score", "relevancy_feedback"]
    # values to be updated in the database
    values = [result.score, f"{result.feedback}"]

    return fields, values

In [12]:
def evaluate_pairwise(evaluation_model, record):
    """
    06 Pairwise Comparison Evaluator
    Evaluates the relevancy of two responses to a query.
    This evaluator considers the query string, two responses, and the reference string.

    Metric:
    model_score (float)
    """
    context, question, reference_answer, response = get_record_data(record)

    # evaluation
    evaluator = PairwiseComparisonEvaluator(llm=evaluation_model)
    result = evaluator.evaluate(
        query=question,  # query_string
        reference=[context],  # reference_string
        response=response,  # response_string
        second_response=reference_answer  # second_response_string
    )

    # fields to be updated in the database
    fields = ["model_score", "model_feedback"]
    # values to be updated in the database
    values = [result.score, f"{result.feedback}"]

    return fields, values

In [None]:
def run_test(
        llm_model: str = "llama3_instruct",
        collection: str = "wiki_movie_plots_1024_100_mxbai",
        chat_mode: str = "CONDENSE_PLUS_CONTEXT"
):
    logging.info("LLM Model: oll_{} Collection: {} ChatMode: {}".format(llm_model, collection, chat_mode))
    
    # get all the records from the test data db
    get_records_for_evaluation = get_all_test_data(
        llm=True,
        llm_model="oll_" + llm_model,
        collection_name=collection,
        chat_mode=chat_mode
    )
    
    # get evaluation model: gpt-3.5-turbo
    evaluation_model = get_evaluation_model()
    
    # loop through the records and evaluate them
    for record in get_records_for_evaluation:
        # 01 Answer Relevancy Evaluator
        exists = get_current_evaluation(
            evaluation="AnswerRelevancyEvaluator",
            llm_model=llm_model,
            metric="answer_feedback",
            collection_name=collection,
            chat_mode=chat_mode
        )
        if exists == 0 or exists < record[4]:
            # evaluate the answer relevancy
            fields, values = evaluate_answer_relenvancy(evaluation_model, record)
            # update the record in the database
            update_database(
                fields=fields,
                values=values,
                llm="oll_" + llm_model,
                collection=collection,
                row=record[4],
                chat_mode=chat_mode
            )
            update_evaluation(
                evaluation_model="AnswerRelevancyEvaluator",
                metric="answer_feedback",
                llm_model=llm_model,
                collection_name=collection,
                question_id=record[4],
                chat_mode=chat_mode
            )

            # log the updated record
            logging.info(f"TEST INFO |:> AnswerRelevancyEvaluator - {record[4]} completed")

        # 02 Context Relevancy Evaluator
        exists = get_current_evaluation(
            evaluation="ContextRelevancyEvaluator",
            llm_model=llm_model,
            metric="context_relevancy_score",
            collection_name=collection,
            chat_mode=chat_mode
        )
        if exists == 0 or exists < record[4]:
            # evaluate the context relevancy
            fields, values = evaluate_context_relevancy(evaluation_model, record)

            # update the record in the database
            update_database(
                fields=fields,
                values=values,
                llm="oll_" + llm_model,
                collection=collection,
                row=record[4],
                chat_mode=chat_mode
            )
            update_evaluation(
                evaluation_model="ContextRelevancyEvaluator",
                metric="context_relevancy_score",
                llm_model=llm_model,
                collection_name=collection,
                question_id=record[4],
                chat_mode=chat_mode
            )
            update_evaluation(
                evaluation_model="ContextRelevancyEvaluator",
                metric="context_relevancy_feedback",
                llm_model=llm_model,
                collection_name=collection,
                question_id=record[4],
                chat_mode=chat_mode
            )

            # log the updated record
            logging.info(f"TEST INFO |:> ContextRelevancyEvaluator - {record[4]} completed")

        # 03 Correctness Evaluator
        evaluator = "CorrectnessEvaluator"
        exists = get_current_evaluation(
            evaluation=evaluator,
            llm_model=llm_model,
            metric="correctness_score",
            collection_name=collection,
            chat_mode=chat_mode
        )
        if exists == 0 or exists < record[4]:
            # evaluate the correctness
            fields, values = evaluate_correctness(evaluation_model, record)

            # update the record in the database
            update_database(
                fields=fields,
                values=values,
                llm="oll_" + llm_model,
                collection=collection,
                row=record[4],
                chat_mode=chat_mode
            )
            update_evaluation(
                evaluation_model=evaluator,
                metric="correctness_score",
                llm_model=llm_model,
                collection_name=collection,
                question_id=record[4],
                chat_mode=chat_mode
            )
            update_evaluation(
                evaluation_model=evaluator,
                metric="correctness_feedback",
                llm_model=llm_model,
                collection_name=collection,
                question_id=record[4],
                chat_mode=chat_mode
            )

            # log the updated record
            logging.info(f"TEST INFO |:> {evaluator} - {record[4]} completed")

        # 04 Faithfulness Evaluator
        evaluator = "FaithfulnessEvaluator"
        exists = get_current_evaluation(
            evaluation=evaluator,
            llm_model=llm_model,
            metric="faithfulness_score",
            collection_name=collection,
            chat_mode=chat_mode
        )
        if exists == 0 or exists < record[4]:
            # evaluate the faithfulness
            fields, values = evaluate_faithfulness(evaluation_model, record)

            # update the record in the database
            update_database(
                fields=fields,
                values=values,
                llm="oll_" + llm_model,
                collection=collection,
                row=record[4],
                chat_mode=chat_mode
            )
            update_evaluation(
                evaluation_model=evaluator,
                metric="faithfulness_score",
                llm_model=llm_model,
                collection_name=collection,
                question_id=record[4],
                chat_mode=chat_mode
            )

            # log the updated record
            logging.info(f"TEST INFO |:> {evaluator} - {record[4]} completed")

        # 05 Relevancy Evaluator
        evaluator = "RelevancyEvaluator"
        exists = get_current_evaluation(
            evaluation=evaluator,
            llm_model=llm_model,
            metric="relevancy_score",
            collection_name=collection,
            chat_mode=chat_mode
        )
        if exists == 0 or exists < record[4]:
            # evaluate the relevancy
            fields, values = evaluate_relevancy(evaluation_model, record)

            # update the record in the database
            update_database(
                fields=fields,
                values=values,
                llm="oll_" + llm_model,
                collection=collection,
                row=record[4],
                chat_mode=chat_mode
            )
            update_evaluation(
                evaluation_model=evaluator,
                metric="relevancy_score",
                llm_model=llm_model,
                collection_name=collection,
                question_id=record[4],
                chat_mode=chat_mode
            )
            update_evaluation(
                evaluation_model=evaluator,
                metric="relevancy_feedback",
                llm_model=llm_model,
                collection_name=collection,
                question_id=record[4],
                chat_mode=chat_mode
            )

            # log the updated record
            logging.info(f"TEST INFO |:> {evaluator} - {record[4]} completed")

        # 06 Pairwise Comparison Evaluator
        evaluator = "PairwiseComparisonEvaluator"
        exists = get_current_evaluation(
            evaluation=evaluator,
            llm_model=llm_model,
            metric="model_score",
            collection_name=collection,
            chat_mode=chat_mode
        )
        if exists == 0 or exists < record[4]:
            # evaluate the pairwise comparison
            fields, values = evaluate_pairwise(evaluation_model, record)

            # update the record in the database
            update_database(
                fields=fields,
                values=values,
                llm="oll_" + llm_model,
                collection=collection,
                row=record[4],
                chat_mode=chat_mode
            )
            update_evaluation(
                evaluation_model=evaluator,
                metric="model_score",
                llm_model=llm_model,
                collection_name=collection,
                question_id=record[4],
                chat_mode=chat_mode
            )
            update_evaluation(
                evaluation_model=evaluator,
                metric="model_feedback",
                llm_model=llm_model,
                collection_name=collection,
                question_id=record[4],
                chat_mode=chat_mode
            )

            # log the updated record
            logging.info(f"TEST INFO |:> {evaluator} - {record[4]} completed")

    time.sleep(1)
    logging.info("TEST INFO |:> Evaluation completed")


llms = ["llama3_instruct", "gemma_instruct", "mistral_instruct"]
collections = [
    "wiki_movie_plots_512_50_mxbai",
    "wiki_movie_plots_1024_100_mxbai",
    "wiki_movie_plots_2048_200_mxbai",
]
for llm in range(len(llms)):
    for collection in range(len(collections)):
        run_test(llm_model=llms[llm], collection=collections[collection], chat_mode="CONDENSE_PLUS_CONTEXT")

2024-06-13 16:10:30,317 - INFO - LLM Model: oll_llama3_instruct Collection: wiki_movie_plots_512_50_mxbai ChatMode: CONDENSE_PLUS_CONTEXT
2024-06-13 16:10:30,403 - DEBUG - load_ssl_context verify=True cert=None trust_env=True http2=False
2024-06-13 16:10:30,404 - DEBUG - load_verify_locations cafile='C:\\Users\\Alex\\anaconda3\\envs\\thesis_llm\\lib\\site-packages\\certifi\\cacert.pem'
2024-06-13 16:10:30,422 - DEBUG - Request options: {'method': 'post', 'url': '/chat/completions', 'files': None, 'json_data': {'messages': [{'role': 'system', 'content': '\nYou are an expert evaluation system for a question answering chatbot.\n\nYou are given the following information:\n- a user query, and\n- a generated answer\n\nYou may also be given a reference answer to use for reference in your evaluation.\n\nYour job is to judge the relevance and correctness of the generated answer.\nOutput a single score that represents a holistic evaluation.\nYou must return your response in a line with only the 

# Improve Quality of Evaluation
Some of the values went into the wrong field and therefore must be corrected.

**Observation:**
The correctness_score was mixed with the correctness_feedback. The following code will correct the values in the database.

In [6]:
def fix_issues(llm_model: str, collection: str, chat_mode: str):
    # get the potential records to be updated
    for record in get_all_test_data(llm=True, llm_model="oll_" + llm_model, collection_name=collection, chat_mode=chat_mode):
        # later indicator to update the database is:
        llm_model = record[1]
        question_id = record[4]
        collection_name = record[3]
    
        correctness_score = record[12]
        correctness_feedback = record[13]  # in this case, the correctness_score is in the feedback field
    
        if correctness_score == 0.0:
    
            correctness_score = correctness_feedback[:3]
            # https://docs.python.org/3/library/re.html#:~:text=So%20r%22%5Cn%22%20is,using%20this%20raw%20string%20notation.
            # \n count as 1 character
            correctness_feedback = correctness_feedback[4:]
    
            # check if correctness_score is a float
            if float(correctness_score):
                # update the record in the database
                update_database(
                    fields=["correctness_score", "correctness_feedback"],
                    values=[correctness_score, correctness_feedback],
                    llm=llm_model,
                    collection=collection_name,
                    row=question_id,
                    chat_mode=chat_mode
                )
                logging.info(f"CORRECTION |:> correct values for - {question_id}")
        logging.info(f"CORRECTION |:> Nothing to do for - {question_id}")

llms = [
    "llama3_instruct",
    "gemma_instruct",
    "mistral_instruct"
]
collections = [
    "wiki_movie_plots_512_50_mxbai",
    "wiki_movie_plots_1024_100_mxbai",
    "wiki_movie_plots_2048_200_mxbai",
]
chat_modes = [
    "CONDENSE_PLUS_CONTEXT",
    "CONTEXT"
]
for llm in range(len(llms)):
    for collection in range(len(collections)):
        for chat_mode in range(len(chat_modes)):
            fix_issues(llm_model=llms[llm], collection=collections[collection], chat_mode=chat_modes[chat_mode])