In [None]:
import os

os.environ["OPENAI_API_KEY"] = ""

In [28]:
import mlflow
from mlflow.metrics.genai import make_genai_metric, EvaluationExample
import pandas as pd
from mlflow.metrics.genai import make_genai_metric_from_prompt, EvaluationExample


In [29]:
eval_df = pd.read_csv("C:/Users/Emumba/Documents/genie research/llm-testing-main/llm-testing-main/hallucination/llm_responses.csv")

In [39]:
from mlflow.metrics.genai import EvaluationExample

# Example for "Accurate" response
example_accurate = EvaluationExample(
    input="What is MLflow?",
    output="MLflow is an open-source platform for managing the end-to-end machine learning lifecycle.",
    score=1,
    justification="The response is fully accurate and answers the question based on the provided context and ground truth."
)

# Example for "Minor Additional Detail"
example_minor_additional = EvaluationExample(
    input="Who developed MLflow?",
    output="MLflow was developed by Databricks, with significant contributions from open-source collaborators.",
    score=2,
    justification="The response is accurate but includes additional context about 'open-source collaborators' that is reasonable but not explicitly mentioned in the ground truth."
)

# Example for "Contextually Relevant but Off-Topic"
example_contextually_relevant = EvaluationExample(
    input="What is MLflow?",
    output="MLflow is designed for tracking and managing machine learning experiments.",
    score=3,
    justification="The response is contextually relevant and grounded in the provided information, but it does not directly answer the question about MLflow's general purpose."
)

# Example for "Hallucination"
example_hallucinated = EvaluationExample(
    input="Explain MLflow's tracking feature.",
    output="MLflow's tracking feature lets users track their fitness activities and health data.",
    score=4,
    justification="The response introduces unrelated information about tracking fitness and health data, which is not present in the ground truth or context, making it a hallucination."
)


In [44]:
hallucination_metric = make_genai_metric(
    name="hallucination_evaluation",
    definition=(
        "This metric evaluates the hallucination level of the response. Responses are scored based on whether "
        "they are grounded in context, include minor additional info, contain irrelevant details, or are hallucinated."
    ),
    grading_prompt=(
        "Hallucination Score: Evaluate the response and label it with one of the following descriptions:\n"
        "- Score 1: If the response accurately answers the question based on the provided ground truth and context, mark it as 'Accurate'.\n"
        "- Score 2: If the response is correct but includes minor details that are reasonable but not explicitly in the ground truth or context, mark it as 'Minor Additional Detail.'\n"
        "- Score 3: If the response is grounded in the context or ground truth but does not directly answer the question, mark it as 'Contextually Relevant but Off-Topic.'\n"
        "- Score 4: If the response introduces fabricated or unverified information that is not present in the ground truth or context, mark it as 'Hallucination.'\n"
    ),
    examples=[example_accurate, example_minor_additional, example_contextually_relevant, example_hallucinated],
    model="openai:/gpt-4",  # Replace with the LLM endpoint you’re using
    parameters={"temperature": 0.0},
    aggregations=["mean"],
    greater_is_better=False,
    grading_context_columns=["context"]

)

In [45]:
def evaluate_hallucination_with_labels(eval_data):
    results = []
    for _, row in eval_data.iterrows():
        evaluation = hallucination_metric.evaluate(
            input=row['query'],
            output=row['response'],
            context=row['context']
        )
        
        # Collect results with descriptive labels
        results.append({
            "query": row['query'],
            "response": row['response'],
            "ground_truth": row['ground truth'],
            "hallucination_score": evaluation.score,
            "hallucination_label": evaluation.justification  # This will store the label (e.g., "Minor Additional Info")
        })
    
    # Convert results to DataFrame for easier saving
    return pd.DataFrame(results)

In [46]:
with mlflow.start_run():
    # Run evaluation with mlflow.evaluate() instead of calling .evaluate() directly
    results = mlflow.evaluate(
        data=eval_df,
        evaluators="default",
        targets="ground truth",  # Column containing ground truth labels
        predictions="response",  # Column containing model responses
        extra_metrics=[hallucination_metric],  # Include the custom hallucination metric
        evaluator_config={
            "col_mapping": {
                "inputs": "query",
                "context": "context",
            }
        }
    )

    # Log the hallucination results table as an artifact for detailed inspection
    results_df = results.tables["eval_results_table"]
    output_file = "hallucination_evaluation_with_labels.csv"
    results_df.to_csv(output_file, index=False)
    mlflow.log_artifact(output_file)

# Display results for verification
print(results_df)

2024/11/06 16:04:15 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
100%|██████████| 1/1 [00:03<00:00,  3.25s/it]
100%|██████████| 10/10 [01:38<00:00,  9.80s/it]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 333.54it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 499.80it/s]

                                               query  \
0  Does GTE-Large support up to 8192 tokens in a ...   
1  Is BGE-M3 primarily used for summarization tasks?   
2  Does GTE-Qwen2-7B-instruct provide specialized...   
3  Which model between GTE-Base and GTE-Large per...   
4  Does Recursive Retrieval allow token inputs ov...   
5  Is there any embedding model specifically desi...   
6  Does the GTE-Qwen2-7B-instruct model have real...   
7  Can Recursive Retrieval merge results from mul...   
8  Does BGE-M3 outperform GTE-Qwen2-7B-instruct o...   
9  Are the Longformer Base 4096 and GTE-Large mod...   

                                             context  \
0  Open Source Embedding Models GTE-Base General ...   
1  Open Source Embedding Models GTE-Base General ...   
2  It can simultaneously perform the three common...   
3  Open Source Embedding Models GTE-Base General ...   
4  both gave us quite good results! Sentence Wind...   
5  for general text blobs Limited to 512 tokens


