In [1]:
import pandas as pd

# Define a sample dataset with queries, responses, ground truth, and context
data = {
    "query": [
        "What is MLflow?", 
        "Who developed MLflow?", 
        "Explain MLflow's tracking feature.", 
        "What is Databricks?"
    ],
    "ground_truth": [
        "MLflow is an open-source platform for managing the ML lifecycle.",
        "MLflow was developed by Databricks.",
        "MLflow's tracking feature allows logging and tracking of experiments.",
        "Databricks is a cloud-based platform for big data and AI."
    ],
    "context": [
        "MLflow is used for tracking experiments and managing machine learning lifecycles.",
        "MLflow was created by the team at Databricks to simplify ML workflows.",
        "The tracking feature in MLflow allows data scientists to record parameters, metrics, and outputs of experiments.",
        "Databricks provides a collaborative environment for data scientists and engineers to work on data and machine learning."
    ],
    "response": [
        "MLflow is a cloud service for tracking your daily habits and personal tasks.",  # Hallucinated
        "MLflow was developed by Databricks.",  # Accurate
        "MLflow's tracking feature allows users to log and track their machine learning experiments.",  # Accurate
        "Databricks is a fitness app that helps users track workouts and diet."  # Hallucinated
    ]
}

# Create DataFrame
df = pd.DataFrame(data)
df.to_csv("hallucination_test_data.csv", index=False)
print("Dataset created!")


Dataset created!


In [2]:
from mlflow.metrics.genai import make_genai_metric, EvaluationExample

# Define sample examples for each score category to help guide the LLM
example_accurate = EvaluationExample(
    input="What is MLflow?",
    output="MLflow is an open-source platform for managing the ML lifecycle.",
    score=1,
    justification="The response is fully accurate based on the ground truth."
)

example_minor_additional = EvaluationExample(
    input="Who developed MLflow?",
    output="MLflow was developed by Databricks, with support from a community of contributors.",
    score=2,
    justification="The response is accurate but includes additional minor information not explicitly in the ground truth."
)

example_contextual = EvaluationExample(
    input="What is MLflow?",
    output="MLflow helps with tracking machine learning experiments.",
    score=3,
    justification="The response is related to MLflow's functionality but does not directly answer the question."
)

example_hallucinated = EvaluationExample(
    input="What is MLflow?",
    output="MLflow is a platform for tracking fitness activities and personal health data.",
    score=4,
    justification="The response introduces unrelated information that is not in the ground truth or context, making it hallucinated."
)

# Create the custom hallucination metric
hallucination_metric = make_genai_metric(
    name="hallucination_evaluation",
    definition="This metric evaluates if a response is accurate, slightly off-topic, or hallucinated based on the provided context and ground truth.",
    grading_prompt=(
        "Hallucination Score:\n"
        "- Score 1: Accurate response based on ground truth.\n"
        "- Score 2: Mostly accurate with minor additional details.\n"
        "- Score 3: Contextually relevant but off-topic.\n"
        "- Score 4: Contains fabricated or unrelated information."
    ),
    examples=[example_accurate, example_minor_additional, example_contextual, example_hallucinated],
    model="openai:/gpt-4",  # Replace with the OpenAI model endpoint
    parameters={"temperature": 0.0},
    aggregations=["mean"],
    greater_is_better=False
)


In [None]:
import mlflow
import os

# Set OpenAI API key
os.environ["OPENAI_API_KEY"] = ""

# Load the test dataset
eval_df = pd.read_csv("hallucination_test_data.csv")

# Run evaluation
with mlflow.start_run():
    results = mlflow.evaluate(
        data=eval_df,
        evaluators="default",
        predictions="response",  # Column with the model's responses
        extra_metrics=[hallucination_metric],  # Our custom metric
        evaluator_config={
            "col_mapping": {
                "inputs": "query",
                "context": "context",
                "ground_truth": "ground_truth"
            }
        }
    )

    # Log the hallucination results table as an artifact
    results_df = results.tables["eval_results_table"]
    results_df.to_csv("hallucination_evaluation_results.csv", index=False)
    mlflow.log_artifact("hallucination_evaluation_results.csv")

# Display results for verification
print(results_df)


2024/11/07 10:16:28 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
100%|██████████| 1/1 [00:02<00:00,  2.48s/it]
100%|██████████| 4/4 [00:04<00:00,  1.07s/it]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 222.50it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 225.57it/s]

                                query  \
0                     What is MLflow?   
1               Who developed MLflow?   
2  Explain MLflow's tracking feature.   
3                 What is Databricks?   

                                        ground_truth  \
0  MLflow is an open-source platform for managing...   
1                MLflow was developed by Databricks.   
2  MLflow's tracking feature allows logging and t...   
3  Databricks is a cloud-based platform for big d...   

                                             context  \
0  MLflow is used for tracking experiments and ma...   
1  MLflow was created by the team at Databricks t...   
2  The tracking feature in MLflow allows data sci...   
3  Databricks provides a collaborative environmen...   

                                             outputs  \
0  MLflow is a cloud service for tracking your da...   
1                MLflow was developed by Databricks.   
2  MLflow's tracking feature allows users to log ...   
3  Datab




In [5]:
# Importing necessary libraries
import pandas as pd

# Define a complex dataset with diverse response types
complex_data = {
    "query": [
        "What is the main purpose of MLflow?", 
        "Who primarily developed MLflow?", 
        "Describe the tracking feature in MLflow.", 
        "What services does Databricks offer?", 
        "Explain how MLflow integrates with Databricks.", 
        "What is the significance of MLflow's model registry?",
        "How does MLflow handle data versioning?", 
        "List the components of MLflow."
    ],
    "ground_truth": [
        "MLflow is primarily designed to manage and streamline the ML lifecycle, including tracking, packaging, and deploying models.",
        "MLflow was primarily developed by Databricks.",
        "MLflow's tracking feature enables users to log parameters, metrics, and artifacts from ML experiments.",
        "Databricks provides a collaborative platform for big data, AI, and ML, with a focus on cloud-based analytics.",
        "MLflow integrates with Databricks by allowing users to track and log their experiments within the Databricks environment.",
        "The MLflow model registry enables model versioning, lifecycle management, and deployment.",
        "MLflow itself does not handle data versioning directly but can be integrated with tools that do.",
        "MLflow comprises four main components: Tracking, Projects, Models, and Registry."
    ],
    "context": [
        "MLflow is an open-source platform that assists with the ML lifecycle by enabling experiment tracking, model packaging, and deployment.",
        "Databricks developed MLflow as part of its mission to simplify the ML workflow.",
        "Tracking experiments is a core feature of MLflow, allowing data scientists to log experiment parameters, metrics, and outputs.",
        "Databricks is known for its cloud-based data and AI platform that enables data scientists to work collaboratively on ML projects.",
        "MLflow is integrated into Databricks, offering users tools to track and manage their machine learning experiments.",
        "The MLflow model registry is designed to store and manage different versions of ML models, providing a centralized hub for model management.",
        "MLflow allows tracking of experiments but relies on other tools for data versioning solutions.",
        "MLflow includes components for experiment tracking, project management, model packaging, and a model registry for model management."
    ],
    "response": [
        "MLflow is a tool that can be used to track machine learning experiments, log model artifacts, and monitor training runs.",  # Simplistic but correct
        "MLflow was created by Amazon to assist with data management and experiment tracking.",  # Hallucinated (Amazon)
        "The tracking feature of MLflow helps in tracking fitness activities, personal projects, and daily habits.",  # Hallucinated
        "Databricks offers cloud-based analytics, data science collaboration, and data storage for healthcare.",  # Partially correct, adds unrelated detail (healthcare)
        "MLflow integrates with Databricks to allow users to track AI and ML experiments, providing easy deployment options.",  # Accurate
        "The MLflow model registry is used to store ML models and track metrics across various health sectors.",  # Partially correct, unrelated domain (health)
        "MLflow directly manages data versioning for all experiment datasets.",  # Contradictory to ground truth
        "MLflow includes a suite of tools, including Tracking, Projects, Models, and a central Repository for metrics."  # Minor detail change, mostly accurate
    ]
}

# Create DataFrame
complex_df = pd.DataFrame(complex_data)
complex_df.to_csv("complex_hallucination_test_data.csv", index=False)
print("Complex dataset created!")


Complex dataset created!


In [6]:
from mlflow.metrics.genai import make_genai_metric, EvaluationExample

# Update example responses to guide LLM in complex scoring scenarios
example_accurate = EvaluationExample(
    input="What is the main purpose of MLflow?",
    output="MLflow is primarily designed to manage and streamline the ML lifecycle, including tracking, packaging, and deploying models.",
    score=1,
    justification="The response is fully accurate, directly answering the question based on the ground truth."
)

example_minor_additional = EvaluationExample(
    input="What is the main purpose of MLflow?",
    output="MLflow is a tool for managing the ML lifecycle, including tracking and deploying models, and is widely used by data scientists.",
    score=2,
    justification="The response is accurate but adds minor additional details ('widely used by data scientists') not in the ground truth."
)

example_contextual = EvaluationExample(
    input="Describe the tracking feature in MLflow.",
    output="MLflow supports machine learning workflows and experiment tracking, especially for data scientists.",
    score=3,
    justification="The response is contextually relevant but doesn't directly answer the question about MLflow's tracking feature."
)

example_hallucinated = EvaluationExample(
    input="Who primarily developed MLflow?",
    output="MLflow was developed by Amazon as a data management tool.",
    score=4,
    justification="The response introduces incorrect information about Amazon and data management, making it a hallucination."
)

# Define the hallucination metric with refined examples and descriptions
hallucination_metric = make_genai_metric(
    name="complex_hallucination_evaluation",
    definition="This metric evaluates responses based on their accuracy or hallucination. Responses are assessed for correctness, minor additional info, relevance, or hallucination.",
    grading_prompt=(
        "Evaluate the response based on the question, ground truth, and context:\n"
        "- Score 1: Accurate response based on ground truth.\n"
        "- Score 2: Mostly accurate with minor additional details.\n"
        "- Score 3: Contextually relevant but does not directly answer the question.\n"
        "- Score 4: Hallucinated or introduces unrelated/fabricated information."
    ),
    examples=[example_accurate, example_minor_additional, example_contextual, example_hallucinated],
    model="openai:/gpt-4",  # Specify the model endpoint for LLM evaluation
    parameters={"temperature": 0.0},
    aggregations=["mean"],
    greater_is_better=False
)


In [7]:
import mlflow
import os

# Set OpenAI API key
os.environ["OPENAI_API_KEY"] = "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"

# Load the complex test dataset
eval_df = pd.read_csv("complex_hallucination_test_data.csv")

# Run evaluation with the custom hallucination metric
with mlflow.start_run():
    results = mlflow.evaluate(
        data=eval_df,
        evaluators="default",
        predictions="response",  # Column with the model's responses
        extra_metrics=[hallucination_metric],  # Our custom metric
        evaluator_config={
            "col_mapping": {
                "inputs": "query",
                "context": "context",
                "ground_truth": "ground_truth"
            }
        }
    )

    # Log the hallucination results table as an artifact
    results_df = results.tables["eval_results_table"]
    results_df.to_csv("complex_hallucination_evaluation_results.csv", index=False)
    mlflow.log_artifact("complex_hallucination_evaluation_results.csv")

# Display results for verification
print(results_df)


2024/11/07 10:20:20 INFO mlflow.models.evaluation.default_evaluator: Testing metrics on first row...
100%|██████████| 1/1 [00:02<00:00,  2.42s/it]
100%|██████████| 8/8 [00:03<00:00,  2.58it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 331.70it/s]
Downloading artifacts: 100%|██████████| 1/1 [00:00<00:00, 174.17it/s]

                                               query  \
0                What is the main purpose of MLflow?   
1                    Who primarily developed MLflow?   
2           Describe the tracking feature in MLflow.   
3               What services does Databricks offer?   
4     Explain how MLflow integrates with Databricks.   
5  What is the significance of MLflow's model reg...   
6            How does MLflow handle data versioning?   
7                     List the components of MLflow.   

                                        ground_truth  \
0  MLflow is primarily designed to manage and str...   
1      MLflow was primarily developed by Databricks.   
2  MLflow's tracking feature enables users to log...   
3  Databricks provides a collaborative platform f...   
4  MLflow integrates with Databricks by allowing ...   
5  The MLflow model registry enables model versio...   
6  MLflow itself does not handle data versioning ...   
7  MLflow comprises four main components: Track


