Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added metrics for genai text #2514

Merged
merged 19 commits into from
Jan 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Constants for genai_metrics."""

_CITATION = """
"""

_SYS_PROMPT = """
You are an AI assistant. You will be given the definition of an evaluation \
metric for assessing the quality of an answer in a question-answering task. \
Your job is to compute an accurate evaluation score using the provided \
evaluation metric.
Your response will be used in automated evaluation of question-answering \
systems, and must be an integer between 1 and 5, and nothing else.
""".strip()
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Compute AI-assisted metrics for generative text models."""

import logging
from pathlib import Path

module_logger = logging.getLogger(__name__)
module_logger.setLevel(logging.INFO)

try:
import evaluate
except ImportError:
module_logger.debug(
'Could not import evaluate, required if using a genai model')


def get_genai_metric(metric_name, **metric_kwargs):
"""Get the metric from the genai library.

:param metric_name: The name of the metric.
:type metric_name: str
:param metric_kwargs: The keyword arguments to pass to the metric.
:type metric_kwargs: dict
:return: The metric.
:rtype: float
"""
curr_file_dir = Path(__file__).resolve().parent
metric = evaluate.load(
str(curr_file_dir.joinpath(f'scripts/{metric_name}.py')))
return metric.compute(**metric_kwargs)
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Helper function to compute metrics."""

import pandas as pd

from responsibleai_text.utils.genai_metrics.constants import _SYS_PROMPT


def format_str(s, **kwargs):
"""Zip all the kwargs together and format the string in a loop"""
keys = list(kwargs.keys())
lists = [kwargs[k] for k in keys]
formatted = []
for vals in zip(*lists):
fmt_kwargs = {k: v for k, v in zip(keys, vals)}
formatted.append(s.format(**fmt_kwargs))
return formatted


def _compute_metric(template, logger, wrapper_model, **kwargs):
m = []
templated_ques = format_str(template, **kwargs)

inp = pd.DataFrame({
'questions': templated_ques,
'sys_prompt': _SYS_PROMPT})

responses = wrapper_model.predict(inp)

for r in responses:
try:
m.append(int(r))
except ValueError as e:
logger.warning('Failed to parse metric `%s`: %s', r, e)
m.append(0)
return {'scores': m}
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Coherence metric."""

import logging

from responsibleai_text.utils.genai_metrics.constants import _CITATION
from responsibleai_text.utils.genai_metrics.scripts._compute import \
_compute_metric

module_logger = logging.getLogger(__name__)
module_logger.setLevel(logging.INFO)

try:
import evaluate
except ImportError:
module_logger.debug(
'Could not import evaluate, required if using a genai model')

try:
import datasets
except ImportError:
module_logger.debug(
'Could not import datasets, required if using a genai model')

logger = evaluate.logging.get_logger(__name__)

_DESCRIPTION = """The coherence metric.
"""

_KWARGS_DESCRIPTION = """
**SOME DESCRIPTION**
"""

_TEMPLATE = """
Coherence of an answer is measured by how well all the sentences fit together \
and sound naturally as a whole. Consider the overall quality of the answer \
when evaluating coherence. Given the question and answer, score the coherence \
of answer between one to five stars using the following rating scale:
One star: the answer completely lacks coherence
Two stars: the answer mostly lacks coherence
Three stars: the answer is partially coherent
Four stars: the answer is mostly coherent
Five stars: the answer has perfect coherency

This rating value should always be an integer between 1 and 5. So the rating \
produced should be 1 or 2 or 3 or 4 or 5.
Some examples of valid responses are:
1
2
5
Some examples of invalid responses are:
1/5
1.5
3.0
5 stars

QUESTION:
{question}

ANSWER:
{prediction}

RATING:
""".strip()


@evaluate.utils.file_utils.add_start_docstrings(
_DESCRIPTION, _KWARGS_DESCRIPTION)
class Coherence(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
kartik727 marked this conversation as resolved.
Show resolved Hide resolved
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features({
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence")}))

def _compute(self, *, predictions=None, references=None, **kwargs):
kartik727 marked this conversation as resolved.
Show resolved Hide resolved
return _compute_metric(
_TEMPLATE,
logger,
kwargs['wrapper_model'],
prediction=predictions,
question=references)
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Equivalence metric."""

import logging

from responsibleai_text.utils.genai_metrics.constants import _CITATION
from responsibleai_text.utils.genai_metrics.scripts._compute import \
_compute_metric

module_logger = logging.getLogger(__name__)
module_logger.setLevel(logging.INFO)

try:
import evaluate
except ImportError:
module_logger.debug(
'Could not import evaluate, required if using a genai model')

try:
import datasets
except ImportError:
module_logger.debug(
'Could not import datasets, required if using a genai model')

logger = evaluate.logging.get_logger(__name__)

_DESCRIPTION = """The equivalence metric.
"""

_KWARGS_DESCRIPTION = """
**SOME DESCRIPTION**
"""

_TEMPLATE = """
Equivalence, as a metric, measures the similarity between the predicted \
answer and the correct answer. If the information and content in the \
predicted answer is similar or equivalent to the correct answer, then the \
value of the Equivalence metric should be high, else it should be low. Given \
the question, correct answer, and predicted answer, determine the value of \
Equivalence metric using the following rating scale:
One star: the predicted answer is not at all similar to the correct answer
Two stars: the predicted answer is mostly not similar to the correct answer
Three stars: the predicted answer is somewhat similar to the correct answer
Four stars: the predicted answer is mostly similar to the correct answer
Five stars: the predicted answer is completely similar to the correct answer

This rating value should always be an integer between 1 and 5. So the rating \
produced should be 1 or 2 or 3 or 4 or 5.

QUESTION:
{question}

CORRECT ANSWER:
{answer}

PREDICTED ANSWER:
{prediction}
""".strip()


@evaluate.utils.file_utils.add_start_docstrings(
_DESCRIPTION, _KWARGS_DESCRIPTION)
class Equivalence(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features({
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
"answers": datasets.Value("string", id="sequence")}))

def _compute(self, *, predictions=None, references=None, **kwargs):
return _compute_metric(
_TEMPLATE,
logger,
kwargs['wrapper_model'],
prediction=predictions,
question=references,
answer=kwargs['answers'])
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Fluency metric."""

import logging

from responsibleai_text.utils.genai_metrics.constants import _CITATION
from responsibleai_text.utils.genai_metrics.scripts._compute import \
_compute_metric

module_logger = logging.getLogger(__name__)
module_logger.setLevel(logging.INFO)

try:
import evaluate
except ImportError:
module_logger.debug(
'Could not import evaluate, required if using a genai model')

try:
import datasets
except ImportError:
module_logger.debug(
'Could not import datasets, required if using a genai model')

logger = evaluate.logging.get_logger(__name__)

_DESCRIPTION = """The fluency metric.
"""

_KWARGS_DESCRIPTION = """
**SOME DESCRIPTION**
"""

_TEMPLATE = """
Fluency measures the quality of individual sentences in the answer, and \
whether they are well-written and grammatically correct. Consider the quality \
of individual sentences when evaluating fluency. Given the question and \
answer, score the fluency of the answer between one to five stars using the \
following rating scale:
One star: the answer completely lacks fluency
Two stars: the answer mostly lacks fluency
Three stars: the answer is partially fluent
Four stars: the answer is mostly fluent
Five stars: the answer has perfect fluency

This rating value should always be an integer between 1 and 5. So the rating \
produced should be 1 or 2 or 3 or 4 or 5.

QUESTION:
{question}

ANSWER:
{prediction}
""".strip()


@evaluate.utils.file_utils.add_start_docstrings(
_DESCRIPTION, _KWARGS_DESCRIPTION)
class Fluency(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features({
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence")}))

def _compute(self, *, predictions=None, references=None, **kwargs):
return _compute_metric(
_TEMPLATE,
logger,
kwargs['wrapper_model'],
prediction=predictions,
question=references)
Loading
Loading