Skip to content

Commit

Permalink
Added metrics for genai text (#2514)
Browse files Browse the repository at this point in the history
* Added info about required packages

* Update responsibleaidashboard-question-answering-model-debugging.ipynb

* show example prediction

* Update responsibleaidashboard-question-answering-model-debugging.ipynb

* add genai metrics

Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu>

* fix linting for metrics

Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu>

* sort import order

Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu>

* fix docstrings and copyright notices

Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu>

* add test for metrics

Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu>

* genai metrics refactor

Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu>

* remove unnecessary newlines

Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu>

* fix import requirement

Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu>

---------

Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu>
  • Loading branch information
kartik727 committed Jan 29, 2024
1 parent 66e33cb commit 13e1782
Show file tree
Hide file tree
Showing 9 changed files with 569 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Constants for genai_metrics."""

_CITATION = """
"""

_SYS_PROMPT = """
You are an AI assistant. You will be given the definition of an evaluation \
metric for assessing the quality of an answer in a question-answering task. \
Your job is to compute an accurate evaluation score using the provided \
evaluation metric.
Your response will be used in automated evaluation of question-answering \
systems, and must be an integer between 1 and 5, and nothing else.
""".strip()
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Compute AI-assisted metrics for generative text models."""

import logging
from pathlib import Path

module_logger = logging.getLogger(__name__)
module_logger.setLevel(logging.INFO)

try:
import evaluate
except ImportError:
module_logger.debug(
'Could not import evaluate, required if using a genai model')


def get_genai_metric(metric_name, **metric_kwargs):
"""Get the metric from the genai library.
:param metric_name: The name of the metric.
:type metric_name: str
:param metric_kwargs: The keyword arguments to pass to the metric.
:type metric_kwargs: dict
:return: The metric.
:rtype: float
"""
curr_file_dir = Path(__file__).resolve().parent
metric = evaluate.load(
str(curr_file_dir.joinpath(f'scripts/{metric_name}.py')))
return metric.compute(**metric_kwargs)
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Helper function to compute metrics."""

import pandas as pd

from responsibleai_text.utils.genai_metrics.constants import _SYS_PROMPT


def format_str(s, **kwargs):
"""Zip all the kwargs together and format the string in a loop"""
keys = list(kwargs.keys())
lists = [kwargs[k] for k in keys]
formatted = []
for vals in zip(*lists):
fmt_kwargs = {k: v for k, v in zip(keys, vals)}
formatted.append(s.format(**fmt_kwargs))
return formatted


def _compute_metric(template, logger, wrapper_model, **kwargs):
m = []
templated_ques = format_str(template, **kwargs)

inp = pd.DataFrame({
'questions': templated_ques,
'sys_prompt': _SYS_PROMPT})

responses = wrapper_model.predict(inp)

for r in responses:
try:
m.append(int(r))
except ValueError as e:
logger.warning('Failed to parse metric `%s`: %s', r, e)
m.append(0)
return {'scores': m}
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Coherence metric."""

import logging

from responsibleai_text.utils.genai_metrics.constants import _CITATION
from responsibleai_text.utils.genai_metrics.scripts._compute import \
_compute_metric

module_logger = logging.getLogger(__name__)
module_logger.setLevel(logging.INFO)

try:
import evaluate
except ImportError:
module_logger.debug(
'Could not import evaluate, required if using a genai model')

try:
import datasets
except ImportError:
module_logger.debug(
'Could not import datasets, required if using a genai model')

logger = evaluate.logging.get_logger(__name__)

_DESCRIPTION = """The coherence metric.
"""

_KWARGS_DESCRIPTION = """
**SOME DESCRIPTION**
"""

_TEMPLATE = """
Coherence of an answer is measured by how well all the sentences fit together \
and sound naturally as a whole. Consider the overall quality of the answer \
when evaluating coherence. Given the question and answer, score the coherence \
of answer between one to five stars using the following rating scale:
One star: the answer completely lacks coherence
Two stars: the answer mostly lacks coherence
Three stars: the answer is partially coherent
Four stars: the answer is mostly coherent
Five stars: the answer has perfect coherency
This rating value should always be an integer between 1 and 5. So the rating \
produced should be 1 or 2 or 3 or 4 or 5.
Some examples of valid responses are:
1
2
5
Some examples of invalid responses are:
1/5
1.5
3.0
5 stars
QUESTION:
{question}
ANSWER:
{prediction}
RATING:
""".strip()


@evaluate.utils.file_utils.add_start_docstrings(
_DESCRIPTION, _KWARGS_DESCRIPTION)
class Coherence(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features({
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence")}))

def _compute(self, *, predictions=None, references=None, **kwargs):
return _compute_metric(
_TEMPLATE,
logger,
kwargs['wrapper_model'],
prediction=predictions,
question=references)
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Equivalence metric."""

import logging

from responsibleai_text.utils.genai_metrics.constants import _CITATION
from responsibleai_text.utils.genai_metrics.scripts._compute import \
_compute_metric

module_logger = logging.getLogger(__name__)
module_logger.setLevel(logging.INFO)

try:
import evaluate
except ImportError:
module_logger.debug(
'Could not import evaluate, required if using a genai model')

try:
import datasets
except ImportError:
module_logger.debug(
'Could not import datasets, required if using a genai model')

logger = evaluate.logging.get_logger(__name__)

_DESCRIPTION = """The equivalence metric.
"""

_KWARGS_DESCRIPTION = """
**SOME DESCRIPTION**
"""

_TEMPLATE = """
Equivalence, as a metric, measures the similarity between the predicted \
answer and the correct answer. If the information and content in the \
predicted answer is similar or equivalent to the correct answer, then the \
value of the Equivalence metric should be high, else it should be low. Given \
the question, correct answer, and predicted answer, determine the value of \
Equivalence metric using the following rating scale:
One star: the predicted answer is not at all similar to the correct answer
Two stars: the predicted answer is mostly not similar to the correct answer
Three stars: the predicted answer is somewhat similar to the correct answer
Four stars: the predicted answer is mostly similar to the correct answer
Five stars: the predicted answer is completely similar to the correct answer
This rating value should always be an integer between 1 and 5. So the rating \
produced should be 1 or 2 or 3 or 4 or 5.
QUESTION:
{question}
CORRECT ANSWER:
{answer}
PREDICTED ANSWER:
{prediction}
""".strip()


@evaluate.utils.file_utils.add_start_docstrings(
_DESCRIPTION, _KWARGS_DESCRIPTION)
class Equivalence(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features({
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence"),
"answers": datasets.Value("string", id="sequence")}))

def _compute(self, *, predictions=None, references=None, **kwargs):
return _compute_metric(
_TEMPLATE,
logger,
kwargs['wrapper_model'],
prediction=predictions,
question=references,
answer=kwargs['answers'])
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
# Copyright (c) Microsoft Corporation
# Licensed under the MIT License.

"""Fluency metric."""

import logging

from responsibleai_text.utils.genai_metrics.constants import _CITATION
from responsibleai_text.utils.genai_metrics.scripts._compute import \
_compute_metric

module_logger = logging.getLogger(__name__)
module_logger.setLevel(logging.INFO)

try:
import evaluate
except ImportError:
module_logger.debug(
'Could not import evaluate, required if using a genai model')

try:
import datasets
except ImportError:
module_logger.debug(
'Could not import datasets, required if using a genai model')

logger = evaluate.logging.get_logger(__name__)

_DESCRIPTION = """The fluency metric.
"""

_KWARGS_DESCRIPTION = """
**SOME DESCRIPTION**
"""

_TEMPLATE = """
Fluency measures the quality of individual sentences in the answer, and \
whether they are well-written and grammatically correct. Consider the quality \
of individual sentences when evaluating fluency. Given the question and \
answer, score the fluency of the answer between one to five stars using the \
following rating scale:
One star: the answer completely lacks fluency
Two stars: the answer mostly lacks fluency
Three stars: the answer is partially fluent
Four stars: the answer is mostly fluent
Five stars: the answer has perfect fluency
This rating value should always be an integer between 1 and 5. So the rating \
produced should be 1 or 2 or 3 or 4 or 5.
QUESTION:
{question}
ANSWER:
{prediction}
""".strip()


@evaluate.utils.file_utils.add_start_docstrings(
_DESCRIPTION, _KWARGS_DESCRIPTION)
class Fluency(evaluate.Metric):
def _info(self):
return evaluate.MetricInfo(
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
features=datasets.Features({
"predictions": datasets.Value("string", id="sequence"),
"references": datasets.Value("string", id="sequence")}))

def _compute(self, *, predictions=None, references=None, **kwargs):
return _compute_metric(
_TEMPLATE,
logger,
kwargs['wrapper_model'],
prediction=predictions,
question=references)
Loading

0 comments on commit 13e1782

Please sign in to comment.