Added metrics for genai text (#2514)

* Added info about required packages * Update responsibleaidashboard-question-answering-model-debugging.ipynb * show example prediction * Update responsibleaidashboard-question-answering-model-debugging.ipynb * add genai metrics Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu> * fix linting for metrics Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu> * sort import order Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu> * fix docstrings and copyright notices Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu> * add test for metrics Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu> * genai metrics refactor Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu> * remove unnecessary newlines Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu> * fix import requirement Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu> --------- Signed-off-by: Kartik Choudhary <kartikchoudh@umass.edu>
microsoft · Jan 29, 2024 · 13e1782 · 13e1782
1 parent 66e33cb
commit 13e1782
Show file tree

Hide file tree

Showing 9 changed files with 569 additions and 0 deletions.
diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/constants.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/constants.py
@@ -0,0 +1,16 @@
+# Copyright (c) Microsoft Corporation
+# Licensed under the MIT License.
+
+"""Constants for genai_metrics."""
+
+_CITATION = """
+"""
+
+_SYS_PROMPT = """
+You are an AI assistant. You will be given the definition of an evaluation \
+metric for assessing the quality of an answer in a question-answering task. \
+Your job is to compute an accurate evaluation score using the provided \
+evaluation metric.
+Your response will be used in automated evaluation of question-answering \
+systems, and must be an integer between 1 and 5, and nothing else.
+""".strip()
diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py
@@ -0,0 +1,32 @@
+# Copyright (c) Microsoft Corporation
+# Licensed under the MIT License.
+
+"""Compute AI-assisted metrics for generative text models."""
+
+import logging
+from pathlib import Path
+
+module_logger = logging.getLogger(__name__)
+module_logger.setLevel(logging.INFO)
+
+try:
+    import evaluate
+except ImportError:
+    module_logger.debug(
+        'Could not import evaluate, required if using a genai model')
+
+
+def get_genai_metric(metric_name, **metric_kwargs):
+    """Get the metric from the genai library.
+
+    :param metric_name: The name of the metric.
+    :type metric_name: str
+    :param metric_kwargs: The keyword arguments to pass to the metric.
+    :type metric_kwargs: dict
+    :return: The metric.
+    :rtype: float
+    """
+    curr_file_dir = Path(__file__).resolve().parent
+    metric = evaluate.load(
+        str(curr_file_dir.joinpath(f'scripts/{metric_name}.py')))
+    return metric.compute(**metric_kwargs)
diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/_compute.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/_compute.py
@@ -0,0 +1,38 @@
+# Copyright (c) Microsoft Corporation
+# Licensed under the MIT License.
+
+"""Helper function to compute metrics."""
+
+import pandas as pd
+
+from responsibleai_text.utils.genai_metrics.constants import _SYS_PROMPT
+
+
+def format_str(s, **kwargs):
+    """Zip all the kwargs together and format the string in a loop"""
+    keys = list(kwargs.keys())
+    lists = [kwargs[k] for k in keys]
+    formatted = []
+    for vals in zip(*lists):
+        fmt_kwargs = {k: v for k, v in zip(keys, vals)}
+        formatted.append(s.format(**fmt_kwargs))
+    return formatted
+
+
+def _compute_metric(template, logger, wrapper_model, **kwargs):
+    m = []
+    templated_ques = format_str(template, **kwargs)
+
+    inp = pd.DataFrame({
+        'questions': templated_ques,
+        'sys_prompt': _SYS_PROMPT})
+
+    responses = wrapper_model.predict(inp)
+
+    for r in responses:
+        try:
+            m.append(int(r))
+        except ValueError as e:
+            logger.warning('Failed to parse metric `%s`: %s', r, e)
+            m.append(0)
+    return {'scores': m}
diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py
@@ -0,0 +1,87 @@
+# Copyright (c) Microsoft Corporation
+# Licensed under the MIT License.
+
+"""Coherence metric."""
+
+import logging
+
+from responsibleai_text.utils.genai_metrics.constants import _CITATION
+from responsibleai_text.utils.genai_metrics.scripts._compute import \
+    _compute_metric
+
+module_logger = logging.getLogger(__name__)
+module_logger.setLevel(logging.INFO)
+
+try:
+    import evaluate
+except ImportError:
+    module_logger.debug(
+        'Could not import evaluate, required if using a genai model')
+
+try:
+    import datasets
+except ImportError:
+    module_logger.debug(
+        'Could not import datasets, required if using a genai model')
+
+logger = evaluate.logging.get_logger(__name__)
+
+_DESCRIPTION = """The coherence metric.
+"""
+
+_KWARGS_DESCRIPTION = """
+**SOME DESCRIPTION**
+"""
+
+_TEMPLATE = """
+Coherence of an answer is measured by how well all the sentences fit together \
+and sound naturally as a whole. Consider the overall quality of the answer \
+when evaluating coherence. Given the question and answer, score the coherence \
+of answer between one to five stars using the following rating scale:
+One star: the answer completely lacks coherence
+Two stars: the answer mostly lacks coherence
+Three stars: the answer is partially coherent
+Four stars: the answer is mostly coherent
+Five stars: the answer has perfect coherency
+
+This rating value should always be an integer between 1 and 5. So the rating \
+produced should be 1 or 2 or 3 or 4 or 5.
+Some examples of valid responses are:
+1
+2
+5
+Some examples of invalid responses are:
+1/5
+1.5
+3.0
+5 stars
+
+QUESTION:
+{question}
+
+ANSWER:
+{prediction}
+
+RATING:
+""".strip()
+
+
+@evaluate.utils.file_utils.add_start_docstrings(
+    _DESCRIPTION, _KWARGS_DESCRIPTION)
+class Coherence(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features({
+                "predictions": datasets.Value("string", id="sequence"),
+                "references": datasets.Value("string", id="sequence")}))
+
+    def _compute(self, *, predictions=None, references=None, **kwargs):
+        return _compute_metric(
+            _TEMPLATE,
+            logger,
+            kwargs['wrapper_model'],
+            prediction=predictions,
+            question=references)
diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py
@@ -0,0 +1,83 @@
+# Copyright (c) Microsoft Corporation
+# Licensed under the MIT License.
+
+"""Equivalence metric."""
+
+import logging
+
+from responsibleai_text.utils.genai_metrics.constants import _CITATION
+from responsibleai_text.utils.genai_metrics.scripts._compute import \
+    _compute_metric
+
+module_logger = logging.getLogger(__name__)
+module_logger.setLevel(logging.INFO)
+
+try:
+    import evaluate
+except ImportError:
+    module_logger.debug(
+        'Could not import evaluate, required if using a genai model')
+
+try:
+    import datasets
+except ImportError:
+    module_logger.debug(
+        'Could not import datasets, required if using a genai model')
+
+logger = evaluate.logging.get_logger(__name__)
+
+_DESCRIPTION = """The equivalence metric.
+"""
+
+_KWARGS_DESCRIPTION = """
+**SOME DESCRIPTION**
+"""
+
+_TEMPLATE = """
+Equivalence, as a metric, measures the similarity between the predicted \
+answer and the correct answer. If the information and content in the \
+predicted answer is similar or equivalent to the correct answer, then the \
+value of the Equivalence metric should be high, else it should be low. Given \
+the question, correct answer, and predicted answer, determine the value of \
+Equivalence metric using the following rating scale:
+One star: the predicted answer is not at all similar to the correct answer
+Two stars: the predicted answer is mostly not similar to the correct answer
+Three stars: the predicted answer is somewhat similar to the correct answer
+Four stars: the predicted answer is mostly similar to the correct answer
+Five stars: the predicted answer is completely similar to the correct answer
+
+This rating value should always be an integer between 1 and 5. So the rating \
+produced should be 1 or 2 or 3 or 4 or 5.
+
+QUESTION:
+{question}
+
+CORRECT ANSWER:
+{answer}
+
+PREDICTED ANSWER:
+{prediction}
+""".strip()
+
+
+@evaluate.utils.file_utils.add_start_docstrings(
+    _DESCRIPTION, _KWARGS_DESCRIPTION)
+class Equivalence(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features({
+                "predictions": datasets.Value("string", id="sequence"),
+                "references": datasets.Value("string", id="sequence"),
+                "answers": datasets.Value("string", id="sequence")}))
+
+    def _compute(self, *, predictions=None, references=None, **kwargs):
+        return _compute_metric(
+            _TEMPLATE,
+            logger,
+            kwargs['wrapper_model'],
+            prediction=predictions,
+            question=references,
+            answer=kwargs['answers'])
diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py
@@ -0,0 +1,77 @@
+# Copyright (c) Microsoft Corporation
+# Licensed under the MIT License.
+
+"""Fluency metric."""
+
+import logging
+
+from responsibleai_text.utils.genai_metrics.constants import _CITATION
+from responsibleai_text.utils.genai_metrics.scripts._compute import \
+    _compute_metric
+
+module_logger = logging.getLogger(__name__)
+module_logger.setLevel(logging.INFO)
+
+try:
+    import evaluate
+except ImportError:
+    module_logger.debug(
+        'Could not import evaluate, required if using a genai model')
+
+try:
+    import datasets
+except ImportError:
+    module_logger.debug(
+        'Could not import datasets, required if using a genai model')
+
+logger = evaluate.logging.get_logger(__name__)
+
+_DESCRIPTION = """The fluency metric.
+"""
+
+_KWARGS_DESCRIPTION = """
+**SOME DESCRIPTION**
+"""
+
+_TEMPLATE = """
+Fluency measures the quality of individual sentences in the answer, and \
+whether they are well-written and grammatically correct. Consider the quality \
+of individual sentences when evaluating fluency. Given the question and \
+answer, score the fluency of the answer between one to five stars using the \
+following rating scale:
+One star: the answer completely lacks fluency
+Two stars: the answer mostly lacks fluency
+Three stars: the answer is partially fluent
+Four stars: the answer is mostly fluent
+Five stars: the answer has perfect fluency
+
+This rating value should always be an integer between 1 and 5. So the rating \
+produced should be 1 or 2 or 3 or 4 or 5.
+
+QUESTION:
+{question}
+
+ANSWER:
+{prediction}
+""".strip()
+
+
+@evaluate.utils.file_utils.add_start_docstrings(
+    _DESCRIPTION, _KWARGS_DESCRIPTION)
+class Fluency(evaluate.Metric):
+    def _info(self):
+        return evaluate.MetricInfo(
+            description=_DESCRIPTION,
+            citation=_CITATION,
+            inputs_description=_KWARGS_DESCRIPTION,
+            features=datasets.Features({
+                "predictions": datasets.Value("string", id="sequence"),
+                "references": datasets.Value("string", id="sequence")}))
+
+    def _compute(self, *, predictions=None, references=None, **kwargs):
+        return _compute_metric(
+            _TEMPLATE,
+            logger,
+            kwargs['wrapper_model'],
+            prediction=predictions,
+            question=references)