From 2899deafeffb4a15098a1daa695124e7b6e4eb90 Mon Sep 17 00:00:00 2001 From: Kartik Choudhary Date: Sun, 15 Oct 2023 18:53:17 -0400 Subject: [PATCH 01/12] Added info about required packages --- ...d-question-answering-model-debugging.ipynb | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb b/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb index 3b663cfc61..4af484b9f1 100644 --- a/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb +++ b/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb @@ -42,6 +42,31 @@ "The following section examines the code necessary to create datasets and a model. It then generates insights using the `responsibleai` API that can be visually analyzed." ] }, +{ + "cell_type": "markdown", + "id": "6174bcad", + "metadata": {}, + "source": [ + "### Prepare\n", + "\n", + "To run this notebook, we need to install the following packages:\n", + "\n", + "```requirements.txt\n", + "raiutils\n", + "raiwidgets\n", + "datasets\n", + "transformers\n", + "responsibleai_text\n", + "torch\n", + "```\n", + "\n", + "Run the following command to load the spacy pipeline:\n", + "\n", + "```bash\n", + "python -m spacy download en_core_web_sm\n", + "```" + ] + }, { "cell_type": "markdown", "id": "40739025", From 80b0c3454bb8b34998da9e0bdb71af3c079bf829 Mon Sep 17 00:00:00 2001 From: Kartik Choudhary Date: Sun, 15 Oct 2023 18:53:55 -0400 Subject: [PATCH 02/12] Update responsibleaidashboard-question-answering-model-debugging.ipynb --- ...ponsibleaidashboard-question-answering-model-debugging.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb b/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb index 4af484b9f1..d804c9bed8 100644 --- a/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb +++ b/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb @@ -51,7 +51,7 @@ "\n", "To run this notebook, we need to install the following packages:\n", "\n", - "```requirements.txt\n", + "```\n", "raiutils\n", "raiwidgets\n", "datasets\n", From af0099398be0a75b2588a219a79db69057c63bf2 Mon Sep 17 00:00:00 2001 From: Kartik Choudhary Date: Sun, 15 Oct 2023 19:00:01 -0400 Subject: [PATCH 03/12] show example prediction --- ...d-question-answering-model-debugging.ipynb | 57 +++++++++++-------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb b/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb index d804c9bed8..2d8de2ffcd 100644 --- a/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb +++ b/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb @@ -111,16 +111,7 @@ "metadata": {}, "outputs": [], "source": [ - "dataset = datasets.load_dataset(\"squad\", split=\"train\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a0eef443", - "metadata": {}, - "outputs": [], - "source": [ + "dataset = datasets.load_dataset(\"squad\", split=\"train\")\n", "dataset" ] }, @@ -155,17 +146,9 @@ "metadata": {}, "outputs": [], "source": [ - "data = pd.DataFrame({'context': context, 'questions': questions, 'answers': answers})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e6f87e9c", - "metadata": {}, - "outputs": [], - "source": [ - "data" + "data = pd.DataFrame({'context': context, 'questions': questions, 'answers': answers})\n", + "data = data.sample(frac=1.0, random_state=42).reset_index(drop=True)\n", + "data.head()" ] }, { @@ -184,18 +167,42 @@ "outputs": [], "source": [ "# load the question-answering model\n", - "pmodel = pipeline('question-answering')" + "pipeline_model = pipeline('question-answering')\n", + "test_size = 5\n", + "\n", + "train_data = data\n", + "test_data = data[:test_size]" + ] + }, + { + "cell_type": "markdown", + "id": "7cf8327b", + "metadata": {}, + "source": [ + "See an example of the model's predictions" ] }, { "cell_type": "code", "execution_count": null, - "id": "04801887", + "id": "ce087699", "metadata": {}, "outputs": [], "source": [ - "train_data = data\n", - "test_data = data[:5]" + "def get_answer(dataset, idx):\n", + " model_output = pipeline_model(question=dataset['questions'][idx], \n", + " context=dataset['context'][idx])\n", + " pred = model_output['answer']\n", + " return pred\n", + "\n", + "def check_answer(dataset, idx):\n", + " pred = get_answer(dataset, idx)\n", + " print('Question : ', dataset['questions'][idx])\n", + " print('Answer : ', dataset['answers'][idx])\n", + " print('Predicted : ', pred)\n", + " print('Correct : ', pred == dataset['answers'][idx])\n", + "\n", + "check_answer(test_data, 0)\n" ] }, { From 6c19f0fd3187d18631324a699d6ef9ecb24e9916 Mon Sep 17 00:00:00 2001 From: Kartik Choudhary Date: Sun, 15 Oct 2023 19:01:09 -0400 Subject: [PATCH 04/12] Update responsibleaidashboard-question-answering-model-debugging.ipynb --- ...onsibleaidashboard-question-answering-model-debugging.ipynb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb b/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb index 2d8de2ffcd..dbcb6b8dc9 100644 --- a/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb +++ b/notebooks/responsibleaidashboard/text/responsibleaidashboard-question-answering-model-debugging.ipynb @@ -241,8 +241,7 @@ "metadata": {}, "outputs": [], "source": [ - "rai_insights = RAITextInsights(pmodel, test_data,\n", - " \"answers\",\n", + "rai_insights = RAITextInsights(pipeline_model, test_data, \"answers\",\n", " task_type=ModelTask.QUESTION_ANSWERING)" ] }, From 4ce78938860225a43bf4176efd7a7b73a2a34ef5 Mon Sep 17 00:00:00 2001 From: Kartik Choudhary Date: Fri, 26 Jan 2024 16:01:49 -0500 Subject: [PATCH 05/12] add genai metrics Signed-off-by: Kartik Choudhary --- .../utils/genai_metrics/metrics.py | 22 +++++ .../utils/genai_metrics/scripts/coherence.py | 92 +++++++++++++++++++ .../genai_metrics/scripts/equivalence.py | 86 +++++++++++++++++ .../utils/genai_metrics/scripts/fluency.py | 81 ++++++++++++++++ .../genai_metrics/scripts/groundedness.py | 78 ++++++++++++++++ .../utils/genai_metrics/scripts/relevance.py | 81 ++++++++++++++++ 6 files changed, 440 insertions(+) create mode 100644 responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py create mode 100644 responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py create mode 100644 responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py create mode 100644 responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py create mode 100644 responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py create mode 100644 responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py new file mode 100644 index 0000000000..7a5c240e9e --- /dev/null +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py @@ -0,0 +1,22 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Compute AI-assisted metrics for generative text models.""" + +from pathlib import Path +import evaluate + +def get_genai_metric(metric_name, **metric_kwargs): + """Get the metric from the genai library. + + :param metric_name: The name of the metric. + :type metric_name: str + :param metric_kwargs: The keyword arguments to pass to the metric. + :type metric_kwargs: dict + :return: The metric. + :rtype: float + """ + curr_file_dir = Path(__file__).resolve().parent + metric = evaluate.load( + str(curr_file_dir.joinpath(f'scripts/{metric_name}.py'))) + return metric.compute(**metric_kwargs) diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py new file mode 100644 index 0000000000..4342ee978e --- /dev/null +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py @@ -0,0 +1,92 @@ +"""Groundedness metric.""" + +import datasets +import evaluate +import pandas as pd + +logger = evaluate.logging.get_logger(__name__) + + +_CITATION = """ +""" + +_DESCRIPTION = """The coherence metric. +""" + +_KWARGS_DESCRIPTION = """ +**SOME DESCRIPTION** +""" + +_SYS_PROMPT = """ +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. +Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else. +""".strip() + +_TEMPLATE = """ +Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale: +One star: the answer completely lacks coherence +Two stars: the answer mostly lacks coherence +Three stars: the answer is partially coherent +Four stars: the answer is mostly coherent +Five stars: the answer has perfect coherency + +This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. +Some examples of valid responses are: +1 +2 +5 +Some examples of invalid responses are: +1/5 +1.5 +3.0 +5 stars + +QUESTION: +{question} + +ANSWER: +{prediction} + +RATING: +""".strip() + + +@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class Coherence(evaluate.Metric): + def _info(self): + + return evaluate.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Value("string", id="sequence") + } + ), + ) + + def _compute(self, *, predictions=None, references=None, **kwargs): + m = [] + templated_ques = [] + + for p, r in zip(predictions, references): + templated_ques.append(_TEMPLATE.format(question=r, prediction=p)) + + model = kwargs['wrapper_model'] + + inp = pd.DataFrame({ + 'questions' : templated_ques, + 'sys_prompt' : _SYS_PROMPT}) + + responses = model.predict(inp) + + for r in responses: + try: + m.append(int(r)) + except ValueError as e: + logger.warning('Failed to parse metric `%s`: %s', r, e) + m.append(0) + return {'scores' : m} + \ No newline at end of file diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py new file mode 100644 index 0000000000..e2acba7eb6 --- /dev/null +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py @@ -0,0 +1,86 @@ +"""Groundedness metric.""" + +import datasets +import evaluate +import pandas as pd + +logger = evaluate.logging.get_logger(__name__) + + +_CITATION = """ +""" + +_DESCRIPTION = """The equivalence metric. +""" + +_KWARGS_DESCRIPTION = """ +**SOME DESCRIPTION** +""" + +_SYS_PROMPT = """ +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. +Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else. +""".strip() + +_TEMPLATE = """ +Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If the information and content in the predicted answer is similar or equivalent to the correct answer, then the value of the Equivalence metric should be high, else it should be low. Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using the following rating scale: +One star: the predicted answer is not at all similar to the correct answer +Two stars: the predicted answer is mostly not similar to the correct answer +Three stars: the predicted answer is somewhat similar to the correct answer +Four stars: the predicted answer is mostly similar to the correct answer +Five stars: the predicted answer is completely similar to the correct answer + +This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. + +QUESTION: +{question} + +CORRECT ANSWER: +{answer} + +PREDICTED ANSWER: +{prediction} +""".strip() + + +@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class Equivalence(evaluate.Metric): + def _info(self): + + return evaluate.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Value("string", id="sequence"), + "answers": datasets.Value("string", id="sequence") + } + ), + ) + + def _compute(self, *, predictions=None, references=None, **kwargs): + m = [] + templated_ques = [] + + answers = kwargs['answers'] + for p, r, a in zip(predictions, references, answers): + templated_ques.append(_TEMPLATE.format(question=r, prediction=p, answer=a)) + + model = kwargs['wrapper_model'] + + inp = pd.DataFrame({ + 'questions' : templated_ques, + 'sys_prompt' : _SYS_PROMPT}) + + responses = model.predict(inp) + + for r in responses: + try: + m.append(int(r)) + except ValueError as e: + logger.warning('Failed to parse metric `%s`: %s', r, e) + m.append(0) + return {'scores' : m} + \ No newline at end of file diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py new file mode 100644 index 0000000000..169a88bec9 --- /dev/null +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py @@ -0,0 +1,81 @@ +"""Groundedness metric.""" + +import datasets +import evaluate +import pandas as pd + +logger = evaluate.logging.get_logger(__name__) + + +_CITATION = """ +""" + +_DESCRIPTION = """The fluency metric. +""" + +_KWARGS_DESCRIPTION = """ +**SOME DESCRIPTION** +""" + +_SYS_PROMPT = """ +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. +Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else. +""".strip() + +_TEMPLATE = """ +Fluency measures the quality of individual sentences in the answer, and whether they are well-written and grammatically correct. Consider the quality of individual sentences when evaluating fluency. Given the question and answer, score the fluency of the answer between one to five stars using the following rating scale: +One star: the answer completely lacks fluency +Two stars: the answer mostly lacks fluency +Three stars: the answer is partially fluent +Four stars: the answer is mostly fluent +Five stars: the answer has perfect fluency + +This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. + +QUESTION: +{question} + +ANSWER: +{prediction} +""".strip() + + +@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class Fluency(evaluate.Metric): + def _info(self): + + return evaluate.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Value("string", id="sequence") + } + ), + ) + + def _compute(self, *, predictions=None, references=None, **kwargs): + m = [] + templated_ques = [] + + for p, r in zip(predictions, references): + templated_ques.append(_TEMPLATE.format(question=r, prediction=p)) + + model = kwargs['wrapper_model'] + + inp = pd.DataFrame({ + 'questions' : templated_ques, + 'sys_prompt' : _SYS_PROMPT}) + + responses = model.predict(inp) + + for r in responses: + try: + m.append(int(r)) + except ValueError as e: + logger.warning('Failed to parse metric `%s`: %s', r, e) + m.append(0) + return {'scores' : m} + \ No newline at end of file diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py new file mode 100644 index 0000000000..8d4f42bc16 --- /dev/null +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py @@ -0,0 +1,78 @@ +"""Groundedness metric.""" + +import datasets +import evaluate +import pandas as pd + +logger = evaluate.logging.get_logger(__name__) + + +_CITATION = """ +""" + +_DESCRIPTION = """The groundedness metric. +""" + +_KWARGS_DESCRIPTION = """ +**SOME DESCRIPTION** +""" + +_SYS_PROMPT = """ +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. +Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else. +""".strip() + +_TEMPLATE = """ +1. 5: The ANSWER follows logically from the information contained in the CONTEXT. +2. 1: The ANSWER is logically false from the information contained in the CONTEXT. +3. an integer score between 1 and 5 and if such integer score does not exists, use 1: It is not possible to determine whether the ANSWER is true or false without further information. +Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. +Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation. + +CONTEXT: +{context} + +ANSWER: +{prediction} +""".strip() + + +@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class Groundedness(evaluate.Metric): + def _info(self): + + return evaluate.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Value("string", id="sequence") + } + ), + ) + + def _compute(self, *, predictions=None, references=None, **kwargs): + m = [] + templated_ques = [] + + for p, r in zip(predictions, references): + templated_ques.append(_TEMPLATE.format(context=r, prediction=p)) + + model = kwargs['wrapper_model'] + + inp = pd.DataFrame({ + 'questions' : templated_ques, + 'sys_prompt' : _SYS_PROMPT}) + + responses = model.predict(inp) + + for r in responses: + try: + m.append(int(r)) + except ValueError as e: + logger.warning('Failed to parse metric `%s`: %s', r, e) + m.append(0) + return {'scores' : m} + \ No newline at end of file diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py new file mode 100644 index 0000000000..7947556b52 --- /dev/null +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py @@ -0,0 +1,81 @@ +"""Groundedness metric.""" + +import datasets +import evaluate +import pandas as pd + +logger = evaluate.logging.get_logger(__name__) + + +_CITATION = """ +""" + +_DESCRIPTION = """The relevance metric. +""" + +_KWARGS_DESCRIPTION = """ +**SOME DESCRIPTION** +""" + +_SYS_PROMPT = """ +You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. +Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else. +""".strip() + +_TEMPLATE = """ +Relevance measures how well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. Given the context and question, score the relevance of the answer between one to five stars using the following rating scale: +One star: the answer completely lacks relevance +Two stars: the answer mostly lacks relevance +Three stars: the answer is partially relevant +Four stars: the answer is mostly relevant +Five stars: the answer has perfect relevance + +This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. + +QUESTION AND CONTEXT: +{question} + +ANSWER: +{prediction} +""".strip() + + +@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +class Relevance(evaluate.Metric): + def _info(self): + + return evaluate.MetricInfo( + description=_DESCRIPTION, + citation=_CITATION, + inputs_description=_KWARGS_DESCRIPTION, + features=datasets.Features( + { + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Value("string", id="sequence") + } + ), + ) + + def _compute(self, *, predictions=None, references=None, **kwargs): + m = [] + templated_ques = [] + + for p, r in zip(predictions, references): + templated_ques.append(_TEMPLATE.format(question=r, prediction=p)) + + model = kwargs['wrapper_model'] + + inp = pd.DataFrame({ + 'questions' : templated_ques, + 'sys_prompt' : _SYS_PROMPT}) + + responses = model.predict(inp) + + for r in responses: + try: + m.append(int(r)) + except ValueError as e: + logger.warning('Failed to parse metric `%s`: %s', r, e) + m.append(0) + return {'scores' : m} + \ No newline at end of file From 1b7bc326dd823c08da6243af8ec356cb8511bb28 Mon Sep 17 00:00:00 2001 From: Kartik Choudhary Date: Fri, 26 Jan 2024 16:19:12 -0500 Subject: [PATCH 06/12] fix linting for metrics Signed-off-by: Kartik Choudhary --- .../utils/genai_metrics/metrics.py | 1 + .../utils/genai_metrics/scripts/coherence.py | 26 +++++++++----- .../genai_metrics/scripts/equivalence.py | 31 ++++++++++------ .../utils/genai_metrics/scripts/fluency.py | 27 +++++++++----- .../genai_metrics/scripts/groundedness.py | 35 ++++++++++++------- .../utils/genai_metrics/scripts/relevance.py | 27 +++++++++----- 6 files changed, 98 insertions(+), 49 deletions(-) diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py index 7a5c240e9e..bb3efa78b1 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py @@ -6,6 +6,7 @@ from pathlib import Path import evaluate + def get_genai_metric(metric_name, **metric_kwargs): """Get the metric from the genai library. diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py index 4342ee978e..596b1c2f66 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py @@ -18,19 +18,27 @@ """ _SYS_PROMPT = """ -You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. -Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else. +You are an AI assistant. You will be given the definition of an evaluation \ +metric for assessing the quality of an answer in a question-answering task. \ +Your job is to compute an accurate evaluation score using the provided \ +evaluation metric. +Your response will be used in automated evaluation of question-answering \ +systems, and must be an integer between 1 and 5, and nothing else. """.strip() _TEMPLATE = """ -Coherence of an answer is measured by how well all the sentences fit together and sound naturally as a whole. Consider the overall quality of the answer when evaluating coherence. Given the question and answer, score the coherence of answer between one to five stars using the following rating scale: +Coherence of an answer is measured by how well all the sentences fit together \ +and sound naturally as a whole. Consider the overall quality of the answer \ +when evaluating coherence. Given the question and answer, score the coherence \ +of answer between one to five stars using the following rating scale: One star: the answer completely lacks coherence Two stars: the answer mostly lacks coherence Three stars: the answer is partially coherent Four stars: the answer is mostly coherent Five stars: the answer has perfect coherency -This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. +This rating value should always be an integer between 1 and 5. So the rating \ +produced should be 1 or 2 or 3 or 4 or 5. Some examples of valid responses are: 1 2 @@ -51,7 +59,8 @@ """.strip() -@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +@evaluate.utils.file_utils.add_start_docstrings( + _DESCRIPTION, _KWARGS_DESCRIPTION) class Coherence(evaluate.Metric): def _info(self): @@ -77,8 +86,8 @@ def _compute(self, *, predictions=None, references=None, **kwargs): model = kwargs['wrapper_model'] inp = pd.DataFrame({ - 'questions' : templated_ques, - 'sys_prompt' : _SYS_PROMPT}) + 'questions': templated_ques, + 'sys_prompt': _SYS_PROMPT}) responses = model.predict(inp) @@ -88,5 +97,4 @@ def _compute(self, *, predictions=None, references=None, **kwargs): except ValueError as e: logger.warning('Failed to parse metric `%s`: %s', r, e) m.append(0) - return {'scores' : m} - \ No newline at end of file + return {'scores': m} diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py index e2acba7eb6..33e01e659e 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py @@ -18,19 +18,29 @@ """ _SYS_PROMPT = """ -You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. -Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else. +You are an AI assistant. You will be given the definition of an evaluation \ +metric for assessing the quality of an answer in a question-answering task. \ +Your job is to compute an accurate evaluation score using the provided \ +evaluation metric. +Your response will be used in automated evaluation of question-answering \ +systems, and must be an integer between 1 and 5, and nothing else. """.strip() _TEMPLATE = """ -Equivalence, as a metric, measures the similarity between the predicted answer and the correct answer. If the information and content in the predicted answer is similar or equivalent to the correct answer, then the value of the Equivalence metric should be high, else it should be low. Given the question, correct answer, and predicted answer, determine the value of Equivalence metric using the following rating scale: +Equivalence, as a metric, measures the similarity between the predicted \ +answer and the correct answer. If the information and content in the \ +predicted answer is similar or equivalent to the correct answer, then the \ +value of the Equivalence metric should be high, else it should be low. Given \ +the question, correct answer, and predicted answer, determine the value of \ +Equivalence metric using the following rating scale: One star: the predicted answer is not at all similar to the correct answer Two stars: the predicted answer is mostly not similar to the correct answer Three stars: the predicted answer is somewhat similar to the correct answer Four stars: the predicted answer is mostly similar to the correct answer Five stars: the predicted answer is completely similar to the correct answer -This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. +This rating value should always be an integer between 1 and 5. So the rating \ +produced should be 1 or 2 or 3 or 4 or 5. QUESTION: {question} @@ -43,7 +53,8 @@ """.strip() -@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +@evaluate.utils.file_utils.add_start_docstrings( + _DESCRIPTION, _KWARGS_DESCRIPTION) class Equivalence(evaluate.Metric): def _info(self): @@ -66,13 +77,14 @@ def _compute(self, *, predictions=None, references=None, **kwargs): answers = kwargs['answers'] for p, r, a in zip(predictions, references, answers): - templated_ques.append(_TEMPLATE.format(question=r, prediction=p, answer=a)) + templated_ques.append(_TEMPLATE.format( + question=r, prediction=p, answer=a)) model = kwargs['wrapper_model'] inp = pd.DataFrame({ - 'questions' : templated_ques, - 'sys_prompt' : _SYS_PROMPT}) + 'questions': templated_ques, + 'sys_prompt': _SYS_PROMPT}) responses = model.predict(inp) @@ -82,5 +94,4 @@ def _compute(self, *, predictions=None, references=None, **kwargs): except ValueError as e: logger.warning('Failed to parse metric `%s`: %s', r, e) m.append(0) - return {'scores' : m} - \ No newline at end of file + return {'scores': m} diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py index 169a88bec9..f24f1b9c29 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py @@ -18,19 +18,28 @@ """ _SYS_PROMPT = """ -You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. -Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else. +You are an AI assistant. You will be given the definition of an evaluation \ +metric for assessing the quality of an answer in a question-answering task. \ +Your job is to compute an accurate evaluation score using the provided \ +evaluation metric. +Your response will be used in automated evaluation of question-answering \ +systems, and must be an integer between 1 and 5, and nothing else. """.strip() _TEMPLATE = """ -Fluency measures the quality of individual sentences in the answer, and whether they are well-written and grammatically correct. Consider the quality of individual sentences when evaluating fluency. Given the question and answer, score the fluency of the answer between one to five stars using the following rating scale: +Fluency measures the quality of individual sentences in the answer, and \ +whether they are well-written and grammatically correct. Consider the quality \ +of individual sentences when evaluating fluency. Given the question and \ +answer, score the fluency of the answer between one to five stars using the \ +following rating scale: One star: the answer completely lacks fluency Two stars: the answer mostly lacks fluency Three stars: the answer is partially fluent Four stars: the answer is mostly fluent Five stars: the answer has perfect fluency -This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. +This rating value should always be an integer between 1 and 5. So the rating \ +produced should be 1 or 2 or 3 or 4 or 5. QUESTION: {question} @@ -40,7 +49,8 @@ """.strip() -@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +@evaluate.utils.file_utils.add_start_docstrings( + _DESCRIPTION, _KWARGS_DESCRIPTION) class Fluency(evaluate.Metric): def _info(self): @@ -66,8 +76,8 @@ def _compute(self, *, predictions=None, references=None, **kwargs): model = kwargs['wrapper_model'] inp = pd.DataFrame({ - 'questions' : templated_ques, - 'sys_prompt' : _SYS_PROMPT}) + 'questions': templated_ques, + 'sys_prompt': _SYS_PROMPT}) responses = model.predict(inp) @@ -77,5 +87,4 @@ def _compute(self, *, predictions=None, references=None, **kwargs): except ValueError as e: logger.warning('Failed to parse metric `%s`: %s', r, e) m.append(0) - return {'scores' : m} - \ No newline at end of file + return {'scores': m} diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py index 8d4f42bc16..12db56a913 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py @@ -18,16 +18,27 @@ """ _SYS_PROMPT = """ -You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. -Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else. +You are an AI assistant. You will be given the definition of an evaluation \ +metric for assessing the quality of an answer in a question-answering task. \ +Your job is to compute an accurate evaluation score using the provided \ +evaluation metric. +Your response will be used in automated evaluation of question-answering \ +systems, and must be an integer between 1 and 5, and nothing else. """.strip() _TEMPLATE = """ -1. 5: The ANSWER follows logically from the information contained in the CONTEXT. -2. 1: The ANSWER is logically false from the information contained in the CONTEXT. -3. an integer score between 1 and 5 and if such integer score does not exists, use 1: It is not possible to determine whether the ANSWER is true or false without further information. -Read the passage of information thoroughly and select the correct answer from the three answer labels. Read the CONTEXT thoroughly to ensure you know what the CONTEXT entails. -Note the ANSWER is generated by a computer system, it can contain certain symbols, which should not be a negative factor in the evaluation. +1. 5: The ANSWER follows logically from the information contained in the \ +CONTEXT. +2. 1: The ANSWER is logically false from the information contained in the \ +CONTEXT. +3. an integer score between 1 and 5 and if such integer score does not \ +exists, use 1: It is not possible to determine whether the ANSWER is true or \ +false without further information. +Read the passage of information thoroughly and select the correct answer from \ +the three answer labels. Read the CONTEXT thoroughly to ensure you know what \ +the CONTEXT entails. +Note the ANSWER is generated by a computer system, it can contain certain \ +symbols, which should not be a negative factor in the evaluation. CONTEXT: {context} @@ -37,7 +48,8 @@ """.strip() -@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +@evaluate.utils.file_utils.add_start_docstrings( + _DESCRIPTION, _KWARGS_DESCRIPTION) class Groundedness(evaluate.Metric): def _info(self): @@ -63,8 +75,8 @@ def _compute(self, *, predictions=None, references=None, **kwargs): model = kwargs['wrapper_model'] inp = pd.DataFrame({ - 'questions' : templated_ques, - 'sys_prompt' : _SYS_PROMPT}) + 'questions': templated_ques, + 'sys_prompt': _SYS_PROMPT}) responses = model.predict(inp) @@ -74,5 +86,4 @@ def _compute(self, *, predictions=None, references=None, **kwargs): except ValueError as e: logger.warning('Failed to parse metric `%s`: %s', r, e) m.append(0) - return {'scores' : m} - \ No newline at end of file + return {'scores': m} diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py index 7947556b52..c10b33a3f6 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py @@ -18,19 +18,28 @@ """ _SYS_PROMPT = """ -You are an AI assistant. You will be given the definition of an evaluation metric for assessing the quality of an answer in a question-answering task. Your job is to compute an accurate evaluation score using the provided evaluation metric. -Your response will be used in automated evaluation of question-answering systems, and must be an integer between 1 and 5, and nothing else. +You are an AI assistant. You will be given the definition of an evaluation \ +metric for assessing the quality of an answer in a question-answering task. \ +Your job is to compute an accurate evaluation score using the provided \ +evaluation metric. +Your response will be used in automated evaluation of question-answering \ +systems, and must be an integer between 1 and 5, and nothing else. """.strip() _TEMPLATE = """ -Relevance measures how well the answer addresses the main aspects of the question, based on the context. Consider whether all and only the important aspects are contained in the answer when evaluating relevance. Given the context and question, score the relevance of the answer between one to five stars using the following rating scale: +Relevance measures how well the answer addresses the main aspects of the \ +question, based on the context. Consider whether all and only the important \ +aspects are contained in the answer when evaluating relevance. Given the \ +context and question, score the relevance of the answer between one to five \ +stars using the following rating scale: One star: the answer completely lacks relevance Two stars: the answer mostly lacks relevance Three stars: the answer is partially relevant Four stars: the answer is mostly relevant Five stars: the answer has perfect relevance -This rating value should always be an integer between 1 and 5. So the rating produced should be 1 or 2 or 3 or 4 or 5. +This rating value should always be an integer between 1 and 5. So the rating \ +produced should be 1 or 2 or 3 or 4 or 5. QUESTION AND CONTEXT: {question} @@ -40,7 +49,8 @@ """.strip() -@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) +@evaluate.utils.file_utils.add_start_docstrings( + _DESCRIPTION, _KWARGS_DESCRIPTION) class Relevance(evaluate.Metric): def _info(self): @@ -66,8 +76,8 @@ def _compute(self, *, predictions=None, references=None, **kwargs): model = kwargs['wrapper_model'] inp = pd.DataFrame({ - 'questions' : templated_ques, - 'sys_prompt' : _SYS_PROMPT}) + 'questions': templated_ques, + 'sys_prompt': _SYS_PROMPT}) responses = model.predict(inp) @@ -77,5 +87,4 @@ def _compute(self, *, predictions=None, references=None, **kwargs): except ValueError as e: logger.warning('Failed to parse metric `%s`: %s', r, e) m.append(0) - return {'scores' : m} - \ No newline at end of file + return {'scores': m} From 28c94b818db821bce26a3cae955f7480b7bbeace Mon Sep 17 00:00:00 2001 From: Kartik Choudhary Date: Fri, 26 Jan 2024 16:20:13 -0500 Subject: [PATCH 07/12] sort import order Signed-off-by: Kartik Choudhary --- .../responsibleai_text/utils/genai_metrics/metrics.py | 1 + 1 file changed, 1 insertion(+) diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py index bb3efa78b1..0b35175d0d 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py @@ -4,6 +4,7 @@ """Compute AI-assisted metrics for generative text models.""" from pathlib import Path + import evaluate From 57d8fa2800693ad0eb00bf247408098e7ddd19a5 Mon Sep 17 00:00:00 2001 From: Kartik Choudhary Date: Fri, 26 Jan 2024 16:33:08 -0500 Subject: [PATCH 08/12] fix docstrings and copyright notices Signed-off-by: Kartik Choudhary --- .../utils/genai_metrics/scripts/coherence.py | 5 ++++- .../utils/genai_metrics/scripts/equivalence.py | 5 ++++- .../utils/genai_metrics/scripts/fluency.py | 5 ++++- .../utils/genai_metrics/scripts/groundedness.py | 3 +++ .../utils/genai_metrics/scripts/relevance.py | 5 ++++- 5 files changed, 19 insertions(+), 4 deletions(-) diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py index 596b1c2f66..7bf8d4a2fe 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py @@ -1,4 +1,7 @@ -"""Groundedness metric.""" +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Coherence metric.""" import datasets import evaluate diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py index 33e01e659e..e3a894870f 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py @@ -1,4 +1,7 @@ -"""Groundedness metric.""" +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Equivalence metric.""" import datasets import evaluate diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py index f24f1b9c29..3a11233226 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py @@ -1,4 +1,7 @@ -"""Groundedness metric.""" +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Fluency metric.""" import datasets import evaluate diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py index 12db56a913..4a431f7474 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py @@ -1,3 +1,6 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + """Groundedness metric.""" import datasets diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py index c10b33a3f6..ada75f7c06 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py @@ -1,4 +1,7 @@ -"""Groundedness metric.""" +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Relevance metric.""" import datasets import evaluate From 3f05ead5254ceb11e169e6364bad4cbcb1f9f89a Mon Sep 17 00:00:00 2001 From: Kartik Choudhary Date: Fri, 26 Jan 2024 17:03:39 -0500 Subject: [PATCH 09/12] add test for metrics Signed-off-by: Kartik Choudhary --- .../tests/test_genai_metrics.py | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 responsibleai_text/tests/test_genai_metrics.py diff --git a/responsibleai_text/tests/test_genai_metrics.py b/responsibleai_text/tests/test_genai_metrics.py new file mode 100644 index 0000000000..5285d6c623 --- /dev/null +++ b/responsibleai_text/tests/test_genai_metrics.py @@ -0,0 +1,83 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +from responsibleai_text.utils.genai_metrics.metrics import get_genai_metric + +PREDICTIONS = ['This is a prediction'] +REFERENCES = ['This is a reference'] +ANSWERS = ['This is an answer'] + + +class DummyModelWrapper: + def predict(self, inp): + return [1] * len(inp) + + +class TestGenAIMetrics: + + def test_coherence(self): + metric = get_genai_metric('coherence', + predictions=PREDICTIONS, + references=REFERENCES, + wrapper_model=DummyModelWrapper()) + assert metric['scores'] == [1] + + metric = get_genai_metric('coherence', + predictions=PREDICTIONS * 5, + references=REFERENCES * 5, + wrapper_model=DummyModelWrapper()) + assert metric['scores'] == [1] * 5 + + def test_equivalence(self): + metric = get_genai_metric('equivalence', + predictions=PREDICTIONS, + references=REFERENCES, + answers=ANSWERS, + wrapper_model=DummyModelWrapper()) + assert metric['scores'] == [1] + + metric = get_genai_metric('equivalence', + predictions=PREDICTIONS * 5, + references=REFERENCES * 5, + answers=ANSWERS * 5, + wrapper_model=DummyModelWrapper()) + assert metric['scores'] == [1] * 5 + + def test_fluency(self): + metric = get_genai_metric('fluency', + predictions=PREDICTIONS, + references=REFERENCES, + wrapper_model=DummyModelWrapper()) + assert metric['scores'] == [1] + + metric = get_genai_metric('fluency', + predictions=PREDICTIONS * 5, + references=REFERENCES * 5, + wrapper_model=DummyModelWrapper()) + assert metric['scores'] == [1] * 5 + + def test_groundedness(self): + metric = get_genai_metric('groundedness', + predictions=PREDICTIONS, + references=REFERENCES, + wrapper_model=DummyModelWrapper()) + assert metric['scores'] == [1] + + metric = get_genai_metric('groundedness', + predictions=PREDICTIONS * 5, + references=REFERENCES * 5, + wrapper_model=DummyModelWrapper()) + assert metric['scores'] == [1] * 5 + + def test_relevance(self): + metric = get_genai_metric('relevance', + predictions=PREDICTIONS, + references=REFERENCES, + wrapper_model=DummyModelWrapper()) + assert metric['scores'] == [1] + + metric = get_genai_metric('relevance', + predictions=PREDICTIONS * 5, + references=REFERENCES * 5, + wrapper_model=DummyModelWrapper()) + assert metric['scores'] == [1] * 5 From ccea941725aacfbd150dc07b30e09709cac68b76 Mon Sep 17 00:00:00 2001 From: Kartik Choudhary Date: Fri, 26 Jan 2024 21:43:12 -0500 Subject: [PATCH 10/12] genai metrics refactor Signed-off-by: Kartik Choudhary --- .../utils/genai_metrics/constants.py | 16 ++++ .../utils/genai_metrics/scripts/_compute.py | 38 ++++++++++ .../utils/genai_metrics/scripts/coherence.py | 72 +++++++----------- .../genai_metrics/scripts/equivalence.py | 76 ++++++++----------- .../utils/genai_metrics/scripts/fluency.py | 71 +++++++---------- .../genai_metrics/scripts/groundedness.py | 71 +++++++---------- .../utils/genai_metrics/scripts/relevance.py | 71 +++++++---------- 7 files changed, 196 insertions(+), 219 deletions(-) create mode 100644 responsibleai_text/responsibleai_text/utils/genai_metrics/constants.py create mode 100644 responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/_compute.py diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/constants.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/constants.py new file mode 100644 index 0000000000..2a157e7d3b --- /dev/null +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/constants.py @@ -0,0 +1,16 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Constants for genai_metrics.""" + +_CITATION = """ +""" + +_SYS_PROMPT = """ +You are an AI assistant. You will be given the definition of an evaluation \ +metric for assessing the quality of an answer in a question-answering task. \ +Your job is to compute an accurate evaluation score using the provided \ +evaluation metric. +Your response will be used in automated evaluation of question-answering \ +systems, and must be an integer between 1 and 5, and nothing else. +""".strip() diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/_compute.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/_compute.py new file mode 100644 index 0000000000..43ab0fc23a --- /dev/null +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/_compute.py @@ -0,0 +1,38 @@ +# Copyright (c) Microsoft Corporation +# Licensed under the MIT License. + +"""Helper function to compute metrics.""" + +import pandas as pd + +from responsibleai_text.utils.genai_metrics.constants import _SYS_PROMPT + + +def format_str(s, **kwargs): + """Zip all the kwargs together and format the string in a loop""" + keys = list(kwargs.keys()) + lists = [kwargs[k] for k in keys] + formatted = [] + for vals in zip(*lists): + fmt_kwargs = {k: v for k, v in zip(keys, vals)} + formatted.append(s.format(**fmt_kwargs)) + return formatted + + +def _compute_metric(template, logger, wrapper_model, **kwargs): + m = [] + templated_ques = format_str(template, **kwargs) + + inp = pd.DataFrame({ + 'questions': templated_ques, + 'sys_prompt': _SYS_PROMPT}) + + responses = wrapper_model.predict(inp) + + for r in responses: + try: + m.append(int(r)) + except ValueError as e: + logger.warning('Failed to parse metric `%s`: %s', r, e) + m.append(0) + return {'scores': m} diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py index 7bf8d4a2fe..5623b70bec 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/coherence.py @@ -3,15 +3,28 @@ """Coherence metric.""" -import datasets -import evaluate -import pandas as pd +import logging -logger = evaluate.logging.get_logger(__name__) +from responsibleai_text.utils.genai_metrics.constants import _CITATION +from responsibleai_text.utils.genai_metrics.scripts._compute import \ + _compute_metric +module_logger = logging.getLogger(__name__) +module_logger.setLevel(logging.INFO) -_CITATION = """ -""" +try: + import evaluate +except ImportError: + module_logger.debug( + 'Could not import evaluate, required if using a genai model') + +try: + import datasets +except ImportError: + module_logger.debug( + 'Could not import datasets, required if using a genai model') + +logger = evaluate.logging.get_logger(__name__) _DESCRIPTION = """The coherence metric. """ @@ -20,15 +33,6 @@ **SOME DESCRIPTION** """ -_SYS_PROMPT = """ -You are an AI assistant. You will be given the definition of an evaluation \ -metric for assessing the quality of an answer in a question-answering task. \ -Your job is to compute an accurate evaluation score using the provided \ -evaluation metric. -Your response will be used in automated evaluation of question-answering \ -systems, and must be an integer between 1 and 5, and nothing else. -""".strip() - _TEMPLATE = """ Coherence of an answer is measured by how well all the sentences fit together \ and sound naturally as a whole. Consider the overall quality of the answer \ @@ -66,38 +70,18 @@ _DESCRIPTION, _KWARGS_DESCRIPTION) class Coherence(evaluate.Metric): def _info(self): - return evaluate.MetricInfo( description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, - features=datasets.Features( - { - "predictions": datasets.Value("string", id="sequence"), - "references": datasets.Value("string", id="sequence") - } - ), - ) + features=datasets.Features({ + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Value("string", id="sequence")})) def _compute(self, *, predictions=None, references=None, **kwargs): - m = [] - templated_ques = [] - - for p, r in zip(predictions, references): - templated_ques.append(_TEMPLATE.format(question=r, prediction=p)) - - model = kwargs['wrapper_model'] - - inp = pd.DataFrame({ - 'questions': templated_ques, - 'sys_prompt': _SYS_PROMPT}) - - responses = model.predict(inp) - - for r in responses: - try: - m.append(int(r)) - except ValueError as e: - logger.warning('Failed to parse metric `%s`: %s', r, e) - m.append(0) - return {'scores': m} + return _compute_metric( + _TEMPLATE, + logger, + kwargs['wrapper_model'], + prediction=predictions, + question=references) diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py index e3a894870f..b2532e4bad 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py @@ -3,15 +3,28 @@ """Equivalence metric.""" -import datasets -import evaluate -import pandas as pd +import logging -logger = evaluate.logging.get_logger(__name__) +from responsibleai_text.utils.genai_metrics.constants import _CITATION +from responsibleai_text.utils.genai_metrics.scripts._compute import \ + _compute_metric +module_logger = logging.getLogger(__name__) +module_logger.setLevel(logging.INFO) -_CITATION = """ -""" +try: + import evaluate +except ImportError: + module_logger.debug( + 'Could not import evaluate, required if using a genai model') + +try: + import datasets +except ImportError: + module_logger.debug( + 'Could not import datasets, required if using a genai model') + +logger = evaluate.logging.get_logger(__name__) _DESCRIPTION = """The equivalence metric. """ @@ -20,15 +33,6 @@ **SOME DESCRIPTION** """ -_SYS_PROMPT = """ -You are an AI assistant. You will be given the definition of an evaluation \ -metric for assessing the quality of an answer in a question-answering task. \ -Your job is to compute an accurate evaluation score using the provided \ -evaluation metric. -Your response will be used in automated evaluation of question-answering \ -systems, and must be an integer between 1 and 5, and nothing else. -""".strip() - _TEMPLATE = """ Equivalence, as a metric, measures the similarity between the predicted \ answer and the correct answer. If the information and content in the \ @@ -65,36 +69,16 @@ def _info(self): description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, - features=datasets.Features( - { - "predictions": datasets.Value("string", id="sequence"), - "references": datasets.Value("string", id="sequence"), - "answers": datasets.Value("string", id="sequence") - } - ), - ) + features=datasets.Features({ + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Value("string", id="sequence"), + "answers": datasets.Value("string", id="sequence")})) def _compute(self, *, predictions=None, references=None, **kwargs): - m = [] - templated_ques = [] - - answers = kwargs['answers'] - for p, r, a in zip(predictions, references, answers): - templated_ques.append(_TEMPLATE.format( - question=r, prediction=p, answer=a)) - - model = kwargs['wrapper_model'] - - inp = pd.DataFrame({ - 'questions': templated_ques, - 'sys_prompt': _SYS_PROMPT}) - - responses = model.predict(inp) - - for r in responses: - try: - m.append(int(r)) - except ValueError as e: - logger.warning('Failed to parse metric `%s`: %s', r, e) - m.append(0) - return {'scores': m} + return _compute_metric( + _TEMPLATE, + logger, + kwargs['wrapper_model'], + prediction=predictions, + question=references, + answer=kwargs['answers']) diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py index 3a11233226..af38e297cb 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py @@ -3,15 +3,28 @@ """Fluency metric.""" -import datasets -import evaluate -import pandas as pd +import logging -logger = evaluate.logging.get_logger(__name__) +from responsibleai_text.utils.genai_metrics.constants import _CITATION +from responsibleai_text.utils.genai_metrics.scripts._compute import \ + _compute_metric +module_logger = logging.getLogger(__name__) +module_logger.setLevel(logging.INFO) -_CITATION = """ -""" +try: + import evaluate +except ImportError: + module_logger.debug( + 'Could not import evaluate, required if using a genai model') + +try: + import datasets +except ImportError: + module_logger.debug( + 'Could not import datasets, required if using a genai model') + +logger = evaluate.logging.get_logger(__name__) _DESCRIPTION = """The fluency metric. """ @@ -20,15 +33,6 @@ **SOME DESCRIPTION** """ -_SYS_PROMPT = """ -You are an AI assistant. You will be given the definition of an evaluation \ -metric for assessing the quality of an answer in a question-answering task. \ -Your job is to compute an accurate evaluation score using the provided \ -evaluation metric. -Your response will be used in automated evaluation of question-answering \ -systems, and must be an integer between 1 and 5, and nothing else. -""".strip() - _TEMPLATE = """ Fluency measures the quality of individual sentences in the answer, and \ whether they are well-written and grammatically correct. Consider the quality \ @@ -61,33 +65,14 @@ def _info(self): description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, - features=datasets.Features( - { - "predictions": datasets.Value("string", id="sequence"), - "references": datasets.Value("string", id="sequence") - } - ), - ) + features=datasets.Features({ + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Value("string", id="sequence")})) def _compute(self, *, predictions=None, references=None, **kwargs): - m = [] - templated_ques = [] - - for p, r in zip(predictions, references): - templated_ques.append(_TEMPLATE.format(question=r, prediction=p)) - - model = kwargs['wrapper_model'] - - inp = pd.DataFrame({ - 'questions': templated_ques, - 'sys_prompt': _SYS_PROMPT}) - - responses = model.predict(inp) - - for r in responses: - try: - m.append(int(r)) - except ValueError as e: - logger.warning('Failed to parse metric `%s`: %s', r, e) - m.append(0) - return {'scores': m} + return _compute_metric( + _TEMPLATE, + logger, + kwargs['wrapper_model'], + prediction=predictions, + question=references) diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py index 4a431f7474..01469a6bbc 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py @@ -3,15 +3,28 @@ """Groundedness metric.""" -import datasets -import evaluate -import pandas as pd +import logging -logger = evaluate.logging.get_logger(__name__) +from responsibleai_text.utils.genai_metrics.constants import _CITATION +from responsibleai_text.utils.genai_metrics.scripts._compute import \ + _compute_metric +module_logger = logging.getLogger(__name__) +module_logger.setLevel(logging.INFO) -_CITATION = """ -""" +try: + import evaluate +except ImportError: + module_logger.debug( + 'Could not import evaluate, required if using a genai model') + +try: + import datasets +except ImportError: + module_logger.debug( + 'Could not import datasets, required if using a genai model') + +logger = evaluate.logging.get_logger(__name__) _DESCRIPTION = """The groundedness metric. """ @@ -20,15 +33,6 @@ **SOME DESCRIPTION** """ -_SYS_PROMPT = """ -You are an AI assistant. You will be given the definition of an evaluation \ -metric for assessing the quality of an answer in a question-answering task. \ -Your job is to compute an accurate evaluation score using the provided \ -evaluation metric. -Your response will be used in automated evaluation of question-answering \ -systems, and must be an integer between 1 and 5, and nothing else. -""".strip() - _TEMPLATE = """ 1. 5: The ANSWER follows logically from the information contained in the \ CONTEXT. @@ -60,33 +64,14 @@ def _info(self): description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, - features=datasets.Features( - { - "predictions": datasets.Value("string", id="sequence"), - "references": datasets.Value("string", id="sequence") - } - ), - ) + features=datasets.Features({ + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Value("string", id="sequence")})) def _compute(self, *, predictions=None, references=None, **kwargs): - m = [] - templated_ques = [] - - for p, r in zip(predictions, references): - templated_ques.append(_TEMPLATE.format(context=r, prediction=p)) - - model = kwargs['wrapper_model'] - - inp = pd.DataFrame({ - 'questions': templated_ques, - 'sys_prompt': _SYS_PROMPT}) - - responses = model.predict(inp) - - for r in responses: - try: - m.append(int(r)) - except ValueError as e: - logger.warning('Failed to parse metric `%s`: %s', r, e) - m.append(0) - return {'scores': m} + return _compute_metric( + _TEMPLATE, + logger, + kwargs['wrapper_model'], + prediction=predictions, + context=references) diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py index ada75f7c06..938a4be202 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py @@ -3,15 +3,28 @@ """Relevance metric.""" -import datasets -import evaluate -import pandas as pd +import logging -logger = evaluate.logging.get_logger(__name__) +from responsibleai_text.utils.genai_metrics.constants import _CITATION +from responsibleai_text.utils.genai_metrics.scripts._compute import \ + _compute_metric +module_logger = logging.getLogger(__name__) +module_logger.setLevel(logging.INFO) -_CITATION = """ -""" +try: + import evaluate +except ImportError: + module_logger.debug( + 'Could not import evaluate, required if using a genai model') + +try: + import datasets +except ImportError: + module_logger.debug( + 'Could not import datasets, required if using a genai model') + +logger = evaluate.logging.get_logger(__name__) _DESCRIPTION = """The relevance metric. """ @@ -20,15 +33,6 @@ **SOME DESCRIPTION** """ -_SYS_PROMPT = """ -You are an AI assistant. You will be given the definition of an evaluation \ -metric for assessing the quality of an answer in a question-answering task. \ -Your job is to compute an accurate evaluation score using the provided \ -evaluation metric. -Your response will be used in automated evaluation of question-answering \ -systems, and must be an integer between 1 and 5, and nothing else. -""".strip() - _TEMPLATE = """ Relevance measures how well the answer addresses the main aspects of the \ question, based on the context. Consider whether all and only the important \ @@ -61,33 +65,14 @@ def _info(self): description=_DESCRIPTION, citation=_CITATION, inputs_description=_KWARGS_DESCRIPTION, - features=datasets.Features( - { - "predictions": datasets.Value("string", id="sequence"), - "references": datasets.Value("string", id="sequence") - } - ), - ) + features=datasets.Features({ + "predictions": datasets.Value("string", id="sequence"), + "references": datasets.Value("string", id="sequence")})) def _compute(self, *, predictions=None, references=None, **kwargs): - m = [] - templated_ques = [] - - for p, r in zip(predictions, references): - templated_ques.append(_TEMPLATE.format(question=r, prediction=p)) - - model = kwargs['wrapper_model'] - - inp = pd.DataFrame({ - 'questions': templated_ques, - 'sys_prompt': _SYS_PROMPT}) - - responses = model.predict(inp) - - for r in responses: - try: - m.append(int(r)) - except ValueError as e: - logger.warning('Failed to parse metric `%s`: %s', r, e) - m.append(0) - return {'scores': m} + return _compute_metric( + _TEMPLATE, + logger, + kwargs['wrapper_model'], + prediction=predictions, + question=references) From f40058fe64dc954095b239bf978c9ff28b83b580 Mon Sep 17 00:00:00 2001 From: Kartik Choudhary Date: Fri, 26 Jan 2024 21:47:01 -0500 Subject: [PATCH 11/12] remove unnecessary newlines Signed-off-by: Kartik Choudhary --- .../utils/genai_metrics/scripts/equivalence.py | 1 - .../responsibleai_text/utils/genai_metrics/scripts/fluency.py | 1 - .../utils/genai_metrics/scripts/groundedness.py | 1 - .../responsibleai_text/utils/genai_metrics/scripts/relevance.py | 1 - 4 files changed, 4 deletions(-) diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py index b2532e4bad..9e32985407 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/equivalence.py @@ -64,7 +64,6 @@ _DESCRIPTION, _KWARGS_DESCRIPTION) class Equivalence(evaluate.Metric): def _info(self): - return evaluate.MetricInfo( description=_DESCRIPTION, citation=_CITATION, diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py index af38e297cb..5fadb1e256 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/fluency.py @@ -60,7 +60,6 @@ _DESCRIPTION, _KWARGS_DESCRIPTION) class Fluency(evaluate.Metric): def _info(self): - return evaluate.MetricInfo( description=_DESCRIPTION, citation=_CITATION, diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py index 01469a6bbc..4135ee8102 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/groundedness.py @@ -59,7 +59,6 @@ _DESCRIPTION, _KWARGS_DESCRIPTION) class Groundedness(evaluate.Metric): def _info(self): - return evaluate.MetricInfo( description=_DESCRIPTION, citation=_CITATION, diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py index 938a4be202..ca43ed0f55 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/scripts/relevance.py @@ -60,7 +60,6 @@ _DESCRIPTION, _KWARGS_DESCRIPTION) class Relevance(evaluate.Metric): def _info(self): - return evaluate.MetricInfo( description=_DESCRIPTION, citation=_CITATION, From 9173474eaabc6d7bce45149f1290a78a9476c76d Mon Sep 17 00:00:00 2001 From: Kartik Choudhary Date: Fri, 26 Jan 2024 21:51:02 -0500 Subject: [PATCH 12/12] fix import requirement Signed-off-by: Kartik Choudhary --- .../responsibleai_text/utils/genai_metrics/metrics.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py b/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py index 0b35175d0d..e0e4934d76 100644 --- a/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py +++ b/responsibleai_text/responsibleai_text/utils/genai_metrics/metrics.py @@ -3,9 +3,17 @@ """Compute AI-assisted metrics for generative text models.""" +import logging from pathlib import Path -import evaluate +module_logger = logging.getLogger(__name__) +module_logger.setLevel(logging.INFO) + +try: + import evaluate +except ImportError: + module_logger.debug( + 'Could not import evaluate, required if using a genai model') def get_genai_metric(metric_name, **metric_kwargs):