From e91405e63f3ad9f36effa205b66bf7bb1b4b74f3 Mon Sep 17 00:00:00 2001 From: caomaosong Date: Thu, 26 Oct 2023 13:52:44 +0800 Subject: [PATCH 1/8] TabMWP --- configs/datasets/TabMWP/TabMWP_gen.py | 54 ++++++++++++++++++ opencompass/datasets/TabMWP.py | 80 +++++++++++++++++++++++++++ opencompass/datasets/__init__.py | 1 + 3 files changed, 135 insertions(+) create mode 100644 configs/datasets/TabMWP/TabMWP_gen.py create mode 100644 opencompass/datasets/TabMWP.py diff --git a/configs/datasets/TabMWP/TabMWP_gen.py b/configs/datasets/TabMWP/TabMWP_gen.py new file mode 100644 index 000000000..d19461f63 --- /dev/null +++ b/configs/datasets/TabMWP/TabMWP_gen.py @@ -0,0 +1,54 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import TabMWPDataset + +# None of the TabMWP dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://github.com/lupantech/PromptPG/tree/main + +input_format='TQ' +output_format='A' +elements = {"Q": f"Question: {{question}}", + "T": f"Table: {{table}}", + "S": f"Solution: {{solution}}", + "A": f"Answer: The answer is {{answer}}.", + "AS": f"Answer: The answer is {{answer}}. BECAUSE: {{solution}}", + "SA": f"Answer: {{solution}} The answer is {{answer}}."} + + +TabMWP_reader_cfg = dict( + input_columns=["question", "table"], + output_column="answer", #choose from ["answer","solution","answer_and_solution","solution_and_answer"] + train_split='dev', + ) + +TabMWP_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role="HUMAN", + prompt= "\n".join(elements[label] for label in input_format) + ), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +TabMWP_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), +) + +TabMWP_datasets = [ + dict( + type=TabMWPDataset, + path="./data/tabmwp/", + reader_cfg=TabMWP_reader_cfg, + infer_cfg=TabMWP_infer_cfg, + eval_cfg=TabMWP_eval_cfg,) +] + diff --git a/opencompass/datasets/TabMWP.py b/opencompass/datasets/TabMWP.py new file mode 100644 index 000000000..3bfb05b08 --- /dev/null +++ b/opencompass/datasets/TabMWP.py @@ -0,0 +1,80 @@ +import json +import os.path as osp + +from datasets import Dataset, DatasetDict + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +def get_table_text(problem): + table = problem['table'] + title = problem['table_title'] + if title and len(title) > 0: + table = f'[TITLE]: {title}\n{table}' + return table + + +def get_question_text(problem, + option_inds=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']): + question = problem['question'] + + unit = problem['unit'] + if unit and len(unit) > 0: + question = f'{question} (Unit: {unit})' + + choices = problem['choices'] + if choices and len(choices) > 0: + choice_list = [] + for i, c in enumerate(choices): + choice_list.append('({}) {}'.format(option_inds[i], c)) + options = ' '.join(choice_list) + question = f'{question}\nOptions: {options}' + + return question + + +def get_answer(problem): + return problem['answer'] + + +def get_solution_text(problem): + # \\n: GPT-3 can generate the solution with more tokens + solution = problem['solution'].replace('\n', '\\n') + return solution + + +@LOAD_DATASET.register_module() +class TabMWPDataset(BaseDataset): + + @staticmethod + def load(path: str): + dataset = DatasetDict() + for split in ['dev', 'test', 'train']: + raw_data = [] + filename = osp.join(path, f'problems_{split}.json') + with open(filename, 'r', encoding='utf-8') as f: + json_data = json.load(f) + for idx in json_data: + problem = json_data[idx] + question = get_question_text(problem) + table = get_table_text(problem) + answer = get_answer(problem) + solution = get_solution_text(problem) + raw_data.append({ + 'question': + question, + 'table': + table, + 'answer': + f'Answer: The answer is {answer}.', + 'solution': + f'Solution: {solution}', + 'answer_and_solution': + f'Answer: The answer is {answer}. BECAUSE: {solution}', + 'solution_and_answer': + f'Answer: {solution} The answer is {answer}.' + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 0c7583c4e..8a07b3219 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -66,6 +66,7 @@ from .strategyqa import * # noqa: F401, F403 from .summedits import * # noqa: F401, F403 from .summscreen import * # noqa: F401, F403 +from .TabMWP import * # noqa: F401, F403 from .TheoremQA import * # noqa: F401, F403 from .tnews import * # noqa: F401, F403 from .triviaqa import * # noqa: F401, F403 From 5d5b2f8c00ad02d5ca9cec5861f1db6a7f02f6f6 Mon Sep 17 00:00:00 2001 From: bittersweet1999 <1487910649@qq.com> Date: Thu, 26 Oct 2023 14:13:42 +0800 Subject: [PATCH 2/8] TabMWP --- configs/datasets/TabMWP/TabMWP_gen.py | 56 ++------------------ configs/datasets/TabMWP/TabMWP_gen_2aef96.py | 54 +++++++++++++++++++ 2 files changed, 57 insertions(+), 53 deletions(-) create mode 100644 configs/datasets/TabMWP/TabMWP_gen_2aef96.py diff --git a/configs/datasets/TabMWP/TabMWP_gen.py b/configs/datasets/TabMWP/TabMWP_gen.py index d19461f63..b0863bdba 100644 --- a/configs/datasets/TabMWP/TabMWP_gen.py +++ b/configs/datasets/TabMWP/TabMWP_gen.py @@ -1,54 +1,4 @@ -from opencompass.openicl.icl_prompt_template import PromptTemplate -from opencompass.openicl.icl_retriever import ZeroRetriever -from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import AccEvaluator -from opencompass.datasets import TabMWPDataset - -# None of the TabMWP dataset in huggingface is correctly parsed, so we use our own dataset reader -# Please download the dataset from https://github.com/lupantech/PromptPG/tree/main - -input_format='TQ' -output_format='A' -elements = {"Q": f"Question: {{question}}", - "T": f"Table: {{table}}", - "S": f"Solution: {{solution}}", - "A": f"Answer: The answer is {{answer}}.", - "AS": f"Answer: The answer is {{answer}}. BECAUSE: {{solution}}", - "SA": f"Answer: {{solution}} The answer is {{answer}}."} - - -TabMWP_reader_cfg = dict( - input_columns=["question", "table"], - output_column="answer", #choose from ["answer","solution","answer_and_solution","solution_and_answer"] - train_split='dev', - ) - -TabMWP_infer_cfg = dict( - prompt_template=dict( - type=PromptTemplate, - template=dict( - round=[ - dict( - role="HUMAN", - prompt= "\n".join(elements[label] for label in input_format) - ), - ], - ), - ), - retriever=dict(type=ZeroRetriever), - inferencer=dict(type=GenInferencer), -) - -TabMWP_eval_cfg = dict( - evaluator=dict(type=AccEvaluator), -) - -TabMWP_datasets = [ - dict( - type=TabMWPDataset, - path="./data/tabmwp/", - reader_cfg=TabMWP_reader_cfg, - infer_cfg=TabMWP_infer_cfg, - eval_cfg=TabMWP_eval_cfg,) -] +from mmengine.config import read_base +with read_base(): + from .TabMWP_gen_2aef96 import TabMWP_datasets # noqa: F401, F403 \ No newline at end of file diff --git a/configs/datasets/TabMWP/TabMWP_gen_2aef96.py b/configs/datasets/TabMWP/TabMWP_gen_2aef96.py new file mode 100644 index 000000000..d19461f63 --- /dev/null +++ b/configs/datasets/TabMWP/TabMWP_gen_2aef96.py @@ -0,0 +1,54 @@ +from opencompass.openicl.icl_prompt_template import PromptTemplate +from opencompass.openicl.icl_retriever import ZeroRetriever +from opencompass.openicl.icl_inferencer import GenInferencer +from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.datasets import TabMWPDataset + +# None of the TabMWP dataset in huggingface is correctly parsed, so we use our own dataset reader +# Please download the dataset from https://github.com/lupantech/PromptPG/tree/main + +input_format='TQ' +output_format='A' +elements = {"Q": f"Question: {{question}}", + "T": f"Table: {{table}}", + "S": f"Solution: {{solution}}", + "A": f"Answer: The answer is {{answer}}.", + "AS": f"Answer: The answer is {{answer}}. BECAUSE: {{solution}}", + "SA": f"Answer: {{solution}} The answer is {{answer}}."} + + +TabMWP_reader_cfg = dict( + input_columns=["question", "table"], + output_column="answer", #choose from ["answer","solution","answer_and_solution","solution_and_answer"] + train_split='dev', + ) + +TabMWP_infer_cfg = dict( + prompt_template=dict( + type=PromptTemplate, + template=dict( + round=[ + dict( + role="HUMAN", + prompt= "\n".join(elements[label] for label in input_format) + ), + ], + ), + ), + retriever=dict(type=ZeroRetriever), + inferencer=dict(type=GenInferencer), +) + +TabMWP_eval_cfg = dict( + evaluator=dict(type=AccEvaluator), +) + +TabMWP_datasets = [ + dict( + type=TabMWPDataset, + path="./data/tabmwp/", + reader_cfg=TabMWP_reader_cfg, + infer_cfg=TabMWP_infer_cfg, + eval_cfg=TabMWP_eval_cfg,) +] + From e46f74f3294f79ad4d9fdfabeb25756b2454e558 Mon Sep 17 00:00:00 2001 From: bittersweet1999 <1487910649@qq.com> Date: Mon, 30 Oct 2023 17:26:55 +0800 Subject: [PATCH 3/8] fixed --- configs/datasets/TabMWP/TabMWP_gen_2aef96.py | 18 +-- opencompass/datasets/__init__.py | 2 +- opencompass/datasets/tabmwp.py | 94 ++++++++++++++ .../openicl/icl_evaluator/icl_hf_evaluator.py | 59 +++++++++ opencompass/utils/text_postprocessors.py | 116 ++++++++++++++++++ 5 files changed, 279 insertions(+), 10 deletions(-) create mode 100644 opencompass/datasets/tabmwp.py diff --git a/configs/datasets/TabMWP/TabMWP_gen_2aef96.py b/configs/datasets/TabMWP/TabMWP_gen_2aef96.py index d19461f63..e1a51f7f5 100644 --- a/configs/datasets/TabMWP/TabMWP_gen_2aef96.py +++ b/configs/datasets/TabMWP/TabMWP_gen_2aef96.py @@ -1,7 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import AccEvaluator +from opencompass.openicl.icl_evaluator import TabMWPEvaluator from opencompass.datasets import TabMWPDataset # None of the TabMWP dataset in huggingface is correctly parsed, so we use our own dataset reader @@ -9,17 +9,17 @@ input_format='TQ' output_format='A' -elements = {"Q": f"Question: {{question}}", - "T": f"Table: {{table}}", - "S": f"Solution: {{solution}}", - "A": f"Answer: The answer is {{answer}}.", - "AS": f"Answer: The answer is {{answer}}. BECAUSE: {{solution}}", - "SA": f"Answer: {{solution}} The answer is {{answer}}."} +elements = {"Q": "Question: {question}", + "T": "Table: {table}", + "S": "Solution: {solution}", + "A": "Answer: The answer is {answer}.", + "AS": "Answer: The answer is {answer}. BECAUSE: {solution}", + "SA": "Answer: {solution} The answer is {answer}."} TabMWP_reader_cfg = dict( input_columns=["question", "table"], - output_column="answer", #choose from ["answer","solution","answer_and_solution","solution_and_answer"] + output_column="test_elements", train_split='dev', ) @@ -40,7 +40,7 @@ ) TabMWP_eval_cfg = dict( - evaluator=dict(type=AccEvaluator), + evaluator=dict(type=TabMWPEvaluator) ) TabMWP_datasets = [ diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py index 8a07b3219..867960e0c 100644 --- a/opencompass/datasets/__init__.py +++ b/opencompass/datasets/__init__.py @@ -66,7 +66,7 @@ from .strategyqa import * # noqa: F401, F403 from .summedits import * # noqa: F401, F403 from .summscreen import * # noqa: F401, F403 -from .TabMWP import * # noqa: F401, F403 +from .tabmwp import * # noqa: F401, F403 from .TheoremQA import * # noqa: F401, F403 from .tnews import * # noqa: F401, F403 from .triviaqa import * # noqa: F401, F403 diff --git a/opencompass/datasets/tabmwp.py b/opencompass/datasets/tabmwp.py new file mode 100644 index 000000000..3063b6555 --- /dev/null +++ b/opencompass/datasets/tabmwp.py @@ -0,0 +1,94 @@ +import json +import os.path as osp + +from datasets import Dataset, DatasetDict + +from opencompass.registry import LOAD_DATASET + +from .base import BaseDataset + + +def get_table_text(problem): + table = problem['table'] + title = problem['table_title'] + if title and len(title) > 0: + table = f'[TITLE]: {title}\n{table}' + return table + + +def get_question_text(problem, option_inds='ABCDEFGH'): + question = problem['question'] + + unit = problem['unit'] + if unit and len(unit) > 0: + question = f'{question} (Unit: {unit})' + + choices = problem['choices'] + if choices and len(choices) > 0: + choice_list = [] + for i, c in enumerate(choices): + choice_list.append('({}) {}'.format(option_inds[i], c)) + options = ' '.join(choice_list) + question = f'{question}\nOptions: {options}' + + return question + + +def get_answer(problem): + return problem['answer'] + + +def get_choices(problem): + return problem['choices'] + + +def get_unit(problem): + return problem['unit'] + + +def get_solution_text(problem): + # \\n: GPT-3 can generate the solution with more tokens + solution = problem['solution'].replace('\n', '\\n') + return solution + + +@LOAD_DATASET.register_module() +class TabMWPDataset(BaseDataset): + + @staticmethod + def load(path: str): + dataset = DatasetDict() + for split in ['dev', 'test', 'train']: + raw_data = [] + filename = osp.join(path, f'problems_{split}.json') + with open(filename, 'r', encoding='utf-8') as f: + json_data = json.load(f) + for idx in json_data: + problem = json_data[idx] + question = get_question_text(problem) + table = get_table_text(problem) + unit = get_unit(problem) + answer = get_answer(problem) + choices = get_choices(problem) + solution = get_solution_text(problem) + raw_data.append({ + 'question': + question, + 'table': + table, + 'test_elements': { + 'answer': answer, + 'unit': unit, + 'choices': choices + }, + 'answer': + f'Answer: The answer is {answer}.', + 'solution': + f'Solution: {solution}', + 'answer_and_solution': + f'Answer: The answer is {answer}. BECAUSE: {solution}', + 'solution_and_answer': + f'Answer: {solution} The answer is {answer}.' + }) + dataset[split] = Dataset.from_list(raw_data) + return dataset diff --git a/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py b/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py index 2a355592d..a1c626161 100644 --- a/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py +++ b/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py @@ -6,6 +6,8 @@ import numpy as np from opencompass.registry import ICL_EVALUATORS +from opencompass.utils.text_postprocessors import (extract_prediction, + normalize_answer) from .icl_base_evaluator import BaseEvaluator @@ -87,6 +89,63 @@ def score(self, predictions: List, references: List) -> dict: return result +@ICL_EVALUATORS.register_module() +class TabMWPEvaluator(HuggingfaceEvaluator): + """Accuracy evaluator.""" + + def __init__(self) -> None: + super().__init__(metric='accuracy') + + def _preprocess(self, predictions: List, references: List) -> dict: + preds, golds = [], [] + for idx in range(len(references)): + pred = predictions[idx] + unit = references[idx]['unit'] + answer = references[idx]['answer'] + choices = references[idx]['choices'] + preds.append( + normalize_answer(extract_prediction(pred, choices), + unit).lower()) + golds.append(normalize_answer(answer, unit).lower()) + """Preprocess the final predictions and references to needed format. + + Args: + predictions (List): List of predictions of each sample. + references (List): List of targets for each sample. + + Returns: + dict: preprocessed results. + """ + predictions = preds + references = golds + mapping_to_int_dict = { + label: idx + for idx, label in enumerate(set(map(str, references))) + } + pred_set = set(predictions) + for pred in pred_set: + if str(pred) not in mapping_to_int_dict.keys(): + mapping_to_int_dict[str(pred)] = len(mapping_to_int_dict) + golds = [mapping_to_int_dict[str(gold)] for gold in references] + preds = [mapping_to_int_dict[str(pred)] for pred in predictions] + return { + 'predictions': preds, + 'references': golds, + } + + def _postprocess(self, scores: dict) -> dict: + """Postprocess for final scores. + + Args: + scores (dict): Dict of calculated scores of metrics. + + Returns: + dict: postprocessed scores. + """ + scores['accuracy'] *= 100 + return scores + + @ICL_EVALUATORS.register_module() class AccEvaluator(HuggingfaceEvaluator): """Accuracy evaluator.""" diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py index ec668f4d6..1eb752d3d 100644 --- a/opencompass/utils/text_postprocessors.py +++ b/opencompass/utils/text_postprocessors.py @@ -1,7 +1,123 @@ +import random import re +import numpy as np + from opencompass.registry import TEXT_POSTPROCESSORS +random.seed(123) + + +def normalize_answer(text, unit): + # ["1,000", "123", "3/4", "56.456", "$56.4", "-3", "-10.02", "-3/2"] + + text = re.sub(r'^[\$]', '', text) + text = re.sub(r'[\,\.\,\/]$', '', text) + + result = re.match(r'^[-+]?[\d,./]+$', text) + + if result is not None: + # is number? + text = text.replace(',', '') + result = re.match(r'[-+]?\d+$', text) + + if result is not None: + number = int(text) + elif '/' in text: + nums = text.split('/') + number = round(float(nums[0]) / float(nums[1]), 3) + else: + number = round(float(text), 3) + number = str(number) + number = re.sub(r'\.[0]+$', '', number) + return number + else: + # is text + if unit: + text = text.replace(unit, '').strip() + return text + + +def score_string_similarity(str1, str2): + if str1 == str2: + return 2.0 + if ' ' in str1 or ' ' in str2: + str1_split = str1.split(' ') + str2_split = str2.split(' ') + overlap = list(set(str1_split) & set(str2_split)) + return len(overlap) / max(len(str1_split), len(str2_split)) + else: + if str1 == str2: + return 1.0 + else: + return 0.0 + + +def extract_prediction(output, options=None, option_inds='ABCDEFGH'): + + # $\\frac{16}{95}$ -> 16/95 + output = re.sub(r'\$?\\frac\{([\d\.\,\-]+)\}\{([\d\.\,]+)\}\$?', r'\1/\2', + output) + + output = re.sub(r'(? 0: + pred = res[0].upper() # e.g., "B" + if pred in option_inds: + ind = option_inds.index(pred) # 1 + if ind >= len(options): + ind = random.choice(range(len(options))) + prediction = options[ind] + return prediction + + # find the most similar options + scores = [score_string_similarity(x, output) for x in options] + max_idx = int( + np.argmax(scores)) # json does not recognize NumPy data types + prediction = options[max_idx] + return prediction + + else: + # free_text QA problems, numeric answer + patterns = [ + r'[Th]he answer is ([\s\S]+)$', # "The answer is XXXXX.", + r'[Th]he table shows that ([\d\$\.\,\/\:]+) ', + r' = ([\d\$\.\,\/\:]+)', # "= $1.40" + r'(?<= be| is) ([\-\d\$\.\,\/\:]{0,}[\d]+)', # "will be $1.40" + r'(?<= are| was) ([\-\d\$\.\,\/\:]{0,}[\d]+)', # "are $1.40" + r'(?<= were) ([\-\d\$\.\,\/\:]{0,}[\d]+)', # "are $1.40" + r' ([\d\$\.\,\/\:]+ [AP]\.M\.)', # 7:25 P.M. + r'([\-\d\$\.\,\/\:]{0,}[\d]+)', # 14.5 + ] + + for p in patterns: + pattern = re.compile(p) + res = pattern.findall(output) + if len(res) > 0: + prediction = res[-1].strip() + if prediction.endswith('.') and '.M.' not in prediction: + prediction = prediction[:-1] + return prediction + + return output + @TEXT_POSTPROCESSORS.register_module('general') def general_postprocess(text: str) -> str: From 3266ec313a10e6d4648615b52566cade53f30861 Mon Sep 17 00:00:00 2001 From: bittersweet1999 <1487910649@qq.com> Date: Mon, 30 Oct 2023 17:34:31 +0800 Subject: [PATCH 4/8] fixed --- opencompass/datasets/tabmwp.py | 9 ++++++++- opencompass/utils/text_postprocessors.py | 3 +-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/opencompass/datasets/tabmwp.py b/opencompass/datasets/tabmwp.py index 3063b6555..3133a1cb3 100644 --- a/opencompass/datasets/tabmwp.py +++ b/opencompass/datasets/tabmwp.py @@ -54,7 +54,14 @@ def get_solution_text(problem): @LOAD_DATASET.register_module() class TabMWPDataset(BaseDataset): - + # The TabMWP dataset contains 38,431 tabular math word problems. + # Each question in TabMWP is aligned with a tabular context, + # which is presented as an image, semi-structured text, and a- + # structured table. There are two types of questions: free-text- + # and multi-choice, and each problem is annotated with gold- + # solutions to reveal the multi-step reasoning process. + # To learn more about it, please follow: + # https://github.com/lupantech/PromptPG/tree/main @staticmethod def load(path: str): dataset = DatasetDict() diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py index 1eb752d3d..125546636 100644 --- a/opencompass/utils/text_postprocessors.py +++ b/opencompass/utils/text_postprocessors.py @@ -5,8 +5,6 @@ from opencompass.registry import TEXT_POSTPROCESSORS -random.seed(123) - def normalize_answer(text, unit): # ["1,000", "123", "3/4", "56.456", "$56.4", "-3", "-10.02", "-3/2"] @@ -83,6 +81,7 @@ def extract_prediction(output, options=None, option_inds='ABCDEFGH'): if pred in option_inds: ind = option_inds.index(pred) # 1 if ind >= len(options): + random.seed(123) ind = random.choice(range(len(options))) prediction = options[ind] return prediction From 3f91af839ce352f70519f5dea13f0168c10d1427 Mon Sep 17 00:00:00 2001 From: bittersweet1999 <1487910649@qq.com> Date: Mon, 30 Oct 2023 20:59:52 +0800 Subject: [PATCH 5/8] fixed --- configs/datasets/TabMWP/TabMWP_gen_2aef96.py | 3 +- configs/eval_demo.py | 2 +- opencompass/datasets/tabmwp.py | 177 +++++++++++++++++- .../openicl/icl_evaluator/icl_hf_evaluator.py | 59 ------ opencompass/utils/text_postprocessors.py | 115 ------------ 5 files changed, 178 insertions(+), 178 deletions(-) diff --git a/configs/datasets/TabMWP/TabMWP_gen_2aef96.py b/configs/datasets/TabMWP/TabMWP_gen_2aef96.py index e1a51f7f5..137546608 100644 --- a/configs/datasets/TabMWP/TabMWP_gen_2aef96.py +++ b/configs/datasets/TabMWP/TabMWP_gen_2aef96.py @@ -1,8 +1,7 @@ from opencompass.openicl.icl_prompt_template import PromptTemplate from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer -from opencompass.openicl.icl_evaluator import TabMWPEvaluator -from opencompass.datasets import TabMWPDataset +from opencompass.datasets import TabMWPDataset, TabMWPEvaluator # None of the TabMWP dataset in huggingface is correctly parsed, so we use our own dataset reader # Please download the dataset from https://github.com/lupantech/PromptPG/tree/main diff --git a/configs/eval_demo.py b/configs/eval_demo.py index ea5def4c9..aba44ef2b 100644 --- a/configs/eval_demo.py +++ b/configs/eval_demo.py @@ -7,4 +7,4 @@ from .models.opt.hf_opt_350m import opt350m datasets = [*siqa_datasets, *winograd_datasets] -models = [opt125m, opt350m] +models = [opt125m, opt350m] \ No newline at end of file diff --git a/opencompass/datasets/tabmwp.py b/opencompass/datasets/tabmwp.py index 3133a1cb3..e9c2f09f4 100644 --- a/opencompass/datasets/tabmwp.py +++ b/opencompass/datasets/tabmwp.py @@ -1,9 +1,15 @@ import json import os.path as osp +import random +import re +from typing import List +import numpy as np from datasets import Dataset, DatasetDict -from opencompass.registry import LOAD_DATASET +from opencompass.openicl.icl_evaluator.icl_hf_evaluator import \ + HuggingfaceEvaluator +from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET from .base import BaseDataset @@ -52,6 +58,175 @@ def get_solution_text(problem): return solution +def normalize_answer(text, unit): + # ["1,000", "123", "3/4", "56.456", "$56.4", "-3", "-10.02", "-3/2"] + + text = re.sub(r'^[\$]', '', text) + text = re.sub(r'[\,\.\,\/]$', '', text) + + result = re.match(r'^[-+]?[\d,./]+$', text) + + if result is not None: + # is number? + text = text.replace(',', '') + result = re.match(r'[-+]?\d+$', text) + + if result is not None: + number = int(text) + elif '/' in text: + nums = text.split('/') + number = round(float(nums[0]) / float(nums[1]), 3) + else: + number = round(float(text), 3) + number = str(number) + number = re.sub(r'\.[0]+$', '', number) + return number + else: + # is text + if unit: + text = text.replace(unit, '').strip() + return text + + +def score_string_similarity(str1, str2): + if str1 == str2: + return 2.0 + if ' ' in str1 or ' ' in str2: + str1_split = str1.split(' ') + str2_split = str2.split(' ') + overlap = list(set(str1_split) & set(str2_split)) + return len(overlap) / max(len(str1_split), len(str2_split)) + else: + if str1 == str2: + return 1.0 + else: + return 0.0 + + +def extract_prediction(output, options=None, option_inds='ABCDEFGH'): + + # $\\frac{16}{95}$ -> 16/95 + output = re.sub(r'\$?\\frac\{([\d\.\,\-]+)\}\{([\d\.\,]+)\}\$?', r'\1/\2', + output) + + output = re.sub(r'(? 0: + pred = res[0].upper() # e.g., "B" + if pred in option_inds: + ind = option_inds.index(pred) # 1 + if ind >= len(options): + random.seed(123) + ind = random.choice(range(len(options))) + prediction = options[ind] + return prediction + + # find the most similar options + scores = [score_string_similarity(x, output) for x in options] + max_idx = int( + np.argmax(scores)) # json does not recognize NumPy data types + prediction = options[max_idx] + return prediction + + else: + # free_text QA problems, numeric answer + patterns = [ + r'[Th]he answer is ([\s\S]+)$', # "The answer is XXXXX.", + r'[Th]he table shows that ([\d\$\.\,\/\:]+) ', + r' = ([\d\$\.\,\/\:]+)', # "= $1.40" + r'(?<= be| is) ([\-\d\$\.\,\/\:]{0,}[\d]+)', # "will be $1.40" + r'(?<= are| was) ([\-\d\$\.\,\/\:]{0,}[\d]+)', # "are $1.40" + r'(?<= were) ([\-\d\$\.\,\/\:]{0,}[\d]+)', # "are $1.40" + r' ([\d\$\.\,\/\:]+ [AP]\.M\.)', # 7:25 P.M. + r'([\-\d\$\.\,\/\:]{0,}[\d]+)', # 14.5 + ] + + for p in patterns: + pattern = re.compile(p) + res = pattern.findall(output) + if len(res) > 0: + prediction = res[-1].strip() + if prediction.endswith('.') and '.M.' not in prediction: + prediction = prediction[:-1] + return prediction + + return output + + +@ICL_EVALUATORS.register_module() +class TabMWPEvaluator(HuggingfaceEvaluator): + """Accuracy evaluator.""" + + def __init__(self) -> None: + super().__init__(metric='accuracy') + + def _preprocess(self, predictions: List, references: List) -> dict: + preds, golds = [], [] + for idx in range(len(references)): + pred = predictions[idx] + unit = references[idx]['unit'] + answer = references[idx]['answer'] + choices = references[idx]['choices'] + preds.append( + normalize_answer(extract_prediction(pred, choices), + unit).lower()) + golds.append(normalize_answer(answer, unit).lower()) + """Preprocess the final predictions and references to needed format. + + Args: + predictions (List): List of predictions of each sample. + references (List): List of targets for each sample. + + Returns: + dict: preprocessed results. + """ + predictions = preds + references = golds + mapping_to_int_dict = { + label: idx + for idx, label in enumerate(set(map(str, references))) + } + pred_set = set(predictions) + for pred in pred_set: + if str(pred) not in mapping_to_int_dict.keys(): + mapping_to_int_dict[str(pred)] = len(mapping_to_int_dict) + golds = [mapping_to_int_dict[str(gold)] for gold in references] + preds = [mapping_to_int_dict[str(pred)] for pred in predictions] + return { + 'predictions': preds, + 'references': golds, + } + + def _postprocess(self, scores: dict) -> dict: + """Postprocess for final scores. + + Args: + scores (dict): Dict of calculated scores of metrics. + + Returns: + dict: postprocessed scores. + """ + scores['accuracy'] *= 100 + return scores + + @LOAD_DATASET.register_module() class TabMWPDataset(BaseDataset): # The TabMWP dataset contains 38,431 tabular math word problems. diff --git a/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py b/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py index a1c626161..2a355592d 100644 --- a/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py +++ b/opencompass/openicl/icl_evaluator/icl_hf_evaluator.py @@ -6,8 +6,6 @@ import numpy as np from opencompass.registry import ICL_EVALUATORS -from opencompass.utils.text_postprocessors import (extract_prediction, - normalize_answer) from .icl_base_evaluator import BaseEvaluator @@ -89,63 +87,6 @@ def score(self, predictions: List, references: List) -> dict: return result -@ICL_EVALUATORS.register_module() -class TabMWPEvaluator(HuggingfaceEvaluator): - """Accuracy evaluator.""" - - def __init__(self) -> None: - super().__init__(metric='accuracy') - - def _preprocess(self, predictions: List, references: List) -> dict: - preds, golds = [], [] - for idx in range(len(references)): - pred = predictions[idx] - unit = references[idx]['unit'] - answer = references[idx]['answer'] - choices = references[idx]['choices'] - preds.append( - normalize_answer(extract_prediction(pred, choices), - unit).lower()) - golds.append(normalize_answer(answer, unit).lower()) - """Preprocess the final predictions and references to needed format. - - Args: - predictions (List): List of predictions of each sample. - references (List): List of targets for each sample. - - Returns: - dict: preprocessed results. - """ - predictions = preds - references = golds - mapping_to_int_dict = { - label: idx - for idx, label in enumerate(set(map(str, references))) - } - pred_set = set(predictions) - for pred in pred_set: - if str(pred) not in mapping_to_int_dict.keys(): - mapping_to_int_dict[str(pred)] = len(mapping_to_int_dict) - golds = [mapping_to_int_dict[str(gold)] for gold in references] - preds = [mapping_to_int_dict[str(pred)] for pred in predictions] - return { - 'predictions': preds, - 'references': golds, - } - - def _postprocess(self, scores: dict) -> dict: - """Postprocess for final scores. - - Args: - scores (dict): Dict of calculated scores of metrics. - - Returns: - dict: postprocessed scores. - """ - scores['accuracy'] *= 100 - return scores - - @ICL_EVALUATORS.register_module() class AccEvaluator(HuggingfaceEvaluator): """Accuracy evaluator.""" diff --git a/opencompass/utils/text_postprocessors.py b/opencompass/utils/text_postprocessors.py index 125546636..ec668f4d6 100644 --- a/opencompass/utils/text_postprocessors.py +++ b/opencompass/utils/text_postprocessors.py @@ -1,123 +1,8 @@ -import random import re -import numpy as np - from opencompass.registry import TEXT_POSTPROCESSORS -def normalize_answer(text, unit): - # ["1,000", "123", "3/4", "56.456", "$56.4", "-3", "-10.02", "-3/2"] - - text = re.sub(r'^[\$]', '', text) - text = re.sub(r'[\,\.\,\/]$', '', text) - - result = re.match(r'^[-+]?[\d,./]+$', text) - - if result is not None: - # is number? - text = text.replace(',', '') - result = re.match(r'[-+]?\d+$', text) - - if result is not None: - number = int(text) - elif '/' in text: - nums = text.split('/') - number = round(float(nums[0]) / float(nums[1]), 3) - else: - number = round(float(text), 3) - number = str(number) - number = re.sub(r'\.[0]+$', '', number) - return number - else: - # is text - if unit: - text = text.replace(unit, '').strip() - return text - - -def score_string_similarity(str1, str2): - if str1 == str2: - return 2.0 - if ' ' in str1 or ' ' in str2: - str1_split = str1.split(' ') - str2_split = str2.split(' ') - overlap = list(set(str1_split) & set(str2_split)) - return len(overlap) / max(len(str1_split), len(str2_split)) - else: - if str1 == str2: - return 1.0 - else: - return 0.0 - - -def extract_prediction(output, options=None, option_inds='ABCDEFGH'): - - # $\\frac{16}{95}$ -> 16/95 - output = re.sub(r'\$?\\frac\{([\d\.\,\-]+)\}\{([\d\.\,]+)\}\$?', r'\1/\2', - output) - - output = re.sub(r'(? 0: - pred = res[0].upper() # e.g., "B" - if pred in option_inds: - ind = option_inds.index(pred) # 1 - if ind >= len(options): - random.seed(123) - ind = random.choice(range(len(options))) - prediction = options[ind] - return prediction - - # find the most similar options - scores = [score_string_similarity(x, output) for x in options] - max_idx = int( - np.argmax(scores)) # json does not recognize NumPy data types - prediction = options[max_idx] - return prediction - - else: - # free_text QA problems, numeric answer - patterns = [ - r'[Th]he answer is ([\s\S]+)$', # "The answer is XXXXX.", - r'[Th]he table shows that ([\d\$\.\,\/\:]+) ', - r' = ([\d\$\.\,\/\:]+)', # "= $1.40" - r'(?<= be| is) ([\-\d\$\.\,\/\:]{0,}[\d]+)', # "will be $1.40" - r'(?<= are| was) ([\-\d\$\.\,\/\:]{0,}[\d]+)', # "are $1.40" - r'(?<= were) ([\-\d\$\.\,\/\:]{0,}[\d]+)', # "are $1.40" - r' ([\d\$\.\,\/\:]+ [AP]\.M\.)', # 7:25 P.M. - r'([\-\d\$\.\,\/\:]{0,}[\d]+)', # 14.5 - ] - - for p in patterns: - pattern = re.compile(p) - res = pattern.findall(output) - if len(res) > 0: - prediction = res[-1].strip() - if prediction.endswith('.') and '.M.' not in prediction: - prediction = prediction[:-1] - return prediction - - return output - - @TEXT_POSTPROCESSORS.register_module('general') def general_postprocess(text: str) -> str: # Cut off the first newline, period, or comma From 161fb8ec28e4c1c3d647a7e7e0a19c8f0e6c8056 Mon Sep 17 00:00:00 2001 From: bittersweet1999 <1487910649@qq.com> Date: Tue, 31 Oct 2023 10:50:51 +0800 Subject: [PATCH 6/8] done --- configs/eval_demo.py | 2 +- opencompass/datasets/TabMWP.py | 80 ---------------------------------- 2 files changed, 1 insertion(+), 81 deletions(-) delete mode 100644 opencompass/datasets/TabMWP.py diff --git a/configs/eval_demo.py b/configs/eval_demo.py index aba44ef2b..ea5def4c9 100644 --- a/configs/eval_demo.py +++ b/configs/eval_demo.py @@ -7,4 +7,4 @@ from .models.opt.hf_opt_350m import opt350m datasets = [*siqa_datasets, *winograd_datasets] -models = [opt125m, opt350m] \ No newline at end of file +models = [opt125m, opt350m] diff --git a/opencompass/datasets/TabMWP.py b/opencompass/datasets/TabMWP.py deleted file mode 100644 index 3bfb05b08..000000000 --- a/opencompass/datasets/TabMWP.py +++ /dev/null @@ -1,80 +0,0 @@ -import json -import os.path as osp - -from datasets import Dataset, DatasetDict - -from opencompass.registry import LOAD_DATASET - -from .base import BaseDataset - - -def get_table_text(problem): - table = problem['table'] - title = problem['table_title'] - if title and len(title) > 0: - table = f'[TITLE]: {title}\n{table}' - return table - - -def get_question_text(problem, - option_inds=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H']): - question = problem['question'] - - unit = problem['unit'] - if unit and len(unit) > 0: - question = f'{question} (Unit: {unit})' - - choices = problem['choices'] - if choices and len(choices) > 0: - choice_list = [] - for i, c in enumerate(choices): - choice_list.append('({}) {}'.format(option_inds[i], c)) - options = ' '.join(choice_list) - question = f'{question}\nOptions: {options}' - - return question - - -def get_answer(problem): - return problem['answer'] - - -def get_solution_text(problem): - # \\n: GPT-3 can generate the solution with more tokens - solution = problem['solution'].replace('\n', '\\n') - return solution - - -@LOAD_DATASET.register_module() -class TabMWPDataset(BaseDataset): - - @staticmethod - def load(path: str): - dataset = DatasetDict() - for split in ['dev', 'test', 'train']: - raw_data = [] - filename = osp.join(path, f'problems_{split}.json') - with open(filename, 'r', encoding='utf-8') as f: - json_data = json.load(f) - for idx in json_data: - problem = json_data[idx] - question = get_question_text(problem) - table = get_table_text(problem) - answer = get_answer(problem) - solution = get_solution_text(problem) - raw_data.append({ - 'question': - question, - 'table': - table, - 'answer': - f'Answer: The answer is {answer}.', - 'solution': - f'Solution: {solution}', - 'answer_and_solution': - f'Answer: The answer is {answer}. BECAUSE: {solution}', - 'solution_and_answer': - f'Answer: {solution} The answer is {answer}.' - }) - dataset[split] = Dataset.from_list(raw_data) - return dataset From 17c326134ef9d4d0d22ffac374d56fa3ce3001b9 Mon Sep 17 00:00:00 2001 From: bittersweet1999 <1487910649@qq.com> Date: Tue, 31 Oct 2023 15:14:46 +0800 Subject: [PATCH 7/8] done --- opencompass/datasets/tabmwp.py | 55 ++++++++-------------------------- 1 file changed, 13 insertions(+), 42 deletions(-) diff --git a/opencompass/datasets/tabmwp.py b/opencompass/datasets/tabmwp.py index e9c2f09f4..55cfc6001 100644 --- a/opencompass/datasets/tabmwp.py +++ b/opencompass/datasets/tabmwp.py @@ -7,8 +7,7 @@ import numpy as np from datasets import Dataset, DatasetDict -from opencompass.openicl.icl_evaluator.icl_hf_evaluator import \ - HuggingfaceEvaluator +from opencompass.openicl.icl_evaluator.icl_hf_evaluator import AccEvaluator from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET from .base import BaseDataset @@ -171,13 +170,19 @@ def extract_prediction(output, options=None, option_inds='ABCDEFGH'): @ICL_EVALUATORS.register_module() -class TabMWPEvaluator(HuggingfaceEvaluator): - """Accuracy evaluator.""" - - def __init__(self) -> None: - super().__init__(metric='accuracy') +class TabMWPEvaluator(AccEvaluator): + """Accuracy evaluator for TabMWP Dataset.""" def _preprocess(self, predictions: List, references: List) -> dict: + """Preprocess the final predictions and references to needed format. + + Args: + predictions (List): List of predictions of each sample. + references (List): List of targets for each sample. + + Returns: + dict: preprocessed results. + """ preds, golds = [], [] for idx in range(len(references)): pred = predictions[idx] @@ -188,43 +193,9 @@ def _preprocess(self, predictions: List, references: List) -> dict: normalize_answer(extract_prediction(pred, choices), unit).lower()) golds.append(normalize_answer(answer, unit).lower()) - """Preprocess the final predictions and references to needed format. - - Args: - predictions (List): List of predictions of each sample. - references (List): List of targets for each sample. - - Returns: - dict: preprocessed results. - """ predictions = preds references = golds - mapping_to_int_dict = { - label: idx - for idx, label in enumerate(set(map(str, references))) - } - pred_set = set(predictions) - for pred in pred_set: - if str(pred) not in mapping_to_int_dict.keys(): - mapping_to_int_dict[str(pred)] = len(mapping_to_int_dict) - golds = [mapping_to_int_dict[str(gold)] for gold in references] - preds = [mapping_to_int_dict[str(pred)] for pred in predictions] - return { - 'predictions': preds, - 'references': golds, - } - - def _postprocess(self, scores: dict) -> dict: - """Postprocess for final scores. - - Args: - scores (dict): Dict of calculated scores of metrics. - - Returns: - dict: postprocessed scores. - """ - scores['accuracy'] *= 100 - return scores + return super()._preprocess(preds, golds) @LOAD_DATASET.register_module() From 8a0225b95e030d3ff07c25acb9282db8842f3d23 Mon Sep 17 00:00:00 2001 From: bittersweet1999 <1487910649@qq.com> Date: Tue, 31 Oct 2023 15:33:12 +0800 Subject: [PATCH 8/8] done --- opencompass/datasets/tabmwp.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/opencompass/datasets/tabmwp.py b/opencompass/datasets/tabmwp.py index 55cfc6001..ac5952ea6 100644 --- a/opencompass/datasets/tabmwp.py +++ b/opencompass/datasets/tabmwp.py @@ -193,8 +193,6 @@ def _preprocess(self, predictions: List, references: List) -> dict: normalize_answer(extract_prediction(pred, choices), unit).lower()) golds.append(normalize_answer(answer, unit).lower()) - predictions = preds - references = golds return super()._preprocess(preds, golds)