Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] Add circular eval #610

Merged
merged 6 commits into from
Nov 23, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 55 additions & 0 deletions configs/datasets/commonsenseqa/commonsenseqa_gen_1da2d0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Use FixKRetriever to avoid hang caused by the Huggingface
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import commonsenseqaDataset
from opencompass.utils.text_postprocessors import first_capital_postprocess

commonsenseqa_reader_cfg = dict(
input_columns=["question", "A", "B", "C", "D", "E"],
output_column="answerKey",
test_split="validation")

_ice_template = dict(
type=PromptTemplate,
template=dict(
begin="</E>",
round=[
dict(
role="HUMAN",
prompt=
"{question}\nA. {A}\nB. {B}\nC. {C}\nD. {D}\nE. {E}\nAnswer:",
),
dict(
role="BOT",
prompt="{answerKey}",
),
],
),
ice_token="</E>",
)

commonsenseqa_infer_cfg = dict(
ice_template=_ice_template,
retriever=dict(type=FixKRetriever, fix_id_list=[0, 1, 2, 3, 4, 5, 6, 7]),
inferencer=dict(type=GenInferencer),
)

commonsenseqa_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_postprocessor=dict(type=first_capital_postprocess),
)

commonsenseqa_datasets = [
dict(
abbr='commonsense_qa',
type=commonsenseqaDataset,
path='./data/commonsenseqa',
reader_cfg=commonsenseqa_reader_cfg,
infer_cfg=commonsenseqa_infer_cfg,
eval_cfg=commonsenseqa_eval_cfg,
)
]

del _ice_template
91 changes: 91 additions & 0 deletions configs/eval_circular.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
from mmengine.config import read_base
from opencompass.datasets.circular import (CircularCEvalDataset, CircularMMLUDataset, CircularCMMLUDataset, CircularCSQADataset,
CircularARCDataset, CircularHSWAGDataset, CircularOBQADataset, CircularRaceDataset, CircularEvaluator)
from opencompass.summarizers import CircularSummarizer

with read_base():
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from .datasets.mmlu.mmlu_gen_a484b3 import mmlu_datasets
from .datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
from .datasets.hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
from .datasets.ARC_e.ARC_e_gen_1e0de5 import ARC_e_datasets
from .datasets.ARC_c.ARC_c_gen_1e0de5 import ARC_c_datasets
from .datasets.commonsenseqa.commonsenseqa_gen_1da2d0 import commonsenseqa_datasets
from .datasets.obqa.obqa_gen_9069e4 import obqa_datasets
from .datasets.race.race_gen_69ee4f import race_datasets

from .models.hf_internlm.hf_internlm_chat_7b import models as hf_internlm_chat_7b_model
from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b_model
from .models.qwen.hf_qwen_7b_chat import models as hf_qwen_7b_chat_model
from .models.qwen.hf_qwen_14b_chat import models as hf_qwen_14b_chat_model

from .summarizers.groups.mmlu import mmlu_summary_groups
from .summarizers.groups.cmmlu import cmmlu_summary_groups
from .summarizers.groups.ceval import ceval_summary_groups

for ds, t in [
(ceval_datasets, CircularCEvalDataset),
(mmlu_datasets, CircularMMLUDataset),
(cmmlu_datasets, CircularCMMLUDataset),
(hellaswag_datasets, CircularHSWAGDataset),
(ARC_e_datasets, CircularARCDataset),
(ARC_c_datasets, CircularARCDataset),
(commonsenseqa_datasets, CircularCSQADataset),
(obqa_datasets, CircularOBQADataset),
(race_datasets, CircularRaceDataset),
]:
for d in ds:
d['type'] = t
d['abbr'] = d['abbr'] + '-circular-4'
d['eval_cfg']['evaluator'] = {'type': CircularEvaluator, 'circular_pattern': 'circular'}
d['circular_patterns'] = 'circular'


datasets = sum([v for k, v in locals().items() if k.endswith("_datasets") or k == 'datasets'], [])
models = sum([v for k, v in locals().items() if k.endswith("_model")], [])

# config summarizer
other_summary_groups = [
{'name': 'average',
'subsets': ['ceval', 'mmlu', 'cmmlu', 'hellaswag', 'ARC-e', 'ARC-c', 'commonsense_qa', 'openbookqa_fact', 'race-middle', 'race-high']},
]
origin_summary_groups = sum([v for k, v in locals().items() if k.endswith("_summary_groups")], [])
new_summary_groups = []
for item in origin_summary_groups:
new_summary_groups.append(
{
'name': item['name'] + '-circular-4',
'subsets': [i + '-circular-4' for i in item['subsets']],
}
)
summarizer = dict(
type=CircularSummarizer,
metric_types=['acc_origin', 'perf_circular'],
dataset_abbrs = [
'average-circular-4',
'ceval-circular-4',
'mmlu-circular-4',
'cmmlu-circular-4',
'hellaswag-circular-4',
'ARC-e-circular-4',
'ARC-c-circular-4',
'commonsense_qa-circular-4',
'openbookqa_fact-circular-4',
'race-middle-circular-4',
'race-high-circular-4',
'ceval-humanities-circular-4',
'ceval-stem-circular-4',
'ceval-social-science-circular-4',
'ceval-other-circular-4',
'mmlu-humanities-circular-4',
'mmlu-stem-circular-4',
'mmlu-social-science-circular-4',
'mmlu-other-circular-4',
'cmmlu-humanities-circular-4',
'cmmlu-stem-circular-4',
'cmmlu-social-science-circular-4',
'cmmlu-other-circular-4',
'cmmlu-china-specific-circular-4',
],
summary_groups=new_summary_groups,
)
113 changes: 113 additions & 0 deletions docs/en/advanced_guides/circular_eval.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
# CircularEval

## Background

For multiple-choice questions, when a Language Model (LLM) provides the correct option, it does not necessarily imply a true understanding and reasoning of the question. It could be a guess. To differentiate these scenarios and reduce LLM bias towards options, CircularEval (CircularEval) can be utilized. A multiple-choice question is augmented by shuffling its options, and if the LLM correctly answers all variations of the augmented question, it is considered correct under CircularEval.

## Adding Your Own CircularEval Dataset

Generally, to evaluate a dataset using CircularEval, both its loading and evaluation methods need to be rewritten. Modifications are required in both the OpenCompass main library and configuration files. We will use C-Eval as an example for explanation.

OpenCompass main library:

```python
from opencompass.datasets.ceval import CEvalDataset
from opencompass.datasets.circular import CircularDatasetMeta

class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta):
# The overloaded dataset class
dataset_class = CEvalDataset

# Splits of the DatasetDict that need CircularEval. For CEvalDataset, which loads [dev, val, test], we only need 'val' and 'test' for CircularEval, not 'dev'
default_circular_splits = ['val', 'test']

# List of keys to be shuffled
default_option_keys = ['A', 'B', 'C', 'D']

# If the content of 'answer_key' is one of ['A', 'B', 'C', 'D'], representing the correct answer. This field indicates how to update the correct answer after shuffling options. Choose either this or default_answer_key_switch_method
default_answer_key = 'answer'

# If 'answer_key' content is not one of ['A', 'B', 'C', 'D'], a function can be used to specify the correct answer after shuffling options. Choose either this or default_answer_key
# def default_answer_key_switch_method(item, circular_pattern):
# # 'item' is the original data item
# # 'circular_pattern' is a tuple indicating the order after shuffling options, e.g., ('D', 'A', 'B', 'C') means the original option A is now D, and so on
# item['answer'] = circular_pattern['ABCD'.index(item['answer'])]
# return item
```

`CircularCEvalDataset` accepts the `circular_pattern` parameter with two values:

- `circular`: Indicates a single cycle. It is the default value. ABCD is expanded to ABCD, BCDA, CDAB, DABC, a total of 4 variations.
- `all_possible`: Indicates all permutations. ABCD is expanded to ABCD, ABDC, ACBD, ACDB, ADBC, ADCB, BACD, ..., a total of 24 variations.

Additionally, we provide a `CircularEvaluator` to replace `AccEvaluator`. This Evaluator also accepts `circular_pattern`, and it should be consistent with the above. It produces the following metrics:

- `acc_{origin|circular|all_possible}`: Treating each question with shuffled options as separate, calculating accuracy.
- `perf_{origin|circular|all_possible}`: Following Circular logic, a question is considered correct only if all its variations with shuffled options are answered correctly, calculating accuracy.
- `more_{num}_{origin|circular|all_possible}`: According to Circular logic, a question is deemed correct if the number of its variations answered correctly is greater than or equal to num, calculating accuracy.

OpenCompass configuration file:

```python
from mmengine.config import read_base
from opencompass.datasets.circular import CircularCEvalDataset

with read_base():
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets

for d in ceval_datasets:
# Overloading the load method
d['type'] = CircularCEvalDataset
# Renaming for differentiation from non-circular evaluation versions
d['abbr'] = d['abbr'] + '-circular-4'
# Overloading the evaluation method
d['eval_cfg']['evaluator'] = {'type': CircularEvaluator}

# The dataset after the above operations looks like this:
# dict(
# type=CircularCEvalDataset,
# path='./data/ceval/formal_ceval', # Unchanged
# name='computer_network', # Unchanged
# abbr='ceval-computer_network-circular-4',
# reader_cfg=dict(...), # Unchanged
# infer_cfg=dict(...), # Unchanged
# eval_cfg=dict(evaluator=dict(type=CircularEvaluator), ...),
# )
```

Additionally, for better presentation of results in CircularEval, consider using the following summarizer:

```python


from mmengine.config import read_base
from opencompass.summarizers import CircularSummarizer

with read_base():
from ...summarizers.groups.ceval.ceval_summary_groups

new_summary_groups = []
for item in ceval_summary_groups:
new_summary_groups.append(
{
'name': item['name'] + '-circular-4',
'subsets': [i + '-circular-4' for i in item['subsets']],
}
)

summarizer = dict(
type=CircularSummarizer,
# Select specific metrics to view
metric_types=['acc_origin', 'perf_circular'],
dataset_abbrs = [
'ceval-circular-4',
'ceval-humanities-circular-4',
'ceval-stem-circular-4',
'ceval-social-science-circular-4',
'ceval-other-circular-4',
],
summary_groups=new_summary_groups,
)
```

For more complex evaluation examples, refer to this sample code: https://github.com/open-compass/opencompass/tree/main/configs/eval_circular.py
1 change: 1 addition & 0 deletions docs/en/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
advanced_guides/prompt_attack.md
advanced_guides/longeval.md
advanced_guides/subjective_evaluation.md
advanced_guides/circular_eval.md

.. _Tools:
.. toctree::
Expand Down
111 changes: 111 additions & 0 deletions docs/zh_cn/advanced_guides/circular_eval.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# 循环评测

## 背景

对于选择题而言,当 LLM 给出正确的选项,并不一定代表着它能真正地理解题意并经过推理得出答案,它也有可能是蒙对的。为了将这两种情形区分开,同时也为了降低 LLM 对选项的偏见,我们可以尝试使用循环评测 (CircularEval)。我们会将一道选择题按照打乱选项的方式进行增广,若 LLM 可以在增广后的每道题上均得到正确的答案,那么我们认为在循环评测的意义下,这道题被做对了。

## 新增自己的循环评测数据集

一般来说,为了将一个数据集使用循环评测的方式进行评测,它的加载方式和评测方式是需要被重写的,OpenCompass 主库和配置文件均需要进行修改。后续我们以 C-Eval 为例进行讲解。

OpenCompass 主库:

```python
from opencompass.datasets.ceval import CEvalDataset
from opencompass.datasets.circular import CircularDatasetMeta

class CircularCEvalDataset(CEvalDataset, metaclass=CircularDatasetMeta):
# 被重载的数据集类
dataset_class = CEvalDataset

# 若原 load 方法得到一 DatasetDict,其哪些 split 需要被循环评测。CEvalDataset load 得到 [dev, val, test],我们只需要对 val 和 test 进行循环评测,dev 不需要
default_circular_splits = ['val', 'test']

# 需要被打乱的 key 列表
default_option_keys = ['A', 'B', 'C', 'D']

# 若 answer_key 的内容属于是 ['A', 'B', 'C', 'D'] 之一,并表示正确答案。该字段表示打乱选项后,需要如何更新正确答案。与 default_answer_key_switch_method 二选一
default_answer_key = 'answer'

# 如果 answer_key 的内容不属于 ['A', 'B', 'C', 'D'] 之一,那么可以使用函数的方式来指定打乱选项后的正确答案。与 default_answer_key 二选一
# def default_answer_key_switch_method(item, circular_pattern):
# # item 是原本的数据项
# # circular_pattern 是一个 tuple,表示打乱选项后的顺序,例如 ('D', 'A', 'B', 'C') 表示原来的 A 选项变成了 D,原来的 B 选项变成了 A,以此类推
# item['answer'] = circular_pattern['ABCD'.index(item['answer'])]
# return item
```

`CircularCEvalDataset` 会接受 `circular_pattern` 参数,它有两个取值:

- `circular`: 表示单项循环。默认为该值。ABCD 会被扩充为 ABCD, BCDA, CDAB, DABC, 共 4 种
- `all_possible`: 表示全排列。ABCD 会被扩充为 ABCD, ABDC, ACBD, ACDB, ADBC, ADCB, BACD, ..., 共 24 种

另外我们提供了一个 `CircularEvaluator` 用于替换 `AccEvaluator`,该 Evaluator 同样接受 `circular_pattern`,该参数应与上述保持一致。它会产出以下指标:

- `acc_{origin|circular|all_possible}`: 将打乱后选项顺序后的题目视作多道单独的题目,计算准确率
- `perf_{origin|circular|all_possible}`: 按照 circular 的逻辑,若选项打乱后的题目都回答正确,才会视为这道题正确,计算准确率
- `more_{num}_{origin|circular|all_possible}`: 按照 circular 的逻辑,若选项打乱后的题目回答正确的数量大于等于 num,就会视为这道题正确,计算准确率

OpenCompass 配置文件:

```python
from mmengine.config import read_base
from opencompass.datasets.circular import CircularCEvalDataset

with read_base():
from .datasets.ceval.ceval_gen_5f30c7 import ceval_datasets

for d in ceval_datasets:
# 重载 load 方法
d['type'] = CircularCEvalDataset
# 为了与非循环评测版本做区分而进行改名
d['abbr'] = d['abbr'] + '-circular-4'
# 重载评测方法
d['eval_cfg']['evaluator'] = {'type': CircularEvaluator}

# 上述操作后的 dataset 形如下:
# dict(
# type=CircularCEvalDataset,
# path='./data/ceval/formal_ceval', # 未改变
# name='computer_network', # 未改变
# abbr='ceval-computer_network-circular-4',
# reader_cfg=dict(...), # 未改变
# infer_cfg=dict(...), # 未改变
# eval_cfg=dict(evaluator=dict(type=CircularEvaluator), ...),
# )
```

另外评测时为了针对循环评测有更良好的结果呈现,建议考虑使用以下 summarizer

```python
from mmengine.config import read_base
from opencompass.summarizers import CircularSummarizer

with read_base():
from ...summarizers.groups.ceval import ceval_summary_groups

new_summary_groups = []
for item in ceval_summary_groups:
new_summary_groups.append(
{
'name': item['name'] + '-circular-4',
'subsets': [i + '-circular-4' for i in item['subsets']],
}
)

summarizer = dict(
type=CircularSummarizer,
# 选择具体看哪些指标
metric_types=['acc_origin', 'perf_circular'],
dataset_abbrs = [
'ceval-circular-4',
'ceval-humanities-circular-4',
'ceval-stem-circular-4',
'ceval-social-science-circular-4',
'ceval-other-circular-4',
],
summary_groups=new_summary_groups,
)
```

更多复杂的评测案例可以参考这个样例代码: https://github.com/open-compass/opencompass/tree/main/configs/eval_circular.py
1 change: 1 addition & 0 deletions docs/zh_cn/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ OpenCompass 上手路线
advanced_guides/prompt_attack.md
advanced_guides/longeval.md
advanced_guides/subjective_evaluation.md
advanced_guides/circular_eval.md

.. _工具:
.. toctree::
Expand Down
1 change: 1 addition & 0 deletions opencompass/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from .ceval import * # noqa: F401, F403
from .chid import * # noqa: F401, F403
from .cibench import * # noqa: F401, F403
from .circular import * # noqa: F401, F403
from .civilcomments import * # noqa: F401, F403
from .clozeTest_maxmin import * # noqa: F401, F403
from .cluewsc import * # noqa: F401, F403
Expand Down
Loading