From 30432bc01a73bf326399a732b53ce0a072a25709 Mon Sep 17 00:00:00 2001 From: frankweijue Date: Fri, 27 Oct 2023 14:04:59 +0800 Subject: [PATCH 1/7] rename --- .../subjective_cmp.py} | 26 ++-- .../{subjective_infer.py => subjective.py} | 20 ++- .../advanced_guides/subjective evaluation.md | 135 ++++++++++++++++++ ...{subjectivity_cmp.py => subjective_cmp.py} | 6 +- 4 files changed, 159 insertions(+), 28 deletions(-) rename configs/datasets/{subjectivity_cmp/subjectivity_cmp.py => subjective_cmp/subjective_cmp.py} (75%) rename configs/{subjective_infer.py => subjective.py} (82%) create mode 100644 docs/zh_cn/advanced_guides/subjective evaluation.md rename opencompass/datasets/{subjectivity_cmp.py => subjective_cmp.py} (98%) diff --git a/configs/datasets/subjectivity_cmp/subjectivity_cmp.py b/configs/datasets/subjective_cmp/subjective_cmp.py similarity index 75% rename from configs/datasets/subjectivity_cmp/subjectivity_cmp.py rename to configs/datasets/subjective_cmp/subjective_cmp.py index 8ec8f219f..c509be931 100644 --- a/configs/datasets/subjectivity_cmp/subjectivity_cmp.py +++ b/configs/datasets/subjective_cmp/subjective_cmp.py @@ -2,21 +2,21 @@ from opencompass.openicl.icl_retriever import ZeroRetriever from opencompass.openicl.icl_inferencer import GenInferencer from opencompass.openicl.icl_evaluator import LMEvaluator -from opencompass.datasets.subjectivity_cmp import SubjectivityCmpDataset +from opencompass.datasets.subjective_cmp import SubjectiveCmpDataset -subjectivity_reader_cfg = dict( +subjective_reader_cfg = dict( input_columns=['question', 'index', 'reference_answer', 'evaluating_guidance', 'capability', 'prompt'], output_column=None, train_split='test') -subjectivity_all_sets = [ +subjective_all_sets = [ "sub_test", ] -subjectivity_datasets = [] +subjective_datasets = [] -for _name in subjectivity_all_sets: - subjectivity_infer_cfg = dict( +for _name in subjective_all_sets: + subjective_infer_cfg = dict( prompt_template=dict( type=PromptTemplate, template=dict(round=[ @@ -30,7 +30,7 @@ inferencer=dict(type=GenInferencer), ) - subjectivity_eval_cfg = dict( + subjective_eval_cfg = dict( evaluator=dict( type=LMEvaluator, cmp_order='both', @@ -49,13 +49,13 @@ pred_role="BOT", ) - subjectivity_datasets.append( + subjective_datasets.append( dict( abbr=f"{_name}", - type=SubjectivityCmpDataset, - path="./data/subjectivity/", + type=SubjectiveCmpDataset, + path="./data/subjective/", name=_name, - reader_cfg=subjectivity_reader_cfg, - infer_cfg=subjectivity_infer_cfg, - eval_cfg=subjectivity_eval_cfg + reader_cfg=subjective_reader_cfg, + infer_cfg=subjective_infer_cfg, + eval_cfg=subjective_eval_cfg )) diff --git a/configs/subjective_infer.py b/configs/subjective.py similarity index 82% rename from configs/subjective_infer.py rename to configs/subjective.py index 88d2a8282..64fab987b 100644 --- a/configs/subjective_infer.py +++ b/configs/subjective.py @@ -1,9 +1,9 @@ from mmengine.config import read_base with read_base(): - from .datasets.subjectivity_cmp.subjectivity_cmp import subjectivity_datasets + from .datasets.subjective_cmp.subjective_cmp import subjective_datasets from .summarizers.subjective import summarizer -datasets = [*subjectivity_datasets] +datasets = [*subjective_datasets] from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner @@ -35,22 +35,20 @@ tokenizer_kwargs=dict( padding_side='left', truncation_side='left', - trust_remote_code=True, - revision='b1502f4f75c71499a3d566b14463edd62620ce9f'), + trust_remote_code=True), max_out_len=100, max_seq_len=2048, batch_size=8, model_kwargs=dict( trust_remote_code=True, - device_map='auto', - revision='b1502f4f75c71499a3d566b14463edd62620ce9f'), + device_map='auto'), run_cfg=dict(num_gpus=1, num_procs=1), ), dict( type=HuggingFaceCausalLM, abbr='qwen-7b-chat-hf', - path="/mnt/petrelfs/share_data/duanhaodong/Qwen-7B-Chat", - tokenizer_path='/mnt/petrelfs/share_data/duanhaodong/Qwen-7B-Chat', + path="Qwen/Qwen-7B-Chat", + tokenizer_path='Qwen/Qwen-7B-Chat', tokenizer_kwargs=dict( padding_side='left', truncation_side='left', @@ -74,16 +72,14 @@ padding_side='left', truncation_side='left', use_fast=False, - trust_remote_code=True, - revision="ed5e35564ac836710817c51e8e8d0a5d4ff03102"), + trust_remote_code=True), max_out_len=100, max_seq_len=2048, batch_size=8, meta_template=_meta_template2, model_kwargs=dict( trust_remote_code=True, - device_map='auto', - revision="ed5e35564ac836710817c51e8e8d0a5d4ff03102"), + device_map='auto'), run_cfg=dict(num_gpus=1, num_procs=1), ) ] diff --git a/docs/zh_cn/advanced_guides/subjective evaluation.md b/docs/zh_cn/advanced_guides/subjective evaluation.md new file mode 100644 index 000000000..eb4399ea0 --- /dev/null +++ b/docs/zh_cn/advanced_guides/subjective evaluation.md @@ -0,0 +1,135 @@ +# 主观评测指引 + +## 介绍 + +- 主观评测旨在评估模型在符合人类偏好的能力上的表现。这种评估的黄金准则是人类喜好,但标注成本很高。 +- 为了探究模型的主观能力,我们采用了最先进的LLM(GPT-4)作为人类评估者的替代品([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685))。 +- 流行的评估方法是将模型的回答进行两两比较,以计算其胜率([Chatbot Arena](https://chat.lmsys.org/))。 +- 我们基于这一方法支持了GPT4用于模型的主观能力评估。 + +## 数据准备 + +- 将主观问题集以.xlsx格式存放在data/subjective/中。 +- 我们提供了一个基于[z-bench](https://github.com/zhenbench/z-bench)的demo测试集[subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx +)。 +- 表格包括以下字段: + - 'question':问题描述 + - 'index':题目序号 + - 'reference_answer':参考答案 + - 'evaluating_guidance':评估引导 + - 'capability':题目所属的能力维度。 + +## 评测配置 +具体流程包括: +1. 模型回答的推理 +2. GPT4评估比较对 +3. 生成评测报告 + +对于 config/subjective.py,我们提供了部分注释,方便用户理解配置文件的含义。 +```python +# 导入数据集与主观评测summarizer +from mmengine.config import read_base +with read_base(): + from .datasets.subjectivity_cmp.subjectivity_cmp import subjectivity_datasets + from .summarizers.subjective import summarizer + +datasets = [*subjectivity_datasets] + +from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI + +#导入主观评测所需partitioner与task +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks.subjective_eval import SubjectiveEvalTask + + +# 定义推理和评测所需模型配置 +# 包括chatglm2-6b,qwen-7b-chat,internlm-chat-7b,gpt4 +_meta_template = dict( + round=[ + dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'), + dict( + role="BOT", + begin="\n<|im_start|>assistant\n", + end='<|im_end|>', + generate=True), + ], ) + +... + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True) + ], + reserved_roles=[ + dict(role='SYSTEM', api_role='SYSTEM'), + ], +) + +# 定义主观评测配置 +eval = dict( + partitioner=dict( + type=SubjectiveNaivePartitioner, + mode='all', # 新参数,构建比较对时会交替构建两个 + ), + runner=dict( + type=LocalRunner, + max_num_workers=2, # 支持并行比较 + task=dict( + type=SubjectiveEvalTask, # 新 task,用来读入一对 model 的输入 + judge_cfg=dict( + abbr='GPT4', + type=OpenAI, + path='gpt-4-0613', + key='ENV', + meta_template=api_meta_template, + query_per_second=1, + max_out_len=2048, + max_seq_len=2048, + batch_size=2), + )), +) +``` + +## 启动评测 +```shell +python run.py config/subjective.py -r +``` +```-r``` 参数支持复用模型推理和GPT4评估结果。 + +## 评测报告 + +评测报告会输出到output/.../summary/timestamp/report.md,包含胜率统计,对战分数与ELO。具体格式如下: +```markdown +# Subjective Analysis +A total of 30 comparisons, of which 30 comparisons are meaningful (A / B answers inconsistent) +A total of 30 answer comparisons, successfully extracted 30 answers from GPT-4 replies, with an extraction success rate of 100.00% +### Basic statistics (4 stats: win / tie / lose / not bad) +| Dimension \ Stat [W / T / L / NB] | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf | +|-------------------------------------|-------------------------------|------------------------------|-------------------------------| +| LANG: Overall | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% | +| LANG: CN | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% | +| LANG: EN | N/A | N/A | N/A | +| CAPA: common | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% | + + +![Capabilities Dimension Classification Result](by_capa.png) + +![Language Classification Result](by_lang.png) + + +### Model scores (base score is 0, win +3, both +1, neither -1, lose -3) +| Dimension \ Score | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf | +|---------------------|------------------|-------------------|-----------------------| +| LANG: Overall | -8 | 0 | -8 | +| LANG: CN | -8 | 0 | -8 | +| LANG: EN | N/A | N/A | N/A | +| CAPA: common | -8 | 0 | -8 | +### Bootstrap ELO, Median of n=1000 times +| | chatglm2-6b-hf | internlm-chat-7b-hf | qwen-7b-chat-hf | +|------------------|------------------|-----------------------|-------------------| +| elo_score [Mean] | 999.504 | 999.912 | 1000.26 | +| elo_score [Std] | 0.621362 | 0.400226 | 0.694434 | + +``` \ No newline at end of file diff --git a/opencompass/datasets/subjectivity_cmp.py b/opencompass/datasets/subjective_cmp.py similarity index 98% rename from opencompass/datasets/subjectivity_cmp.py rename to opencompass/datasets/subjective_cmp.py index bc676194f..38cf7363c 100644 --- a/opencompass/datasets/subjectivity_cmp.py +++ b/opencompass/datasets/subjective_cmp.py @@ -187,20 +187,20 @@ def build_prompt(question, examples = [example1, example2, example3, example4] -subjectivity_reader_cfg = dict(input_columns=[ +subjective_reader_cfg = dict(input_columns=[ 'question', 'index', 'reference_answer', 'evaluating_guidance', 'capability' ], output_column=None, train_split='test') -subjectivity_all_sets = [ +subjective_all_sets = [ 'sub_test', ] @LOAD_DATASET.register_module() -class SubjectivityCmpDataset(BaseDataset): +class SubjectiveCmpDataset(BaseDataset): @staticmethod def load(path: str, name: str): From 3f9121a420c06c11a7ae5bea7ee6068daa94040a Mon Sep 17 00:00:00 2001 From: frankweijue Date: Fri, 27 Oct 2023 14:17:27 +0800 Subject: [PATCH 2/7] add en subdoc --- .../advanced_guides/subjective evaluation.md | 139 ++++++++++++++++++ .../advanced_guides/subjective evaluation.md | 29 ++-- 2 files changed, 156 insertions(+), 12 deletions(-) create mode 100644 docs/en/advanced_guides/subjective evaluation.md diff --git a/docs/en/advanced_guides/subjective evaluation.md b/docs/en/advanced_guides/subjective evaluation.md new file mode 100644 index 000000000..b09fd08ed --- /dev/null +++ b/docs/en/advanced_guides/subjective evaluation.md @@ -0,0 +1,139 @@ +# Subjective Evaluation Guide + +## Introduction + +Subjective evaluation aims to assess the model's performance in tasks that align with human preferences. The key criterion for this evaluation is human preference, but it comes with a high cost of annotation. + +To explore the model's subjective capabilities, we employ state-of-the-art LLM (GPT-4) as a substitute for human assessors ([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)). + +A popular evaluation method involves comparing model responses pairwise to calculate their win rate ([Chatbot Arena](https://chat.lmsys.org/)). + +We support the use of GPT-4 for the subjective evaluation of models based on this method. + +## Data Preparation + +We provide a demo test set [subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx) based on [z-bench](https://github.com/zhenbench/z-bench). + +Store the set of subjective questions in .xlsx format in the data/subjective/directory. + +The table includes the following fields: +- 'question': Question description +- 'index': Question number +- 'reference_answer': Reference answer +- 'evaluating_guidance': Evaluation guidance +- 'capability': The capability dimension of the question. + +## Evaluation Configuration +The specific process includes: +1. Model response reasoning +2. GPT-4 evaluation comparisons +3. Generating evaluation reports + +For config/subjective.py, we provide some annotations to help users understand the configuration file's meaning. +```python +# Import datasets and subjective evaluation summarizer +from mmengine.config import read_base +with read_base(): + from .datasets.subjectivity_cmp.subjectivity_cmp import subjectivity_datasets + from .summarizers.subjective import summarizer + +datasets = [*subjectivity_datasets] + +from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI + +# Import partitioner and task required for subjective evaluation +from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner +from opencompass.runners import LocalRunner +from opencompass.tasks.subjective_eval import SubjectiveEvalTask + + +# Define model configurations for inference and evaluation +# Including chatglm2-6b, qwen-7b-chat, internlm-chat-7b and gpt4 +_meta_template = dict( + round=[ + dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'), + dict( + role="BOT", + begin="\n<|im_start|>assistant\n", + end='<|im_end|>', + generate=True), + ], ) + +... + +api_meta_template = dict( + round=[ + dict(role='HUMAN', api_role='HUMAN'), + dict(role='BOT', api_role='BOT', generate=True) + ], + reserved_roles=[ + dict(role='SYSTEM', api_role='SYSTEM'), + ], +) + +# Define the configuration for subjective evaluation +eval = dict( + partitioner=dict( + type=SubjectiveNaivePartitioner, + mode='all', # alternately constructs two for comparisons + ), + runner=dict( + type=LocalRunner, + max_num_workers=2, # Supports parallel comparisons + task=dict( + type=SubjectiveEvalTask, # Used to read inputs for a pair of models + judge_cfg=dict( + abbr='GPT4', + type=OpenAI, + path='gpt-4-0613', + key='ENV', + meta_template=api_meta_template, + query_per_second=1, + max_out_len=2048, + max_seq_len=2048, + batch_size=2), + )), +) +``` + +## Launching the Evaluation +```shell +python run.py config/subjective.py -r +``` +The ```-r``` parameter allows the reuse of model inference and GPT-4 evaluation results. + +## Evaluation Report + +The evaluation report will be output to output/.../summary/timestamp/report.md, which includes win rate statistics, battle scores, and ELO ratings. The specific format is as follows: +```markdown +# Subjective Analysis +A total of 30 comparisons, of which 30 comparisons are meaningful (A / B answers inconsistent) +A total of 30 answer comparisons, successfully extracted 30 answers from GPT-4 replies, with an extraction success rate of 100.00% +### Basic statistics (4 stats: win / tie / lose / not bad) +| Dimension \ Stat [W / T / L / NB] | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf | +|-------------------------------------|-------------------------------|------------------------------|-------------------------------| +| LANG: Overall | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% | +| LANG: CN | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% | +| LANG: EN | N/A | N/A | N/A | +| CAPA: common | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% | + + +![Capabilities Dimension Classification Result](by_capa.png) + +![Language Classification Result](by_lang.png) + + +### Model scores (base score is 0, win +3, both +1, neither -1, lose -3) +| Dimension \ Score | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf | +|---------------------|------------------|-------------------|-----------------------| +| LANG: Overall | -8 | 0 | -8 | +| LANG: CN | -8 | 0 | -8 | +| LANG: EN | N/A | N/A | N/A | +| CAPA: common | -8 | 0 | -8 | +### Bootstrap ELO, Median of n=1000 times +| | chatglm2-6b-hf | internlm-chat-7b-hf | qwen-7b-chat-hf | +|------------------|------------------|-----------------------|-------------------| +| elo_score [Mean] | 999.504 | 999.912 | 1000.26 | +| elo_score [Std] | 0.621362 | 0.400226 | 0.694434 | + +``` \ No newline at end of file diff --git a/docs/zh_cn/advanced_guides/subjective evaluation.md b/docs/zh_cn/advanced_guides/subjective evaluation.md index eb4399ea0..8e3f82a00 100644 --- a/docs/zh_cn/advanced_guides/subjective evaluation.md +++ b/docs/zh_cn/advanced_guides/subjective evaluation.md @@ -2,22 +2,27 @@ ## 介绍 -- 主观评测旨在评估模型在符合人类偏好的能力上的表现。这种评估的黄金准则是人类喜好,但标注成本很高。 -- 为了探究模型的主观能力,我们采用了最先进的LLM(GPT-4)作为人类评估者的替代品([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685))。 -- 流行的评估方法是将模型的回答进行两两比较,以计算其胜率([Chatbot Arena](https://chat.lmsys.org/))。 -- 我们基于这一方法支持了GPT4用于模型的主观能力评估。 +主观评测旨在评估模型在符合人类偏好的能力上的表现。这种评估的黄金准则是人类喜好,但标注成本很高。 + +为了探究模型的主观能力,我们采用了最先进的LLM(GPT-4)作为人类评估者的替代品([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685))。 + +流行的评估方法是将模型的回答进行两两比较,以计算其胜率([Chatbot Arena](https://chat.lmsys.org/))。 + +我们基于这一方法支持了GPT4用于模型的主观能力评估。 ## 数据准备 -- 将主观问题集以.xlsx格式存放在data/subjective/中。 -- 我们提供了一个基于[z-bench](https://github.com/zhenbench/z-bench)的demo测试集[subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx +我们提供了一个基于[z-bench](https://github.com/zhenbench/z-bench)的demo测试集:[subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx )。 -- 表格包括以下字段: - - 'question':问题描述 - - 'index':题目序号 - - 'reference_answer':参考答案 - - 'evaluating_guidance':评估引导 - - 'capability':题目所属的能力维度。 + +将主观问题集以.xlsx格式存放在data/subjective/中。 + +表格包括以下字段: +- 'question':问题描述 +- 'index':题目序号 +- 'reference_answer':参考答案 +- 'evaluating_guidance':评估引导 +- 'capability':题目所属的能力维度。 ## 评测配置 具体流程包括: From 633be69d399f3e00b53aed9c86941c6fa420bc4c Mon Sep 17 00:00:00 2001 From: frankweijue Date: Fri, 27 Oct 2023 14:30:15 +0800 Subject: [PATCH 3/7] fix name --- configs/datasets/subjective_cmp/subjective_cmp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/datasets/subjective_cmp/subjective_cmp.py b/configs/datasets/subjective_cmp/subjective_cmp.py index c509be931..3c81f41cb 100644 --- a/configs/datasets/subjective_cmp/subjective_cmp.py +++ b/configs/datasets/subjective_cmp/subjective_cmp.py @@ -10,7 +10,7 @@ train_split='test') subjective_all_sets = [ - "sub_test", + "subjective_demo", ] subjective_datasets = [] From c0278964ce37ee163e38eaff424911d40113656e Mon Sep 17 00:00:00 2001 From: frankweijue Date: Fri, 27 Oct 2023 15:47:01 +0800 Subject: [PATCH 4/7] fix writing --- .../advanced_guides/subjective evaluation.md | 37 +++++++------- .../advanced_guides/subjective evaluation.md | 49 +++++++++---------- 2 files changed, 41 insertions(+), 45 deletions(-) diff --git a/docs/en/advanced_guides/subjective evaluation.md b/docs/en/advanced_guides/subjective evaluation.md index b09fd08ed..d9ca515af 100644 --- a/docs/en/advanced_guides/subjective evaluation.md +++ b/docs/en/advanced_guides/subjective evaluation.md @@ -14,7 +14,7 @@ We support the use of GPT-4 for the subjective evaluation of models based on thi We provide a demo test set [subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx) based on [z-bench](https://github.com/zhenbench/z-bench). -Store the set of subjective questions in .xlsx format in the data/subjective/directory. +Store the set of subjective questions in .xlsx format in the `data/subjective/directory`. The table includes the following fields: - 'question': Question description @@ -29,15 +29,15 @@ The specific process includes: 2. GPT-4 evaluation comparisons 3. Generating evaluation reports -For config/subjective.py, we provide some annotations to help users understand the configuration file's meaning. +For `config/subjective.py`, we provide some annotations to help users understand the configuration file's meaning. ```python # Import datasets and subjective evaluation summarizer from mmengine.config import read_base with read_base(): - from .datasets.subjectivity_cmp.subjectivity_cmp import subjectivity_datasets + from .datasets.subjective_cmp.subjective_cmp import subjective_datasets from .summarizers.subjective import summarizer -datasets = [*subjectivity_datasets] +datasets = [*subjective_datasets] from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI @@ -48,18 +48,8 @@ from opencompass.tasks.subjective_eval import SubjectiveEvalTask # Define model configurations for inference and evaluation -# Including chatglm2-6b, qwen-7b-chat, internlm-chat-7b and gpt4 -_meta_template = dict( - round=[ - dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'), - dict( - role="BOT", - begin="\n<|im_start|>assistant\n", - end='<|im_end|>', - generate=True), - ], ) - -... +# Including the inference models chatglm2-6b, qwen-7b-chat, internlm-chat-7b, and the evaluation model gpt4 +models = [...] api_meta_template = dict( round=[ @@ -100,11 +90,11 @@ eval = dict( ```shell python run.py config/subjective.py -r ``` -The ```-r``` parameter allows the reuse of model inference and GPT-4 evaluation results. +The `-r` parameter allows the reuse of model inference and GPT-4 evaluation results. ## Evaluation Report -The evaluation report will be output to output/.../summary/timestamp/report.md, which includes win rate statistics, battle scores, and ELO ratings. The specific format is as follows: +The evaluation report will be output to `output/.../summary/timestamp/report.md`, which includes win rate statistics, battle scores, and ELO ratings. The specific format is as follows: ```markdown # Subjective Analysis A total of 30 comparisons, of which 30 comparisons are meaningful (A / B answers inconsistent) @@ -136,4 +126,13 @@ A total of 30 answer comparisons, successfully extracted 30 answers from GPT-4 r | elo_score [Mean] | 999.504 | 999.912 | 1000.26 | | elo_score [Std] | 0.621362 | 0.400226 | 0.694434 | -``` \ No newline at end of file +``` +For comparing the evaluation of models A and B, there are four choices: +1. A is better than B. +2. A and B are equally good. +3. A is worse than B. +4. Neither A nor B is good. + +So, `win` / `tie` / `lose` / `not bad` represent the proportions of the model winning / tying / losing / winning or being equally good, respectively. + +`Bootstrap ELO` is calculated as the median ELO score by comparing match results through 1000 random permutations. diff --git a/docs/zh_cn/advanced_guides/subjective evaluation.md b/docs/zh_cn/advanced_guides/subjective evaluation.md index 8e3f82a00..7f3ea5953 100644 --- a/docs/zh_cn/advanced_guides/subjective evaluation.md +++ b/docs/zh_cn/advanced_guides/subjective evaluation.md @@ -4,18 +4,17 @@ 主观评测旨在评估模型在符合人类偏好的能力上的表现。这种评估的黄金准则是人类喜好,但标注成本很高。 -为了探究模型的主观能力,我们采用了最先进的LLM(GPT-4)作为人类评估者的替代品([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685))。 +为了探究模型的主观能力,我们采用了最先进的 LLM(GPT-4)作为人类评估者的替代品([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685))。 流行的评估方法是将模型的回答进行两两比较,以计算其胜率([Chatbot Arena](https://chat.lmsys.org/))。 -我们基于这一方法支持了GPT4用于模型的主观能力评估。 +我们基于这一方法支持了 GPT4 用于模型的主观能力评估。 ## 数据准备 -我们提供了一个基于[z-bench](https://github.com/zhenbench/z-bench)的demo测试集:[subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx -)。 +我们提供了一个基于 [z-bench](https://github.com/zhenbench/z-bench) 的 demo 测试集:[subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx)。 -将主观问题集以.xlsx格式存放在data/subjective/中。 +将主观问题集以.xlsx格式存放在 `data/subjective/` 中。 表格包括以下字段: - 'question':问题描述 @@ -27,40 +26,30 @@ ## 评测配置 具体流程包括: 1. 模型回答的推理 -2. GPT4评估比较对 +2. GPT4 评估比较对 3. 生成评测报告 -对于 config/subjective.py,我们提供了部分注释,方便用户理解配置文件的含义。 +对于 `config/subjective.py`,我们提供了部分注释,方便用户理解配置文件的含义。 ```python -# 导入数据集与主观评测summarizer +# 导入数据集与主观评测 summarizer from mmengine.config import read_base with read_base(): - from .datasets.subjectivity_cmp.subjectivity_cmp import subjectivity_datasets + from .datasets.subjective_cmp.subjective_cmp import subjective_datasets from .summarizers.subjective import summarizer -datasets = [*subjectivity_datasets] +datasets = [*subjective_datasets] from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI -#导入主观评测所需partitioner与task +#导入主观评测所需 partitioner 与 task from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner from opencompass.runners import LocalRunner from opencompass.tasks.subjective_eval import SubjectiveEvalTask # 定义推理和评测所需模型配置 -# 包括chatglm2-6b,qwen-7b-chat,internlm-chat-7b,gpt4 -_meta_template = dict( - round=[ - dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'), - dict( - role="BOT", - begin="\n<|im_start|>assistant\n", - end='<|im_end|>', - generate=True), - ], ) - -... +# 包括推理模型 chatglm2-6b,qwen-7b-chat,internlm-chat-7b 和 评测模型 gpt4 +models = [...] api_meta_template = dict( round=[ @@ -101,11 +90,11 @@ eval = dict( ```shell python run.py config/subjective.py -r ``` -```-r``` 参数支持复用模型推理和GPT4评估结果。 +`-r` 参数支持复用模型推理和 GPT4 评估结果。 ## 评测报告 -评测报告会输出到output/.../summary/timestamp/report.md,包含胜率统计,对战分数与ELO。具体格式如下: +评测报告会输出到 `output/.../summary/timestamp/report.md` ,包含胜率统计,对战分数与ELO。具体格式如下: ```markdown # Subjective Analysis A total of 30 comparisons, of which 30 comparisons are meaningful (A / B answers inconsistent) @@ -137,4 +126,12 @@ A total of 30 answer comparisons, successfully extracted 30 answers from GPT-4 r | elo_score [Mean] | 999.504 | 999.912 | 1000.26 | | elo_score [Std] | 0.621362 | 0.400226 | 0.694434 | -``` \ No newline at end of file +``` +对于评估模型 A 和 B的比较对,有四种选择: +1. A 比 B 好 +2. A 和 B 一样好 +3. A 比 B 差 +4. A 和 B 都不好 + +故 `win` / `tie` / `lose` / `not bad` 分别指模型 胜 / 平局 / 负 / 胜或一样好 的比例 。 +`Bootstrap ELO` 是通过对比赛结果进行1000次随机顺序,计算出 ELO 分数的中位数。 \ No newline at end of file From a35a75f75af3866ca17219d93999bfca6ac30a07 Mon Sep 17 00:00:00 2001 From: Leymore Date: Fri, 27 Oct 2023 16:12:45 +0800 Subject: [PATCH 5/7] update --- ...evaluation.md => subjective_evaluation.md} | 48 ++++++++++------ docs/en/index.rst | 1 + ...evaluation.md => subjective_evaluation.md} | 56 +++++++++++-------- docs/zh_cn/index.rst | 1 + 4 files changed, 66 insertions(+), 40 deletions(-) rename docs/en/advanced_guides/{subjective evaluation.md => subjective_evaluation.md} (72%) rename docs/zh_cn/advanced_guides/{subjective evaluation.md => subjective_evaluation.md} (65%) diff --git a/docs/en/advanced_guides/subjective evaluation.md b/docs/en/advanced_guides/subjective_evaluation.md similarity index 72% rename from docs/en/advanced_guides/subjective evaluation.md rename to docs/en/advanced_guides/subjective_evaluation.md index d9ca515af..1ac2da643 100644 --- a/docs/en/advanced_guides/subjective evaluation.md +++ b/docs/en/advanced_guides/subjective_evaluation.md @@ -17,6 +17,7 @@ We provide a demo test set [subjective_demo.xlsx](https://opencompass.openxlab.s Store the set of subjective questions in .xlsx format in the `data/subjective/directory`. The table includes the following fields: + - 'question': Question description - 'index': Question number - 'reference_answer': Reference answer @@ -24,12 +25,15 @@ The table includes the following fields: - 'capability': The capability dimension of the question. ## Evaluation Configuration + The specific process includes: + 1. Model response reasoning 2. GPT-4 evaluation comparisons 3. Generating evaluation reports For `config/subjective.py`, we provide some annotations to help users understand the configuration file's meaning. + ```python # Import datasets and subjective evaluation summarizer from mmengine.config import read_base @@ -87,47 +91,55 @@ eval = dict( ``` ## Launching the Evaluation + ```shell python run.py config/subjective.py -r ``` + The `-r` parameter allows the reuse of model inference and GPT-4 evaluation results. ## Evaluation Report The evaluation report will be output to `output/.../summary/timestamp/report.md`, which includes win rate statistics, battle scores, and ELO ratings. The specific format is as follows: + ```markdown # Subjective Analysis + A total of 30 comparisons, of which 30 comparisons are meaningful (A / B answers inconsistent) A total of 30 answer comparisons, successfully extracted 30 answers from GPT-4 replies, with an extraction success rate of 100.00% + ### Basic statistics (4 stats: win / tie / lose / not bad) -| Dimension \ Stat [W / T / L / NB] | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf | -|-------------------------------------|-------------------------------|------------------------------|-------------------------------| -| LANG: Overall | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% | -| LANG: CN | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% | -| LANG: EN | N/A | N/A | N/A | -| CAPA: common | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% | +| Dimension \ Stat [W / T / L / NB] | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf | +| --------------------------------- | ----------------------------- | ---------------------------- | ----------------------------- | +| LANG: Overall | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% | +| LANG: CN | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% | +| LANG: EN | N/A | N/A | N/A | +| CAPA: common | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% | ![Capabilities Dimension Classification Result](by_capa.png) ![Language Classification Result](by_lang.png) - ### Model scores (base score is 0, win +3, both +1, neither -1, lose -3) -| Dimension \ Score | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf | -|---------------------|------------------|-------------------|-----------------------| -| LANG: Overall | -8 | 0 | -8 | -| LANG: CN | -8 | 0 | -8 | -| LANG: EN | N/A | N/A | N/A | -| CAPA: common | -8 | 0 | -8 | -### Bootstrap ELO, Median of n=1000 times -| | chatglm2-6b-hf | internlm-chat-7b-hf | qwen-7b-chat-hf | -|------------------|------------------|-----------------------|-------------------| -| elo_score [Mean] | 999.504 | 999.912 | 1000.26 | -| elo_score [Std] | 0.621362 | 0.400226 | 0.694434 | +| Dimension \ Score | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf | +| ----------------- | -------------- | --------------- | ------------------- | +| LANG: Overall | -8 | 0 | -8 | +| LANG: CN | -8 | 0 | -8 | +| LANG: EN | N/A | N/A | N/A | +| CAPA: common | -8 | 0 | -8 | + +### Bootstrap ELO, Median of n=1000 times + +| | chatglm2-6b-hf | internlm-chat-7b-hf | qwen-7b-chat-hf | +| ---------------- | -------------- | ------------------- | --------------- | +| elo_score [Mean] | 999.504 | 999.912 | 1000.26 | +| elo_score [Std] | 0.621362 | 0.400226 | 0.694434 | ``` + For comparing the evaluation of models A and B, there are four choices: + 1. A is better than B. 2. A and B are equally good. 3. A is worse than B. diff --git a/docs/en/index.rst b/docs/en/index.rst index d14c8308a..97e09e63f 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -64,6 +64,7 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass. advanced_guides/multimodal_eval.md advanced_guides/prompt_attack.md advanced_guides/longeval.md + advanced_guides/subjective_evaluation.md .. _Tools: .. toctree:: diff --git a/docs/zh_cn/advanced_guides/subjective evaluation.md b/docs/zh_cn/advanced_guides/subjective_evaluation.md similarity index 65% rename from docs/zh_cn/advanced_guides/subjective evaluation.md rename to docs/zh_cn/advanced_guides/subjective_evaluation.md index 7f3ea5953..dcb32055f 100644 --- a/docs/zh_cn/advanced_guides/subjective evaluation.md +++ b/docs/zh_cn/advanced_guides/subjective_evaluation.md @@ -14,9 +14,10 @@ 我们提供了一个基于 [z-bench](https://github.com/zhenbench/z-bench) 的 demo 测试集:[subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx)。 -将主观问题集以.xlsx格式存放在 `data/subjective/` 中。 +将主观问题集以.xlsx 格式存放在 `data/subjective/` 中。 表格包括以下字段: + - 'question':问题描述 - 'index':题目序号 - 'reference_answer':参考答案 @@ -24,12 +25,15 @@ - 'capability':题目所属的能力维度。 ## 评测配置 + 具体流程包括: + 1. 模型回答的推理 2. GPT4 评估比较对 3. 生成评测报告 对于 `config/subjective.py`,我们提供了部分注释,方便用户理解配置文件的含义。 + ```python # 导入数据集与主观评测 summarizer from mmengine.config import read_base @@ -87,51 +91,59 @@ eval = dict( ``` ## 启动评测 + ```shell python run.py config/subjective.py -r ``` + `-r` 参数支持复用模型推理和 GPT4 评估结果。 ## 评测报告 -评测报告会输出到 `output/.../summary/timestamp/report.md` ,包含胜率统计,对战分数与ELO。具体格式如下: +评测报告会输出到 `output/.../summary/timestamp/report.md` ,包含胜率统计,对战分数与 ELO。具体格式如下: + ```markdown # Subjective Analysis + A total of 30 comparisons, of which 30 comparisons are meaningful (A / B answers inconsistent) A total of 30 answer comparisons, successfully extracted 30 answers from GPT-4 replies, with an extraction success rate of 100.00% + ### Basic statistics (4 stats: win / tie / lose / not bad) -| Dimension \ Stat [W / T / L / NB] | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf | -|-------------------------------------|-------------------------------|------------------------------|-------------------------------| -| LANG: Overall | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% | -| LANG: CN | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% | -| LANG: EN | N/A | N/A | N/A | -| CAPA: common | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% | +| Dimension \ Stat [W / T / L / NB] | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf | +| --------------------------------- | ----------------------------- | ---------------------------- | ----------------------------- | +| LANG: Overall | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% | +| LANG: CN | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% | +| LANG: EN | N/A | N/A | N/A | +| CAPA: common | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% | ![Capabilities Dimension Classification Result](by_capa.png) ![Language Classification Result](by_lang.png) - ### Model scores (base score is 0, win +3, both +1, neither -1, lose -3) -| Dimension \ Score | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf | -|---------------------|------------------|-------------------|-----------------------| -| LANG: Overall | -8 | 0 | -8 | -| LANG: CN | -8 | 0 | -8 | -| LANG: EN | N/A | N/A | N/A | -| CAPA: common | -8 | 0 | -8 | -### Bootstrap ELO, Median of n=1000 times -| | chatglm2-6b-hf | internlm-chat-7b-hf | qwen-7b-chat-hf | -|------------------|------------------|-----------------------|-------------------| -| elo_score [Mean] | 999.504 | 999.912 | 1000.26 | -| elo_score [Std] | 0.621362 | 0.400226 | 0.694434 | +| Dimension \ Score | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf | +| ----------------- | -------------- | --------------- | ------------------- | +| LANG: Overall | -8 | 0 | -8 | +| LANG: CN | -8 | 0 | -8 | +| LANG: EN | N/A | N/A | N/A | +| CAPA: common | -8 | 0 | -8 | + +### Bootstrap ELO, Median of n=1000 times + +| | chatglm2-6b-hf | internlm-chat-7b-hf | qwen-7b-chat-hf | +| ---------------- | -------------- | ------------------- | --------------- | +| elo_score [Mean] | 999.504 | 999.912 | 1000.26 | +| elo_score [Std] | 0.621362 | 0.400226 | 0.694434 | ``` -对于评估模型 A 和 B的比较对,有四种选择: + +对于评估模型 A 和 B 的比较对,有四种选择: + 1. A 比 B 好 2. A 和 B 一样好 3. A 比 B 差 4. A 和 B 都不好 故 `win` / `tie` / `lose` / `not bad` 分别指模型 胜 / 平局 / 负 / 胜或一样好 的比例 。 -`Bootstrap ELO` 是通过对比赛结果进行1000次随机顺序,计算出 ELO 分数的中位数。 \ No newline at end of file +`Bootstrap ELO` 是通过对比赛结果进行 1000 次随机顺序,计算出 ELO 分数的中位数。 diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst index b08583db8..b099eebb7 100644 --- a/docs/zh_cn/index.rst +++ b/docs/zh_cn/index.rst @@ -64,6 +64,7 @@ OpenCompass 上手路线 advanced_guides/multimodal_eval.md advanced_guides/prompt_attack.md advanced_guides/longeval.md + advanced_guides/subjective_evaluation.md .. _工具: .. toctree:: From aac71632d61937b5da230c148429677ab0c2eea5 Mon Sep 17 00:00:00 2001 From: Leymore Date: Fri, 27 Oct 2023 16:15:30 +0800 Subject: [PATCH 6/7] update --- docs/en/advanced_guides/subjective_evaluation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/en/advanced_guides/subjective_evaluation.md b/docs/en/advanced_guides/subjective_evaluation.md index 1ac2da643..444099cc7 100644 --- a/docs/en/advanced_guides/subjective_evaluation.md +++ b/docs/en/advanced_guides/subjective_evaluation.md @@ -1,4 +1,4 @@ -# Subjective Evaluation Guide +# Subjective Evaluation Guidance ## Introduction From 0c879ef0223788b926b725bee128fe483697760f Mon Sep 17 00:00:00 2001 From: Leymore Date: Fri, 27 Oct 2023 16:18:21 +0800 Subject: [PATCH 7/7] update --- opencompass/datasets/subjective_cmp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/opencompass/datasets/subjective_cmp.py b/opencompass/datasets/subjective_cmp.py index 38cf7363c..cde91858e 100644 --- a/opencompass/datasets/subjective_cmp.py +++ b/opencompass/datasets/subjective_cmp.py @@ -191,8 +191,8 @@ def build_prompt(question, 'question', 'index', 'reference_answer', 'evaluating_guidance', 'capability' ], - output_column=None, - train_split='test') + output_column=None, + train_split='test') subjective_all_sets = [ 'sub_test',