[Sync] format (#1214)

open-compass · May 29, 2024 · a77b8a5 · a77b8a5
1 parent d59189b
commit a77b8a5
Show file tree

Hide file tree

Showing 9 changed files with 561 additions and 9 deletions.
diff --git a/configs/datasets/subjective/compassbench/compassbench_compare.py b/configs/datasets/subjective/compassbench/compassbench_compare.py
@@ -0,0 +1,58 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import CompassBenchDataset
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'judge_prompt'],
+    output_column='judge',
+    )
+
+data_path ='data/subjective/compassbench'
+
+subjective_datasets = []
+
+versions = ['CompassbenchV1']
+
+for version_abbr in versions:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt='{question}'
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = '{judge_prompt}'
+                    ),
+                ]),
+            ),
+        ),
+        pred_role='BOT',
+    )
+
+    subjective_datasets.append(
+        dict(
+            abbr=version_abbr,
+            type=CompassBenchDataset,
+            path=data_path,
+            name=version_abbr,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg
+        ))
diff --git a/configs/eval_subjective_compassbench.py b/configs/eval_subjective_compassbench.py
@@ -0,0 +1,137 @@
+from os import getenv as gv
+from opencompass.models import HuggingFaceCausalLM
+from mmengine.config import read_base
+
+with read_base():
+    from .datasets.subjective.compassbench.compassbench_compare import subjective_datasets
+
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
+from opencompass.partitioners import NaivePartitioner, SizePartitioner
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.runners import SlurmSequentialRunner
+from opencompass.tasks import OpenICLInferTask
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+from opencompass.summarizers import CompassBenchSummarizer
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True),
+    ],
+    reserved_roles=[dict(role='SYSTEM', api_role='SYSTEM')],
+)
+
+# -------------Inference Stage ----------------------------------------
+
+from opencompass.models import HuggingFacewithChatTemplate
+
+models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='internlm2-chat-7b-hf',
+        path='internlm/internlm2-chat-7b',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=1),
+        stop_words=['</s>', '<|im_end|>'],
+        generation_kwargs=dict(
+            do_sample=True,
+        ),
+    )
+]
+
+datasets = [*subjective_datasets]
+
+infer = dict(
+    partitioner=dict(type=NaivePartitioner),
+    runner=dict(
+        type=SlurmSequentialRunner,
+        partition='llmeval',
+        quotatype='reserved',
+        max_num_workers=256,
+        task=dict(type=OpenICLInferTask),
+    ),
+)
+
+gpt4 = dict(
+    abbr='gpt4-turbo',
+    type=OpenAI,
+    path='gpt-4-1106-preview',
+    key='',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+    meta_template=api_meta_template,
+    query_per_second=1,
+    max_out_len=2048,
+    max_seq_len=4096,
+    batch_size=4,
+    retry=20,
+    temperature=1,
+)  # Re-inference gpt4's predictions or you can choose to use the pre-commited gpt4's predictions
+
+# -------------Evalation Stage ----------------------------------------
+
+## ------------- JudgeLLM Configuration
+judge_models = [dict(
+    abbr='GPT4-Turbo',
+    type=OpenAI,
+    path='gpt-4-1106-preview',
+    key='',  # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
+    meta_template=api_meta_template,
+    query_per_second=1,
+    max_out_len=1024,
+    max_seq_len=4096,
+    batch_size=2,
+    retry=20,
+    temperature=0,
+)]
+
+judge_models = [
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='internlm102b',
+        path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=4),
+        stop_words=['</s>', '<|im_end|>'],
+    ),
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='internlm102b2',
+        path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=4),
+        stop_words=['</s>', '<|im_end|>'],
+    ),
+    dict(
+        type=HuggingFacewithChatTemplate,
+        abbr='internlm102b3',
+        path='/mnt/petrelfs/caomaosong/backup_hwfile/100bjudge_6w_epoch1/hf',
+        max_out_len=1024,
+        batch_size=8,
+        run_cfg=dict(num_gpus=4),
+        stop_words=['</s>', '<|im_end|>'],
+    )
+]
+
+## ------------- Evaluation Configuration
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveSizePartitioner,
+        strategy='split',
+        max_task_size=10000000,
+        mode='m2n',
+        infer_order='double',
+        base_models=[gpt4],
+        compare_models=models,
+        judge_models=judge_models,
+    ),
+    runner=dict(type=LocalRunner, max_num_workers=32, task=dict(type=SubjectiveEvalTask)),
+    #given_pred = [{'abbr':'gpt4-turbo', 'path':''}]
+)
+
+work_dir = 'outputs/compassbench/'
+
+summarizer = dict(type=CompassBenchSummarizer, summary_type='half_add')
diff --git a/configs/summarizers/groups/charm_reason.py b/configs/summarizers/groups/charm_reason.py
@@ -20,16 +20,16 @@
 ]
 
 
-charm_reaso_summary_groups = []
+charm_reason_summary_groups = []
 for prompt in prompts:
     for region in regions:
         subsets = ['charm-reason-' + region + '_' + task + '_' + prompt for task in charm_tasks]
-        charm_reaso_summary_groups.append({'name': 'charm-reason-' + region + '_' + prompt, 'subsets': subsets})
+        charm_reason_summary_groups.append({'name': 'charm-reason-' + region + '_' + prompt, 'subsets': subsets})
 
 for prompt in prompts:
     subsets = ['charm-reason-' + region + '_' + prompt for region in regions]
-    charm_reaso_summary_groups.append({'name': 'charm-reason-' + prompt, 'subsets': subsets})
+    charm_reason_summary_groups.append({'name': 'charm-reason-' + prompt, 'subsets': subsets})
 
-charm_reaso_summary_groups.append(
+charm_reason_summary_groups.append(
     {'name': 'charm-reason-CoT', 'subsets': ['charm-reason-ZH-CoT', 'charm-reason-EN-CoT']}
 )
diff --git a/opencompass/datasets/subjective/__init__.py b/opencompass/datasets/subjective/__init__.py
@@ -1,6 +1,7 @@
 from .alignbench import AlignmentBenchDataset  # noqa: F401, F403
 from .arena_hard import ArenaHardDataset  # noqa: F401, F403
 from .compass_arena import CompassArenaDataset  # noqa: F401, F403
+from .compassbench import CompassBenchDataset  # noqa: F401, F403
 from .corev2 import Corev2Dataset  # noqa: F401, F403
 from .creationbench import CreationBenchDataset  # noqa: F401, F403
 from .information_retrival import IRDataset  # noqa: F401, F403

diff --git a/opencompass/datasets/subjective/compassbench.py b/opencompass/datasets/subjective/compassbench.py
@@ -0,0 +1,101 @@
+# flake8: noqa
+import json
+import os.path as osp
+
+from datasets import Dataset
+
+from opencompass.registry import LOAD_DATASET
+
+from ..base import BaseDataset
+
+base_prompt_zh = """请根据 用户问题 以及 相应的两个回答，判断哪一个回答更好。
+[用户问题]
+{question}
+
+[回答1开始]
+{prediction}
+[回答1结束]
+
+[回答2开始]
+{prediction2}
+[回答2结束]
+
+根据评分要求，请先对两个回答进行评价，最后在以下 3 个选项中做出选择:
+A. 回答1更好
+B. 回答2更好
+C. 回答1、2平局
+
+如果你认为回答1更好，你的输出应形如：
+评价1：回答1 xxx
+评价2：回答2 xxx
+选择：[[A]]
+
+如果你认为回答2更好，你的输出应形如：
+评价1：回答1 xxx
+评价2：回答2 xxx
+选择：[[B]]
+
+如果你认为回答1、2打成平手，你的输出应形如：
+评价1：回答1 xxx
+评价2：回答2 xxx
+选择：[[C]]
+"""
+
+base_prompt_en = """Please evaluate the two responses based on the user's question and then choose from the following three options:
+A. Response 1 is better
+B. Response 2 is better
+C. Both responses are equal
+
+[user's question]
+{question}
+
+[Response 1 Start]
+{prediction}
+[Response 1 End]
+
+[Response 2 Start]
+{prediction2}
+[Response 2 End]
+
+If you believe that Response 1 is better, your output should be formatted as follows:
+Evaluation 1: Response 1 xxx
+Evaluation 2: Response 2 xxx
+Choice: [[A]]
+
+If you believe that Response 2 is better, your output should be formatted as follows:
+Evaluation 1: Response 1 xxx
+Evaluation 2: Response 2 xxx
+Choice: [[B]]
+
+If you believe that both responses are equally good, your output should be formatted as follows:
+Evaluation 1: Response 1 xxx
+Evaluation 2: Response 2 xxx
+Choice: [[C]]
+"""
+
+
+@LOAD_DATASET.register_module()
+class CompassBenchDataset(BaseDataset):
+
+    def load(self, path: str, name: str):
+        filename = osp.join(path, f'{name}.json')
+        raw_data = []
+        with open(filename, 'r', encoding='utf-8') as f:
+            json_data = json.load(f)
+            for problem in json_data:
+                question = problem['question']
+                lan = problem['language']
+                others = problem['others']
+                judge_prompt = base_prompt_zh if lan == 'zh' else base_prompt_en
+                raw_data.append({
+                    'question': question,
+                    'judge_prompt': judge_prompt,
+                    'judge': {
+                        'lan': lan,
+                        'level': others['level'],
+                        'category': problem['category'],
+                        'question': question
+                    }
+                })
+        dataset = Dataset.from_list(raw_data)
+        return dataset
diff --git a/opencompass/summarizers/subjective/__init__.py b/opencompass/summarizers/subjective/__init__.py
@@ -4,6 +4,7 @@
 from .alpacaeval import AlpacaSummarizer
 from .arenahard import ArenaHardSummarizer
 from .compass_arena import CompassArenaSummarizer
+from .compassbench import CompassBenchSummarizer
 from .corev2 import Corev2Summarizer
 from .creationbench import CreationBenchSummarizer
 from .flames import FlamesSummarizer