open-compass · Leymore · Dec 27, 2023 · Dec 23, 2023 · Dec 23, 2023 · Dec 25, 2023
diff --git a/.gitignore b/.gitignore
@@ -4,6 +4,7 @@ outputs/
 icl_inference_output/
 .vscode/
 tmp/
+configs/eval_subjective_alignbench_test.py
 configs/openai_key.py
 configs/secrets.py
 configs/datasets/log.json

diff --git a/configs/datasets/subjective_alignbench/alignbench_judgeby_autoj.py b/configs/datasets/subjective_alignbench/alignbench_judgeby_autoj.py
@@ -0,0 +1,71 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import AlignmentBenchDataset
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'capability', 'ref'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    "alignment_bench",
+]
+data_path ="data/subjective/alignment_bench"
+
+subjective_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt="{question}"
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer， max_out_len=1024),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = """为上传的针对给定用户问题的回应撰写评论, 并为该回复打分:
+
+[BEGIN DATA]
+***
+[用户问询]: {question}
+***
+[回应]: {prediction}
+***
+[参考答案]: {ref}
+***
+[END DATA]
+
+请根据参考答案为这个回应撰写评论. 在这之后, 你应该按照如下格式给这个回应一个最终的1-10范围的评分: "[[评分]]", 例如: "评分: [[5]]"."""
+                    ),
+                ]),
+            ),
+        ),
+        pred_role="BOT",
+    )
+
+    subjective_datasets.append(
+        dict(
+            abbr=f"{_name}",
+            type=AlignmentBenchDataset,
+            path=data_path,
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg
+        ))
diff --git a/...atasets/subjective_cmp/alignment_bench.py → ...gnbench/alignbench_judgeby_critiquellm.py b/...atasets/subjective_cmp/alignment_bench.py → ...gnbench/alignbench_judgeby_critiquellm.py
@@ -3,10 +3,9 @@
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
 from opencompass.datasets import AlignmentBenchDataset
-from mmengine.config import read_base
 
 subjective_reader_cfg = dict(
-    input_columns=['question', 'capability', 'prefix', 'suffix'],
+    input_columns=['question', 'capability', 'critiquellm_prefix'],
     output_column='judge',
     )
 
@@ -43,7 +42,7 @@
                 template=dict(round=[
                     dict(
                         role='HUMAN',
-                        prompt = "{prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n"
+                        prompt = "{critiquellm_prefix}[助手的答案开始]\n{prediction}\n[助手的答案结束]\n"
                     ),
                 ]),
             ),

diff --git a/configs/datasets/subjective_alignbench/alignbench_judgeby_judgelm.py b/configs/datasets/subjective_alignbench/alignbench_judgeby_judgelm.py
@@ -0,0 +1,59 @@
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_evaluator import LMEvaluator
+from opencompass.datasets import AlignmentBenchDataset
+
+subjective_reader_cfg = dict(
+    input_columns=['question', 'capability', 'ref'],
+    output_column='judge',
+    )
+
+subjective_all_sets = [
+    "alignment_bench",
+]
+data_path ="data/subjective/alignment_bench"
+
+subjective_datasets = []
+
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt="{question}"
+                    ),
+                ]),
+            ),
+            retriever=dict(type=ZeroRetriever),
+            inferencer=dict(type=GenInferencer， max_out_len=1024),
+        )
+
+    subjective_eval_cfg = dict(
+        evaluator=dict(
+            type=LMEvaluator,
+            prompt_template=dict(
+                type=PromptTemplate,
+                template=dict(round=[
+                    dict(
+                        role='HUMAN',
+                        prompt = """You are a helpful and precise assistant for checking the quality of the answer.\n[Question]\n{question}\n\n[The Start of Assistant 1's Answer]\n{ref}\n\n[The End of Assistant 1's Answer]\n\n[The Start of Assistant 2's Answer]\n{prediction}\n\n[The End of Assistant 2's Answer]\n\n[System]\nWe would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space. In the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment.\n\n### Response:10"""
+                    ),
+                ]),
+            ),
+        ),
+        pred_role="BOT",
+    )
+
+    subjective_datasets.append(
+        dict(
+            abbr=f"{_name}",
+            type=AlignmentBenchDataset,
+            path=data_path,
+            name=_name,
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg
+        ))
diff --git a/configs/eval_subjective_alignbench.py b/configs/eval_subjective_alignbench.py
@@ -7,7 +7,10 @@
     from .models.chatglm.hf_chatglm3_6b import models as hf_chatglm3_6b
     from .models.baichuan.hf_baichuan2_7b_chat import models as hf_baichuan2_7b
     from .models.hf_internlm.hf_internlm_chat_20b import models as hf_internlm_chat_20b
-    from .datasets.subjective_cmp.alignment_bench import subjective_datasets
+    from .models.judge_llm.auto_j.hf_autoj_eng_13b import models as hf_autoj
+    from .models.judge_llm.judgelm.hf_judgelm_33b_v1 import models as hf_judgelm
+    from .models.judge_llm.pandalm.hf_pandalm_7b_v1 import models as hf_pandalm
+    from .datasets.subjective_alignbench.alignbench_judgeby_critiquellm import subjective_datasets
 
 datasets = [*subjective_datasets]
 

diff --git a/configs/models/judge_llm/auto_j/hf_autoj_bilingual_6b.py b/configs/models/judge_llm/auto_j/hf_autoj_bilingual_6b.py
@@ -7,8 +7,7 @@
 https://huggingface.co/GAIR/autoj-bilingual-6b
 '''
 
-models = [
-    dict(
+models = [dict(
         type=HuggingFaceCausalLM,
         abbr='autoj-bilingual-6b',
         path="GAIR/autoj-bilingual-6b",
@@ -22,5 +21,4 @@
         batch_size=8,
         model_kwargs=dict(device_map='auto', trust_remote_code=True),
         run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-]
+    )]
diff --git a/configs/models/judge_llm/auto_j/hf_autoj_eng_13b.py b/configs/models/judge_llm/auto_j/hf_autoj_eng_13b.py
@@ -1,8 +1,7 @@
 from opencompass.models import HuggingFaceCausalLM
 
 
-models = [
-    dict(
+models = [dict(
         type=HuggingFaceCausalLM,
         abbr='autoj-13b-GPTQ-4bits',
         path="GAIR/autoj-13b-GPTQ-4bits",
@@ -16,5 +15,4 @@
         batch_size=8,
         model_kwargs=dict(device_map='auto', trust_remote_code=True),
         run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-]
+    )]
diff --git a/configs/models/judge_llm/auto_j/hf_autoj_eng_13b_4bit.py b/configs/models/judge_llm/auto_j/hf_autoj_eng_13b_4bit.py
@@ -6,8 +6,7 @@
 https://huggingface.co/GAIR/autoj-13b-GPTQ-4bits
 '''
 
-models = [
-    dict(
+models = [dict(
         type=HuggingFaceCausalLM,
         abbr='autoj-13b',
         path="GAIR/autoj-13b",
@@ -21,5 +20,4 @@
         batch_size=8,
         model_kwargs=dict(device_map='auto', trust_remote_code=True),
         run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-]
+    )]
diff --git a/configs/models/judge_llm/auto_j/hf_autoj_scen_classifier.py b/configs/models/judge_llm/auto_j/hf_autoj_scen_classifier.py
@@ -1,8 +1,7 @@
 from opencompass.models import HuggingFaceCausalLM
 
 
-models = [
-    dict(
+models = [dict(
         type=HuggingFaceCausalLM,
         abbr='autoj-scenario-classifier',
         path="GAIR/autoj-scenario-classifier",
@@ -16,5 +15,4 @@
         batch_size=8,
         model_kwargs=dict(device_map='auto', trust_remote_code=True),
         run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-]
+    )]
diff --git a/configs/models/judge_llm/judgelm/hf_judgelm_13b_v1.py b/configs/models/judge_llm/judgelm/hf_judgelm_13b_v1.py
@@ -1,12 +1,11 @@
 from opencompass.models import HuggingFaceCausalLM
 
 
-models = [
-    dict(
+models = [dict(
         type=HuggingFaceCausalLM,
         abbr='judgelm-13b-v1-hf',
-        path="BAAI/JudgeLM-13b-v1.0",
-        tokenizer_path='BAAI/JudgeLM-13b-v1.0',
+        path="BAAI/JudgeLM-13B-v1.0",
+        tokenizer_path='BAAI/JudgeLM-13B-v1.0',
         tokenizer_kwargs=dict(padding_side='left',
                               truncation_side='left',
                               trust_remote_code=True,
@@ -16,5 +15,4 @@
         batch_size=8,
         model_kwargs=dict(device_map='auto', trust_remote_code=True),
         run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-]
+    )]
diff --git a/configs/models/judge_llm/judgelm/hf_judgelm_33b_v1.py b/configs/models/judge_llm/judgelm/hf_judgelm_33b_v1.py
@@ -1,12 +1,11 @@
 from opencompass.models import HuggingFaceCausalLM
 
 
-models = [
-    dict(
+models = [dict(
         type=HuggingFaceCausalLM,
         abbr='judgelm-33b-v1-hf',
-        path="BAAI/JudgeLM-33b-v1.0",
-        tokenizer_path='BAAI/JudgeLM-33b-v1.0',
+        path="BAAI/JudgeLM-33B-v1.0",
+        tokenizer_path='BAAI/JudgeLM-33B-v1.0',
         tokenizer_kwargs=dict(padding_side='left',
                               truncation_side='left',
                               trust_remote_code=True,
@@ -16,5 +15,4 @@
         batch_size=8,
         model_kwargs=dict(device_map='auto', trust_remote_code=True),
         run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-]
+    )]
diff --git a/configs/models/judge_llm/judgelm/hf_judgelm_7b_v1.py b/configs/models/judge_llm/judgelm/hf_judgelm_7b_v1.py
@@ -1,8 +1,7 @@
 from opencompass.models import HuggingFaceCausalLM
 
 
-models = [
-    dict(
+models = [dict(
         type=HuggingFaceCausalLM,
         abbr='judgelm-7b-v1-hf',
         path="BAAI/JudgeLM-7B-v1.0",
@@ -16,5 +15,4 @@
         batch_size=8,
         model_kwargs=dict(device_map='auto', trust_remote_code=True),
         run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-]
+    )]
diff --git a/configs/models/judge_llm/pandalm/hf_alpaca_pandalm_7b_v1.py b/configs/models/judge_llm/pandalm/hf_alpaca_pandalm_7b_v1.py
@@ -1,8 +1,7 @@
 from opencompass.models import HuggingFaceCausalLM
 
 
-models = [
-    dict(
+models = [dict(
         type=HuggingFaceCausalLM,
         abbr='alpaca-pandalm-7b-v1-hf',
         path="WeOpenML/PandaLM-Alpaca-7B-v1",
@@ -16,5 +15,4 @@
         batch_size=8,
         model_kwargs=dict(device_map='auto', trust_remote_code=True),
         run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-]
+    )]
diff --git a/configs/models/judge_llm/pandalm/hf_pandalm_7b_v1.py b/configs/models/judge_llm/pandalm/hf_pandalm_7b_v1.py
@@ -1,8 +1,7 @@
 from opencompass.models import HuggingFaceCausalLM
 
 
-models = [
-    dict(
+models = [dict(
         type=HuggingFaceCausalLM,
         abbr='pandalm-7b-v1-hf',
         path="WeOpenML/PandaLM-7B-v1",
@@ -16,5 +15,4 @@
         batch_size=8,
         model_kwargs=dict(device_map='auto', trust_remote_code=True),
         run_cfg=dict(num_gpus=1, num_procs=1),
-    )
-]
+    )]
diff --git a/opencompass/datasets/subject_alignmentbench.py b/opencompass/datasets/subject_alignmentbench.py
@@ -2,6 +2,7 @@
 import json
 import os.path as osp
 import re
+from typing import Optional
 
 from datasets import Dataset, DatasetDict
 
@@ -83,16 +84,25 @@ def prompt_construct(sample, config: Config):
 @LOAD_DATASET.register_module()
 class AlignmentBenchDataset(SubjectiveCmpDataset):
 
-    def load(self, path: str, name: str, alignment_bench_config_path: str,
-             alignment_bench_config_name: str):
-        alignmentbenchconfig = Config(alignment_bench_config_path,
-                                      alignment_bench_config_name)
+    def load(self,
+             path: str,
+             name: str,
+             alignment_bench_config_path: Optional[str] = '',
+             alignment_bench_config_name: Optional[str] = ''):
+        if alignment_bench_config_path != '':
+            alignmentbenchconfig = Config(alignment_bench_config_path,
+                                          alignment_bench_config_name)
+        else:
+            alignmentbenchconfig = None
         dataset = list(super().load(path, name))
         corev2_dataset = []
         for data in dataset:
-            dimensions, prefix = prompt_construct(data, alignmentbenchconfig)
-            data['prefix'], data['suffix'] = prefix, ''
+            if alignmentbenchconfig:
+                dimensions, prefix = prompt_construct(data,
+                                                      alignmentbenchconfig)
+                data['critiquellm_prefix'] = prefix
             data['judge']['others'] = data['others']
+            data['ref'] = data['others']['reference']
             corev2_dataset.append(data)
         dataset = Dataset.from_list(corev2_dataset)
         return dataset

diff --git a/opencompass/summarizers/__init__.py b/opencompass/summarizers/__init__.py
@@ -1,4 +1,6 @@
-from .alignmentbench import AlignmentBenchSummarizer  # noqa: F401
+# flake8: noqa: F401, E501
+from .alignmentbench import (AlignmentBenchSummarizer, AutojSummarizer,
+                             JudgeLMSummarizer)
 from .circular import CircularSummarizer  # noqa: F401
 from .corev2 import Corev2Summarizer  # noqa: F401
 from .creationv01 import Creationv01Summarizer  # noqa: F401