open-compass · Leymore · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023 · Oct 27, 2023
diff --git a/...sets/subjectivity_cmp/subjectivity_cmp.py → ...datasets/subjective_cmp/subjective_cmp.py b/...sets/subjectivity_cmp/subjectivity_cmp.py → ...datasets/subjective_cmp/subjective_cmp.py
@@ -2,21 +2,21 @@
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets.subjectivity_cmp import SubjectivityCmpDataset
+from opencompass.datasets.subjective_cmp import SubjectiveCmpDataset
 
-subjectivity_reader_cfg = dict(
+subjective_reader_cfg = dict(
     input_columns=['question', 'index', 'reference_answer', 'evaluating_guidance', 'capability', 'prompt'],
     output_column=None,
     train_split='test')
 
-subjectivity_all_sets = [
-    "sub_test",
+subjective_all_sets = [
+    "subjective_demo",
 ]
 
-subjectivity_datasets = []
+subjective_datasets = []
 
-for _name in subjectivity_all_sets:
-    subjectivity_infer_cfg = dict(
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
             prompt_template=dict(
                 type=PromptTemplate,
                 template=dict(round=[
@@ -30,7 +30,7 @@
             inferencer=dict(type=GenInferencer),
         )
 
-    subjectivity_eval_cfg = dict(
+    subjective_eval_cfg = dict(
         evaluator=dict(
             type=LMEvaluator,
             cmp_order='both',
@@ -49,13 +49,13 @@
         pred_role="BOT",
     )
 
-    subjectivity_datasets.append(
+    subjective_datasets.append(
         dict(
             abbr=f"{_name}",
-            type=SubjectivityCmpDataset,
-            path="./data/subjectivity/",
+            type=SubjectiveCmpDataset,
+            path="./data/subjective/",
             name=_name,
-            reader_cfg=subjectivity_reader_cfg,
-            infer_cfg=subjectivity_infer_cfg,
-            eval_cfg=subjectivity_eval_cfg
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg
         ))
diff --git a/configs/subjective_infer.py → configs/subjective.py b/configs/subjective_infer.py → configs/subjective.py
@@ -1,9 +1,9 @@
 from mmengine.config import read_base
 with read_base():
-    from .datasets.subjectivity_cmp.subjectivity_cmp import subjectivity_datasets
+    from .datasets.subjective_cmp.subjective_cmp import subjective_datasets
     from .summarizers.subjective import summarizer
 
-datasets = [*subjectivity_datasets]
+datasets = [*subjective_datasets]
 
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
@@ -35,22 +35,20 @@
         tokenizer_kwargs=dict(
             padding_side='left',
             truncation_side='left',
-            trust_remote_code=True,
-            revision='b1502f4f75c71499a3d566b14463edd62620ce9f'),
+            trust_remote_code=True),
         max_out_len=100,
         max_seq_len=2048,
         batch_size=8,
         model_kwargs=dict(
             trust_remote_code=True,
-            device_map='auto',
-            revision='b1502f4f75c71499a3d566b14463edd62620ce9f'),
+            device_map='auto'),
         run_cfg=dict(num_gpus=1, num_procs=1),
     ),
     dict(
         type=HuggingFaceCausalLM,
         abbr='qwen-7b-chat-hf',
-        path="/mnt/petrelfs/share_data/duanhaodong/Qwen-7B-Chat",
-        tokenizer_path='/mnt/petrelfs/share_data/duanhaodong/Qwen-7B-Chat',
+        path="Qwen/Qwen-7B-Chat",
+        tokenizer_path='Qwen/Qwen-7B-Chat',
         tokenizer_kwargs=dict(
             padding_side='left',
             truncation_side='left',
@@ -74,16 +72,14 @@
             padding_side='left',
             truncation_side='left',
             use_fast=False,
-            trust_remote_code=True,
-            revision="ed5e35564ac836710817c51e8e8d0a5d4ff03102"),
+            trust_remote_code=True),
         max_out_len=100,
         max_seq_len=2048,
         batch_size=8,
         meta_template=_meta_template2,
         model_kwargs=dict(
             trust_remote_code=True,
-            device_map='auto',
-            revision="ed5e35564ac836710817c51e8e8d0a5d4ff03102"),
+            device_map='auto'),
         run_cfg=dict(num_gpus=1, num_procs=1),
     )
 ]

diff --git a/docs/en/advanced_guides/subjective_evaluation.md b/docs/en/advanced_guides/subjective_evaluation.md
@@ -0,0 +1,150 @@
+# Subjective Evaluation Guidance
+
+## Introduction
+
+Subjective evaluation aims to assess the model's performance in tasks that align with human preferences. The key criterion for this evaluation is human preference, but it comes with a high cost of annotation.
+
+To explore the model's subjective capabilities, we employ state-of-the-art LLM (GPT-4) as a substitute for human assessors ([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)).
+
+A popular evaluation method involves comparing model responses pairwise to calculate their win rate ([Chatbot Arena](https://chat.lmsys.org/)).
+
+We support the use of GPT-4 for the subjective evaluation of models based on this method.
+
+## Data Preparation
+
+We provide a demo test set [subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx) based on [z-bench](https://github.com/zhenbench/z-bench).
+
+Store the set of subjective questions in .xlsx format in the `data/subjective/directory`.
+
+The table includes the following fields:
+
+- 'question': Question description
+- 'index': Question number
+- 'reference_answer': Reference answer
+- 'evaluating_guidance': Evaluation guidance
+- 'capability': The capability dimension of the question.
+
+## Evaluation Configuration
+
+The specific process includes:
+
+1. Model response reasoning
+2. GPT-4 evaluation comparisons
+3. Generating evaluation reports
+
+For `config/subjective.py`, we provide some annotations to help users understand the configuration file's meaning.
+
+```python
+# Import datasets and subjective evaluation summarizer
+from mmengine.config import read_base
+with read_base():
+    from .datasets.subjective_cmp.subjective_cmp import subjective_datasets
+    from .summarizers.subjective import summarizer
+
+datasets = [*subjective_datasets]
+
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
+
+# Import partitioner and task required for subjective evaluation
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+
+
+# Define model configurations for inference and evaluation
+# Including the inference models chatglm2-6b, qwen-7b-chat, internlm-chat-7b, and the evaluation model gpt4
+models = [...]
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True)
+    ],
+    reserved_roles=[
+        dict(role='SYSTEM', api_role='SYSTEM'),
+    ],
+)
+
+# Define the configuration for subjective evaluation
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveNaivePartitioner,
+        mode='all',  # alternately constructs two for comparisons
+    ),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=2,  # Supports parallel comparisons
+        task=dict(
+            type=SubjectiveEvalTask,  # Used to read inputs for a pair of models
+            judge_cfg=dict(
+                abbr='GPT4',
+                type=OpenAI,
+                path='gpt-4-0613',
+                key='ENV',
+                meta_template=api_meta_template,
+                query_per_second=1,
+                max_out_len=2048,
+                max_seq_len=2048,
+                batch_size=2),
+        )),
+)
+```
+
+## Launching the Evaluation
+
+```shell
+python run.py config/subjective.py -r
+```
+
+The `-r` parameter allows the reuse of model inference and GPT-4 evaluation results.
+
+## Evaluation Report
+
+The evaluation report will be output to `output/.../summary/timestamp/report.md`, which includes win rate statistics, battle scores, and ELO ratings. The specific format is as follows:
+
+```markdown
+# Subjective Analysis
+
+A total of 30 comparisons, of which 30 comparisons are meaningful (A / B answers inconsistent)
+A total of 30 answer comparisons, successfully extracted 30 answers from GPT-4 replies, with an extraction success rate of 100.00%
+
+### Basic statistics (4 stats: win / tie / lose / not bad)
+
+| Dimension \ Stat [W / T / L / NB] | chatglm2-6b-hf                | qwen-7b-chat-hf              | internlm-chat-7b-hf           |
+| --------------------------------- | ----------------------------- | ---------------------------- | ----------------------------- |
+| LANG: Overall                     | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
+| LANG: CN                          | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
+| LANG: EN                          | N/A                           | N/A                          | N/A                           |
+| CAPA: common                      | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
+
+![Capabilities Dimension Classification Result](by_capa.png)
+
+![Language Classification  Result](by_lang.png)
+
+### Model scores (base score is 0, win +3, both +1, neither -1, lose -3)
+
+| Dimension \ Score | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf |
+| ----------------- | -------------- | --------------- | ------------------- |
+| LANG: Overall     | -8             | 0               | -8                  |
+| LANG: CN          | -8             | 0               | -8                  |
+| LANG: EN          | N/A            | N/A             | N/A                 |
+| CAPA: common      | -8             | 0               | -8                  |
+
+### Bootstrap ELO, Median of n=1000 times
+
+|                  | chatglm2-6b-hf | internlm-chat-7b-hf | qwen-7b-chat-hf |
+| ---------------- | -------------- | ------------------- | --------------- |
+| elo_score [Mean] | 999.504        | 999.912             | 1000.26         |
+| elo_score [Std]  | 0.621362       | 0.400226            | 0.694434        |
+```
+
+For comparing the evaluation of models A and B, there are four choices:
+
+1. A is better than B.
+2. A and B are equally good.
+3. A is worse than B.
+4. Neither A nor B is good.
+
+So, `win` / `tie` / `lose` / `not bad` represent the proportions of the model winning / tying / losing / winning or being equally good, respectively.
+
+`Bootstrap ELO` is calculated as the median ELO score by comparing match results through 1000 random permutations.
diff --git a/docs/en/index.rst b/docs/en/index.rst
@@ -64,6 +64,7 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
    advanced_guides/multimodal_eval.md
    advanced_guides/prompt_attack.md
    advanced_guides/longeval.md
+   advanced_guides/subjective_evaluation.md
 
 .. _Tools:
 .. toctree::