From 30432bc01a73bf326399a732b53ce0a072a25709 Mon Sep 17 00:00:00 2001
From: frankweijue <weijueqi@pjlab.org.cn>
Date: Fri, 27 Oct 2023 14:04:59 +0800
Subject: [PATCH 1/7] rename

---
 .../subjective_cmp.py}                        |  26 ++--
 .../{subjective_infer.py => subjective.py}    |  20 ++-
 .../advanced_guides/subjective evaluation.md  | 135 ++++++++++++++++++
 ...{subjectivity_cmp.py => subjective_cmp.py} |   6 +-
 4 files changed, 159 insertions(+), 28 deletions(-)
 rename configs/datasets/{subjectivity_cmp/subjectivity_cmp.py => subjective_cmp/subjective_cmp.py} (75%)
 rename configs/{subjective_infer.py => subjective.py} (82%)
 create mode 100644 docs/zh_cn/advanced_guides/subjective evaluation.md
 rename opencompass/datasets/{subjectivity_cmp.py => subjective_cmp.py} (98%)

diff --git a/configs/datasets/subjectivity_cmp/subjectivity_cmp.py b/configs/datasets/subjective_cmp/subjective_cmp.py
similarity index 75%
rename from configs/datasets/subjectivity_cmp/subjectivity_cmp.py
rename to configs/datasets/subjective_cmp/subjective_cmp.py
index 8ec8f219f..c509be931 100644
--- a/configs/datasets/subjectivity_cmp/subjectivity_cmp.py
+++ b/configs/datasets/subjective_cmp/subjective_cmp.py
@@ -2,21 +2,21 @@
 from opencompass.openicl.icl_retriever import ZeroRetriever
 from opencompass.openicl.icl_inferencer import GenInferencer
 from opencompass.openicl.icl_evaluator import LMEvaluator
-from opencompass.datasets.subjectivity_cmp import SubjectivityCmpDataset
+from opencompass.datasets.subjective_cmp import SubjectiveCmpDataset
 
-subjectivity_reader_cfg = dict(
+subjective_reader_cfg = dict(
     input_columns=['question', 'index', 'reference_answer', 'evaluating_guidance', 'capability', 'prompt'],
     output_column=None,
     train_split='test')
 
-subjectivity_all_sets = [
+subjective_all_sets = [
     "sub_test",
 ]
 
-subjectivity_datasets = []
+subjective_datasets = []
 
-for _name in subjectivity_all_sets:
-    subjectivity_infer_cfg = dict(
+for _name in subjective_all_sets:
+    subjective_infer_cfg = dict(
             prompt_template=dict(
                 type=PromptTemplate,
                 template=dict(round=[
@@ -30,7 +30,7 @@
             inferencer=dict(type=GenInferencer),
         )
 
-    subjectivity_eval_cfg = dict(
+    subjective_eval_cfg = dict(
         evaluator=dict(
             type=LMEvaluator,
             cmp_order='both',
@@ -49,13 +49,13 @@
         pred_role="BOT",
     )
 
-    subjectivity_datasets.append(
+    subjective_datasets.append(
         dict(
             abbr=f"{_name}",
-            type=SubjectivityCmpDataset,
-            path="./data/subjectivity/",
+            type=SubjectiveCmpDataset,
+            path="./data/subjective/",
             name=_name,
-            reader_cfg=subjectivity_reader_cfg,
-            infer_cfg=subjectivity_infer_cfg,
-            eval_cfg=subjectivity_eval_cfg
+            reader_cfg=subjective_reader_cfg,
+            infer_cfg=subjective_infer_cfg,
+            eval_cfg=subjective_eval_cfg
         ))
diff --git a/configs/subjective_infer.py b/configs/subjective.py
similarity index 82%
rename from configs/subjective_infer.py
rename to configs/subjective.py
index 88d2a8282..64fab987b 100644
--- a/configs/subjective_infer.py
+++ b/configs/subjective.py
@@ -1,9 +1,9 @@
 from mmengine.config import read_base
 with read_base():
-    from .datasets.subjectivity_cmp.subjectivity_cmp import subjectivity_datasets
+    from .datasets.subjective_cmp.subjective_cmp import subjective_datasets
     from .summarizers.subjective import summarizer
 
-datasets = [*subjectivity_datasets]
+datasets = [*subjective_datasets]
 
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
@@ -35,22 +35,20 @@
         tokenizer_kwargs=dict(
             padding_side='left',
             truncation_side='left',
-            trust_remote_code=True,
-            revision='b1502f4f75c71499a3d566b14463edd62620ce9f'),
+            trust_remote_code=True),
         max_out_len=100,
         max_seq_len=2048,
         batch_size=8,
         model_kwargs=dict(
             trust_remote_code=True,
-            device_map='auto',
-            revision='b1502f4f75c71499a3d566b14463edd62620ce9f'),
+            device_map='auto'),
         run_cfg=dict(num_gpus=1, num_procs=1),
     ),
     dict(
         type=HuggingFaceCausalLM,
         abbr='qwen-7b-chat-hf',
-        path="/mnt/petrelfs/share_data/duanhaodong/Qwen-7B-Chat",
-        tokenizer_path='/mnt/petrelfs/share_data/duanhaodong/Qwen-7B-Chat',
+        path="Qwen/Qwen-7B-Chat",
+        tokenizer_path='Qwen/Qwen-7B-Chat',
         tokenizer_kwargs=dict(
             padding_side='left',
             truncation_side='left',
@@ -74,16 +72,14 @@
             padding_side='left',
             truncation_side='left',
             use_fast=False,
-            trust_remote_code=True,
-            revision="ed5e35564ac836710817c51e8e8d0a5d4ff03102"),
+            trust_remote_code=True),
         max_out_len=100,
         max_seq_len=2048,
         batch_size=8,
         meta_template=_meta_template2,
         model_kwargs=dict(
             trust_remote_code=True,
-            device_map='auto',
-            revision="ed5e35564ac836710817c51e8e8d0a5d4ff03102"),
+            device_map='auto'),
         run_cfg=dict(num_gpus=1, num_procs=1),
     )
 ]
diff --git a/docs/zh_cn/advanced_guides/subjective evaluation.md b/docs/zh_cn/advanced_guides/subjective evaluation.md
new file mode 100644
index 000000000..eb4399ea0
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/subjective evaluation.md	
@@ -0,0 +1,135 @@
+# 主观评测指引
+
+## 介绍
+
+- 主观评测旨在评估模型在符合人类偏好的能力上的表现。这种评估的黄金准则是人类喜好，但标注成本很高。
+- 为了探究模型的主观能力，我们采用了最先进的LLM（GPT-4）作为人类评估者的替代品（[LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)）。
+- 流行的评估方法是将模型的回答进行两两比较，以计算其胜率（[Chatbot Arena](https://chat.lmsys.org/)）。
+- 我们基于这一方法支持了GPT4用于模型的主观能力评估。
+
+## 数据准备
+
+- 将主观问题集以.xlsx格式存放在data/subjective/中。
+- 我们提供了一个基于[z-bench](https://github.com/zhenbench/z-bench)的demo测试集[subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx
+)。
+- 表格包括以下字段：
+  - 'question'：问题描述
+  - 'index'：题目序号
+  - 'reference_answer'：参考答案
+  - 'evaluating_guidance'：评估引导
+  - 'capability'：题目所属的能力维度。
+
+## 评测配置
+具体流程包括:
+1. 模型回答的推理
+2. GPT4评估比较对
+3. 生成评测报告
+
+对于 config/subjective.py，我们提供了部分注释，方便用户理解配置文件的含义。
+```python
+# 导入数据集与主观评测summarizer
+from mmengine.config import read_base
+with read_base():
+    from .datasets.subjectivity_cmp.subjectivity_cmp import subjectivity_datasets
+    from .summarizers.subjective import summarizer
+
+datasets = [*subjectivity_datasets]
+
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
+
+#导入主观评测所需partitioner与task
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+
+
+# 定义推理和评测所需模型配置
+# 包括chatglm2-6b，qwen-7b-chat，internlm-chat-7b，gpt4
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
+        dict(
+            role="BOT",
+            begin="\n<|im_start|>assistant\n",
+            end='<|im_end|>',
+            generate=True),
+    ], )
+
+...
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True)
+    ],
+    reserved_roles=[
+        dict(role='SYSTEM', api_role='SYSTEM'),
+    ],
+)
+
+# 定义主观评测配置
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveNaivePartitioner,
+        mode='all',  # 新参数，构建比较对时会交替构建两个
+    ),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=2,  # 支持并行比较
+        task=dict(
+            type=SubjectiveEvalTask,  # 新 task，用来读入一对 model 的输入
+            judge_cfg=dict(
+                abbr='GPT4',
+                type=OpenAI,
+                path='gpt-4-0613',
+                key='ENV',
+                meta_template=api_meta_template,
+                query_per_second=1,
+                max_out_len=2048,
+                max_seq_len=2048,
+                batch_size=2),
+        )),
+)
+```
+
+## 启动评测
+```shell
+python run.py config/subjective.py -r
+```
+```-r``` 参数支持复用模型推理和GPT4评估结果。
+
+## 评测报告
+
+评测报告会输出到output/.../summary/timestamp/report.md，包含胜率统计，对战分数与ELO。具体格式如下：
+```markdown
+# Subjective Analysis
+A total of 30 comparisons, of which 30 comparisons are meaningful (A / B answers inconsistent)
+A total of 30 answer comparisons, successfully extracted 30 answers from GPT-4 replies, with an extraction success rate of 100.00%
+### Basic statistics (4 stats: win / tie / lose / not bad)
+| Dimension \ Stat [W / T / L / NB]   | chatglm2-6b-hf                | qwen-7b-chat-hf              | internlm-chat-7b-hf           |
+|-------------------------------------|-------------------------------|------------------------------|-------------------------------|
+| LANG: Overall                       | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
+| LANG: CN                            | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
+| LANG: EN                            | N/A                           | N/A                          | N/A                           |
+| CAPA: common                        | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
+
+
+![Capabilities Dimension Classification Result](by_capa.png)
+
+![Language Classification  Result](by_lang.png)
+
+
+### Model scores (base score is 0, win +3, both +1, neither -1, lose -3)
+| Dimension \ Score   | chatglm2-6b-hf   | qwen-7b-chat-hf   | internlm-chat-7b-hf   |
+|---------------------|------------------|-------------------|-----------------------|
+| LANG: Overall       | -8               | 0                 | -8                    |
+| LANG: CN            | -8               | 0                 | -8                    |
+| LANG: EN            | N/A              | N/A               | N/A                   |
+| CAPA: common        | -8               | 0                 | -8                    |
+### Bootstrap ELO, Median of n=1000 times 
+|                  |   chatglm2-6b-hf |   internlm-chat-7b-hf |   qwen-7b-chat-hf |
+|------------------|------------------|-----------------------|-------------------|
+| elo_score [Mean] |       999.504    |            999.912    |       1000.26     |
+| elo_score [Std]  |         0.621362 |              0.400226 |          0.694434 |
+
+```
\ No newline at end of file
diff --git a/opencompass/datasets/subjectivity_cmp.py b/opencompass/datasets/subjective_cmp.py
similarity index 98%
rename from opencompass/datasets/subjectivity_cmp.py
rename to opencompass/datasets/subjective_cmp.py
index bc676194f..38cf7363c 100644
--- a/opencompass/datasets/subjectivity_cmp.py
+++ b/opencompass/datasets/subjective_cmp.py
@@ -187,20 +187,20 @@ def build_prompt(question,
 
 examples = [example1, example2, example3, example4]
 
-subjectivity_reader_cfg = dict(input_columns=[
+subjective_reader_cfg = dict(input_columns=[
     'question', 'index', 'reference_answer', 'evaluating_guidance',
     'capability'
 ],
                                output_column=None,
                                train_split='test')
 
-subjectivity_all_sets = [
+subjective_all_sets = [
     'sub_test',
 ]
 
 
 @LOAD_DATASET.register_module()
-class SubjectivityCmpDataset(BaseDataset):
+class SubjectiveCmpDataset(BaseDataset):
 
     @staticmethod
     def load(path: str, name: str):

From 3f9121a420c06c11a7ae5bea7ee6068daa94040a Mon Sep 17 00:00:00 2001
From: frankweijue <weijueqi@pjlab.org.cn>
Date: Fri, 27 Oct 2023 14:17:27 +0800
Subject: [PATCH 2/7] add en subdoc

---
 .../advanced_guides/subjective evaluation.md  | 139 ++++++++++++++++++
 .../advanced_guides/subjective evaluation.md  |  29 ++--
 2 files changed, 156 insertions(+), 12 deletions(-)
 create mode 100644 docs/en/advanced_guides/subjective evaluation.md

diff --git a/docs/en/advanced_guides/subjective evaluation.md b/docs/en/advanced_guides/subjective evaluation.md
new file mode 100644
index 000000000..b09fd08ed
--- /dev/null
+++ b/docs/en/advanced_guides/subjective evaluation.md	
@@ -0,0 +1,139 @@
+# Subjective Evaluation Guide
+
+## Introduction
+
+Subjective evaluation aims to assess the model's performance in tasks that align with human preferences. The key criterion for this evaluation is human preference, but it comes with a high cost of annotation.
+
+To explore the model's subjective capabilities, we employ state-of-the-art LLM (GPT-4) as a substitute for human assessors ([LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)).
+
+A popular evaluation method involves comparing model responses pairwise to calculate their win rate ([Chatbot Arena](https://chat.lmsys.org/)).
+
+We support the use of GPT-4 for the subjective evaluation of models based on this method.
+
+## Data Preparation
+
+We provide a demo test set [subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx) based on [z-bench](https://github.com/zhenbench/z-bench).
+
+Store the set of subjective questions in .xlsx format in the data/subjective/directory.
+
+The table includes the following fields:
+- 'question': Question description
+- 'index': Question number
+- 'reference_answer': Reference answer
+- 'evaluating_guidance': Evaluation guidance
+- 'capability': The capability dimension of the question.
+
+## Evaluation Configuration
+The specific process includes:
+1. Model response reasoning
+2. GPT-4 evaluation comparisons
+3. Generating evaluation reports
+
+For config/subjective.py, we provide some annotations to help users understand the configuration file's meaning.
+```python
+# Import datasets and subjective evaluation summarizer
+from mmengine.config import read_base
+with read_base():
+    from .datasets.subjectivity_cmp.subjectivity_cmp import subjectivity_datasets
+    from .summarizers.subjective import summarizer
+
+datasets = [*subjectivity_datasets]
+
+from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
+
+# Import partitioner and task required for subjective evaluation
+from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
+from opencompass.runners import LocalRunner
+from opencompass.tasks.subjective_eval import SubjectiveEvalTask
+
+
+# Define model configurations for inference and evaluation
+# Including chatglm2-6b, qwen-7b-chat, internlm-chat-7b and gpt4
+_meta_template = dict(
+    round=[
+        dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
+        dict(
+            role="BOT",
+            begin="\n<|im_start|>assistant\n",
+            end='<|im_end|>',
+            generate=True),
+    ], )
+
+...
+
+api_meta_template = dict(
+    round=[
+        dict(role='HUMAN', api_role='HUMAN'),
+        dict(role='BOT', api_role='BOT', generate=True)
+    ],
+    reserved_roles=[
+        dict(role='SYSTEM', api_role='SYSTEM'),
+    ],
+)
+
+# Define the configuration for subjective evaluation
+eval = dict(
+    partitioner=dict(
+        type=SubjectiveNaivePartitioner,
+        mode='all',  # alternately constructs two for comparisons
+    ),
+    runner=dict(
+        type=LocalRunner,
+        max_num_workers=2,  # Supports parallel comparisons
+        task=dict(
+            type=SubjectiveEvalTask,  # Used to read inputs for a pair of models
+            judge_cfg=dict(
+                abbr='GPT4',
+                type=OpenAI,
+                path='gpt-4-0613',
+                key='ENV',
+                meta_template=api_meta_template,
+                query_per_second=1,
+                max_out_len=2048,
+                max_seq_len=2048,
+                batch_size=2),
+        )),
+)
+```
+
+## Launching the Evaluation
+```shell
+python run.py config/subjective.py -r
+```
+The ```-r``` parameter allows the reuse of model inference and GPT-4 evaluation results.
+
+## Evaluation Report
+
+The evaluation report will be output to output/.../summary/timestamp/report.md, which includes win rate statistics, battle scores, and ELO ratings. The specific format is as follows:
+```markdown
+# Subjective Analysis
+A total of 30 comparisons, of which 30 comparisons are meaningful (A / B answers inconsistent)
+A total of 30 answer comparisons, successfully extracted 30 answers from GPT-4 replies, with an extraction success rate of 100.00%
+### Basic statistics (4 stats: win / tie / lose / not bad)
+| Dimension \ Stat [W / T / L / NB]   | chatglm2-6b-hf                | qwen-7b-chat-hf              | internlm-chat-7b-hf           |
+|-------------------------------------|-------------------------------|------------------------------|-------------------------------|
+| LANG: Overall                       | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
+| LANG: CN                            | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
+| LANG: EN                            | N/A                           | N/A                          | N/A                           |
+| CAPA: common                        | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
+
+
+![Capabilities Dimension Classification Result](by_capa.png)
+
+![Language Classification  Result](by_lang.png)
+
+
+### Model scores (base score is 0, win +3, both +1, neither -1, lose -3)
+| Dimension \ Score   | chatglm2-6b-hf   | qwen-7b-chat-hf   | internlm-chat-7b-hf   |
+|---------------------|------------------|-------------------|-----------------------|
+| LANG: Overall       | -8               | 0                 | -8                    |
+| LANG: CN            | -8               | 0                 | -8                    |
+| LANG: EN            | N/A              | N/A               | N/A                   |
+| CAPA: common        | -8               | 0                 | -8                    |
+### Bootstrap ELO, Median of n=1000 times 
+|                  |   chatglm2-6b-hf |   internlm-chat-7b-hf |   qwen-7b-chat-hf |
+|------------------|------------------|-----------------------|-------------------|
+| elo_score [Mean] |       999.504    |            999.912    |       1000.26     |
+| elo_score [Std]  |         0.621362 |              0.400226 |          0.694434 |
+
+```
\ No newline at end of file
diff --git a/docs/zh_cn/advanced_guides/subjective evaluation.md b/docs/zh_cn/advanced_guides/subjective evaluation.md
index eb4399ea0..8e3f82a00 100644
--- a/docs/zh_cn/advanced_guides/subjective evaluation.md	
+++ b/docs/zh_cn/advanced_guides/subjective evaluation.md	
@@ -2,22 +2,27 @@
 
 ## 介绍
 
-- 主观评测旨在评估模型在符合人类偏好的能力上的表现。这种评估的黄金准则是人类喜好，但标注成本很高。
-- 为了探究模型的主观能力，我们采用了最先进的LLM（GPT-4）作为人类评估者的替代品（[LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)）。
-- 流行的评估方法是将模型的回答进行两两比较，以计算其胜率（[Chatbot Arena](https://chat.lmsys.org/)）。
-- 我们基于这一方法支持了GPT4用于模型的主观能力评估。
+主观评测旨在评估模型在符合人类偏好的能力上的表现。这种评估的黄金准则是人类喜好，但标注成本很高。
+
+为了探究模型的主观能力，我们采用了最先进的LLM（GPT-4）作为人类评估者的替代品（[LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)）。
+
+流行的评估方法是将模型的回答进行两两比较，以计算其胜率（[Chatbot Arena](https://chat.lmsys.org/)）。
+
+我们基于这一方法支持了GPT4用于模型的主观能力评估。
 
 ## 数据准备
 
-- 将主观问题集以.xlsx格式存放在data/subjective/中。
-- 我们提供了一个基于[z-bench](https://github.com/zhenbench/z-bench)的demo测试集[subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx
+我们提供了一个基于[z-bench](https://github.com/zhenbench/z-bench)的demo测试集：[subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx
 )。
-- 表格包括以下字段：
-  - 'question'：问题描述
-  - 'index'：题目序号
-  - 'reference_answer'：参考答案
-  - 'evaluating_guidance'：评估引导
-  - 'capability'：题目所属的能力维度。
+
+将主观问题集以.xlsx格式存放在data/subjective/中。
+
+表格包括以下字段：
+- 'question'：问题描述
+- 'index'：题目序号
+- 'reference_answer'：参考答案
+- 'evaluating_guidance'：评估引导
+- 'capability'：题目所属的能力维度。
 
 ## 评测配置
 具体流程包括:

From 633be69d399f3e00b53aed9c86941c6fa420bc4c Mon Sep 17 00:00:00 2001
From: frankweijue <weijueqi@pjlab.org.cn>
Date: Fri, 27 Oct 2023 14:30:15 +0800
Subject: [PATCH 3/7] fix name

---
 configs/datasets/subjective_cmp/subjective_cmp.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/configs/datasets/subjective_cmp/subjective_cmp.py b/configs/datasets/subjective_cmp/subjective_cmp.py
index c509be931..3c81f41cb 100644
--- a/configs/datasets/subjective_cmp/subjective_cmp.py
+++ b/configs/datasets/subjective_cmp/subjective_cmp.py
@@ -10,7 +10,7 @@
     train_split='test')
 
 subjective_all_sets = [
-    "sub_test",
+    "subjective_demo",
 ]
 
 subjective_datasets = []

From c0278964ce37ee163e38eaff424911d40113656e Mon Sep 17 00:00:00 2001
From: frankweijue <weijueqi@pjlab.org.cn>
Date: Fri, 27 Oct 2023 15:47:01 +0800
Subject: [PATCH 4/7] fix writing

---
 .../advanced_guides/subjective evaluation.md  | 37 +++++++-------
 .../advanced_guides/subjective evaluation.md  | 49 +++++++++----------
 2 files changed, 41 insertions(+), 45 deletions(-)

diff --git a/docs/en/advanced_guides/subjective evaluation.md b/docs/en/advanced_guides/subjective evaluation.md
index b09fd08ed..d9ca515af 100644
--- a/docs/en/advanced_guides/subjective evaluation.md	
+++ b/docs/en/advanced_guides/subjective evaluation.md	
@@ -14,7 +14,7 @@ We support the use of GPT-4 for the subjective evaluation of models based on thi
 
 We provide a demo test set [subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx) based on [z-bench](https://github.com/zhenbench/z-bench).
 
-Store the set of subjective questions in .xlsx format in the data/subjective/directory.
+Store the set of subjective questions in .xlsx format in the `data/subjective/directory`.
 
 The table includes the following fields:
 - 'question': Question description
@@ -29,15 +29,15 @@ The specific process includes:
 2. GPT-4 evaluation comparisons
 3. Generating evaluation reports
 
-For config/subjective.py, we provide some annotations to help users understand the configuration file's meaning.
+For `config/subjective.py`, we provide some annotations to help users understand the configuration file's meaning.
 ```python
 # Import datasets and subjective evaluation summarizer
 from mmengine.config import read_base
 with read_base():
-    from .datasets.subjectivity_cmp.subjectivity_cmp import subjectivity_datasets
+    from .datasets.subjective_cmp.subjective_cmp import subjective_datasets
     from .summarizers.subjective import summarizer
 
-datasets = [*subjectivity_datasets]
+datasets = [*subjective_datasets]
 
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
 
@@ -48,18 +48,8 @@ from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 
 
 # Define model configurations for inference and evaluation
-# Including chatglm2-6b, qwen-7b-chat, internlm-chat-7b and gpt4
-_meta_template = dict(
-    round=[
-        dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
-        dict(
-            role="BOT",
-            begin="\n<|im_start|>assistant\n",
-            end='<|im_end|>',
-            generate=True),
-    ], )
-
-...
+# Including the inference models chatglm2-6b, qwen-7b-chat, internlm-chat-7b, and the evaluation model gpt4
+models = [...]
 
 api_meta_template = dict(
     round=[
@@ -100,11 +90,11 @@ eval = dict(
 ```shell
 python run.py config/subjective.py -r
 ```
-The ```-r``` parameter allows the reuse of model inference and GPT-4 evaluation results.
+The `-r` parameter allows the reuse of model inference and GPT-4 evaluation results.
 
 ## Evaluation Report
 
-The evaluation report will be output to output/.../summary/timestamp/report.md, which includes win rate statistics, battle scores, and ELO ratings. The specific format is as follows:
+The evaluation report will be output to `output/.../summary/timestamp/report.md`, which includes win rate statistics, battle scores, and ELO ratings. The specific format is as follows:
 ```markdown
 # Subjective Analysis
 A total of 30 comparisons, of which 30 comparisons are meaningful (A / B answers inconsistent)
@@ -136,4 +126,13 @@ A total of 30 answer comparisons, successfully extracted 30 answers from GPT-4 r
 | elo_score [Mean] |       999.504    |            999.912    |       1000.26     |
 | elo_score [Std]  |         0.621362 |              0.400226 |          0.694434 |
 
-```
\ No newline at end of file
+```
+For comparing the evaluation of models A and B, there are four choices:
+1. A is better than B.
+2. A and B are equally good.
+3. A is worse than B.
+4. Neither A nor B is good.
+
+So, `win` / `tie` / `lose` / `not bad` represent the proportions of the model winning / tying / losing / winning or being equally good, respectively.
+
+`Bootstrap ELO` is calculated as the median ELO score by comparing match results through 1000 random permutations.
diff --git a/docs/zh_cn/advanced_guides/subjective evaluation.md b/docs/zh_cn/advanced_guides/subjective evaluation.md
index 8e3f82a00..7f3ea5953 100644
--- a/docs/zh_cn/advanced_guides/subjective evaluation.md	
+++ b/docs/zh_cn/advanced_guides/subjective evaluation.md	
@@ -4,18 +4,17 @@
 
 主观评测旨在评估模型在符合人类偏好的能力上的表现。这种评估的黄金准则是人类喜好，但标注成本很高。
 
-为了探究模型的主观能力，我们采用了最先进的LLM（GPT-4）作为人类评估者的替代品（[LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)）。
+为了探究模型的主观能力，我们采用了最先进的 LLM（GPT-4）作为人类评估者的替代品（[LLM-as-a-Judge](https://arxiv.org/abs/2306.05685)）。
 
 流行的评估方法是将模型的回答进行两两比较，以计算其胜率（[Chatbot Arena](https://chat.lmsys.org/)）。
 
-我们基于这一方法支持了GPT4用于模型的主观能力评估。
+我们基于这一方法支持了 GPT4 用于模型的主观能力评估。
 
 ## 数据准备
 
-我们提供了一个基于[z-bench](https://github.com/zhenbench/z-bench)的demo测试集：[subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx
-)。
+我们提供了一个基于 [z-bench](https://github.com/zhenbench/z-bench) 的 demo 测试集：[subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx)。
 
-将主观问题集以.xlsx格式存放在data/subjective/中。
+将主观问题集以.xlsx格式存放在 `data/subjective/` 中。
 
 表格包括以下字段：
 - 'question'：问题描述
@@ -27,40 +26,30 @@
 ## 评测配置
 具体流程包括:
 1. 模型回答的推理
-2. GPT4评估比较对
+2. GPT4 评估比较对
 3. 生成评测报告
 
-对于 config/subjective.py，我们提供了部分注释，方便用户理解配置文件的含义。
+对于 `config/subjective.py`，我们提供了部分注释，方便用户理解配置文件的含义。
 ```python
-# 导入数据集与主观评测summarizer
+# 导入数据集与主观评测 summarizer
 from mmengine.config import read_base
 with read_base():
-    from .datasets.subjectivity_cmp.subjectivity_cmp import subjectivity_datasets
+    from .datasets.subjective_cmp.subjective_cmp import subjective_datasets
     from .summarizers.subjective import summarizer
 
-datasets = [*subjectivity_datasets]
+datasets = [*subjective_datasets]
 
 from opencompass.models import HuggingFaceCausalLM, HuggingFace, OpenAI
 
-#导入主观评测所需partitioner与task
+#导入主观评测所需 partitioner 与 task
 from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
 from opencompass.runners import LocalRunner
 from opencompass.tasks.subjective_eval import SubjectiveEvalTask
 
 
 # 定义推理和评测所需模型配置
-# 包括chatglm2-6b，qwen-7b-chat，internlm-chat-7b，gpt4
-_meta_template = dict(
-    round=[
-        dict(role="HUMAN", begin='\n<|im_start|>user\n', end='<|im_end|>'),
-        dict(
-            role="BOT",
-            begin="\n<|im_start|>assistant\n",
-            end='<|im_end|>',
-            generate=True),
-    ], )
-
-...
+# 包括推理模型 chatglm2-6b，qwen-7b-chat，internlm-chat-7b 和 评测模型 gpt4
+models = [...]
 
 api_meta_template = dict(
     round=[
@@ -101,11 +90,11 @@ eval = dict(
 ```shell
 python run.py config/subjective.py -r
 ```
-```-r``` 参数支持复用模型推理和GPT4评估结果。
+`-r` 参数支持复用模型推理和 GPT4 评估结果。
 
 ## 评测报告
 
-评测报告会输出到output/.../summary/timestamp/report.md，包含胜率统计，对战分数与ELO。具体格式如下：
+评测报告会输出到 `output/.../summary/timestamp/report.md` ，包含胜率统计，对战分数与ELO。具体格式如下：
 ```markdown
 # Subjective Analysis
 A total of 30 comparisons, of which 30 comparisons are meaningful (A / B answers inconsistent)
@@ -137,4 +126,12 @@ A total of 30 answer comparisons, successfully extracted 30 answers from GPT-4 r
 | elo_score [Mean] |       999.504    |            999.912    |       1000.26     |
 | elo_score [Std]  |         0.621362 |              0.400226 |          0.694434 |
 
-```
\ No newline at end of file
+```
+对于评估模型 A 和 B的比较对，有四种选择：
+1. A 比 B 好
+2. A 和 B 一样好
+3. A 比 B 差
+4. A 和 B 都不好
+
+故 `win` / `tie` / `lose` / `not bad` 分别指模型 胜 / 平局 / 负 / 胜或一样好 的比例 。
+`Bootstrap ELO` 是通过对比赛结果进行1000次随机顺序，计算出 ELO 分数的中位数。
\ No newline at end of file

From a35a75f75af3866ca17219d93999bfca6ac30a07 Mon Sep 17 00:00:00 2001
From: Leymore <zfz-960727@163.com>
Date: Fri, 27 Oct 2023 16:12:45 +0800
Subject: [PATCH 5/7] update

---
 ...evaluation.md => subjective_evaluation.md} | 48 ++++++++++------
 docs/en/index.rst                             |  1 +
 ...evaluation.md => subjective_evaluation.md} | 56 +++++++++++--------
 docs/zh_cn/index.rst                          |  1 +
 4 files changed, 66 insertions(+), 40 deletions(-)
 rename docs/en/advanced_guides/{subjective evaluation.md => subjective_evaluation.md} (72%)
 rename docs/zh_cn/advanced_guides/{subjective evaluation.md => subjective_evaluation.md} (65%)

diff --git a/docs/en/advanced_guides/subjective evaluation.md b/docs/en/advanced_guides/subjective_evaluation.md
similarity index 72%
rename from docs/en/advanced_guides/subjective evaluation.md
rename to docs/en/advanced_guides/subjective_evaluation.md
index d9ca515af..1ac2da643 100644
--- a/docs/en/advanced_guides/subjective evaluation.md	
+++ b/docs/en/advanced_guides/subjective_evaluation.md
@@ -17,6 +17,7 @@ We provide a demo test set [subjective_demo.xlsx](https://opencompass.openxlab.s
 Store the set of subjective questions in .xlsx format in the `data/subjective/directory`.
 
 The table includes the following fields:
+
 - 'question': Question description
 - 'index': Question number
 - 'reference_answer': Reference answer
@@ -24,12 +25,15 @@ The table includes the following fields:
 - 'capability': The capability dimension of the question.
 
 ## Evaluation Configuration
+
 The specific process includes:
+
 1. Model response reasoning
 2. GPT-4 evaluation comparisons
 3. Generating evaluation reports
 
 For `config/subjective.py`, we provide some annotations to help users understand the configuration file's meaning.
+
 ```python
 # Import datasets and subjective evaluation summarizer
 from mmengine.config import read_base
@@ -87,47 +91,55 @@ eval = dict(
 ```
 
 ## Launching the Evaluation
+
 ```shell
 python run.py config/subjective.py -r
 ```
+
 The `-r` parameter allows the reuse of model inference and GPT-4 evaluation results.
 
 ## Evaluation Report
 
 The evaluation report will be output to `output/.../summary/timestamp/report.md`, which includes win rate statistics, battle scores, and ELO ratings. The specific format is as follows:
+
 ```markdown
 # Subjective Analysis
+
 A total of 30 comparisons, of which 30 comparisons are meaningful (A / B answers inconsistent)
 A total of 30 answer comparisons, successfully extracted 30 answers from GPT-4 replies, with an extraction success rate of 100.00%
+
 ### Basic statistics (4 stats: win / tie / lose / not bad)
-| Dimension \ Stat [W / T / L / NB]   | chatglm2-6b-hf                | qwen-7b-chat-hf              | internlm-chat-7b-hf           |
-|-------------------------------------|-------------------------------|------------------------------|-------------------------------|
-| LANG: Overall                       | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
-| LANG: CN                            | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
-| LANG: EN                            | N/A                           | N/A                          | N/A                           |
-| CAPA: common                        | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
 
+| Dimension \ Stat [W / T / L / NB] | chatglm2-6b-hf                | qwen-7b-chat-hf              | internlm-chat-7b-hf           |
+| --------------------------------- | ----------------------------- | ---------------------------- | ----------------------------- |
+| LANG: Overall                     | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
+| LANG: CN                          | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
+| LANG: EN                          | N/A                           | N/A                          | N/A                           |
+| CAPA: common                      | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
 
 ![Capabilities Dimension Classification Result](by_capa.png)
 
 ![Language Classification  Result](by_lang.png)
 
-
 ### Model scores (base score is 0, win +3, both +1, neither -1, lose -3)
-| Dimension \ Score   | chatglm2-6b-hf   | qwen-7b-chat-hf   | internlm-chat-7b-hf   |
-|---------------------|------------------|-------------------|-----------------------|
-| LANG: Overall       | -8               | 0                 | -8                    |
-| LANG: CN            | -8               | 0                 | -8                    |
-| LANG: EN            | N/A              | N/A               | N/A                   |
-| CAPA: common        | -8               | 0                 | -8                    |
-### Bootstrap ELO, Median of n=1000 times 
-|                  |   chatglm2-6b-hf |   internlm-chat-7b-hf |   qwen-7b-chat-hf |
-|------------------|------------------|-----------------------|-------------------|
-| elo_score [Mean] |       999.504    |            999.912    |       1000.26     |
-| elo_score [Std]  |         0.621362 |              0.400226 |          0.694434 |
 
+| Dimension \ Score | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf |
+| ----------------- | -------------- | --------------- | ------------------- |
+| LANG: Overall     | -8             | 0               | -8                  |
+| LANG: CN          | -8             | 0               | -8                  |
+| LANG: EN          | N/A            | N/A             | N/A                 |
+| CAPA: common      | -8             | 0               | -8                  |
+
+### Bootstrap ELO, Median of n=1000 times
+
+|                  | chatglm2-6b-hf | internlm-chat-7b-hf | qwen-7b-chat-hf |
+| ---------------- | -------------- | ------------------- | --------------- |
+| elo_score [Mean] | 999.504        | 999.912             | 1000.26         |
+| elo_score [Std]  | 0.621362       | 0.400226            | 0.694434        |
 ```
+
 For comparing the evaluation of models A and B, there are four choices:
+
 1. A is better than B.
 2. A and B are equally good.
 3. A is worse than B.
diff --git a/docs/en/index.rst b/docs/en/index.rst
index d14c8308a..97e09e63f 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -64,6 +64,7 @@ We always welcome *PRs* and *Issues* for the betterment of OpenCompass.
    advanced_guides/multimodal_eval.md
    advanced_guides/prompt_attack.md
    advanced_guides/longeval.md
+   advanced_guides/subjective_evaluation.md
 
 .. _Tools:
 .. toctree::
diff --git a/docs/zh_cn/advanced_guides/subjective evaluation.md b/docs/zh_cn/advanced_guides/subjective_evaluation.md
similarity index 65%
rename from docs/zh_cn/advanced_guides/subjective evaluation.md
rename to docs/zh_cn/advanced_guides/subjective_evaluation.md
index 7f3ea5953..dcb32055f 100644
--- a/docs/zh_cn/advanced_guides/subjective evaluation.md	
+++ b/docs/zh_cn/advanced_guides/subjective_evaluation.md
@@ -14,9 +14,10 @@
 
 我们提供了一个基于 [z-bench](https://github.com/zhenbench/z-bench) 的 demo 测试集：[subjective_demo.xlsx](https://opencompass.openxlab.space/utils/subjective_demo.xlsx)。
 
-将主观问题集以.xlsx格式存放在 `data/subjective/` 中。
+将主观问题集以.xlsx 格式存放在 `data/subjective/` 中。
 
 表格包括以下字段：
+
 - 'question'：问题描述
 - 'index'：题目序号
 - 'reference_answer'：参考答案
@@ -24,12 +25,15 @@
 - 'capability'：题目所属的能力维度。
 
 ## 评测配置
+
 具体流程包括:
+
 1. 模型回答的推理
 2. GPT4 评估比较对
 3. 生成评测报告
 
 对于 `config/subjective.py`，我们提供了部分注释，方便用户理解配置文件的含义。
+
 ```python
 # 导入数据集与主观评测 summarizer
 from mmengine.config import read_base
@@ -87,51 +91,59 @@ eval = dict(
 ```
 
 ## 启动评测
+
 ```shell
 python run.py config/subjective.py -r
 ```
+
 `-r` 参数支持复用模型推理和 GPT4 评估结果。
 
 ## 评测报告
 
-评测报告会输出到 `output/.../summary/timestamp/report.md` ，包含胜率统计，对战分数与ELO。具体格式如下：
+评测报告会输出到 `output/.../summary/timestamp/report.md` ，包含胜率统计，对战分数与 ELO。具体格式如下：
+
 ```markdown
 # Subjective Analysis
+
 A total of 30 comparisons, of which 30 comparisons are meaningful (A / B answers inconsistent)
 A total of 30 answer comparisons, successfully extracted 30 answers from GPT-4 replies, with an extraction success rate of 100.00%
+
 ### Basic statistics (4 stats: win / tie / lose / not bad)
-| Dimension \ Stat [W / T / L / NB]   | chatglm2-6b-hf                | qwen-7b-chat-hf              | internlm-chat-7b-hf           |
-|-------------------------------------|-------------------------------|------------------------------|-------------------------------|
-| LANG: Overall                       | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
-| LANG: CN                            | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
-| LANG: EN                            | N/A                           | N/A                          | N/A                           |
-| CAPA: common                        | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
 
+| Dimension \ Stat [W / T / L / NB] | chatglm2-6b-hf                | qwen-7b-chat-hf              | internlm-chat-7b-hf           |
+| --------------------------------- | ----------------------------- | ---------------------------- | ----------------------------- |
+| LANG: Overall                     | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
+| LANG: CN                          | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
+| LANG: EN                          | N/A                           | N/A                          | N/A                           |
+| CAPA: common                      | 30.0% / 40.0% / 30.0% / 30.0% | 50.0% / 0.0% / 50.0% / 50.0% | 30.0% / 40.0% / 30.0% / 30.0% |
 
 ![Capabilities Dimension Classification Result](by_capa.png)
 
 ![Language Classification  Result](by_lang.png)
 
-
 ### Model scores (base score is 0, win +3, both +1, neither -1, lose -3)
-| Dimension \ Score   | chatglm2-6b-hf   | qwen-7b-chat-hf   | internlm-chat-7b-hf   |
-|---------------------|------------------|-------------------|-----------------------|
-| LANG: Overall       | -8               | 0                 | -8                    |
-| LANG: CN            | -8               | 0                 | -8                    |
-| LANG: EN            | N/A              | N/A               | N/A                   |
-| CAPA: common        | -8               | 0                 | -8                    |
-### Bootstrap ELO, Median of n=1000 times 
-|                  |   chatglm2-6b-hf |   internlm-chat-7b-hf |   qwen-7b-chat-hf |
-|------------------|------------------|-----------------------|-------------------|
-| elo_score [Mean] |       999.504    |            999.912    |       1000.26     |
-| elo_score [Std]  |         0.621362 |              0.400226 |          0.694434 |
 
+| Dimension \ Score | chatglm2-6b-hf | qwen-7b-chat-hf | internlm-chat-7b-hf |
+| ----------------- | -------------- | --------------- | ------------------- |
+| LANG: Overall     | -8             | 0               | -8                  |
+| LANG: CN          | -8             | 0               | -8                  |
+| LANG: EN          | N/A            | N/A             | N/A                 |
+| CAPA: common      | -8             | 0               | -8                  |
+
+### Bootstrap ELO, Median of n=1000 times
+
+|                  | chatglm2-6b-hf | internlm-chat-7b-hf | qwen-7b-chat-hf |
+| ---------------- | -------------- | ------------------- | --------------- |
+| elo_score [Mean] | 999.504        | 999.912             | 1000.26         |
+| elo_score [Std]  | 0.621362       | 0.400226            | 0.694434        |
 ```
-对于评估模型 A 和 B的比较对，有四种选择：
+
+对于评估模型 A 和 B 的比较对，有四种选择：
+
 1. A 比 B 好
 2. A 和 B 一样好
 3. A 比 B 差
 4. A 和 B 都不好
 
 故 `win` / `tie` / `lose` / `not bad` 分别指模型 胜 / 平局 / 负 / 胜或一样好 的比例 。
-`Bootstrap ELO` 是通过对比赛结果进行1000次随机顺序，计算出 ELO 分数的中位数。
\ No newline at end of file
+`Bootstrap ELO` 是通过对比赛结果进行 1000 次随机顺序，计算出 ELO 分数的中位数。
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index b08583db8..b099eebb7 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -64,6 +64,7 @@ OpenCompass 上手路线
    advanced_guides/multimodal_eval.md
    advanced_guides/prompt_attack.md
    advanced_guides/longeval.md
+   advanced_guides/subjective_evaluation.md
 
 .. _工具:
 .. toctree::

From aac71632d61937b5da230c148429677ab0c2eea5 Mon Sep 17 00:00:00 2001
From: Leymore <zfz-960727@163.com>
Date: Fri, 27 Oct 2023 16:15:30 +0800
Subject: [PATCH 6/7] update

---
 docs/en/advanced_guides/subjective_evaluation.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/advanced_guides/subjective_evaluation.md b/docs/en/advanced_guides/subjective_evaluation.md
index 1ac2da643..444099cc7 100644
--- a/docs/en/advanced_guides/subjective_evaluation.md
+++ b/docs/en/advanced_guides/subjective_evaluation.md
@@ -1,4 +1,4 @@
-# Subjective Evaluation Guide
+# Subjective Evaluation Guidance
 
 ## Introduction
 

From 0c879ef0223788b926b725bee128fe483697760f Mon Sep 17 00:00:00 2001
From: Leymore <zfz-960727@163.com>
Date: Fri, 27 Oct 2023 16:18:21 +0800
Subject: [PATCH 7/7] update

---
 opencompass/datasets/subjective_cmp.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/opencompass/datasets/subjective_cmp.py b/opencompass/datasets/subjective_cmp.py
index 38cf7363c..cde91858e 100644
--- a/opencompass/datasets/subjective_cmp.py
+++ b/opencompass/datasets/subjective_cmp.py
@@ -191,8 +191,8 @@ def build_prompt(question,
     'question', 'index', 'reference_answer', 'evaluating_guidance',
     'capability'
 ],
-                               output_column=None,
-                               train_split='test')
+                             output_column=None,
+                             train_split='test')
 
 subjective_all_sets = [
     'sub_test',