Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feature] add Compass arena #828

Merged
merged 12 commits into from
Jan 23, 2024
Merged
160 changes: 160 additions & 0 deletions configs/datasets/subjective/compassarena/compassarena_compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,160 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import LMEvaluator
from opencompass.datasets import Compass_Arena
bittersweet1999 marked this conversation as resolved.
Show resolved Hide resolved

subjective_reader_cfg = dict(
input_columns=['question', 'ref'],
output_column='judge',
)

data_path ="data/subjective/"

subjective_datasets = []

base_prompt = """

[回答1开始]
{prediction}
[回答1结束]

[回答2开始]
{prediction2}
[回答2结束]

根据评分要求,在以下 3 个选项中做出选择:
A. 回答1更好
B. 回答2更好
C. 回答1、2平局
并提供你的解释原因。

如果你认为回答1更好,你的输出应形如:
选择:A
原因:blahblah blahblah\n

如果你认为回答2更好,你的输出应形如:
选择:B
原因:blahblah blahblah\n

如果你认为回答1、2打成平手,你的输出应形如:
选择:C
原因:blahblah blahblah\n
"""

knowledge_prompt = """
请根据提供的 评分要求,用户问题,参考答案 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。
评分要求(重要性依次递减):
1. 更好的回答能与参考答案吻合或表明参考答案的意思。
2. 在都准确答对问题的前提下,更好的回答能对知识点进行额外补充,且补充的知识准确无误。
3. 更好的回答更加符合与人类对话的习惯,包括语气、情调等。

[用户问题]
{question}

[参考答案]
{ref}
""" + base_prompt


language_prompt = """
请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。
评分要求(重要性依次递减):
1. 在有明确的参考答案的情况下,越贴近参考答案或表明了参考答案的意思的回答越好。
2. 更好的回答在语言表达上更流畅,更加符合与人类对话的习惯,包括语气、情调等
3. 在都准确答对问题的前提下,更好的回答能进行额外补充,且补充的内容准确无误。

[用户问题]
{question}

[参考答案]
{ref}
""" + base_prompt


math_prompt = """
请根据提供的 评分要求,用户问题,参考答案 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。
评分要求(重要性依次递减):
1. 更好的回答的答案能和参考答案一致。
2. 若两个回答的答案都与参考答案不一致,则更好的回答的推理过程应更加合理。
3. 更好的回答更加符合与人类对话的习惯,包括语气、情调等。

[用户问题]
{question}

[参考答案]
{ref}
""" + base_prompt

reason_prompt = math_prompt

qa_prompt = """
请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。
评分要求(重要性依次递减):
1. 好的回答必须首先具有事实正确性,即除了想象的内容外,所引用或阐述的各种信息都是真实正确的
2. 好的回答必须具有逻辑连贯性,围绕一个中心进行回答,且前后连贯,逻辑没有问题
3. 在都准确答对问题的前提下,更好的回答能进行额外补充,且补充的内容准确无误

[用户问题]
{question}
""" + base_prompt



creation_prompt = """
请根据提供的 评分要求,用户问题 以及 相应的两个回答(回答1,回答2),判断两个回答中哪一个更好。
评分要求(重要性依次递减):
1. 好的回答必须首先符合用户问题里的各种需求,不能跑题
2. 好的回答必须具有逻辑连贯性,围绕一个中心进行回答
3. 好的回答必须具有创造性的词语和表达丰富度

[用户问题]
{question}
""" + base_prompt


subjective_all_sets = ["knowledge", "language", "math", "reason", "qa", "creationv2_zh"]
prompt_all_sets = [knowledge_prompt, language_prompt, math_prompt, reason_prompt, qa_prompt, creation_prompt]

for _name,_prompt in zip(subjective_all_sets, prompt_all_sets):
subjective_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt="{question}"
),
]),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_seq_len=4096, max_out_len=2048),
)

subjective_eval_cfg = dict(
evaluator=dict(
type=LMEvaluator,
infer_order='double',
prompt_template=dict(
type=PromptTemplate,
template=dict(round=[
dict(
role='HUMAN',
prompt = _prompt
),
]),
),
),
pred_role="BOT",
)

subjective_datasets.append(
dict(
abbr=f"{_name}",
type=Compass_Arena,
bittersweet1999 marked this conversation as resolved.
Show resolved Hide resolved
path=data_path,
name=_name,
reader_cfg=subjective_reader_cfg,
infer_cfg=subjective_infer_cfg,
eval_cfg=subjective_eval_cfg
))
95 changes: 95 additions & 0 deletions configs/eval_subjective_compassarena.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
from os import getenv as gv
from opencompass.models import HuggingFaceCausalLM
from mmengine.config import read_base
with read_base():
from .models.chatglm.hf_chatglm3_6b_32k import models as chatglm3_6b_32k_model
from .models.yi.hf_yi_6b_chat import models as yi_6b_chat_model
from .datasets.subjective.compassarena.compassarena_compare import subjective_datasets

from opencompass.models import HuggingFaceCausalLM, HuggingFace, HuggingFaceChatGLM3, OpenAI
from opencompass.models.openai_api import OpenAIAllesAPIN
from opencompass.partitioners import NaivePartitioner, SizePartitioner
from opencompass.partitioners.sub_naive import SubjectiveNaivePartitioner
from opencompass.partitioners.sub_size import SubjectiveSizePartitioner
from opencompass.runners import LocalRunner
from opencompass.runners import SlurmSequentialRunner
from opencompass.tasks import OpenICLInferTask
from opencompass.tasks.subjective_eval import SubjectiveEvalTask
from opencompass.summarizers import Compassarena_Summarizer
bittersweet1999 marked this conversation as resolved.
Show resolved Hide resolved

infer = dict(
#partitioner=dict(type=NaivePartitioner),
partitioner=dict(type=SizePartitioner, max_task_size=10000),
runner=dict(
type=SlurmSequentialRunner,
partition='llm_dev2',
quotatype='auto',
max_num_workers=256,
task=dict(type=OpenICLInferTask)),
)

api_meta_template = dict(
round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
]
)

gpt4 = dict(
abbr='gpt4-turbo',
type=OpenAI, path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=2048,
max_seq_len=4096,
batch_size=4,
retry=20,
temperature = 1
)
models = [*chatglm3_6b_32k_model, *yi_6b_chat_model]
datasets = [*subjective_datasets]



work_dir = 'outputs/compass_arena/'

# -------------Inferen Stage ----------------------------------------

judge_model = dict(
abbr='GPT4-Turbo',
type=OpenAI, path='gpt-4-1106-preview',
key='', # The key will be obtained from $OPENAI_API_KEY, but you can write down your key here as well
meta_template=api_meta_template,
query_per_second=1,
max_out_len=1024,
max_seq_len=4096,
batch_size=2,
retry=20,
temperature = 0
)
## ------------- Evaluation Configuration
eval = dict(
partitioner=dict(
type=SubjectiveSizePartitioner,
strategy='split',
max_task_size=10000,
mode='m2n',
base_models = [gpt4],
compare_models = [*chatglm3_6b_32k_model, *yi_6b_chat_model, ]
),
runner=dict(
type=SlurmSequentialRunner,
partition='llm_dev2',
quotatype='auto',
max_num_workers=32,
task=dict(
type=SubjectiveEvalTask,
judge_cfg=judge_model
)),
)


summarizer = dict(
type=Compassarena_Summarizer
bittersweet1999 marked this conversation as resolved.
Show resolved Hide resolved
)
1 change: 1 addition & 0 deletions opencompass/datasets/subjective/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .alignbench import AlignmentBenchDataset # noqa: F401, F403
from .compass_arena import Compass_Arena # noqa: F401, F403
bittersweet1999 marked this conversation as resolved.
Show resolved Hide resolved
from .corev2 import Corev2Dataset # noqa: F401, F403
from .creationbench import CreationBenchDataset # noqa: F401, F403
from .information_retrival import IRDataset # noqa: F401, F403
Expand Down
28 changes: 28 additions & 0 deletions opencompass/datasets/subjective/compass_arena.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
from datasets import Dataset

from opencompass.registry import LOAD_DATASET

from .subjective_cmp import SubjectiveCmpDataset


@LOAD_DATASET.register_module()
class Compass_Arena(SubjectiveCmpDataset):
bittersweet1999 marked this conversation as resolved.
Show resolved Hide resolved

def load(
self,
path: str,
name: str,
):
dataset = list(super().load(path, name))
creation_dataset = []
for data in dataset:
if 'reference' in data['others']:
if data['others']['reference'] is not None:
data['ref'] = data['others']['reference']
else:
data['ref'] = '满足用户需求,言之有理即可'
else:
data['ref'] = '满足用户需求,言之有理即可'
creation_dataset.append(data)
dataset = Dataset.from_list(creation_dataset)
return dataset
3 changes: 2 additions & 1 deletion opencompass/datasets/subjective/subjective_cmp.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ def load(self, path: str, name: str):
'capability': capability,
'others': others,
'judge': {
'capability': capability
'capability': capability,
'question': question
}
})
dataset = Dataset.from_list(raw_data)
Expand Down
1 change: 1 addition & 0 deletions opencompass/summarizers/subjective/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# flake8: noqa: F401, E501
from .alignmentbench import AlignmentBenchSummarizer
from .compass_arena import Compassarena_Summarizer
bittersweet1999 marked this conversation as resolved.
Show resolved Hide resolved
from .corev2 import Corev2Summarizer
from .creationbench import CreationBenchSummarizer
from .information_retrival import IRSummarizer
Expand Down
Loading
Loading