Skip to content

Commit

Permalink
[Feature] Add huggingface apply_chat_template (#1098)
Browse files Browse the repository at this point in the history
* add TheoremQA with 5-shot

* add huggingface_above_v4_33 classes

* use num_worker partitioner in cli

* update theoremqa

* update TheoremQA

* add TheoremQA

* rename theoremqa -> TheoremQA

* update TheoremQA output path

* rewrite many model configs

* update huggingface

* further update

* refine configs

* update configs

* update configs

* add configs/eval_llama3_instruct.py

* add summarizer multi faceted

* update bbh datasets

* update configs/models/hf_llama/lmdeploy_llama3_8b_instruct.py

* rename class

* update readme

* update hf above v4.33
  • Loading branch information
Leymore committed May 14, 2024
1 parent 6c711cb commit 7505b3c
Show file tree
Hide file tree
Showing 186 changed files with 1,949 additions and 2,912 deletions.
11 changes: 1 addition & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -162,20 +162,11 @@ python tools/list_configs.py llama mmlu
You can also evaluate other HuggingFace models via command line. Taking LLaMA-7b as an example:

```bash
python run.py --datasets ceval_ppl mmlu_ppl \
--hf-path huggyllama/llama-7b \ # HuggingFace model path
--model-kwargs device_map='auto' \ # Arguments for model construction
--tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \ # Arguments for tokenizer construction
--max-out-len 100 \ # Maximum number of tokens generated
--max-seq-len 2048 \ # Maximum sequence length the model can accept
--batch-size 8 \ # Batch size
--no-batch-padding \ # Don't enable batch padding, infer through for loop to avoid performance loss
--num-gpus 1 # Number of minimum required GPUs
python run.py --datasets ceval_ppl mmlu_ppl --hf-type base --hf-path huggyllama/llama-7b
```

> \[!TIP\]
>
> To run the command above, you will need to remove the comments starting from `# ` first.
> configuration with `_ppl` is designed for base model typically.
> configuration with `_gen` can be used for both base model and chat model.
Expand Down
13 changes: 1 addition & 12 deletions README_zh-CN.md
Original file line number Diff line number Diff line change
Expand Up @@ -163,20 +163,9 @@ python tools/list_configs.py llama mmlu
你也可以通过命令行去评测其它 HuggingFace 模型。同样以 LLaMA-7b 为例:

```bash
python run.py --datasets ceval_ppl mmlu_ppl \
--hf-path huggyllama/llama-7b \ # HuggingFace 模型地址
--model-kwargs device_map='auto' \ # 构造 model 的参数
--tokenizer-kwargs padding_side='left' truncation='left' use_fast=False \ # 构造 tokenizer 的参数
--max-out-len 100 \ # 最长生成 token 数
--max-seq-len 2048 \ # 模型能接受的最大序列长度
--batch-size 8 \ # 批次大小
--no-batch-padding \ # 不打开 batch padding,通过 for loop 推理,避免精度损失
--num-gpus 1 # 运行该模型所需的最少 gpu 数
python run.py --datasets ceval_ppl mmlu_ppl --hf-type base --hf-path huggyllama/llama-7b
```

> **注意**<br />
> 若需要运行上述命令,你需要删除所有从 `# ` 开始的注释。
通过命令行或配置文件,OpenCompass 还支持评测 API 或自定义模型,以及更多样化的评测策略。请阅读[快速开始](https://opencompass.readthedocs.io/zh_CN/latest/get_started/quick_start.html)了解如何运行一个评测任务。

更多教程请查看我们的[文档](https://opencompass.readthedocs.io/zh_CN/latest/index.html)
Expand Down
22 changes: 22 additions & 0 deletions configs/dataset_collections/chat_OC15.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
from mmengine.config import read_base

with read_base():
from ..datasets.mmlu.mmlu_gen_4d595a import mmlu_datasets
from ..datasets.cmmlu.cmmlu_gen_c13365 import cmmlu_datasets
from ..datasets.ceval.ceval_gen_5f30c7 import ceval_datasets
from ..datasets.GaokaoBench.GaokaoBench_no_subjective_gen_4c31db import GaokaoBench_datasets
from ..datasets.triviaqa.triviaqa_wiki_1shot_gen_eaf81e import triviaqa_datasets
from ..datasets.nq.nq_open_1shot_gen_01cf41 import nq_datasets
from ..datasets.race.race_gen_69ee4f import race_datasets
from ..datasets.winogrande.winogrande_5shot_gen_b36770 import winogrande_datasets
from ..datasets.hellaswag.hellaswag_10shot_gen_e42710 import hellaswag_datasets
from ..datasets.bbh.bbh_gen_2879b0 import bbh_datasets
from ..datasets.gsm8k.gsm8k_gen_1d7fe4 import gsm8k_datasets
from ..datasets.math.math_0shot_gen_393424 import math_datasets
from ..datasets.TheoremQA.TheoremQA_5shot_gen_6f0af8 import TheoremQA_datasets
from ..datasets.humaneval.humaneval_gen_8e312c import humaneval_datasets
from ..datasets.mbpp.sanitized_mbpp_gen_830460 import sanitized_mbpp_datasets
from ..datasets.gpqa.gpqa_gen_4baadb import gpqa_datasets
from ..datasets.IFEval.IFEval_gen_3321a3 import ifeval_datasets

datasets = sum((v for k, v in locals().items() if k.endswith("_datasets")), [])
46 changes: 0 additions & 46 deletions configs/datasets/TheoremQA/TheoremQA_5shot_gen_a4f581.py

This file was deleted.

56 changes: 56 additions & 0 deletions configs/datasets/bbh/bbh_gen_2879b0.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import os
from mmengine.config import read_base
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.datasets import BBHDataset, bbh_mcq_postprocess, BBHEvaluator, BBHEvaluator_mcq

with read_base():
from .bbh_subset_settings import settings

bbh_datasets = []
for name, test_type in settings:
with open(os.path.join(os.path.dirname(__file__), 'lib_prompt', f'{name}.txt'), 'r') as f:
hint = f.read()

task_prompt, body = hint.split('\n\nQ:', 1)
sections = ('Q:' + body).split('\n\n')
prompt_rounds = []
for index, section in enumerate(sections):
question, answer = section.split('\nA:')
answer = 'A:' + answer
if index == 0:
desc = task_prompt.strip() + '\n'
else:
desc = ''
prompt_rounds.append(dict(role="HUMAN", prompt=f"{desc}{question.strip()}"))
prompt_rounds.append(dict(role="BOT", prompt=answer.strip()))
prompt_rounds.append(dict(role="HUMAN", prompt="Q: {input}"))

bbh_reader_cfg = dict(input_columns=["input"], output_column="target")

bbh_infer_cfg = dict(
prompt_template=dict(type=PromptTemplate, template=dict(round=prompt_rounds)),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer, max_out_len=512))

if test_type == 'mcq':
bbh_eval_cfg = dict(
evaluator=dict(type=BBHEvaluator_mcq),
pred_role="BOT",
pred_postprocessor=dict(type=bbh_mcq_postprocess),
dataset_postprocessor=dict(type=bbh_mcq_postprocess))
else:
bbh_eval_cfg = dict(
evaluator=dict(type=BBHEvaluator),
pred_role="BOT")

bbh_datasets.append(
dict(
type=BBHDataset,
path="./data/BBH/data",
name=name,
abbr='bbh-' + name,
reader_cfg=bbh_reader_cfg.copy(),
infer_cfg=bbh_infer_cfg.copy(),
eval_cfg=bbh_eval_cfg.copy()))
29 changes: 29 additions & 0 deletions configs/datasets/bbh/bbh_subset_settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
settings = [
('temporal_sequences', 'mcq'),
('disambiguation_qa', 'mcq'),
('date_understanding', 'mcq'),
('tracking_shuffled_objects_three_objects', 'mcq'),
('penguins_in_a_table', 'mcq'),
('geometric_shapes', 'mcq'),
('snarks', 'mcq'),
('ruin_names', 'mcq'),
('tracking_shuffled_objects_seven_objects', 'mcq'),
('tracking_shuffled_objects_five_objects', 'mcq'),
('logical_deduction_three_objects', 'mcq'),
('hyperbaton', 'mcq'),
('logical_deduction_five_objects', 'mcq'),
('logical_deduction_seven_objects', 'mcq'),
('movie_recommendation', 'mcq'),
('salient_translation_error_detection', 'mcq'),
('reasoning_about_colored_objects', 'mcq'),
('multistep_arithmetic_two', 'free_form'),
('navigate', 'free_form'),
('dyck_languages', 'free_form'),
('word_sorting', 'free_form'),
('sports_understanding', 'free_form'),
('boolean_expressions', 'free_form'),
('object_counting', 'free_form'),
('formal_fallacies', 'free_form'),
('causal_judgement', 'free_form'),
('web_of_lies', 'free_form'),
]
2 changes: 1 addition & 1 deletion configs/datasets/collections/chat_medium.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@
from ..piqa.piqa_gen_1194eb import piqa_datasets
from ..siqa.siqa_gen_e78df3 import siqa_datasets
from ..strategyqa.strategyqa_gen_1180a7 import strategyqa_datasets
from ..winogrande.winogrande_gen_a9ede5 import winogrande_datasets
from ..winogrande.deprecated_winogrande_gen_a9ede5 import winogrande_datasets
from ..obqa.obqa_gen_9069e4 import obqa_datasets
from ..nq.nq_gen_c788f6 import nq_datasets
from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
Expand Down
2 changes: 1 addition & 1 deletion configs/datasets/collections/chat_small.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from ..summedits.summedits_gen_315438 import summedits_datasets
from ..hellaswag.hellaswag_gen_6faab5 import hellaswag_datasets
from ..piqa.piqa_gen_1194eb import piqa_datasets
from ..winogrande.winogrande_gen_a9ede5 import winogrande_datasets
from ..winogrande.deprecated_winogrande_gen_a9ede5 import winogrande_datasets
from ..obqa.obqa_gen_9069e4 import obqa_datasets
from ..nq.nq_gen_c788f6 import nq_datasets
from ..triviaqa.triviaqa_gen_2121ce import triviaqa_datasets
Expand Down
46 changes: 46 additions & 0 deletions configs/datasets/winogrande/winogrande_5shot_gen_b36770.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import FixKRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import winograndeDataset_V3
from opencompass.utils.text_postprocessors import first_option_postprocess

winogrande_reader_cfg = dict(
input_columns=["prompt", "only_option1", "only_option2"],
output_column="answer",
train_split="train_xs",
test_split="dev",
)

winogrande_infer_cfg = dict(
ice_template=dict(
type=PromptTemplate,
template=dict(
begin="</E>",
round=[
dict(role="HUMAN", prompt="Question: {prompt}\nA. {only_option1}\nB. {only_option2}\nAnswer:"),
dict(role="BOT", prompt="{answer}"),
]
),
ice_token="</E>",
),
retriever=dict(type=FixKRetriever, fix_id_list=[0, 2, 4, 6, 8]),
inferencer=dict(type=GenInferencer),
)

winogrande_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_role="BOT",
pred_postprocessor=dict(type=first_option_postprocess, options="AB"),
)

winogrande_datasets = [
dict(
abbr="winogrande",
type=winograndeDataset_V3,
path="./data/winogrande",
reader_cfg=winogrande_reader_cfg,
infer_cfg=winogrande_infer_cfg,
eval_cfg=winogrande_eval_cfg,
)
]
2 changes: 1 addition & 1 deletion configs/datasets/winogrande/winogrande_gen.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from mmengine.config import read_base

with read_base():
from .winogrande_gen_a9ede5 import winogrande_datasets # noqa: F401, F403
from .winogrande_gen_458220 import winogrande_datasets # noqa: F401, F403
41 changes: 41 additions & 0 deletions configs/datasets/winogrande/winogrande_gen_458220.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.openicl.icl_evaluator import AccEvaluator
from opencompass.datasets import winograndeDataset_V2
from opencompass.utils.text_postprocessors import first_option_postprocess

winogrande_reader_cfg = dict(
input_columns=["prompt", "only_option1", "only_option2"],
output_column="answer",
)

winogrande_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role="HUMAN", prompt="Question: {prompt}\nA. {only_option1}\nB. {only_option2}\nAnswer:"),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)

winogrande_eval_cfg = dict(
evaluator=dict(type=AccEvaluator),
pred_role="BOT",
pred_postprocessor=dict(type=first_option_postprocess, options='AB'),
)

winogrande_datasets = [
dict(
abbr="winogrande",
type=winograndeDataset_V2,
path='./data/winogrande',
reader_cfg=winogrande_reader_cfg,
infer_cfg=winogrande_infer_cfg,
eval_cfg=winogrande_eval_cfg,
)
]

0 comments on commit 7505b3c

Please sign in to comment.