diff --git "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" index ffc4f2afac..9b80691ce0 100644 --- "a/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" +++ "b/docs/source/Instruction/\345\221\275\344\273\244\350\241\214\345\217\202\346\225\260.md" @@ -393,6 +393,7 @@ App参数继承于[部署参数](#部署参数), [Web-UI参数](#Web-UI参数) 评测参数继承于[部署参数](#部署参数) +- 🔥eval_backend: 评测后端,默认为Native,也可以指定为OpenCompass或VLMEvalKit - 🔥eval_dataset: 评测数据集,请查看[评测文档](./评测.md) - eval_limit: 每个评测集的采样数,默认为None - eval_output_dir: 评测存储结果的文件夹,默认为'eval_output' diff --git "a/docs/source/Instruction/\350\257\204\346\265\213.md" "b/docs/source/Instruction/\350\257\204\346\265\213.md" index a193e26f69..1431e46be2 100644 --- "a/docs/source/Instruction/\350\257\204\346\265\213.md" +++ "b/docs/source/Instruction/\350\257\204\346\265\213.md" @@ -6,40 +6,54 @@ SWIFT支持了eval(评测)能力,用于对原始模型和训练后的模 SWIFT的eval能力使用了魔搭社区[评测框架EvalScope](https://github.com/modelscope/eval-scope),并进行了高级封装以支持各类模型的评测需求。 -目前我们支持了**标准评测集**的评测流程,以及**用户自定义**评测集的评测流程。其中**标准评测集**包含: - -> 注意:EvalScope支持许多其他的复杂能力,例如模型的性能评测,请直接使用EvalScope框架。 - -纯文本评测: -```text -'obqa', 'cmb', 'AX_b', 'siqa', 'nq', 'mbpp', 'winogrande', 'mmlu', 'BoolQ', 'cluewsc', 'ocnli', 'lambada', -'CMRC', 'ceval', 'csl', 'cmnli', 'bbh', 'ReCoRD', 'math', 'humaneval', 'eprstmt', 'WSC', 'storycloze', -'MultiRC', 'RTE', 'chid', 'gsm8k', 'AX_g', 'bustm', 'afqmc', 'piqa', 'lcsts', 'strategyqa', 'Xsum', 'agieval', -'ocnli_fc', 'C3', 'tnews', 'race', 'triviaqa', 'CB', 'WiC', 'hellaswag', 'summedits', 'GaokaoBench', -'ARC_e', 'COPA', 'ARC_c', 'DRCD' -``` -数据集的具体介绍可以查看:https://hub.opencompass.org.cn/home - -多模态评测: -```text -'COCO_VAL', 'MME', 'HallusionBench', 'POPE', 'MMBench_DEV_EN', 'MMBench_TEST_EN', 'MMBench_DEV_CN', 'MMBench_TEST_CN', -'MMBench', 'MMBench_CN', 'MMBench_DEV_EN_V11', 'MMBench_TEST_EN_V11', 'MMBench_DEV_CN_V11', -'MMBench_TEST_CN_V11', 'MMBench_V11', 'MMBench_CN_V11', 'SEEDBench_IMG', 'SEEDBench2', -'SEEDBench2_Plus', 'ScienceQA_VAL', 'ScienceQA_TEST', 'MMT-Bench_ALL_MI', 'MMT-Bench_ALL', -'MMT-Bench_VAL_MI', 'MMT-Bench_VAL', 'AesBench_VAL', 'AesBench_TEST', 'CCBench', 'AI2D_TEST', 'MMStar', -'RealWorldQA', 'MLLMGuard_DS', 'BLINK', 'OCRVQA_TEST', 'OCRVQA_TESTCORE', 'TextVQA_VAL', 'DocVQA_VAL', -'DocVQA_TEST', 'InfoVQA_VAL', 'InfoVQA_TEST', 'ChartQA_TEST', 'MathVision', 'MathVision_MINI', -'MMMU_DEV_VAL', 'MMMU_TEST', 'OCRBench', 'MathVista_MINI', 'LLaVABench', 'MMVet', 'MTVQA_TEST', -'MMLongBench_DOC', 'VCR_EN_EASY_500', 'VCR_EN_EASY_100', 'VCR_EN_EASY_ALL', 'VCR_EN_HARD_500', -'VCR_EN_HARD_100', 'VCR_EN_HARD_ALL', 'VCR_ZH_EASY_500', 'VCR_ZH_EASY_100', 'VCR_ZH_EASY_ALL', -'VCR_ZH_HARD_500', 'VCR_ZH_HARD_100', 'VCR_ZH_HARD_ALL', 'MMDU', 'MMBench-Video', 'Video-MME', -'MMBench_DEV_EN', 'MMBench_TEST_EN', 'MMBench_DEV_CN', 'MMBench_TEST_CN', 'MMBench', 'MMBench_CN', -'MMBench_DEV_EN_V11', 'MMBench_TEST_EN_V11', 'MMBench_DEV_CN_V11', 'MMBench_TEST_CN_V11', 'MMBench_V11', -'MMBench_CN_V11', 'SEEDBench_IMG', 'SEEDBench2', 'SEEDBench2_Plus', 'ScienceQA_VAL', 'ScienceQA_TEST', -'MMT-Bench_ALL_MI', 'MMT-Bench_ALL', 'MMT-Bench_VAL_MI', 'MMT-Bench_VAL', 'AesBench_VAL', -'AesBench_TEST', 'CCBench', 'AI2D_TEST', 'MMStar', 'RealWorldQA', 'MLLMGuard_DS', 'BLINK' -``` -数据集的具体介绍可以查看:https://github.com/open-compass/VLMEvalKit +> 注意:EvalScope支持许多其他的复杂能力,例如[模型的性能评测](https://evalscope.readthedocs.io/zh-cn/latest/user_guides/stress_test/quick_start.html),请直接使用EvalScope框架。 + +目前我们支持了**标准评测集**的评测流程,以及**用户自定义**评测集的评测流程。其中**标准评测集**由三个评测后端提供支持: + +下面展示所支持的数据集名称,若需了解数据集的详细信息,请参考[所有支持的数据集](https://evalscope.readthedocs.io/zh-cn/latest/get_started/supported_dataset.html) + +1. Native(默认): + + 主要支持纯文本评测,同时**支持**评测结果可视化 + ```text + 'arc', 'bbh', 'ceval', 'cmmlu', 'competition_math', + 'general_qa', 'gpqa', 'gsm8k', 'hellaswag', 'humaneval', + 'ifeval', 'iquiz', 'mmlu', 'mmlu_pro', + 'race', 'trivia_qa', 'truthful_qa' + ``` + +2. OpenCompass: + + 主要支持纯文本评测,暂**不支持**评测结果可视化 + ```text + 'obqa', 'cmb', 'AX_b', 'siqa', 'nq', 'mbpp', 'winogrande', 'mmlu', 'BoolQ', 'cluewsc', 'ocnli', 'lambada', + 'CMRC', 'ceval', 'csl', 'cmnli', 'bbh', 'ReCoRD', 'math', 'humaneval', 'eprstmt', 'WSC', 'storycloze', + 'MultiRC', 'RTE', 'chid', 'gsm8k', 'AX_g', 'bustm', 'afqmc', 'piqa', 'lcsts', 'strategyqa', 'Xsum', 'agieval', + 'ocnli_fc', 'C3', 'tnews', 'race', 'triviaqa', 'CB', 'WiC', 'hellaswag', 'summedits', 'GaokaoBench', + 'ARC_e', 'COPA', 'ARC_c', 'DRCD' + ``` + +3. VLMEvalKit: + + 主要支持多模态评测,暂**不支持**评测结果可视化 + ```text + 'COCO_VAL', 'MME', 'HallusionBench', 'POPE', 'MMBench_DEV_EN', 'MMBench_TEST_EN', 'MMBench_DEV_CN', 'MMBench_TEST_CN', + 'MMBench', 'MMBench_CN', 'MMBench_DEV_EN_V11', 'MMBench_TEST_EN_V11', 'MMBench_DEV_CN_V11', + 'MMBench_TEST_CN_V11', 'MMBench_V11', 'MMBench_CN_V11', 'SEEDBench_IMG', 'SEEDBench2', + 'SEEDBench2_Plus', 'ScienceQA_VAL', 'ScienceQA_TEST', 'MMT-Bench_ALL_MI', 'MMT-Bench_ALL', + 'MMT-Bench_VAL_MI', 'MMT-Bench_VAL', 'AesBench_VAL', 'AesBench_TEST', 'CCBench', 'AI2D_TEST', 'MMStar', + 'RealWorldQA', 'MLLMGuard_DS', 'BLINK', 'OCRVQA_TEST', 'OCRVQA_TESTCORE', 'TextVQA_VAL', 'DocVQA_VAL', + 'DocVQA_TEST', 'InfoVQA_VAL', 'InfoVQA_TEST', 'ChartQA_TEST', 'MathVision', 'MathVision_MINI', + 'MMMU_DEV_VAL', 'MMMU_TEST', 'OCRBench', 'MathVista_MINI', 'LLaVABench', 'MMVet', 'MTVQA_TEST', + 'MMLongBench_DOC', 'VCR_EN_EASY_500', 'VCR_EN_EASY_100', 'VCR_EN_EASY_ALL', 'VCR_EN_HARD_500', + 'VCR_EN_HARD_100', 'VCR_EN_HARD_ALL', 'VCR_ZH_EASY_500', 'VCR_ZH_EASY_100', 'VCR_ZH_EASY_ALL', + 'VCR_ZH_HARD_500', 'VCR_ZH_HARD_100', 'VCR_ZH_HARD_ALL', 'MMDU', 'MMBench-Video', 'Video-MME', + 'MMBench_DEV_EN', 'MMBench_TEST_EN', 'MMBench_DEV_CN', 'MMBench_TEST_CN', 'MMBench', 'MMBench_CN', + 'MMBench_DEV_EN_V11', 'MMBench_TEST_EN_V11', 'MMBench_DEV_CN_V11', 'MMBench_TEST_CN_V11', 'MMBench_V11', + 'MMBench_CN_V11', 'SEEDBench_IMG', 'SEEDBench2', 'SEEDBench2_Plus', 'ScienceQA_VAL', 'ScienceQA_TEST', + 'MMT-Bench_ALL_MI', 'MMT-Bench_ALL', 'MMT-Bench_VAL_MI', 'MMT-Bench_VAL', 'AesBench_VAL', + 'AesBench_TEST', 'CCBench', 'AI2D_TEST', 'MMStar', 'RealWorldQA', 'MLLMGuard_DS', 'BLINK' + ``` ## 环境准备 @@ -59,9 +73,23 @@ pip install -e '.[eval]' 支持纯文本评测、多模态评测、url评测、自定义数据集评测四种方式 -评测的样例可以参考[examples](https://github.com/modelscope/ms-swift/tree/main/examples/eval) +**基本示例** + +```shell +swift eval \ + --model Qwen/Qwen2.5-0.5B-Instruct \ + --eval_backend Native \ + --infer_backend pt \ + --eval_limit 10 \ + --eval_dataset gsm8k +``` +其中: +- eval_backend: 可选 Native, OpenCompass, VLMEvalKit +- infer_backend: 可选 pt, vllm, lmdeploy + +具体评测的参数列表可以参考[这里](命令行参数.md#评测参数)。 -评测的参数列表可以参考[这里](命令行参数.md#评测参数)。 +更多评测的样例可以参考[examples](https://github.com/modelscope/ms-swift/tree/main/examples/eval) ## 自定义评测集 diff --git a/docs/source_en/Instruction/Command-line-parameters.md b/docs/source_en/Instruction/Command-line-parameters.md index fa5339175c..6bc1205dcb 100644 --- a/docs/source_en/Instruction/Command-line-parameters.md +++ b/docs/source_en/Instruction/Command-line-parameters.md @@ -396,6 +396,7 @@ App parameters inherit from [deployment arguments](#deployment-arguments) and [W Evaluation Arguments inherit from the [deployment arguments](#deployment-arguments). +- 🔥eval_backend: Evaluation backend, default is Native, but can also be specified as OpenCompass or VLMEvalKit - 🔥eval_dataset: Evaluation dataset, refer to [Evaluation documentation](./Evaluation.md). - eval_limit: Number of samples for each evaluation set, default is None. - eval_output_dir: Folder for storing evaluation results, default is 'eval_output'. diff --git a/docs/source_en/Instruction/Evaluation.md b/docs/source_en/Instruction/Evaluation.md index 9b7109bc70..31fc998ba3 100644 --- a/docs/source_en/Instruction/Evaluation.md +++ b/docs/source_en/Instruction/Evaluation.md @@ -1,45 +1,59 @@ # Evaluation -SWIFT supports eval(evaluation) capabilities to provide standardized assessment metrics for both the raw model and the trained model. +SWIFT supports eval (evaluation) capabilities to provide standardized evaluation metrics for both raw models and trained models. ## Capability Introduction -SWIFT's eval capability uses the [evalution framework EvalScope](https://github.com/modelscope/eval-scope) from the ModelScope, with high-level encapsulation to meet various model evaluation needs. - -Currently, we support evaluation processes for **standard evaluation sets** as well as **user-defined** evaluation sets. The **standard evaluation sets** include: - -> Note: EvalScope supports many other complex capabilities, such as model performance evaluation. Please use the EvalScope framework directly for those features. - -Pure Text Evaluation: -```text -'obqa', 'cmb', 'AX_b', 'siqa', 'nq', 'mbpp', 'winogrande', 'mmlu', 'BoolQ', 'cluewsc', 'ocnli', 'lambada', -'CMRC', 'ceval', 'csl', 'cmnli', 'bbh', 'ReCoRD', 'math', 'humaneval', 'eprstmt', 'WSC', 'storycloze', -'MultiRC', 'RTE', 'chid', 'gsm8k', 'AX_g', 'bustm', 'afqmc', 'piqa', 'lcsts', 'strategyqa', 'Xsum', 'agieval', -'ocnli_fc', 'C3', 'tnews', 'race', 'triviaqa', 'CB', 'WiC', 'hellaswag', 'summedits', 'GaokaoBench', -'ARC_e', 'COPA', 'ARC_c', 'DRCD' -``` -For detailed information on the datasets, please visit: https://hub.opencompass.org.cn/home - -Multimodal Evaluation: -```text -'COCO_VAL', 'MME', 'HallusionBench', 'POPE', 'MMBench_DEV_EN', 'MMBench_TEST_EN', 'MMBench_DEV_CN', 'MMBench_TEST_CN', -'MMBench', 'MMBench_CN', 'MMBench_DEV_EN_V11', 'MMBench_TEST_EN_V11', 'MMBench_DEV_CN_V11', -'MMBench_TEST_CN_V11', 'MMBench_V11', 'MMBench_CN_V11', 'SEEDBench_IMG', 'SEEDBench2', -'SEEDBench2_Plus', 'ScienceQA_VAL', 'ScienceQA_TEST', 'MMT-Bench_ALL_MI', 'MMT-Bench_ALL', -'MMT-Bench_VAL_MI', 'MMT-Bench_VAL', 'AesBench_VAL', 'AesBench_TEST', 'CCBench', 'AI2D_TEST', 'MMStar', -'RealWorldQA', 'MLLMGuard_DS', 'BLINK', 'OCRVQA_TEST', 'OCRVQA_TESTCORE', 'TextVQA_VAL', 'DocVQA_VAL', -'DocVQA_TEST', 'InfoVQA_VAL', 'InfoVQA_TEST', 'ChartQA_TEST', 'MathVision', 'MathVision_MINI', -'MMMU_DEV_VAL', 'MMMU_TEST', 'OCRBench', 'MathVista_MINI', 'LLaVABench', 'MMVet', 'MTVQA_TEST', -'MMLongBench_DOC', 'VCR_EN_EASY_500', 'VCR_EN_EASY_100', 'VCR_EN_EASY_ALL', 'VCR_EN_HARD_500', -'VCR_EN_HARD_100', 'VCR_EN_HARD_ALL', 'VCR_ZH_EASY_500', 'VCR_ZH_EASY_100', 'VCR_ZH_EASY_ALL', -'VCR_ZH_HARD_500', 'VCR_ZH_HARD_100', 'VCR_ZH_HARD_ALL', 'MMDU', 'MMBench-Video', 'Video-MME', -'MMBench_DEV_EN', 'MMBench_TEST_EN', 'MMBench_DEV_CN', 'MMBench_TEST_CN', 'MMBench', 'MMBench_CN', -'MMBench_DEV_EN_V11', 'MMBench_TEST_EN_V11', 'MMBench_DEV_CN_V11', 'MMBench_TEST_CN_V11', 'MMBench_V11', -'MMBench_CN_V11', 'SEEDBench_IMG', 'SEEDBench2', 'SEEDBench2_Plus', 'ScienceQA_VAL', 'ScienceQA_TEST', -'MMT-Bench_ALL_MI', 'MMT-Bench_ALL', 'MMT-Bench_VAL_MI', 'MMT-Bench_VAL', 'AesBench_VAL', -'AesBench_TEST', 'CCBench', 'AI2D_TEST', 'MMStar', 'RealWorldQA', 'MLLMGuard_DS', 'BLINK' -``` -For detailed information on the datasets, please visit: https://github.com/open-compass/VLMEvalKit +SWIFT's eval capability utilizes the EvalScope evaluation framework from the Magic Tower community, which has been advanced in its encapsulation to support the evaluation needs of various models. + +> Note: EvalScope supports many other complex capabilities, such as [model performance evaluation](https://evalscope.readthedocs.io/en/latest/user_guides/stress_test/quick_start.html), so please use the EvalScope framework directly. + +Currently, we support the evaluation process of **standard evaluation datasets** as well as the evaluation process of **user-defined** evaluation datasets. The **standard evaluation datasets** are supported by three evaluation backends: + +Below are the names of the supported datasets. For detailed information on the datasets, please refer to [all supported datasets](https://evalscope.readthedocs.io/en/latest/get_started/supported_dataset.html). + +1. Native (default): + + Primarily supports pure text evaluation, while **supporting** visualization of evaluation results. + ```text + 'arc', 'bbh', 'ceval', 'cmmlu', 'competition_math', + 'general_qa', 'gpqa', 'gsm8k', 'hellaswag', 'humaneval', + 'ifeval', 'iquiz', 'mmlu', 'mmlu_pro', + 'race', 'trivia_qa', 'truthful_qa' + ``` + +2. OpenCompass: + + Primarily supports pure text evaluation, currently **does not support** visualization of evaluation results. + ```text + 'obqa', 'cmb', 'AX_b', 'siqa', 'nq', 'mbpp', 'winogrande', 'mmlu', 'BoolQ', 'cluewsc', 'ocnli', 'lambada', + 'CMRC', 'ceval', 'csl', 'cmnli', 'bbh', 'ReCoRD', 'math', 'humaneval', 'eprstmt', 'WSC', 'storycloze', + 'MultiRC', 'RTE', 'chid', 'gsm8k', 'AX_g', 'bustm', 'afqmc', 'piqa', 'lcsts', 'strategyqa', 'Xsum', 'agieval', + 'ocnli_fc', 'C3', 'tnews', 'race', 'triviaqa', 'CB', 'WiC', 'hellaswag', 'summedits', 'GaokaoBench', + 'ARC_e', 'COPA', 'ARC_c', 'DRCD' + ``` + +3. VLMEvalKit: + + Primarily supports multimodal evaluation and currently **does not support** visualization of evaluation results. + ```text + 'COCO_VAL', 'MME', 'HallusionBench', 'POPE', 'MMBench_DEV_EN', 'MMBench_TEST_EN', 'MMBench_DEV_CN', 'MMBench_TEST_CN', + 'MMBench', 'MMBench_CN', 'MMBench_DEV_EN_V11', 'MMBench_TEST_EN_V11', 'MMBench_DEV_CN_V11', + 'MMBench_TEST_CN_V11', 'MMBench_V11', 'MMBench_CN_V11', 'SEEDBench_IMG', 'SEEDBench2', + 'SEEDBench2_Plus', 'ScienceQA_VAL', 'ScienceQA_TEST', 'MMT-Bench_ALL_MI', 'MMT-Bench_ALL', + 'MMT-Bench_VAL_MI', 'MMT-Bench_VAL', 'AesBench_VAL', 'AesBench_TEST', 'CCBench', 'AI2D_TEST', 'MMStar', + 'RealWorldQA', 'MLLMGuard_DS', 'BLINK', 'OCRVQA_TEST', 'OCRVQA_TESTCORE', 'TextVQA_VAL', 'DocVQA_VAL', + 'DocVQA_TEST', 'InfoVQA_VAL', 'InfoVQA_TEST', 'ChartQA_TEST', 'MathVision', 'MathVision_MINI', + 'MMMU_DEV_VAL', 'MMMU_TEST', 'OCRBench', 'MathVista_MINI', 'LLaVABench', 'MMVet', 'MTVQA_TEST', + 'MMLongBench_DOC', 'VCR_EN_EASY_500', 'VCR_EN_EASY_100', 'VCR_EN_EASY_ALL', 'VCR_EN_HARD_500', + 'VCR_EN_HARD_100', 'VCR_EN_HARD_ALL', 'VCR_ZH_EASY_500', 'VCR_ZH_EASY_100', 'VCR_ZH_EASY_ALL', + 'VCR_ZH_HARD_500', 'VCR_ZH_HARD_100', 'VCR_ZH_HARD_ALL', 'MMDU', 'MMBench-Video', 'Video-MME', + 'MMBench_DEV_EN', 'MMBench_TEST_EN', 'MMBench_DEV_CN', 'MMBench_TEST_CN', 'MMBench', 'MMBench_CN', + 'MMBench_DEV_EN_V11', 'MMBench_TEST_EN_V11', 'MMBench_DEV_CN_V11', 'MMBench_TEST_CN_V11', 'MMBench_V11', + 'MMBench_CN_V11', 'SEEDBench_IMG', 'SEEDBench2', 'SEEDBench2_Plus', 'ScienceQA_VAL', 'ScienceQA_TEST', + 'MMT-Bench_ALL_MI', 'MMT-Bench_ALL', 'MMT-Bench_VAL_MI', 'MMT-Bench_VAL', 'AesBench_VAL', + 'AesBench_TEST', 'CCBench', 'AI2D_TEST', 'MMStar', 'RealWorldQA', 'MLLMGuard_DS', 'BLINK' + ``` ## Environment Preparation @@ -47,7 +61,7 @@ For detailed information on the datasets, please visit: https://github.com/open- pip install ms-swift[eval] -U ``` -Or install from the source code: +Or install from source: ```shell git clone https://github.com/modelscope/ms-swift.git @@ -57,11 +71,25 @@ pip install -e '.[eval]' ## Evaluation -We support four types of evaluation: pure text evaluation, multimodal evaluation, URL evaluation, and custom dataset evaluation. +Supports four methods of evaluation: pure text evaluation, multimodal evaluation, URL evaluation, and custom dataset evaluation. + +**Basic Example** + +```shell +swift eval \ + --model Qwen/Qwen2.5-0.5B-Instruct \ + --eval_backend Native \ + --infer_backend pt \ + --eval_limit 10 \ + --eval_dataset gsm8k +``` +Where: +- eval_backend: options are Native, OpenCompass, VLMEvalKit +- infer_backend: options are pt, vllm, lmdeploy -For sample evaluations, please refer to [examples](https://github.com/modelscope/ms-swift/tree/main/examples/eval). +For a specific list of evaluation parameters, please refer to [here](./Command-line-parameters.md#evaluation-arguments). -The list of evaluation parameters can be found [here](Commend-line-parameters#评测参数). +More evaluation examples can be found in [examples](https://github.com/modelscope/ms-swift/tree/main/examples/eval). ## Custom Evaluation Sets diff --git a/examples/eval/eval_url/demo.py b/examples/eval/eval_url/demo.py index d893b513c3..259ad043c8 100644 --- a/examples/eval/eval_url/demo.py +++ b/examples/eval/eval_url/demo.py @@ -11,4 +11,4 @@ with run_deploy( DeployArguments(model='Qwen/Qwen2.5-1.5B-Instruct', verbose=False, log_interval=-1, infer_backend='vllm'), return_url=True) as url: - eval_main(EvalArguments(model='Qwen2.5-1.5B-Instruct', eval_url=url, eval_dataset=['ARC_c'])) + eval_main(EvalArguments(model='Qwen2.5-1.5B-Instruct', eval_url=url, eval_dataset=['arc'])) diff --git a/examples/eval/vlm/eval.sh b/examples/eval/vlm/eval.sh index f8a9acdd26..d59e32cbb3 100644 --- a/examples/eval/vlm/eval.sh +++ b/examples/eval/vlm/eval.sh @@ -4,4 +4,5 @@ swift eval \ --model Qwen/Qwen2-VL-2B-Instruct \ --infer_backend pt \ --eval_limit 100 \ - --eval_dataset realWorldQA + --eval_dataset realWorldQA \ + --eval_backend VLMEvalKit diff --git a/swift/llm/argument/eval_args.py b/swift/llm/argument/eval_args.py index 4ccd92d32b..029df55884 100644 --- a/swift/llm/argument/eval_args.py +++ b/swift/llm/argument/eval_args.py @@ -2,7 +2,7 @@ import datetime as dt import os from dataclasses import dataclass, field -from typing import List, Optional +from typing import List, Literal, Optional from swift.utils import get_logger from .base_args import to_abspath @@ -29,7 +29,7 @@ class EvalArguments(DeployArguments): eval_dataset: List[str] = field(default_factory=list) eval_limit: Optional[int] = None eval_output_dir: str = 'eval_output' - + eval_backend: Literal['Native', 'OpenCompass', 'VLMEvalKit'] = 'Native' local_dataset: bool = False temperature: Optional[float] = 0. @@ -53,38 +53,33 @@ def __post_init__(self): @staticmethod def list_eval_dataset(): + from evalscope.constants import EvalBackend + from evalscope.benchmarks.benchmark import BENCHMARK_MAPPINGS from evalscope.backend.opencompass import OpenCompassBackendManager from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager return { - 'opencompass': OpenCompassBackendManager.list_datasets(), - 'vlmeval': VLMEvalKitBackendManager.list_supported_datasets() + EvalBackend.NATIVE: list(BENCHMARK_MAPPINGS.keys()), + EvalBackend.OPEN_COMPASS: OpenCompassBackendManager.list_datasets(), + EvalBackend.VLM_EVAL_KIT: VLMEvalKitBackendManager.list_supported_datasets() } def _init_eval_dataset(self): if isinstance(self.eval_dataset, str): self.eval_dataset = [self.eval_dataset] - eval_dataset = self.list_eval_dataset() - from evalscope.backend.opencompass import OpenCompassBackendManager - from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager - self.opencompass_dataset = set(eval_dataset['opencompass']) - self.vlmeval_dataset = set(eval_dataset['vlmeval']) - eval_dataset_mapping = {dataset.lower(): dataset for dataset in self.opencompass_dataset | self.vlmeval_dataset} - self.eval_dataset_oc = [] - self.eval_dataset_vlm = [] + all_eval_dataset = self.list_eval_dataset() + dataset_mapping = {dataset.lower(): dataset for dataset in all_eval_dataset[self.eval_backend]} + valid_dataset = [] for dataset in self.eval_dataset: - dataset = eval_dataset_mapping.get(dataset.lower(), dataset) - if dataset in self.opencompass_dataset: - self.eval_dataset_oc.append(dataset) - elif dataset in self.vlmeval_dataset: - self.eval_dataset_vlm.append(dataset) - else: - raise ValueError(f'eval_dataset: {dataset} is not supported.\n' - f'opencompass_dataset: {OpenCompassBackendManager.list_datasets()}.\n\n' - f'vlmeval_dataset: {VLMEvalKitBackendManager.list_supported_datasets()}.') + if dataset.lower() not in dataset_mapping: + raise ValueError( + f'eval_dataset: {dataset} is not supported.\n' + f'eval_backend: {self.eval_backend} supported datasets: {all_eval_dataset[self.eval_backend]}') + valid_dataset.append(dataset_mapping[dataset.lower()]) + self.eval_dataset = valid_dataset - logger.info(f'opencompass dataset: {self.eval_dataset_oc}') - logger.info(f'vlmeval dataset: {self.eval_dataset_vlm}') + logger.info(f'eval_backend: {self.eval_backend}') + logger.info(f'eval_dataset: {self.eval_dataset}') def _init_result_path(self, folder_name: str) -> None: self.time = dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f') diff --git a/swift/llm/eval/eval.py b/swift/llm/eval/eval.py index a118a1066c..ca63bb42c2 100644 --- a/swift/llm/eval/eval.py +++ b/swift/llm/eval/eval.py @@ -4,7 +4,8 @@ from contextlib import nullcontext from typing import List, Union -from evalscope.run import run_task +from evalscope.constants import EvalBackend, EvalType +from evalscope.run import TaskConfig, run_task from evalscope.summarizer import Summarizer from swift.utils import append_to_jsonl, get_logger @@ -27,20 +28,11 @@ def run(self): with deploy_context as base_url: base_url = args.eval_url or base_url url = os.path.join(base_url, 'chat/completions') - if args.eval_dataset_oc: - reports = self.run_task(args.eval_dataset_oc, 'opencompass', url) - result = {} - for report in reports: - if report[args.model_suffix] != '-': - result[report['dataset']] = {report['metric']: report[args.model_suffix]} - eval_report['opencompass'] = result - if args.eval_dataset_vlm: - reports = self.run_task(args.eval_dataset_vlm, 'vlmeval', url) - result = {} - for dataset, report in zip(args.eval_dataset_vlm, reports): - metric = next(iter(report)).rsplit('_')[-1] - result[dataset] = {metric: list(report.values())[0]} - eval_report['vlmeval'] = result + + task_cfg = self.get_task_cfg(args.eval_dataset, args.eval_backend, url) + result = self.get_task_result(task_cfg) + eval_report[args.eval_backend] = result + eval_report.update({ 'time': args.time, 'model': args.model, @@ -55,10 +47,26 @@ def run(self): logger.info(f'The eval result have been saved to result_jsonl: `{args.result_jsonl}`.') return eval_report - def run_task(self, dataset: List[str], eval_backend: str, url: str): - args = self.args - assert eval_backend in {'opencompass', 'vlmeval'} - if eval_backend == 'opencompass': + def get_task_result(self, task_cfg: TaskConfig): + run_task(task_cfg=task_cfg) + reports = Summarizer.get_report_from_cfg(task_cfg=task_cfg) + result = {} + if task_cfg.eval_backend == EvalBackend.OPEN_COMPASS: + for report in reports: + if report[self.args.model_suffix] != '-': + result[report['dataset']] = {report['metric']: report[self.args.model_suffix]} + elif task_cfg.eval_backend == EvalBackend.VLM_EVAL_KIT: + for report in reports: + metric = next(iter(report)).rsplit('_')[-1] + dataset = next(iter(report)).rsplit('_')[-2] + result[dataset] = {metric: list(report.values())[0]} + else: + result = reports + return result + + def get_task_cfg(self, dataset: List[str], eval_backend: str, url: str): + assert eval_backend in {EvalBackend.NATIVE, EvalBackend.OPEN_COMPASS, EvalBackend.VLM_EVAL_KIT} + if eval_backend == EvalBackend.OPEN_COMPASS: if self.args.local_dataset: if os.path.exists('data'): if not os.path.exists(os.path.join('data', 'CMB')): @@ -74,44 +82,55 @@ def run_task(self, dataset: List[str], eval_backend: str, url: str): os.symlink(os.path.join(local_dir, 'data'), 'data') task_cfg = self.get_opencompass_task_cfg(dataset, url) - else: + elif eval_backend == EvalBackend.VLM_EVAL_KIT: task_cfg = self.get_vlmeval_task_cfg(dataset, url) - if args.eval_limit: - task_cfg['eval_config']['limit'] = args.eval_limit + else: + task_cfg = self.get_native_task_cfg(dataset, url) + return task_cfg - run_task(task_cfg=task_cfg) - return Summarizer.get_report_from_cfg(task_cfg=task_cfg) + def get_native_task_cfg(self, dataset: List[str], url: str): + args = self.args + work_dir = os.path.join(args.eval_output_dir, 'native') + return TaskConfig( + model=args.model_suffix, + eval_type=EvalType.SERVICE, + api_url=url, + api_key=args.api_key or 'EMPTY', + datasets=dataset, + work_dir=work_dir, + limit=args.eval_limit) def get_opencompass_task_cfg(self, dataset: List[str], url: str): args = self.args - return { - 'eval_backend': 'OpenCompass', - 'eval_config': { + work_dir = os.path.join(args.eval_output_dir, 'opencompass') + return TaskConfig( + eval_backend=EvalBackend.OPEN_COMPASS, + eval_config={ 'datasets': dataset, 'batch_size': args.eval_num_proc or 256, 'work_dir': - os.path.join(args.eval_output_dir, 'opencompass'), + work_dir, 'models': [{ 'path': args.model_suffix, 'openai_api_base': url, 'key': args.api_key or 'EMPTY', 'is_chat': args.use_chat_template - }] - } - } + }], + 'limit': + args.eval_limit + }, + work_dir=work_dir) def get_vlmeval_task_cfg(self, dataset: List[str], url: str): args = self.args - task_cfg = { - 'eval_backend': 'VLMEvalKit', - 'eval_config': { + work_dir = os.path.join(args.eval_output_dir, 'vlmeval') + return TaskConfig( + eval_backend=EvalBackend.VLM_EVAL_KIT, + eval_config={ 'data': dataset, - 'work_dir': - os.path.join(args.eval_output_dir, 'vlmeval', - dt.datetime.now().strftime('%Y%m%d-%H%M%S')), 'model': [{ 'type': args.model_suffix, 'name': 'CustomAPIModel', @@ -120,10 +139,10 @@ def get_vlmeval_task_cfg(self, dataset: List[str], url: str): }], 'nproc': args.eval_num_proc or 16, - } - } - task_cfg['work_dir'] = task_cfg['eval_config']['work_dir'] # compat evalscope 0.8.1 - return task_cfg + 'limit': + args.eval_limit + }, + work_dir=work_dir) def eval_main(args: Union[List[str], EvalArguments, None] = None): diff --git a/swift/ui/llm_eval/eval.py b/swift/ui/llm_eval/eval.py index 7694c33ec2..dbf08ab00b 100644 --- a/swift/ui/llm_eval/eval.py +++ b/swift/ui/llm_eval/eval.py @@ -14,14 +14,24 @@ class Eval(BaseUI): group = 'llm_eval' locale_dict = { + 'eval_backend': { + 'label': { + 'zh': '评测后端', + 'en': 'Eval backend' + }, + 'info': { + 'zh': '选择评测后端', + 'en': 'Select eval backend' + } + }, 'eval_dataset': { 'label': { 'zh': '评测数据集', 'en': 'Evaluation dataset' }, 'info': { - 'zh': '选择评测数据集,支持多选', - 'en': 'Select eval dataset, multiple datasets supported' + 'zh': '选择评测数据集,支持多选 (先选择评测后端)', + 'en': 'Select eval dataset, multiple datasets supported (select eval backend first)' } }, 'eval_limit': { @@ -88,46 +98,19 @@ class Eval(BaseUI): @classmethod def do_build_ui(cls, base_tab: Type['BaseUI']): try: - from evalscope.backend.opencompass import OpenCompassBackendManager - from evalscope.backend.vlm_eval_kit import VLMEvalKitBackendManager - eval_dataset_list = ( - OpenCompassBackendManager.list_datasets() + VLMEvalKitBackendManager.list_supported_datasets()) - logger.warn('If you encounter an error message👆🏻👆🏻👆🏻 of `.env` file, please ignore.') + from swift.llm.argument.eval_args import EvalArguments + eval_dataset_dict = EvalArguments.list_eval_dataset() + default_backend = EvalArguments.eval_backend except Exception as e: logger.warn(e) - logger.warn( - ('The error message 👆🏻👆🏻👆🏻above will have no bad effects, ' - 'only means evalscope is not installed, and default eval datasets will be listed in the web-ui.')) - eval_dataset_list = [ - 'AX_b', 'cmb', 'winogrande', 'mmlu', 'afqmc', 'COPA', 'commonsenseqa', 'CMRC', 'lcsts', 'nq', - 'ocnli_fc', 'math', 'mbpp', 'DRCD', 'TheoremQA', 'CB', 'ReCoRD', 'lambada', 'tnews', 'flores', - 'humaneval', 'AX_g', 'ceval', 'bbh', 'BoolQ', 'MultiRC', 'piqa', 'csl', 'ARC_c', 'agieval', 'cmnli', - 'strategyqa', 'gsm8k', 'summedits', 'eprstmt', 'WiC', 'cluewsc', 'Xsum', 'ocnli', 'triviaqa', - 'hellaswag', 'race', 'bustm', 'RTE', 'C3', 'GaokaoBench', 'storycloze', 'ARC_e', 'siqa', 'obqa', 'WSC', - 'chid', 'COCO_VAL', 'MME', 'HallusionBench', 'POPE', 'MMBench_DEV_EN', 'MMBench_TEST_EN', - 'MMBench_DEV_CN', 'MMBench_TEST_CN', 'MMBench', 'MMBench_CN', 'MMBench_DEV_EN_V11', - 'MMBench_TEST_EN_V11', 'MMBench_DEV_CN_V11', 'MMBench_TEST_CN_V11', 'MMBench_V11', 'MMBench_CN_V11', - 'SEEDBench_IMG', 'SEEDBench2', 'SEEDBench2_Plus', 'ScienceQA_VAL', 'ScienceQA_TEST', 'MMT-Bench_ALL_MI', - 'MMT-Bench_ALL', 'MMT-Bench_VAL_MI', 'MMT-Bench_VAL', 'AesBench_VAL', 'AesBench_TEST', 'CCBench', - 'AI2D_TEST', 'MMStar', 'RealWorldQA', 'MLLMGuard_DS', 'BLINK', 'OCRVQA_TEST', 'OCRVQA_TESTCORE', - 'TextVQA_VAL', 'DocVQA_VAL', 'DocVQA_TEST', 'InfoVQA_VAL', 'InfoVQA_TEST', 'ChartQA_TEST', 'MathVision', - 'MathVision_MINI', 'MMMU_DEV_VAL', 'MMMU_TEST', 'OCRBench', 'MathVista_MINI', 'LLaVABench', 'MMVet', - 'MTVQA_TEST', 'MMLongBench_DOC', 'VCR_EN_EASY_500', 'VCR_EN_EASY_100', 'VCR_EN_EASY_ALL', - 'VCR_EN_HARD_500', 'VCR_EN_HARD_100', 'VCR_EN_HARD_ALL', 'VCR_ZH_EASY_500', 'VCR_ZH_EASY_100', - 'VCR_ZH_EASY_ALL', 'VCR_ZH_HARD_500', 'VCR_ZH_HARD_100', 'VCR_ZH_HARD_ALL', 'MMDU', 'MMBench-Video', - 'Video-MME', 'MMBench_DEV_EN', 'MMBench_TEST_EN', 'MMBench_DEV_CN', 'MMBench_TEST_CN', 'MMBench', - 'MMBench_CN', 'MMBench_DEV_EN_V11', 'MMBench_TEST_EN_V11', 'MMBench_DEV_CN_V11', 'MMBench_TEST_CN_V11', - 'MMBench_V11', 'MMBench_CN_V11', 'SEEDBench_IMG', 'SEEDBench2', 'SEEDBench2_Plus', 'ScienceQA_VAL', - 'ScienceQA_TEST', 'MMT-Bench_ALL_MI', 'MMT-Bench_ALL', 'MMT-Bench_VAL_MI', 'MMT-Bench_VAL', - 'AesBench_VAL', 'AesBench_TEST', 'CCBench', 'AI2D_TEST', 'MMStar', 'RealWorldQA', 'MLLMGuard_DS', - 'BLINK' - ] + eval_dataset_dict = [] with gr.Row(): + gr.Dropdown(elem_id='eval_backend', choices=list(eval_dataset_dict.keys()), value=default_backend, scale=20) gr.Dropdown( elem_id='eval_dataset', is_list=True, - choices=eval_dataset_list, + choices=eval_dataset_dict[default_backend], multiselect=True, allow_custom_value=True, scale=20) @@ -138,3 +121,9 @@ def do_build_ui(cls, base_tab: Type['BaseUI']): gr.Textbox(elem_id='eval_output_dir', scale=20) gr.Textbox(elem_id='eval_url', scale=20) gr.Textbox(elem_id='api_key', scale=20) + + def update_eval_dataset(backend): + return gr.update(choices=eval_dataset_dict[backend]) + + cls.element('eval_backend').change(update_eval_dataset, [cls.element('eval_backend')], + [cls.element('eval_dataset')]) diff --git a/tests/eval/test_eval.py b/tests/eval/test_eval.py index 442fa92121..9161f60f63 100644 --- a/tests/eval/test_eval.py +++ b/tests/eval/test_eval.py @@ -2,21 +2,45 @@ os.environ['CUDA_VISIBLE_DEVICES'] = '0' +infer_backend = 'vllm' + + +def test_eval_native(): + from swift.llm import EvalArguments, eval_main + eval_main( + EvalArguments( + model='Qwen/Qwen2.5-0.5B-Instruct', + eval_dataset='arc', + infer_backend=infer_backend, + eval_backend='Native', + eval_limit=10)) + def test_eval_llm(): from swift.llm import EvalArguments, eval_main - eval_main(EvalArguments(model='Qwen/Qwen2-7B-Instruct', eval_dataset='arc_c', infer_backend='vllm')) + eval_main( + EvalArguments( + model='Qwen/Qwen2-7B-Instruct', + eval_dataset='arc_c', + infer_backend=infer_backend, + eval_backend='OpenCompass', + eval_limit=10)) def test_eval_mllm(): from swift.llm import EvalArguments, eval_main eval_main( - EvalArguments(model='Qwen/Qwen2-VL-7B-Instruct', eval_dataset=['realWorldQA', 'arc_c'], infer_backend='vllm')) + EvalArguments( + model='Qwen/Qwen2-VL-7B-Instruct', + eval_dataset=['realWorldQA'], + infer_backend=infer_backend, + eval_backend='VLMEvalKit', + eval_limit=10)) def test_eval_url(): from swift.llm import EvalArguments, eval_main, DeployArguments, run_deploy - deploy_args = DeployArguments(model='Qwen/Qwen2-VL-7B-Instruct', infer_backend='vllm', verbose=False) + deploy_args = DeployArguments(model='Qwen/Qwen2-VL-7B-Instruct', infer_backend=infer_backend, verbose=False) with run_deploy(deploy_args, return_url=True) as url: eval_main(EvalArguments(model='Qwen2-VL-7B-Instruct', eval_url=url, eval_dataset=['arc_c'])) @@ -25,4 +49,5 @@ def test_eval_url(): if __name__ == '__main__': # test_eval_llm() # test_eval_mllm() - test_eval_url() + # test_eval_url() + test_eval_native()