diff --git "a/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" "b/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" index 65dee97901..708d70919e 100644 --- "a/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" +++ "b/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" @@ -107,6 +107,16 @@ AAAAA,BBBBB,CCCCC {"query": "AAAAA", "response": "BBBBB", "rejected_response": "CCCCC"} ``` +**CogAgent模型** + +```jsonl +{"query": "55555", "response": "66666", "image": "some-local-image-path"} +{"query": "eeeee", "response": "fffff", "history": [], "image": "some-http-image-path"} +{"query": "EEEEE", "response": "FFFFF", "history": [["AAAAA", "BBBBB"], ["CCCCC", "DDDDD"]], "image": "some-local-image-path"} +``` + +image字段支持本地图片文件和http可访问的image url两类。 + ### 注册数据集的方式 以下是一个**注册数据集**的案例. 完整的py文件可以查看[custom.py](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/custom.py), sh脚本可以查看[custom](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/custom). diff --git a/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh b/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh index 0b642444db..093d672859 100644 --- a/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh +++ b/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh @@ -26,7 +26,7 @@ python llm_sft.py \ --eval_steps 100 \ --save_steps 100 \ --save_total_limit 2 \ - --logging_steps 10 + --logging_steps 10 \ --push_to_hub false \ --hub_model_id cogagent-chat-lora \ --hub_private_repo true \ diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index 3c8769f209..8d2dc7c9c6 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -22,7 +22,7 @@ PreprocessFunc, RenameColumnsPreprocessor, SmartPreprocessor, TextGenerationPreprocessor) from .template import History -from .utils import download_dataset +from .utils import dataset_map, download_dataset def _remove_useless_columns(dataset: HfDataset) -> HfDataset: @@ -1120,7 +1120,21 @@ def load_dataset_from_local( 'for more information.') dataset = HfDataset.from_dict(df.to_dict(orient='list')) dataset_list.append(preprocess_func(dataset)) - return concatenate_datasets(dataset_list) + + dataset = concatenate_datasets(dataset_list) + + def load_image(row): + from PIL import Image + import requests + if not os.path.exists(row['image']): + row['image'] = requests.get(row['image'], stream=True).raw + row['image'] = Image.open(row['image']) + return row + + if 'image' in dataset.features and isinstance(dataset[0]['image'], str): + dataset = HfDataset.from_list( + dataset_map(dataset, load_image, num_proc=4).data) + return dataset def get_custom_dataset(_: str, train_subset_split_list: Union[str, List[str]],