From 53b00762ad550090ea78ce0541f81a6cbd59a6ce Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 16 Jan 2024 12:15:30 +0800 Subject: [PATCH 1/8] load image --- swift/llm/utils/dataset.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index 3c8769f209..33094a441d 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -1120,7 +1120,16 @@ def load_dataset_from_local( 'for more information.') dataset = HfDataset.from_dict(df.to_dict(orient='list')) dataset_list.append(preprocess_func(dataset)) - return concatenate_datasets(dataset_list) + dataset = concatenate_datasets(dataset_list) + + def load_image(row): + if 'image' in row and isinstance(row['image'], str): + from PIL import Image + row['image'] = Image.open(row['image']) + return row + + dataset = dataset.map(load_image) + return dataset def get_custom_dataset(_: str, train_subset_split_list: Union[str, List[str]], From f120445dfdf6bc424cb622c500bb3773053b49de Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 16 Jan 2024 14:52:10 +0800 Subject: [PATCH 2/8] fix image loading --- swift/llm/utils/dataset.py | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index 33094a441d..70f8170bc7 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -22,7 +22,7 @@ PreprocessFunc, RenameColumnsPreprocessor, SmartPreprocessor, TextGenerationPreprocessor) from .template import History -from .utils import download_dataset +from .utils import dataset_map, download_dataset def _remove_useless_columns(dataset: HfDataset) -> HfDataset: @@ -1120,16 +1120,19 @@ def load_dataset_from_local( 'for more information.') dataset = HfDataset.from_dict(df.to_dict(orient='list')) dataset_list.append(preprocess_func(dataset)) - dataset = concatenate_datasets(dataset_list) - - def load_image(row): - if 'image' in row and isinstance(row['image'], str): - from PIL import Image - row['image'] = Image.open(row['image']) - return row - - dataset = dataset.map(load_image) - return dataset + dataset = concatenate_datasets(dataset_list) + + def load_image(row): + if 'image' in row and isinstance(row['image'], str): + from PIL import Image + import requests + if not os.path.exists(row['image']): + row['image'] = requests.get(row['image'], stream=True).raw + row['image'] = Image.open(row['image']) + return row + + dataset = HfDataset.from_list(dataset_map(dataset, load_image).data) + return dataset def get_custom_dataset(_: str, train_subset_split_list: Union[str, List[str]], From a2b245fd3962f47f916b5ba0e367e33d972e8490 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 16 Jan 2024 14:59:43 +0800 Subject: [PATCH 3/8] update doc --- ...344\271\211\344\270\216\346\213\223\345\261\225.md" | 10 +++++++++- examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh | 2 +- 2 files changed, 10 insertions(+), 2 deletions(-) diff --git "a/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" "b/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" index 65dee97901..974baaed0b 100644 --- "a/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" +++ "b/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" @@ -99,7 +99,7 @@ AAAAA,BBBBB,CCCCC {"messages": [{"role": "user", "content": "AAAAA"}, {"role": "assistant", "content": "BBBBB"}, {"role": "user", "content": "CCCCC"}, {"role": "assistant", "content": "DDDDD"}]} ``` -**强化学习(DPO)** +**强化学习(DPO** ```jsonl {"query": "11111", "response": "22222", "rejected_response": "33333"} @@ -107,6 +107,14 @@ AAAAA,BBBBB,CCCCC {"query": "AAAAA", "response": "BBBBB", "rejected_response": "CCCCC"} ``` +**CogAgent模型** + +```jsonl +{"query": "55555", "response": "66666", "image": "some local path"} +{"query": "eeeee", "response": "fffff", "history": [], "image": "some http path"} +{"query": "EEEEE", "response": "FFFFF", "history": [["AAAAA", "BBBBB"], ["CCCCC", "DDDDD"]], "image": "some local path"} +``` + ### 注册数据集的方式 以下是一个**注册数据集**的案例. 完整的py文件可以查看[custom.py](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/custom.py), sh脚本可以查看[custom](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/custom). diff --git a/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh b/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh index 0b642444db..093d672859 100644 --- a/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh +++ b/examples/pytorch/llm/scripts/cogagent_chat/lora/sft.sh @@ -26,7 +26,7 @@ python llm_sft.py \ --eval_steps 100 \ --save_steps 100 \ --save_total_limit 2 \ - --logging_steps 10 + --logging_steps 10 \ --push_to_hub false \ --hub_model_id cogagent-chat-lora \ --hub_private_repo true \ From a1d6f6cb953d4e7386f28539b1c58771082d2984 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 16 Jan 2024 15:03:03 +0800 Subject: [PATCH 4/8] fix quote --- ...\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git "a/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" "b/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" index 974baaed0b..d9c01d6f3f 100644 --- "a/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" +++ "b/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" @@ -99,7 +99,7 @@ AAAAA,BBBBB,CCCCC {"messages": [{"role": "user", "content": "AAAAA"}, {"role": "assistant", "content": "BBBBB"}, {"role": "user", "content": "CCCCC"}, {"role": "assistant", "content": "DDDDD"}]} ``` -**强化学习(DPO** +**强化学习(DPO)** ```jsonl {"query": "11111", "response": "22222", "rejected_response": "33333"} From c69e8040ed97114f0a3b872484a7170e2d3e3630 Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 16 Jan 2024 15:05:54 +0800 Subject: [PATCH 5/8] fix --- swift/llm/utils/dataset.py | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index 70f8170bc7..2ad9d45a3e 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -1120,19 +1120,20 @@ def load_dataset_from_local( 'for more information.') dataset = HfDataset.from_dict(df.to_dict(orient='list')) dataset_list.append(preprocess_func(dataset)) - dataset = concatenate_datasets(dataset_list) - - def load_image(row): - if 'image' in row and isinstance(row['image'], str): - from PIL import Image - import requests - if not os.path.exists(row['image']): - row['image'] = requests.get(row['image'], stream=True).raw - row['image'] = Image.open(row['image']) - return row - - dataset = HfDataset.from_list(dataset_map(dataset, load_image).data) - return dataset + + dataset = concatenate_datasets(dataset_list) + + def load_image(row): + if 'image' in row and isinstance(row['image'], str): + from PIL import Image + import requests + if not os.path.exists(row['image']): + row['image'] = requests.get(row['image'], stream=True).raw + row['image'] = Image.open(row['image']) + return row + + dataset = HfDataset.from_list(dataset_map(dataset, load_image).data) + return dataset def get_custom_dataset(_: str, train_subset_split_list: Union[str, List[str]], From 0650d56fc1371185526c6b06e887b89cd58d6bca Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 16 Jan 2024 15:08:44 +0800 Subject: [PATCH 6/8] update doc --- ...32\344\271\211\344\270\216\346\213\223\345\261\225.md" | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git "a/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" "b/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" index d9c01d6f3f..708d70919e 100644 --- "a/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" +++ "b/docs/source/LLM/\350\207\252\345\256\232\344\271\211\344\270\216\346\213\223\345\261\225.md" @@ -110,11 +110,13 @@ AAAAA,BBBBB,CCCCC **CogAgent模型** ```jsonl -{"query": "55555", "response": "66666", "image": "some local path"} -{"query": "eeeee", "response": "fffff", "history": [], "image": "some http path"} -{"query": "EEEEE", "response": "FFFFF", "history": [["AAAAA", "BBBBB"], ["CCCCC", "DDDDD"]], "image": "some local path"} +{"query": "55555", "response": "66666", "image": "some-local-image-path"} +{"query": "eeeee", "response": "fffff", "history": [], "image": "some-http-image-path"} +{"query": "EEEEE", "response": "FFFFF", "history": [["AAAAA", "BBBBB"], ["CCCCC", "DDDDD"]], "image": "some-local-image-path"} ``` +image字段支持本地图片文件和http可访问的image url两类。 + ### 注册数据集的方式 以下是一个**注册数据集**的案例. 完整的py文件可以查看[custom.py](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/custom.py), sh脚本可以查看[custom](https://github.com/modelscope/swift/tree/main/examples/pytorch/llm/scripts/custom). From 7a9a619ec4e0afea3de05b833b32affa2009a93b Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 16 Jan 2024 15:32:01 +0800 Subject: [PATCH 7/8] fix --- swift/llm/utils/dataset.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index 2ad9d45a3e..5afe8a4819 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -1124,15 +1124,16 @@ def load_dataset_from_local( dataset = concatenate_datasets(dataset_list) def load_image(row): - if 'image' in row and isinstance(row['image'], str): - from PIL import Image - import requests - if not os.path.exists(row['image']): - row['image'] = requests.get(row['image'], stream=True).raw - row['image'] = Image.open(row['image']) + from PIL import Image + import requests + if not os.path.exists(row['image']): + row['image'] = requests.get(row['image'], stream=True).raw + row['image'] = Image.open(row['image']) return row - dataset = HfDataset.from_list(dataset_map(dataset, load_image).data) + if 'image' in dataset.features and isinstance(dataset[0]['image'], str): + dataset = HfDataset.from_list( + dataset_map(dataset, load_image, num_proc=10).data) return dataset From 8a433a4e7f7d37973cf743824fbf34d878f4b85f Mon Sep 17 00:00:00 2001 From: "yuze.zyz" Date: Tue, 16 Jan 2024 15:34:28 +0800 Subject: [PATCH 8/8] fix proc --- swift/llm/utils/dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index 5afe8a4819..8d2dc7c9c6 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -1133,7 +1133,7 @@ def load_image(row): if 'image' in dataset.features and isinstance(dataset[0]['image'], str): dataset = HfDataset.from_list( - dataset_map(dataset, load_image, num_proc=10).data) + dataset_map(dataset, load_image, num_proc=4).data) return dataset