diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py index 8d2dc7c9c6..321196a2ed 100644 --- a/swift/llm/utils/dataset.py +++ b/swift/llm/utils/dataset.py @@ -1121,20 +1121,7 @@ def load_dataset_from_local( dataset = HfDataset.from_dict(df.to_dict(orient='list')) dataset_list.append(preprocess_func(dataset)) - dataset = concatenate_datasets(dataset_list) - - def load_image(row): - from PIL import Image - import requests - if not os.path.exists(row['image']): - row['image'] = requests.get(row['image'], stream=True).raw - row['image'] = Image.open(row['image']) - return row - - if 'image' in dataset.features and isinstance(dataset[0]['image'], str): - dataset = HfDataset.from_list( - dataset_map(dataset, load_image, num_proc=4).data) - return dataset + return concatenate_datasets(dataset_list) def get_custom_dataset(_: str, train_subset_split_list: Union[str, List[str]], diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py index b8d9d066df..5e732fe9d4 100644 --- a/swift/llm/utils/template.py +++ b/swift/llm/utils/template.py @@ -1,4 +1,5 @@ # Copyright (c) Alibaba, Inc. and its affiliates. +import os from copy import deepcopy from typing import Any, Dict, List, Literal, Optional, Tuple, Union @@ -554,6 +555,13 @@ def build_conversation_input_ids( def encode(self, example: Dict[str, Any]) -> Dict[str, Optional[List[int]]]: + if 'image' in example and isinstance(example['image'], str): + from PIL import Image + import requests + if not os.path.exists(example['image']): + example['image'] = requests.get( + example['image'], stream=True).raw + example['image'] = Image.open(example['image']) return self.build_conversation_input_ids( self.tokenizer, query=example['query'],