From 2956474ec22d5fbd0210a58c9208d670b907c702 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 26 Dec 2024 13:47:06 +0800 Subject: [PATCH 1/5] fix alpaca --- swift/llm/dataset/dataset/llm.py | 16 +++++++++++-- swift/llm/dataset/preprocessor/core.py | 32 ++++++++++---------------- 2 files changed, 26 insertions(+), 22 deletions(-) diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py index d0d5b002ca..eb06e14b23 100644 --- a/swift/llm/dataset/dataset/llm.py +++ b/swift/llm/dataset/dataset/llm.py @@ -12,14 +12,26 @@ def _concat_inst_inp_alpaca_zh(inst: str, inp: str) -> str: if inp.startswith('输入:'): inp = inp[3:] - return f'{inst}\n{inp}' + if inst and inp: + return f'{inst}\n{inp}' + else: + return inst or inp + + +class AlpacaZhPreprocessor(AlpacaPreprocessor): + + @classmethod + def concat_inst_input(cls, instruction, input_): + if input_ and input_.startswith('输入:'): + input_ = input_[3:] + return super().concat_inst_input(instruction, input_) register_dataset( DatasetMeta( ms_dataset_id='AI-ModelScope/alpaca-gpt4-data-zh', hf_dataset_id='llm-wizard/alpaca-gpt4-data-zh', - preprocess_func=AlpacaPreprocessor(concat_inst_input=_concat_inst_inp_alpaca_zh), + preprocess_func=AlpacaZhPreprocessor(), tags=['chat', 'general', '🔥'], )) diff --git a/swift/llm/dataset/preprocessor/core.py b/swift/llm/dataset/preprocessor/core.py index 24c6fa73a6..825c0ba1f3 100644 --- a/swift/llm/dataset/preprocessor/core.py +++ b/swift/llm/dataset/preprocessor/core.py @@ -312,18 +312,18 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]: class AlpacaPreprocessor(ResponsePreprocessor): - def __init__(self, - *, - concat_inst_input: Union[Callable[[str, str], str]] = '\n', - columns_mapping: Optional[Dict[str, str]] = None, - **kwargs) -> None: - """Alpaca format preprocessor - - Args: - concat_inst_input: The concat sep between instruction and input - """ + def __init__(self, *, columns_mapping: Optional[Dict[str, str]] = None, **kwargs) -> None: + """Alpaca format preprocessor""" super().__init__(columns_mapping=columns_mapping, **kwargs) - self.concat_inst_input = concat_inst_input + + @classmethod + def concat_inst_input(cls, instruction, input_): + if instruction and input_: + query = f'{instruction}\n{input_}' + else: + query = instruction or input_ + assert isinstance(query, str), f'query: {query}' + return query def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]: instruction = row.pop('instruction', None) @@ -331,15 +331,7 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]: output = row.pop('output', None) if output is not None: row['response'] = output - - if instruction is not None or input_ is not None: - instruction = instruction or '' - input_ = input_ or '' - if isinstance(self.concat_inst_input, str): - query = instruction + self.concat_inst_input + input_ - else: - query = self.concat_inst_input(instruction, input_) - row['query'] = query + row['query'] = self.concat_inst_input(instruction, input_) return super().preprocess(row) From 3998b7735b2a96dc15a5d26feaa1538a885644cb Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 26 Dec 2024 13:47:53 +0800 Subject: [PATCH 2/5] fix --- swift/llm/dataset/dataset/llm.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py index eb06e14b23..e82e982b98 100644 --- a/swift/llm/dataset/dataset/llm.py +++ b/swift/llm/dataset/dataset/llm.py @@ -8,16 +8,6 @@ RowPreprocessor, TextGenerationPreprocessor) from ..register import DatasetMeta, SubsetDataset, register_dataset - -def _concat_inst_inp_alpaca_zh(inst: str, inp: str) -> str: - if inp.startswith('输入:'): - inp = inp[3:] - if inst and inp: - return f'{inst}\n{inp}' - else: - return inst or inp - - class AlpacaZhPreprocessor(AlpacaPreprocessor): @classmethod From eb53b6c36e16f00cd3dfc16771ea65a10220ad82 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 26 Dec 2024 13:49:08 +0800 Subject: [PATCH 3/5] fix --- swift/llm/dataset/dataset/llm.py | 1 + tests/general/test_dataset.py | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py index e82e982b98..56af8a47e6 100644 --- a/swift/llm/dataset/dataset/llm.py +++ b/swift/llm/dataset/dataset/llm.py @@ -8,6 +8,7 @@ RowPreprocessor, TextGenerationPreprocessor) from ..register import DatasetMeta, SubsetDataset, register_dataset + class AlpacaZhPreprocessor(AlpacaPreprocessor): @classmethod diff --git a/tests/general/test_dataset.py b/tests/general/test_dataset.py index cf4da83127..371401fbec 100644 --- a/tests/general/test_dataset.py +++ b/tests/general/test_dataset.py @@ -15,8 +15,11 @@ def test_sft(): # _test_dataset(['AI-ModelScope/Duet-v0.5']) # _test_dataset(['swift/SlimOrca', 'swift/cosmopedia-100k']) # _test_dataset(['OmniData/Zhihu-KOL-More-Than-100-Upvotes']) - _test_dataset(['OmniData/Zhihu-KOL']) - # _test_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000', 'AI-ModelScope/alpaca-gpt4-data-en#200']) + # _test_dataset(['OmniData/Zhihu-KOL']) + _test_dataset([ + 'AI-ModelScope/alpaca-gpt4-data-zh#1000', 'AI-ModelScope/alpaca-gpt4-data-en#1000', + 'AI-ModelScope/LongAlpaca-12k#1000' + ]) # _test_dataset(['swift/Infinity-Instruct:all']) # _test_dataset(['swift/sharegpt:all']) # _test_dataset(['AI-ModelScope/sharegpt_gpt4:all']) From c10033db07fbd973a8ea2a923344f94a6abe04ed Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 26 Dec 2024 14:18:27 +0800 Subject: [PATCH 4/5] fix --- swift/llm/__init__.py | 5 +++-- swift/llm/model/__init__.py | 2 +- swift/llm/model/utils.py | 5 +++-- tests/general/test_stream.py | 9 ++++----- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/swift/llm/__init__.py b/swift/llm/__init__.py index 1c50131738..2173ce93b2 100644 --- a/swift/llm/__init__.py +++ b/swift/llm/__init__.py @@ -18,7 +18,8 @@ from .model import (register_model, MODEL_MAPPING, ModelType, get_model_tokenizer, safe_snapshot_download, HfConfigFactory, ModelInfo, ModelMeta, ModelKeys, register_model_arch, MultiModelKeys, ModelArch, get_model_arch, MODEL_ARCH_MAPPING, get_model_info_meta, get_model_name, ModelGroup, - Model, get_model_tokenizer_with_flash_attn, get_model_tokenizer_multimodal, load_by_unsloth) + Model, get_model_tokenizer_with_flash_attn, get_model_tokenizer_multimodal, load_by_unsloth, + git_clone_github) from .dataset import (AlpacaPreprocessor, ResponsePreprocessor, MessagesPreprocessor, AutoPreprocessor, DATASET_MAPPING, MediaResource, register_dataset, register_dataset_info, EncodePreprocessor, LazyLLMDataset, ConstantLengthDataset, standard_keys, load_dataset, DATASET_TYPE, @@ -51,7 +52,7 @@ 'ModelInfo', 'ModelMeta', 'ModelKeys', 'register_model_arch', 'MultiModelKeys', 'ModelArch', 'MODEL_ARCH_MAPPING', 'get_model_arch', 'get_model_info_meta', 'get_model_name', 'register_model', 'ModelGroup', 'Model', 'get_model_tokenizer_with_flash_attn', 'get_model_tokenizer_multimodal', - 'load_by_unsloth' + 'load_by_unsloth', 'git_clone_github' ], 'dataset': [ 'AlpacaPreprocessor', 'ClsPreprocessor', 'ComposePreprocessor', 'MessagesPreprocessor', 'DATASET_MAPPING', diff --git a/swift/llm/model/__init__.py b/swift/llm/model/__init__.py index d0db4befba..754d715207 100644 --- a/swift/llm/model/__init__.py +++ b/swift/llm/model/__init__.py @@ -6,4 +6,4 @@ get_default_torch_dtype, get_model_info_meta, get_model_name, get_model_tokenizer, get_model_tokenizer_multimodal, get_model_tokenizer_with_flash_attn, get_model_with_value_head, load_by_unsloth, register_model) -from .utils import HfConfigFactory, ModelInfo, safe_snapshot_download +from .utils import HfConfigFactory, ModelInfo, git_clone_github, safe_snapshot_download diff --git a/swift/llm/model/utils.py b/swift/llm/model/utils.py index 7195b266e7..efacbc91fb 100644 --- a/swift/llm/model/utils.py +++ b/swift/llm/model/utils.py @@ -274,6 +274,8 @@ def git_clone_github(github_url: str, local_repo_name: Optional[str] = None, branch: Optional[str] = None, commit_hash: Optional[str] = None) -> str: + if github_url.endswith('.git'): + github_url = github_url[:-4] git_cache_dir = os.path.join(get_cache_dir(), '_github') os.makedirs(git_cache_dir, exist_ok=True) if local_repo_name is None: @@ -282,8 +284,7 @@ def git_clone_github(github_url: str, local_repo_path = os.path.join(git_cache_dir, local_repo_name) with safe_ddp_context(hash_id=local_repo_path): if not os.path.exists(local_repo_path): - if not github_url.endswith('.git'): - github_url = f'{github_url}.git' + github_url = f'{github_url}.git' command = ['git', '-C', git_cache_dir, 'clone', github_url, local_repo_name] command_str = f"git -C '{git_cache_dir}' clone '{github_url}' {local_repo_name}" if branch is not None: diff --git a/tests/general/test_stream.py b/tests/general/test_stream.py index 08828d12f3..ad20696227 100644 --- a/tests/general/test_stream.py +++ b/tests/general/test_stream.py @@ -3,11 +3,10 @@ def test_local_dataset(): # please use git clone - local_dataset = '/mnt/nas2/huangjintao.hjt/work/datasets/swift-sft-mixture:firefly#100' - dataset = load_dataset(datasets=[local_dataset], streaming=True)[0] - for i, x in enumerate(dataset): - pass - print(i, x) + from swift.llm import git_clone_github + model_dir = git_clone_github('https://www.modelscope.cn/datasets/swift/swift-sft-mixture.git') + dataset = load_dataset(datasets=[f'{model_dir}:firefly'], streaming=True)[0] + print(next(iter(dataset))) def test_hub_dataset(): From b99dcc72f9b3bb5604878eeba064af2c7eb5ea40 Mon Sep 17 00:00:00 2001 From: Jintao Huang Date: Thu, 26 Dec 2024 14:19:29 +0800 Subject: [PATCH 5/5] fix --- swift/llm/dataset/preprocessor/core.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/swift/llm/dataset/preprocessor/core.py b/swift/llm/dataset/preprocessor/core.py index 825c0ba1f3..c3d692f4c3 100644 --- a/swift/llm/dataset/preprocessor/core.py +++ b/swift/llm/dataset/preprocessor/core.py @@ -312,10 +312,6 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]: class AlpacaPreprocessor(ResponsePreprocessor): - def __init__(self, *, columns_mapping: Optional[Dict[str, str]] = None, **kwargs) -> None: - """Alpaca format preprocessor""" - super().__init__(columns_mapping=columns_mapping, **kwargs) - @classmethod def concat_inst_input(cls, instruction, input_): if instruction and input_: