From 2956474ec22d5fbd0210a58c9208d670b907c702 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 26 Dec 2024 13:47:06 +0800
Subject: [PATCH 1/5] fix alpaca

---
 swift/llm/dataset/dataset/llm.py       | 16 +++++++++++--
 swift/llm/dataset/preprocessor/core.py | 32 ++++++++++----------------
 2 files changed, 26 insertions(+), 22 deletions(-)

diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py
index d0d5b002ca..eb06e14b23 100644
--- a/swift/llm/dataset/dataset/llm.py
+++ b/swift/llm/dataset/dataset/llm.py
@@ -12,14 +12,26 @@
 def _concat_inst_inp_alpaca_zh(inst: str, inp: str) -> str:
     if inp.startswith('输入：'):
         inp = inp[3:]
-    return f'{inst}\n{inp}'
+    if inst and inp:
+        return f'{inst}\n{inp}'
+    else:
+        return inst or inp
+
+
+class AlpacaZhPreprocessor(AlpacaPreprocessor):
+
+    @classmethod
+    def concat_inst_input(cls, instruction, input_):
+        if input_ and input_.startswith('输入：'):
+            input_ = input_[3:]
+        return super().concat_inst_input(instruction, input_)
 
 
 register_dataset(
     DatasetMeta(
         ms_dataset_id='AI-ModelScope/alpaca-gpt4-data-zh',
         hf_dataset_id='llm-wizard/alpaca-gpt4-data-zh',
-        preprocess_func=AlpacaPreprocessor(concat_inst_input=_concat_inst_inp_alpaca_zh),
+        preprocess_func=AlpacaZhPreprocessor(),
         tags=['chat', 'general', '🔥'],
     ))
 
diff --git a/swift/llm/dataset/preprocessor/core.py b/swift/llm/dataset/preprocessor/core.py
index 24c6fa73a6..825c0ba1f3 100644
--- a/swift/llm/dataset/preprocessor/core.py
+++ b/swift/llm/dataset/preprocessor/core.py
@@ -312,18 +312,18 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
 
 class AlpacaPreprocessor(ResponsePreprocessor):
 
-    def __init__(self,
-                 *,
-                 concat_inst_input: Union[Callable[[str, str], str]] = '\n',
-                 columns_mapping: Optional[Dict[str, str]] = None,
-                 **kwargs) -> None:
-        """Alpaca format preprocessor
-
-        Args:
-            concat_inst_input: The concat sep between instruction and input
-        """
+    def __init__(self, *, columns_mapping: Optional[Dict[str, str]] = None, **kwargs) -> None:
+        """Alpaca format preprocessor"""
         super().__init__(columns_mapping=columns_mapping, **kwargs)
-        self.concat_inst_input = concat_inst_input
+
+    @classmethod
+    def concat_inst_input(cls, instruction, input_):
+        if instruction and input_:
+            query = f'{instruction}\n{input_}'
+        else:
+            query = instruction or input_
+        assert isinstance(query, str), f'query: {query}'
+        return query
 
     def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
         instruction = row.pop('instruction', None)
@@ -331,15 +331,7 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
         output = row.pop('output', None)
         if output is not None:
             row['response'] = output
-
-        if instruction is not None or input_ is not None:
-            instruction = instruction or ''
-            input_ = input_ or ''
-            if isinstance(self.concat_inst_input, str):
-                query = instruction + self.concat_inst_input + input_
-            else:
-                query = self.concat_inst_input(instruction, input_)
-            row['query'] = query
+        row['query'] = self.concat_inst_input(instruction, input_)
         return super().preprocess(row)
 
 

From 3998b7735b2a96dc15a5d26feaa1538a885644cb Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 26 Dec 2024 13:47:53 +0800
Subject: [PATCH 2/5] fix

---
 swift/llm/dataset/dataset/llm.py | 10 ----------
 1 file changed, 10 deletions(-)

diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py
index eb06e14b23..e82e982b98 100644
--- a/swift/llm/dataset/dataset/llm.py
+++ b/swift/llm/dataset/dataset/llm.py
@@ -8,16 +8,6 @@
                             RowPreprocessor, TextGenerationPreprocessor)
 from ..register import DatasetMeta, SubsetDataset, register_dataset
 
-
-def _concat_inst_inp_alpaca_zh(inst: str, inp: str) -> str:
-    if inp.startswith('输入：'):
-        inp = inp[3:]
-    if inst and inp:
-        return f'{inst}\n{inp}'
-    else:
-        return inst or inp
-
-
 class AlpacaZhPreprocessor(AlpacaPreprocessor):
 
     @classmethod

From eb53b6c36e16f00cd3dfc16771ea65a10220ad82 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 26 Dec 2024 13:49:08 +0800
Subject: [PATCH 3/5] fix

---
 swift/llm/dataset/dataset/llm.py | 1 +
 tests/general/test_dataset.py    | 7 +++++--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/swift/llm/dataset/dataset/llm.py b/swift/llm/dataset/dataset/llm.py
index e82e982b98..56af8a47e6 100644
--- a/swift/llm/dataset/dataset/llm.py
+++ b/swift/llm/dataset/dataset/llm.py
@@ -8,6 +8,7 @@
                             RowPreprocessor, TextGenerationPreprocessor)
 from ..register import DatasetMeta, SubsetDataset, register_dataset
 
+
 class AlpacaZhPreprocessor(AlpacaPreprocessor):
 
     @classmethod
diff --git a/tests/general/test_dataset.py b/tests/general/test_dataset.py
index cf4da83127..371401fbec 100644
--- a/tests/general/test_dataset.py
+++ b/tests/general/test_dataset.py
@@ -15,8 +15,11 @@ def test_sft():
     # _test_dataset(['AI-ModelScope/Duet-v0.5'])
     # _test_dataset(['swift/SlimOrca', 'swift/cosmopedia-100k'])
     # _test_dataset(['OmniData/Zhihu-KOL-More-Than-100-Upvotes'])
-    _test_dataset(['OmniData/Zhihu-KOL'])
-    # _test_dataset(['AI-ModelScope/alpaca-gpt4-data-zh#1000', 'AI-ModelScope/alpaca-gpt4-data-en#200'])
+    # _test_dataset(['OmniData/Zhihu-KOL'])
+    _test_dataset([
+        'AI-ModelScope/alpaca-gpt4-data-zh#1000', 'AI-ModelScope/alpaca-gpt4-data-en#1000',
+        'AI-ModelScope/LongAlpaca-12k#1000'
+    ])
     # _test_dataset(['swift/Infinity-Instruct:all'])
     # _test_dataset(['swift/sharegpt:all'])
     # _test_dataset(['AI-ModelScope/sharegpt_gpt4:all'])

From c10033db07fbd973a8ea2a923344f94a6abe04ed Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 26 Dec 2024 14:18:27 +0800
Subject: [PATCH 4/5] fix

---
 swift/llm/__init__.py        | 5 +++--
 swift/llm/model/__init__.py  | 2 +-
 swift/llm/model/utils.py     | 5 +++--
 tests/general/test_stream.py | 9 ++++-----
 4 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/swift/llm/__init__.py b/swift/llm/__init__.py
index 1c50131738..2173ce93b2 100644
--- a/swift/llm/__init__.py
+++ b/swift/llm/__init__.py
@@ -18,7 +18,8 @@
     from .model import (register_model, MODEL_MAPPING, ModelType, get_model_tokenizer, safe_snapshot_download,
                         HfConfigFactory, ModelInfo, ModelMeta, ModelKeys, register_model_arch, MultiModelKeys,
                         ModelArch, get_model_arch, MODEL_ARCH_MAPPING, get_model_info_meta, get_model_name, ModelGroup,
-                        Model, get_model_tokenizer_with_flash_attn, get_model_tokenizer_multimodal, load_by_unsloth)
+                        Model, get_model_tokenizer_with_flash_attn, get_model_tokenizer_multimodal, load_by_unsloth,
+                        git_clone_github)
     from .dataset import (AlpacaPreprocessor, ResponsePreprocessor, MessagesPreprocessor, AutoPreprocessor,
                           DATASET_MAPPING, MediaResource, register_dataset, register_dataset_info, EncodePreprocessor,
                           LazyLLMDataset, ConstantLengthDataset, standard_keys, load_dataset, DATASET_TYPE,
@@ -51,7 +52,7 @@
             'ModelInfo', 'ModelMeta', 'ModelKeys', 'register_model_arch', 'MultiModelKeys', 'ModelArch',
             'MODEL_ARCH_MAPPING', 'get_model_arch', 'get_model_info_meta', 'get_model_name', 'register_model',
             'ModelGroup', 'Model', 'get_model_tokenizer_with_flash_attn', 'get_model_tokenizer_multimodal',
-            'load_by_unsloth'
+            'load_by_unsloth', 'git_clone_github'
         ],
         'dataset': [
             'AlpacaPreprocessor', 'ClsPreprocessor', 'ComposePreprocessor', 'MessagesPreprocessor', 'DATASET_MAPPING',
diff --git a/swift/llm/model/__init__.py b/swift/llm/model/__init__.py
index d0db4befba..754d715207 100644
--- a/swift/llm/model/__init__.py
+++ b/swift/llm/model/__init__.py
@@ -6,4 +6,4 @@
                        get_default_torch_dtype, get_model_info_meta, get_model_name, get_model_tokenizer,
                        get_model_tokenizer_multimodal, get_model_tokenizer_with_flash_attn, get_model_with_value_head,
                        load_by_unsloth, register_model)
-from .utils import HfConfigFactory, ModelInfo, safe_snapshot_download
+from .utils import HfConfigFactory, ModelInfo, git_clone_github, safe_snapshot_download
diff --git a/swift/llm/model/utils.py b/swift/llm/model/utils.py
index 7195b266e7..efacbc91fb 100644
--- a/swift/llm/model/utils.py
+++ b/swift/llm/model/utils.py
@@ -274,6 +274,8 @@ def git_clone_github(github_url: str,
                      local_repo_name: Optional[str] = None,
                      branch: Optional[str] = None,
                      commit_hash: Optional[str] = None) -> str:
+    if github_url.endswith('.git'):
+        github_url = github_url[:-4]
     git_cache_dir = os.path.join(get_cache_dir(), '_github')
     os.makedirs(git_cache_dir, exist_ok=True)
     if local_repo_name is None:
@@ -282,8 +284,7 @@ def git_clone_github(github_url: str,
     local_repo_path = os.path.join(git_cache_dir, local_repo_name)
     with safe_ddp_context(hash_id=local_repo_path):
         if not os.path.exists(local_repo_path):
-            if not github_url.endswith('.git'):
-                github_url = f'{github_url}.git'
+            github_url = f'{github_url}.git'
             command = ['git', '-C', git_cache_dir, 'clone', github_url, local_repo_name]
             command_str = f"git -C '{git_cache_dir}' clone '{github_url}' {local_repo_name}"
             if branch is not None:
diff --git a/tests/general/test_stream.py b/tests/general/test_stream.py
index 08828d12f3..ad20696227 100644
--- a/tests/general/test_stream.py
+++ b/tests/general/test_stream.py
@@ -3,11 +3,10 @@
 
 def test_local_dataset():
     # please use git clone
-    local_dataset = '/mnt/nas2/huangjintao.hjt/work/datasets/swift-sft-mixture:firefly#100'
-    dataset = load_dataset(datasets=[local_dataset], streaming=True)[0]
-    for i, x in enumerate(dataset):
-        pass
-    print(i, x)
+    from swift.llm import git_clone_github
+    model_dir = git_clone_github('https://www.modelscope.cn/datasets/swift/swift-sft-mixture.git')
+    dataset = load_dataset(datasets=[f'{model_dir}:firefly'], streaming=True)[0]
+    print(next(iter(dataset)))
 
 
 def test_hub_dataset():

From b99dcc72f9b3bb5604878eeba064af2c7eb5ea40 Mon Sep 17 00:00:00 2001
From: Jintao Huang <huangjintao.hjt@alibaba-inc.com>
Date: Thu, 26 Dec 2024 14:19:29 +0800
Subject: [PATCH 5/5] fix

---
 swift/llm/dataset/preprocessor/core.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/swift/llm/dataset/preprocessor/core.py b/swift/llm/dataset/preprocessor/core.py
index 825c0ba1f3..c3d692f4c3 100644
--- a/swift/llm/dataset/preprocessor/core.py
+++ b/swift/llm/dataset/preprocessor/core.py
@@ -312,10 +312,6 @@ def preprocess(self, row: Dict[str, Any]) -> Optional[Dict[str, Any]]:
 
 class AlpacaPreprocessor(ResponsePreprocessor):
 
-    def __init__(self, *, columns_mapping: Optional[Dict[str, str]] = None, **kwargs) -> None:
-        """Alpaca format preprocessor"""
-        super().__init__(columns_mapping=columns_mapping, **kwargs)
-
     @classmethod
     def concat_inst_input(cls, instruction, input_):
         if instruction and input_: