oobabooga · oobabooga · Dec 20, 2023 · Dec 20, 2023 · Dec 20, 2023 · Dec 20, 2023
diff --git a/modules/LoRA.py b/modules/LoRA.py
@@ -53,7 +53,10 @@ def add_lora_exllama(lora_names):
 
         lora_path = get_lora_path(lora_names[0])
         lora_config_path = lora_path / "adapter_config.json"
-        lora_adapter_path = lora_path / "adapter_model.bin"
+        for file_name in ["adapter_model.safetensors", "adapter_model.bin"]:
+            file_path = lora_path / file_name
+            if file_path.is_file():
+                lora_adapter_path = file_path
 
         logger.info("Applying the following LoRAs to {}: {}".format(shared.model_name, ', '.join([lora_names[0]])))
         if shared.model.__class__.__name__ == 'ExllamaModel':

diff --git a/modules/exllamav2.py b/modules/exllamav2.py
@@ -1,4 +1,3 @@
-import random
 import traceback
 from pathlib import Path
 
@@ -10,7 +9,7 @@
     ExLlamaV2Config,
     ExLlamaV2Tokenizer
 )
-from exllamav2.generator import ExLlamaV2BaseGenerator, ExLlamaV2Sampler
+from exllamav2.generator import ExLlamaV2Sampler, ExLlamaV2StreamingGenerator
 
 from modules import shared
 from modules.logging_colors import logger
@@ -64,7 +63,7 @@ def from_pretrained(self, path_to_model):
         else:
             cache = ExLlamaV2Cache(model)
 
-        generator = ExLlamaV2BaseGenerator(model, cache, tokenizer)
+        generator = ExLlamaV2StreamingGenerator(model, cache, tokenizer)
 
         result = self()
         result.model = model
@@ -115,41 +114,21 @@ def generate_with_streaming(self, prompt, state):
 
         ids = self.tokenizer.encode(prompt, add_bos=state['add_bos_token'], encode_special_tokens=True)
         ids = ids[:, -get_max_prompt_length(state):]
-        initial_len = ids.shape[-1]
 
         if state['auto_max_new_tokens']:
             max_new_tokens = state['truncation_length'] - ids.shape[-1]
         else:
             max_new_tokens = state['max_new_tokens']
 
-        # _gen_begin_base
-        self.cache.current_seq_len = 0
-        self.model.forward(ids[:, :-1], self.cache, input_mask=None, preprocess_only=True, loras=self.loras)
+        self.generator.begin_stream(ids, settings, loras=self.loras)
 
-        has_leading_space = False
+        decoded_text = ''
         for i in range(max_new_tokens):
-            logits = self.model.forward(ids[:, -1:], self.cache, input_mask=None, loras=self.loras).float().cpu()
-            token, _, _ = ExLlamaV2Sampler.sample(logits, settings, ids, random.random(), self.tokenizer)
-            ids = torch.cat([ids, token], dim=1)
-
-            if i == 0 and self.tokenizer.tokenizer.id_to_piece(int(token)).startswith('▁'):
-                has_leading_space = True
-
-            decoded_text = self.tokenizer.decode(ids[:, initial_len:], decode_special_tokens=not state['skip_special_tokens'])[0]
-            if has_leading_space:
-                decoded_text = ' ' + decoded_text
-
-            # Check the partial unicode character
-            if chr(0xfffd) in decoded_text:
-                is_last = i == max_new_tokens - 1
-                is_stopping = token.item() == self.tokenizer.eos_token_id or shared.stop_everything
-                # If we are not at the end of the generation, we skip this token
-                if not (is_last or is_stopping):
-                    continue
-
-            if token.item() == self.tokenizer.eos_token_id or shared.stop_everything:
+            chunk, eos, _ = self.generator.stream()
+            if eos or shared.stop_everything:
                 break
 
+            decoded_text += chunk
             yield decoded_text
 
     def generate(self, prompt, state):

diff --git a/modules/models.py b/modules/models.py
@@ -413,12 +413,8 @@ def ExLlamav2_HF_loader(model_name):
 
 
 def HQQ_loader(model_name):
-    try:
-        from hqq.core.quantize import HQQBackend, HQQLinear
-        from hqq.engine.hf import HQQModelForCausalLM
-    except ModuleNotFoundError:
-        logger.error("HQQ is not installed. You can install it with:\n\npip install hqq")
-        return None
+    from hqq.core.quantize import HQQBackend, HQQLinear
+    from hqq.engine.hf import HQQModelForCausalLM
 
     logger.info(f"Loading HQQ model with backend: {shared.args.hqq_backend}")
 

diff --git a/requirements.txt b/requirements.txt
@@ -4,6 +4,7 @@ datasets
 einops
 exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64"
 gradio==3.50.*
+hqq==0.1.1.post1
 markdown
 numpy==1.24.*
 optimum==1.16.*

diff --git a/requirements_amd.txt b/requirements_amd.txt
@@ -4,6 +4,7 @@ datasets
 einops
 exllamav2==0.0.11; platform_system == "Windows" or python_version < "3.10" or python_version > "3.11" or platform_machine != "x86_64"
 gradio==3.50.*
+hqq==0.1.1.post1
 markdown
 numpy==1.24.*
 optimum==1.16.*

diff --git a/requirements_amd_noavx2.txt b/requirements_amd_noavx2.txt
@@ -4,6 +4,7 @@ datasets
 einops
 exllamav2==0.0.11; platform_system == "Windows" or python_version < "3.10" or python_version > "3.11" or platform_machine != "x86_64"
 gradio==3.50.*
+hqq==0.1.1.post1
 markdown
 numpy==1.24.*
 optimum==1.16.*

diff --git a/requirements_apple_intel.txt b/requirements_apple_intel.txt
@@ -4,6 +4,7 @@ datasets
 einops
 exllamav2==0.0.11
 gradio==3.50.*
+hqq==0.1.1.post1
 markdown
 numpy==1.24.*
 optimum==1.16.*

diff --git a/requirements_apple_silicon.txt b/requirements_apple_silicon.txt
@@ -4,6 +4,7 @@ datasets
 einops
 exllamav2==0.0.11
 gradio==3.50.*
+hqq==0.1.1.post1
 markdown
 numpy==1.24.*
 optimum==1.16.*

diff --git a/requirements_cpu_only.txt b/requirements_cpu_only.txt
@@ -4,6 +4,7 @@ datasets
 einops
 exllamav2==0.0.11
 gradio==3.50.*
+hqq==0.1.1.post1
 markdown
 numpy==1.24.*
 optimum==1.16.*

diff --git a/requirements_cpu_only_noavx2.txt b/requirements_cpu_only_noavx2.txt
@@ -4,6 +4,7 @@ datasets
 einops
 exllamav2==0.0.11
 gradio==3.50.*
+hqq==0.1.1.post1
 markdown
 numpy==1.24.*
 optimum==1.16.*

diff --git a/requirements_noavx2.txt b/requirements_noavx2.txt
@@ -4,6 +4,7 @@ datasets
 einops
 exllamav2==0.0.11; platform_system != "Darwin" and platform_machine != "x86_64"
 gradio==3.50.*
+hqq==0.1.1.post1
 markdown
 numpy==1.24.*
 optimum==1.16.*

diff --git a/requirements_nowheels.txt b/requirements_nowheels.txt
@@ -4,6 +4,7 @@ datasets
 einops
 exllamav2==0.0.11
 gradio==3.50.*
+hqq==0.1.1.post1
 markdown
 numpy==1.24.*
 optimum==1.16.*