From aa39c0cdcc8c446bdee90241ad7085f305f0c1fd Mon Sep 17 00:00:00 2001
From: potassiummmm <zhou.hansong@outlook.com>
Date: Wed, 18 Dec 2024 17:54:23 +0800
Subject: [PATCH 1/2] fix version requirement of transformers pypi package and
 model list for codegen

---
 3rdparty/llama.cpp | 2 +-
 setup_env.py       | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp
index 2ce860403..957b59d22 160000
--- a/3rdparty/llama.cpp
+++ b/3rdparty/llama.cpp
@@ -1 +1 @@
-Subproject commit 2ce86040364799c44a48bb5a8407351812045dc6
+Subproject commit 957b59d2207370cd5061dd1bb12d079aa267fbab
diff --git a/setup_env.py b/setup_env.py
index 844092934..fa92b5004 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -140,6 +140,9 @@ def setup_gguf():
 
 def gen_code():
     _, arch = system_info()
+    
+    llama3_f3_models = ["Llama3-8B-1.58-100B-tokens", "Falcon3-7B-1.58bit", "Falcon3-10B-1.58bit", "Falcon3-3B-1.58bit", "Falcon3-1B-1.58bit", "Falcon3-1B-Instruct-1.58bit", "Falcon3-3B-Instruct-1.58bit", "Falcon3-7B-Instruct-1.58bit", "Falcon3-10B-Instruct-1.58bit"]
+
     if arch == "arm64":
         if args.use_pretuned:
             pretuned_kernels = os.path.join("preset_kernels", get_model_name())
@@ -154,7 +157,7 @@ def gen_code():
                 shutil.copyfile(os.path.join(pretuned_kernels, "kernel_config_tl2.ini"), "include/kernel_config.ini")
         if get_model_name() == "bitnet_b1_58-large":
             run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "128,64,128", "--bm", "32,64,32"], log_step="codegen")
-        elif get_model_name() in ["Llama3-8B-1.58-100B-tokens", "Falcon3-7B-1.58bit", "Falcon3-10B-1.58bit", "Falcon3-3B-1.58bit", "Falcon3-1B-1.58bit"]:
+        elif get_model_name() in llama3_f3_models:
             run_command([sys.executable, "utils/codegen_tl1.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "128,64,128,64", "--bm", "32,64,32,64"], log_step="codegen")
         elif get_model_name() == "bitnet_b1_58-3B":
             run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "64,128,64", "--bm", "32,64,32"], log_step="codegen")
@@ -170,7 +173,7 @@ def gen_code():
             shutil.copyfile(os.path.join(pretuned_kernels, "bitnet-lut-kernels-tl2.h"), "include/bitnet-lut-kernels.h")
         if get_model_name() == "bitnet_b1_58-large":
             run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "96,192,96", "--bm", "32,32,32"], log_step="codegen")
-        elif get_model_name() in ["Llama3-8B-1.58-100B-tokens", "Falcon3-7B-1.58bit", "Falcon3-10B-1.58bit", "Falcon3-3B-1.58bit", "Falcon3-1B-1.58bit"]:
+        elif get_model_name() in llama3_f3_models:
             run_command([sys.executable, "utils/codegen_tl2.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "96,96,96,96", "--bm", "32,32,32,32"], log_step="codegen")
         elif get_model_name() == "bitnet_b1_58-3B":
             run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")

From 0a446952e18bf6ac9d2de624af7a229dcab931c8 Mon Sep 17 00:00:00 2001
From: potassiummmm <zhou.hansong@outlook.com>
Date: Wed, 18 Dec 2024 21:20:26 +0800
Subject: [PATCH 2/2] fix readme issue and -cnv option issue

---
 README.md        | 5 +++--
 run_inference.py | 3 ++-
 setup_env.py     | 8 ++++----
 3 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 4739214aa..8b6fce8a2 100644
--- a/README.md
+++ b/README.md
@@ -159,7 +159,7 @@ optional arguments:
 ### Basic usage
 ```bash
 # Run inference with the quantized model
-python run_inference.py -m models/Falcon3-7B-Instruct-1.58bit/ggml-model-i2_s.gguf -cnv "You are a helpful assistant"
+python run_inference.py -m models/Falcon3-7B-Instruct-1.58bit/ggml-model-i2_s.gguf -p "You are a helpful assistant" -cnv
 
 # Output:
 # Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?
@@ -167,7 +167,7 @@ python run_inference.py -m models/Falcon3-7B-Instruct-1.58bit/ggml-model-i2_s.gg
 
 ```
 <pre>
-usage: run_inference.py [-h] [-m MODEL] [-n N_PREDICT] -p PROMPT [-t THREADS] [-c CTX_SIZE] [-temp TEMPERATURE]
+usage: run_inference.py [-h] [-m MODEL] [-n N_PREDICT] -p PROMPT [-t THREADS] [-c CTX_SIZE] [-temp TEMPERATURE] [-cnv]
 
 Run inference
 
@@ -185,6 +185,7 @@ optional arguments:
                         Size of the prompt context
   -temp TEMPERATURE, --temperature TEMPERATURE
                         Temperature, a hyperparameter that controls the randomness of the generated text
+  -cnv, --conversation  Whether to enable chat mode or not (for instruct models.)
 </pre>
 
 ### Benchmark
diff --git a/run_inference.py b/run_inference.py
index 75a724660..f3ab727b6 100644
--- a/run_inference.py
+++ b/run_inference.py
@@ -31,8 +31,9 @@ def run_inference():
         '-c', str(args.ctx_size),
         '--temp', str(args.temperature),
         "-b", "1",
-        "-cnv" if args.conversation else ""
     ]
+    if args.conversation:
+        command.append("-cnv")
     run_command(command)
 
 def signal_handler(sig, frame):
diff --git a/setup_env.py b/setup_env.py
index fa92b5004..4b380133a 100644
--- a/setup_env.py
+++ b/setup_env.py
@@ -27,19 +27,19 @@
         "model_name": "Falcon3-7B-1.58bit",
     },
     "tiiuae/Falcon3-10B-Instruct-1.58bit": {
-        "model_name": "Falcon3-10B-1.58bit",
+        "model_name": "Falcon3-10B-Instruct-1.58bit",
     },
     "tiiuae/Falcon3-10B-1.58bit": {
         "model_name": "Falcon3-10B-1.58bit",
     },
     "tiiuae/Falcon3-3B-Instruct-1.58bit": {
-        "model_name": "Falcon3-3B-1.58bit",
+        "model_name": "Falcon3-3B-Instruct-1.58bit",
     },
     "tiiuae/Falcon3-3B-1.58bit": {
         "model_name": "Falcon3-3B-1.58bit",
     },
     "tiiuae/Falcon3-1B-Instruct-1.58bit": {
-        "model_name": "Falcon3-1B-1.58bit",
+        "model_name": "Falcon3-1B-Instruct-1.58bit",
     },
 }
 
@@ -141,7 +141,7 @@ def setup_gguf():
 def gen_code():
     _, arch = system_info()
     
-    llama3_f3_models = ["Llama3-8B-1.58-100B-tokens", "Falcon3-7B-1.58bit", "Falcon3-10B-1.58bit", "Falcon3-3B-1.58bit", "Falcon3-1B-1.58bit", "Falcon3-1B-Instruct-1.58bit", "Falcon3-3B-Instruct-1.58bit", "Falcon3-7B-Instruct-1.58bit", "Falcon3-10B-Instruct-1.58bit"]
+    llama3_f3_models = set([model['model_name'] for model in SUPPORTED_HF_MODELS.values() if model['model_name'].startswith("Falcon3") or model['model_name'].startswith("Llama")])
 
     if arch == "arm64":
         if args.use_pretuned: