microsoft · potassiummmm · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
diff --git a/3rdparty/llama.cpp b/3rdparty/llama.cpp
diff --git a/README.md b/README.md
@@ -159,15 +159,15 @@ optional arguments:
 ### Basic usage
 ```bash
 # Run inference with the quantized model
-python run_inference.py -m models/Falcon3-7B-Instruct-1.58bit/ggml-model-i2_s.gguf -cnv "You are a helpful assistant"
+python run_inference.py -m models/Falcon3-7B-Instruct-1.58bit/ggml-model-i2_s.gguf -p "You are a helpful assistant" -cnv
 
 # Output:
 # Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?
 # Answer: Mary is in the garden.
 
 ```
 <pre>
-usage: run_inference.py [-h] [-m MODEL] [-n N_PREDICT] -p PROMPT [-t THREADS] [-c CTX_SIZE] [-temp TEMPERATURE]
+usage: run_inference.py [-h] [-m MODEL] [-n N_PREDICT] -p PROMPT [-t THREADS] [-c CTX_SIZE] [-temp TEMPERATURE] [-cnv]
 
 Run inference
 
@@ -185,6 +185,7 @@ optional arguments:
                         Size of the prompt context
   -temp TEMPERATURE, --temperature TEMPERATURE
                         Temperature, a hyperparameter that controls the randomness of the generated text
+  -cnv, --conversation  Whether to enable chat mode or not (for instruct models.)
 </pre>
 
 ### Benchmark

diff --git a/run_inference.py b/run_inference.py
@@ -31,8 +31,9 @@ def run_inference():
         '-c', str(args.ctx_size),
         '--temp', str(args.temperature),
         "-b", "1",
-        "-cnv" if args.conversation else ""
     ]
+    if args.conversation:
+        command.append("-cnv")
     run_command(command)
 
 def signal_handler(sig, frame):

diff --git a/setup_env.py b/setup_env.py
@@ -27,19 +27,19 @@
         "model_name": "Falcon3-7B-1.58bit",
     },
     "tiiuae/Falcon3-10B-Instruct-1.58bit": {
-        "model_name": "Falcon3-10B-1.58bit",
+        "model_name": "Falcon3-10B-Instruct-1.58bit",
     },
     "tiiuae/Falcon3-10B-1.58bit": {
         "model_name": "Falcon3-10B-1.58bit",
     },
     "tiiuae/Falcon3-3B-Instruct-1.58bit": {
-        "model_name": "Falcon3-3B-1.58bit",
+        "model_name": "Falcon3-3B-Instruct-1.58bit",
     },
     "tiiuae/Falcon3-3B-1.58bit": {
         "model_name": "Falcon3-3B-1.58bit",
     },
     "tiiuae/Falcon3-1B-Instruct-1.58bit": {
-        "model_name": "Falcon3-1B-1.58bit",
+        "model_name": "Falcon3-1B-Instruct-1.58bit",
     },
 }
 
@@ -140,6 +140,9 @@ def setup_gguf():
 
 def gen_code():
     _, arch = system_info()
+
+    llama3_f3_models = set([model['model_name'] for model in SUPPORTED_HF_MODELS.values() if model['model_name'].startswith("Falcon3") or model['model_name'].startswith("Llama")])
+
     if arch == "arm64":
         if args.use_pretuned:
             pretuned_kernels = os.path.join("preset_kernels", get_model_name())
@@ -154,7 +157,7 @@ def gen_code():
                 shutil.copyfile(os.path.join(pretuned_kernels, "kernel_config_tl2.ini"), "include/kernel_config.ini")
         if get_model_name() == "bitnet_b1_58-large":
             run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "128,64,128", "--bm", "32,64,32"], log_step="codegen")
-        elif get_model_name() in ["Llama3-8B-1.58-100B-tokens", "Falcon3-7B-1.58bit", "Falcon3-10B-1.58bit", "Falcon3-3B-1.58bit", "Falcon3-1B-1.58bit"]:
+        elif get_model_name() in llama3_f3_models:
             run_command([sys.executable, "utils/codegen_tl1.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "128,64,128,64", "--bm", "32,64,32,64"], log_step="codegen")
         elif get_model_name() == "bitnet_b1_58-3B":
             run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "64,128,64", "--bm", "32,64,32"], log_step="codegen")
@@ -170,7 +173,7 @@ def gen_code():
             shutil.copyfile(os.path.join(pretuned_kernels, "bitnet-lut-kernels-tl2.h"), "include/bitnet-lut-kernels.h")
         if get_model_name() == "bitnet_b1_58-large":
             run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "96,192,96", "--bm", "32,32,32"], log_step="codegen")
-        elif get_model_name() in ["Llama3-8B-1.58-100B-tokens", "Falcon3-7B-1.58bit", "Falcon3-10B-1.58bit", "Falcon3-3B-1.58bit", "Falcon3-1B-1.58bit"]:
+        elif get_model_name() in llama3_f3_models:
             run_command([sys.executable, "utils/codegen_tl2.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "96,96,96,96", "--bm", "32,32,32,32"], log_step="codegen")
         elif get_model_name() == "bitnet_b1_58-3B":
             run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")
+6 −0		ggml/src/ggml-common.h
+12 −1		ggml/src/ggml-metal.m
+196 −53		ggml/src/ggml-metal.metal
+1 −1		requirements/requirements-convert_legacy_llama.txt
+9 −0		src/llama.cpp