Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,15 +159,15 @@ optional arguments:
### Basic usage
```bash
# Run inference with the quantized model
python run_inference.py -m models/Falcon3-7B-Instruct-1.58bit/ggml-model-i2_s.gguf -cnv "You are a helpful assistant"
python run_inference.py -m models/Falcon3-7B-Instruct-1.58bit/ggml-model-i2_s.gguf -p "You are a helpful assistant" -cnv

# Output:
# Daniel went back to the the the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?
# Answer: Mary is in the garden.

```
<pre>
usage: run_inference.py [-h] [-m MODEL] [-n N_PREDICT] -p PROMPT [-t THREADS] [-c CTX_SIZE] [-temp TEMPERATURE]
usage: run_inference.py [-h] [-m MODEL] [-n N_PREDICT] -p PROMPT [-t THREADS] [-c CTX_SIZE] [-temp TEMPERATURE] [-cnv]

Run inference

Expand All @@ -185,6 +185,7 @@ optional arguments:
Size of the prompt context
-temp TEMPERATURE, --temperature TEMPERATURE
Temperature, a hyperparameter that controls the randomness of the generated text
-cnv, --conversation Whether to enable chat mode or not (for instruct models.)
</pre>

### Benchmark
Expand Down
3 changes: 2 additions & 1 deletion run_inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ def run_inference():
'-c', str(args.ctx_size),
'--temp', str(args.temperature),
"-b", "1",
"-cnv" if args.conversation else ""
]
if args.conversation:
command.append("-cnv")
run_command(command)

def signal_handler(sig, frame):
Expand Down
13 changes: 8 additions & 5 deletions setup_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,19 @@
"model_name": "Falcon3-7B-1.58bit",
},
"tiiuae/Falcon3-10B-Instruct-1.58bit": {
"model_name": "Falcon3-10B-1.58bit",
"model_name": "Falcon3-10B-Instruct-1.58bit",
},
"tiiuae/Falcon3-10B-1.58bit": {
"model_name": "Falcon3-10B-1.58bit",
},
"tiiuae/Falcon3-3B-Instruct-1.58bit": {
"model_name": "Falcon3-3B-1.58bit",
"model_name": "Falcon3-3B-Instruct-1.58bit",
},
"tiiuae/Falcon3-3B-1.58bit": {
"model_name": "Falcon3-3B-1.58bit",
},
"tiiuae/Falcon3-1B-Instruct-1.58bit": {
"model_name": "Falcon3-1B-1.58bit",
"model_name": "Falcon3-1B-Instruct-1.58bit",
},
}

Expand Down Expand Up @@ -140,6 +140,9 @@ def setup_gguf():

def gen_code():
_, arch = system_info()

llama3_f3_models = set([model['model_name'] for model in SUPPORTED_HF_MODELS.values() if model['model_name'].startswith("Falcon3") or model['model_name'].startswith("Llama")])

if arch == "arm64":
if args.use_pretuned:
pretuned_kernels = os.path.join("preset_kernels", get_model_name())
Expand All @@ -154,7 +157,7 @@ def gen_code():
shutil.copyfile(os.path.join(pretuned_kernels, "kernel_config_tl2.ini"), "include/kernel_config.ini")
if get_model_name() == "bitnet_b1_58-large":
run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "128,64,128", "--bm", "32,64,32"], log_step="codegen")
elif get_model_name() in ["Llama3-8B-1.58-100B-tokens", "Falcon3-7B-1.58bit", "Falcon3-10B-1.58bit", "Falcon3-3B-1.58bit", "Falcon3-1B-1.58bit"]:
elif get_model_name() in llama3_f3_models:
run_command([sys.executable, "utils/codegen_tl1.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "128,64,128,64", "--bm", "32,64,32,64"], log_step="codegen")
elif get_model_name() == "bitnet_b1_58-3B":
run_command([sys.executable, "utils/codegen_tl1.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "64,128,64", "--bm", "32,64,32"], log_step="codegen")
Expand All @@ -170,7 +173,7 @@ def gen_code():
shutil.copyfile(os.path.join(pretuned_kernels, "bitnet-lut-kernels-tl2.h"), "include/bitnet-lut-kernels.h")
if get_model_name() == "bitnet_b1_58-large":
run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-large", "--BM", "256,128,256", "--BK", "96,192,96", "--bm", "32,32,32"], log_step="codegen")
elif get_model_name() in ["Llama3-8B-1.58-100B-tokens", "Falcon3-7B-1.58bit", "Falcon3-10B-1.58bit", "Falcon3-3B-1.58bit", "Falcon3-1B-1.58bit"]:
elif get_model_name() in llama3_f3_models:
run_command([sys.executable, "utils/codegen_tl2.py", "--model", "Llama3-8B-1.58-100B-tokens", "--BM", "256,128,256,128", "--BK", "96,96,96,96", "--bm", "32,32,32,32"], log_step="codegen")
elif get_model_name() == "bitnet_b1_58-3B":
run_command([sys.executable, "utils/codegen_tl2.py", "--model", "bitnet_b1_58-3B", "--BM", "160,320,320", "--BK", "96,96,96", "--bm", "32,32,32"], log_step="codegen")
Expand Down