# AI agent prompt design for llama_cpp_canister

# Setup

## Verify we're in the Conda environment

In [1]:
import sys

print(sys.executable)

/opt/miniconda3/envs/llama_cpp_canister/bin/python


## Import python packages

In [2]:
import os
import sys
import json
import base64
import io
from dotenv import load_dotenv
import requests
import pprint
from pathlib import Path
import subprocess
import jupyter_black
import textwrap
from calculate_sha256 import calculate_sha256

from run_llama_cpp import run_llama_cpp

# Activate the jupyter_black extension, which reformats code cells with black
# https://github.com/n8henrie/jupyter-black
jupyter_black.load()

## Before running notebook, build regular llama.cpp

To build llama.cpp, follow the instructions in the README at:

https://github.com/ggml-org/llama.cpp

```bash
cd ../../  # sibling directory to llama_cpp_canister
git clone git@github.com:ggml-org/llama.cpp.git ggml_org_llama_615212.cpp  
cd ggml_org_llama_615212.cpp
git checkout 615212
cmake -B build
cmake --build build --config Release -j 8
```

Then, define LLAMA_CLI_PATH as the location of `llama-cli`, relative to this notebook:

In [3]:
# lama.cpp git sha 615212 is used by current version of llama_cpp_canister
LLAMA_CLI_PATH = "../../ggml_org_llama_615212.cpp/build/bin/llama-cli"

# lama.cpp git sha b841d0 was previous version used by llama_cpp_canister
# LLAMA_CLI_PATH = "../../ggml_org_llama_b841d0.cpp/llama-cli"


# ####################################################################### #
# Select the MODEL_TYPE and MODEL (location is relative to this notebook) #
# ####################################################################### #

seed = 42
num_tokens = 1024
temp = 0.6
# top_k = 50
# top_p = 0.95
# min_p = 0.05
# tfs = 0.9
# typical = 0.9
# mirostat = 2
# mirostat_lr = 0.1
# mirostat_ent = 5.0
repeat_penalty = 1.1

# Notes:
#                                     <not quantized>|<         quantized                >
#  --cache-type-k has allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
#  --cache-type-v is not tested because that requires a GPU,
#                 which is not available right now in an Internet Computer canister


# ------------------------------------------------------------------------------------------
# 135 Million parameters

# https://huggingface.co/tensorblock/SmolLM2-135M-Instruct-GGUF
# MODEL_TYPE = "SmolLM2"
# MODEL="../models/tensorblock/SmolLM2-135M-Instruct-GGUF/SmolLM2-135M-Instruct-Q8_0.gguf"
# cache_type_k = "f16"

# ------------------------------------------------------------------------------------------
# 163 Million parameters

# https://huggingface.co/tensorblock/gpt2-GGUF (124M)
# MODEL_TYPE = "gpt2"
# MODEL = "../models/tensorblock/gpt2-GGUF/gpt2-Q8_0.gguf"
# cache_type_k = "f16"

# ------------------------------------------------------------------------------------------
# 630 Million parameters

# https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF
# MODEL_TYPE = "Qwen"
# MODEL = "../models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q4_k_m.gguf"
# cache_type_k = "f16"

# This is our work-horse model !
MODEL_TYPE = "Qwen"
MODEL = "../models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf"
cache_type_k = "q8_0"
# File details from https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/blob/main/qwen2.5-0.5b-instruct-q8_0.gguf
MODEL_HF_SHA256 = "ca59ca7f13d0e15a8cfa77bd17e65d24f6844b554a7b6c12e07a5f89ff76844e"
MODEL_HF_FILESIZE = 676  # MB

# ------------------------------------------------------------------------------------------
# 1.24 Billion parameters

# https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF
# MODEL_TYPE = "Llama-3.2"
# MODEL = "../models/unsloth/Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct-Q4_K_M.gguf"
# cache_type_k = "q5_0"

# ------------------------------------------------------------------------------------------
# 1.78 Billion parameters

# WORK-IN-PROGRESS...

# https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF
# MODEL_TYPE = "Qwen"
# MODEL = "../models/Qwen/Qwen2.5-1.5B-Instruct-GGUF/qwen2.5-1.5b-instruct-q4_k_m.gguf"
# cache_type_k = "q8_0"

# MODEL_TYPE = "Qwen"
# MODEL = "../models/Qwen/Qwen2.5-1.5B-Instruct-GGUF/qwen2.5-1.5b-instruct-q8_0.gguf"
# cache_type_k = "q8_0"

# ------------------------------------------------------------------------------------------
# 1.78 Billion parameters

# https://huggingface.co/NexaAIDev/DeepSeek-R1-Distill-Qwen-1.5B-NexaQuant
# MODEL_TYPE = "DeepSeek-R1-Distill-Qwen"
# MODEL = "../models/NexaAIDev/DeepSeek-R1-Distill-Qwen-1.5B-NexaQuant/DeepSeek-R1-Distill-Qwen-1.5B-NexaQuant.gguf"
# cache_type_k = "f16"

# ------------------------------------------------------------------------------------------
# 1.78 Billion parameters

# https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF
# MODEL_TYPE = "DeepSeek-R1-Distill-Qwen"
# MODEL = "../models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q2_K.gguf"
# cache_type_k = "q8_0"

# MODEL_TYPE = "DeepSeek-R1-Distill-Qwen"
# MODEL = "../models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q2_K_L.gguf"
# cache_type_k = "q8_0"

# MODEL_TYPE = "DeepSeek-R1-Distill-Qwen"
# MODEL = "../models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf"
# cache_type_k = "q5_0"

# MODEL_TYPE = "DeepSeek-R1-Distill-Qwen"
# MODEL = "../models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"
# cache_type_k = "f16"

# MODEL_TYPE = "DeepSeek-R1-Distill-Qwen"
# MODEL = "../models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q6_K.gguf"
# cache_type_k = "q8_0"

# MODEL_TYPE = "DeepSeek-R1-Distill-Qwen"
# MODEL = "../models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q8_0.gguf"
# cache_type_k = "f16"

# ------------------------------------------------------------------------------------------
# 3.0 Billion parameters

# NOT YET POSSIBLE TO RUN IN A CANISTER... WIP

# https://huggingface.co/unsloth/Llama-3.2-3B-Instruct-GGUF
# MODEL_TYPE = "Llama-3.2"
# MODEL = "../models/unsloth/Llama-3.2-3B-Instruct-GGUF/Llama-3.2-3B-Instruct-Q2_K.gguf"
# cache_type_k = "q4_0"


# ------------------------------------------------------------------------------------------
# Verify we're running the same model as the one from Hugging Face
# Calculate the SHA256 hash of the model file
# def calculate_sha256(file_path):
#     sha256_hash = hashlib.sha256()
#     with open(file_path, "rb") as f:
#         # Read and update hash string value in blocks of 4K
#         for byte_block in iter(lambda: f.read(4096), b""):
#             sha256_hash.update(byte_block)
#     return sha256_hash.hexdigest()

model_sha256 = calculate_sha256(MODEL)
if model_sha256 != MODEL_HF_SHA256:
    print(f"Model SHA256 mismatch: {model_sha256} != {MODEL_HF_SHA256}")
else:
    print(f"Model SHA256 match: {model_sha256} == {MODEL_HF_SHA256}")


# ------------------------------------------------------------------------------------------
# Questions to test the model

prompt = ""
# question = "give me a short introduction to LLMs."
# question = "What is the Internet Computer Protocol?"
# question = "What is a blockchain?"
# question = "What is the term for a blockchain that operates in parallel with other blockchains, allowing for cross-chain transactions?"
# question = "Who invented the telescope?"
# question = "Where does a butterfly emerge from?"
# question = "When is Bitcoin first released?"
#
# Math questions - the only one used for DeepSeek-R1-Distill-Qwen, which is trained for math only
# question = "What is 1+1?"
# answer to next question = 28.36
question = "What is 312 divided by 11?"

if MODEL_TYPE == "gpt2":
    prompt = f"{question}."
elif MODEL_TYPE == "SmolLM2":
    prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
elif MODEL_TYPE == "Qwen":
    prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
elif MODEL_TYPE == "Llama-3.2":
    system_prompt = ""
    prompt = f"<|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
elif MODEL_TYPE == "DeepSeek-R1-Distill-Qwen":
    """
    From: https://deepinfra.com/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B

    Usage Recommendations

    We recommend adhering to the following configurations when utilizing the DeepSeek-R1 series models,
    including benchmarking, to achieve the expected performance:

    1. Set the temperature within the range of 0.5-0.7 (0.6 is recommended) to prevent endless repetitions or incoherent outputs.
    2. Avoid adding a system prompt; all instructions should be contained within the user prompt.
    3. For mathematical problems, it is advisable to include a directive in your prompt such as: "Please reason step by step, and put your final answer within \boxed{}."
    4. When evaluating model performance, it is recommended to conduct multiple tests and average the results.
    """
    temp = 0.5
    # prompt = f"<｜User｜>{question}<｜Assistant｜>"
    # prompt = (
    #     f"<｜User｜>"
    #     f"You are a helpful assistant. Think through the problem, but return **only** the final answer.\n"
    #     f"Strictly follow this format:\n\n"
    #     f"**Answer** [Final answer only, do not repeat the question or any part of the reasoning]\n\n"
    #     f"{question}"
    #     f"<｜Assistant｜>"
    # )

    # From https://huggingface.co/NexaAIDev/DeepSeek-R1-Distill-Qwen-1.5B-NexaQuant
    prompt = (
        "Provide step-by-step reasoning enclosed in <think> </think> tags, followed by the final answer enclosed in \\boxed{} tags.\n"
        f"{question}"
    )
    # Not so good...
    # prompt = (
    #     f"<｜User｜>"
    #     "Provide step-by-step reasoning enclosed in <think> </think> tags, followed by the final answer enclosed in \\boxed{} tags.\n"
    #     f"{question}\n"
    #     "<｜Assistant｜>"
    # )
    # Good
    # prompt = (
    #     f"You are a helpful assistant. Think through the problem, but return **only** the final answer.\n"
    #     f"Strictly follow this format:\n\n"
    #     f"**Answer** [Final answer only, do not repeat the question or any part of the reasoning]\n\n"
    #     f"{question}"
    # )
    # Bad
    # prompt = (
    #     "You are a helpful assistant. Think through the problem, but return **only** the final answer.\n"
    #     "Strictly follow this format:\n\n"
    #     "**Answer** [Final answer enclosed in \\boxed{} tags.]\n\n"
    #     f"{question}"
    # )

else:
    print(f"Model type {MODEL_TYPE} not recognized")
    exit(1)

print("\nprompt:\n", textwrap.fill(prompt, width=80))

run_llama_cpp(
    LLAMA_CLI_PATH,
    MODEL,
    prompt,
    num_tokens,
    seed,
    temp,
    # top_k,
    # top_p,
    # min_p,
    # tfs,
    # typical,
    # mirostat,
    # mirostat_lr,
    # mirostat_ent,
    repeat_penalty,
    cache_type_k,
)

Model SHA256 match: ca59ca7f13d0e15a8cfa77bd17e65d24f6844b554a7b6c12e07a5f89ff76844e == ca59ca7f13d0e15a8cfa77bd17e65d24f6844b554a7b6c12e07a5f89ff76844e

prompt:
 <|im_start|>system You are a helpful assistant.<|im_end|> <|im_start|>user What
is 312 divided by 11?<|im_end|> <|im_start|>assistant

Command:
 ../../ggml_org_llama_615212.cpp/build/bin/llama-cli -m ../models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf --no-warmup -no-cnv -sp -n 1024 --seed 42 --temp 0.6 --repeat-penalty 1.1 --cache-type-k q8_0 -p '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nWhat is 312 divided by 11?<|im_end|>\n<|im_start|>assistant\n'


build: 4531 (6152129d) with Apple clang version 16.0.0 (clang-1600.0.26.6) for x86_64-apple-darwin24.3.0
main: llama backend init
main: load the model and apply lora adapter, if any
llama_model_load_from_file_impl: using device Metal (AMD Radeon Pro 5500M) - 4080 MiB free
llama_model_loader: loaded meta data with 26 key-value pairs and 291 tensors from ../models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = qwen2.5-0.5b-instruct
llama_model_loader: - kv   3:                            general.version str              = v0.1
llama_model_loader: - kv   4:          

<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
What is 312 divided by 11?<|im_end|>
<|im_start|>assistant
To solve the division problem \( \frac{312}{11} \), you can perform the operation directly:

\[ \frac{312}{11} = 28 \]

So, \( 312 \div 11 = 28 \).<|im_end|> [end of text]




llama_perf_sampler_print:    sampling time =     116.78 ms /    92 runs   (    1.27 ms per token,   787.82 tokens per second)
llama_perf_context_print:        load time =   24351.55 ms
llama_perf_context_print: prompt eval time =    2362.07 ms /    31 tokens (   76.20 ms per token,    13.12 tokens per second)
llama_perf_context_print:        eval time =    6988.14 ms /    60 runs   (  116.47 ms per token,     8.59 tokens per second)
llama_perf_context_print:       total time =   31482.59 ms /    91 tokens
ggml_metal_free: deallocating
