# AI agent prompt design for llama_cpp_canister

# Setup

## Verify we're in the Conda environment

In [None]:
import sys

print(sys.executable)

## Import python packages

In [None]:
import os
import sys
import json
import base64
import io
from dotenv import load_dotenv
import requests
import pprint
from pathlib import Path
import subprocess
import jupyter_black
import textwrap

from run_llama_cpp import run_llama_cpp

# Activate the jupyter_black extension, which reformats code cells with black
# https://github.com/n8henrie/jupyter-black
jupyter_black.load()

## Before running notebook, build regular llama.cpp

To build llama.cpp, follow the instructions in the README at:

https://github.com/ggml-org/llama.cpp

```bash
cd ../../  # sibling directory to llama_cpp_canister
git clone git@github.com:ggml-org/llama.cpp.git ggml_org_llama_615212.cpp  
cd ggml_org_llama_615212.cpp
git checkout 615212
cmake -B build
cmake --build build --config Release -j 8
```

Then, define LLAMA_CLI_PATH as the location of `llama-cli`, relative to this notebook:

In [None]:
# lama.cpp git sha 615212 is used by current version of llama_cpp_canister
LLAMA_CLI_PATH = "../../ggml_org_llama_615212.cpp/build/bin/llama-cli"

# lama.cpp git sha b841d0 was previous version used by llama_cpp_canister
# LLAMA_CLI_PATH = "../../ggml_org_llama_b841d0.cpp/llama-cli"


# ####################################################################### #
# Select the MODEL_TYPE and MODEL (location is relative to this notebook) #
# ####################################################################### #

seed = 42
num_tokens = 1024
temp = 0.6
# top_k = 50
# top_p = 0.95
# min_p = 0.05
# tfs = 0.9
# typical = 0.9
# mirostat = 2
# mirostat_lr = 0.1
# mirostat_ent = 5.0
repeat_penalty = 1.1

# Notes:
#                                     <not quantized>|<         quantized                >
#  --cache-type-k has allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
#  --cache-type-v is not tested because that requires a GPU,
#                 which is not available right now in an Internet Computer canister


# ------------------------------------------------------------------------------------------
# 135 Million parameters

# https://huggingface.co/tensorblock/SmolLM2-135M-Instruct-GGUF
# MODEL_TYPE = "SmolLM2"
# MODEL="../models/tensorblock/SmolLM2-135M-Instruct-GGUF/SmolLM2-135M-Instruct-Q8_0.gguf"
# cache_type_k = "f16"

# ------------------------------------------------------------------------------------------
# 163 Million parameters

# https://huggingface.co/tensorblock/gpt2-GGUF (124M)
# MODEL_TYPE = "gpt2"
# MODEL = "../models/tensorblock/gpt2-GGUF/gpt2-Q8_0.gguf"
# cache_type_k = "f16"

# ------------------------------------------------------------------------------------------
# 630 Million parameters

# https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF
# MODEL_TYPE = "Qwen"
# MODEL = "../models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q4_k_m.gguf"
# cache_type_k = "f16"

MODEL_TYPE = "Qwen"
MODEL = "../models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf"
cache_type_k = "q8_0"


# ------------------------------------------------------------------------------------------
# 1.24 Billion parameters

# https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF
# MODEL_TYPE = "Llama-3.2"
# MODEL = "../models/unsloth/Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct-Q4_K_M.gguf"
# cache_type_k = "q5_0"

# ------------------------------------------------------------------------------------------
# 1.78 Billion parameters

# WORK-IN-PROGRESS...

# https://huggingface.co/Qwen/Qwen2.5-1.5B-Instruct-GGUF
# MODEL_TYPE = "Qwen"
# MODEL = "../models/Qwen/Qwen2.5-1.5B-Instruct-GGUF/qwen2.5-1.5b-instruct-q4_k_m.gguf"
# cache_type_k = "q8_0"

# MODEL_TYPE = "Qwen"
# MODEL = "../models/Qwen/Qwen2.5-1.5B-Instruct-GGUF/qwen2.5-1.5b-instruct-q8_0.gguf"
# cache_type_k = "q8_0"

# ------------------------------------------------------------------------------------------
# 1.78 Billion parameters

# WORK-IN-PROGRESS...

# https://huggingface.co/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF
# MODEL_TYPE = "DeepSeek-R1-Distill-Qwen"
# MODEL = "../models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q2_K.gguf"
# cache_type_k = "q5_0"

# MODEL = "../models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q2_K_L.gguf"
# cache_type_k = "q5_0"

# MODEL = "../models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q3_K_M.gguf"
# cache_type_k = "q5_0"

# MODEL = "../models/unsloth/DeepSeek-R1-Distill-Qwen-1.5B-GGUF/DeepSeek-R1-Distill-Qwen-1.5B-Q4_K_M.gguf"
# cache_type_k = "q5_0"

# ------------------------------------------------------------------------------------------
# MODEL_TYPE = "Llama-3.2"

# https://huggingface.co/unsloth/Llama-3.2-3B-Instruct-GGUF
# MODEL = "../models/unsloth/Llama-3.2-3B-Instruct-GGUF/Llama-3.2-3B-Instruct-Q2_K.gguf"

prompt = ""
question = "give me a short introduction to LLMs."
# question = "What is the Internet Computer Protocol?"
# question = "What is a blockchain?"
# question = "What is the term for a blockchain that operates in parallel with other blockchains, allowing for cross-chain transactions?"
# question = "Who invented the telescope?"
# question = "Where does a butterfly emerge from?"
# question = "What is 1+1?"
# question = "When is Bitcoin first released?"
if MODEL_TYPE == "gpt2":
    prompt = f"{question}."
elif MODEL_TYPE == "SmolLM2":
    prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
elif MODEL_TYPE == "Qwen":
    prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
elif MODEL_TYPE == "Llama-3.2":
    system_prompt = ""
    prompt = f"<|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{question}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
elif MODEL_TYPE == "DeepSeek-R1-Distill-Qwen":
    """
    From: https://deepinfra.com/deepseek-ai/DeepSeek-R1-Distill-Qwen-32B

    Usage Recommendations

    We recommend adhering to the following configurations when utilizing the DeepSeek-R1 series models,
    including benchmarking, to achieve the expected performance:

    1. Set the temperature within the range of 0.5-0.7 (0.6 is recommended) to prevent endless repetitions or incoherent outputs.
    2. Avoid adding a system prompt; all instructions should be contained within the user prompt.
    3. For mathematical problems, it is advisable to include a directive in your prompt such as: "Please reason step by step, and put your final answer within \boxed{}."
    4. When evaluating model performance, it is recommended to conduct multiple tests and average the results.
    """
    prompt = f"<｜User｜>{question}. <｜Assistant｜>"
else:
    print(f"Model type {MODEL_TYPE} not recognized")
    exit(1)

print("\nprompt:\n", textwrap.fill(prompt, width=80))

run_llama_cpp(
    LLAMA_CLI_PATH,
    MODEL,
    prompt,
    num_tokens,
    seed,
    temp,
    # top_k,
    # top_p,
    # min_p,
    # tfs,
    # typical,
    # mirostat,
    # mirostat_lr,
    # mirostat_ent,
    repeat_penalty,
    cache_type_k,
)