# IConfucius prompt design for llama_cpp_canister

# Setup

## Verify we're in the Conda environment

In [10]:
import sys

print(sys.executable)

/opt/miniconda3/envs/IConfucius/bin/python


## Import python packages

In [11]:
import os
import sys
import json
import base64
import io

# from dotenv import load_dotenv
# import requests
import pprint
from pathlib import Path
import subprocess
import jupyter_black
import textwrap

import random

from run_llama_cpp import run_llama_cpp

# Activate the jupyter_black extension, which reformats code cells with black
# https://github.com/n8henrie/jupyter-black
jupyter_black.load()

In [None]:
# ################################################################ #
# First build regular llama.cpp, and then specify the path to it   #
#                                                                  #
# To build llama.cpp, follow the instructions in the README at:    #
# https://github.com/ggml-org/llama.cpp                            #
#                                                                  #
# Define where the llama-cli is located, relative to this notebook #
# ################################################################ #

# Using the latest version of llama.cpp
# LLAMA_CLI_PATH = "../../ggml_org_llama_latest.cpp/build/bin/llama-cli"

# lama.cpp git sha 615212 is used by current version of llama_cpp_canister
LLAMA_CLI_PATH = "../../ggml_org_llama_615212.cpp/build/bin/llama-cli"

# lama.cpp git sha b841d0 was previous version used by llama_cpp_canister
# LLAMA_CLI_PATH = "../../ggml_org_llama_b841d0.cpp/llama-cli"


# ####################################################################### #
# Select the MODEL_TYPE and MODEL (location is relative to this notebook) #
# ####################################################################### #

seed = random.randint(0, 10000000)
num_tokens = 1024
temp = 0.7
# top_k = 50
# top_p = 0.95
# min_p = 0.05
# tfs = 0.9
# typical = 0.9
# mirostat = 2
# mirostat_lr = 0.1
# mirostat_ent = 5.0
repeat_penalty = 1.1

# Notes:
#                                     <not quantized>|<         quantized                >
#  --cache-type-k has allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1
#  --cache-type-v is not tested because that requires a GPU,
#                 which is not available right now in an Internet Computer canister

# ------------------------------------------------------------------------------------------
# 163 Million parameters

# https://huggingface.co/tensorblock/gpt2-GGUF (124M)
# MODEL_TYPE = "gpt2"
# MODEL = "../llms/llama_cpp_canister/models/tensorblock/gpt2-GGUF/gpt2-Q8_0.gguf"
# cache_type_k = "f16"

# ------------------------------------------------------------------------------------------
# 630 Million parameters

# https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF
# MODEL_TYPE = "Qwen"
# MODEL = "../llms/llama_cpp_canister/models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q4_k_m.gguf"
# cache_type_k = "f16"

MODEL_TYPE = "Qwen"
MODEL = "../llms/llama_cpp_canister/models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf"
cache_type_k = "q8_0"


# ------------------------------------------------------------------------------------------
# 1.24 Billion parameters

# https://huggingface.co/unsloth/Llama-3.2-1B-Instruct-GGUF
# MODEL_TYPE = "Llama-3.2"
# MODEL = "../llms/llama_cpp_canister/models/unsloth/Llama-3.2-1B-Instruct-GGUF/Llama-3.2-1B-Instruct-Q4_K_M.gguf"
# cache_type_k = "q5_0"

# ------------------------------------------------------------------------------------------

prompt = ""
topic = "Crypto"
# topic = "Bitcoin"
system_prompt = "You are Confucius, the ancient philosopher. You finish quotes in a profound and compassionate manner."
user_prompt = f"Write a profound and thought proviking quote about {topic}. Provide only the quote, nothing else."

if MODEL_TYPE in ["SmolLM2", "Qwen"]:
    prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
elif MODEL_TYPE == "Llama-3.2":
    prompt = f"<|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n{user_prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"
else:
    print(f"Model type {MODEL_TYPE} not recognized")
    exit(1)

print("\nprompt:\n", textwrap.fill(prompt, width=80))

run_llama_cpp(
    LLAMA_CLI_PATH,
    MODEL,
    prompt,
    num_tokens,
    seed,
    temp,
    # top_k,
    # top_p,
    # min_p,
    # tfs,
    # typical,
    # mirostat,
    # mirostat_lr,
    # mirostat_ent,
    repeat_penalty,
    cache_type_k,
)


prompt:
 <|im_start|>system You are Confucius, the ancient philosopher. You finish quotes
in a profound and compassionate manner.<|im_end|> <|im_start|>user Write a
profound and thought proviking quote about Crypto. Provide only the quote,
nothing else.<|im_end|> <|im_start|>assistant

Command:
 ../../ggml_org_llama_615212.cpp/build/bin/llama-cli -m ../llms/llama_cpp_canister/models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf --no-warmup -no-cnv -sp -n 1024 --seed 849021 --temp 0.8 --repeat-penalty 1.1 --cache-type-k q8_0 -p '<|im_start|>system\nYou are Confucius, the ancient philosopher. You finish quotes in a profound and compassionate manner.<|im_end|>\n<|im_start|>user\nWrite a profound and thought proviking quote about Crypto. Provide only the quote, nothing else.<|im_end|>\n<|im_start|>assistant\n'


build: 4531 (6152129d) with Apple clang version 16.0.0 (clang-1600.0.26.6) for x86_64-apple-darwin24.3.0
main: llama backend init
main: load the model and apply lora adapter, if any
llama_model_load_from_file_impl: using device Metal (AMD Radeon Pro 5500M) - 4080 MiB free
llama_model_loader: loaded meta data with 26 key-value pairs and 291 tensors from ../llms/llama_cpp_canister/models/Qwen/Qwen2.5-0.5B-Instruct-GGUF/qwen2.5-0.5b-instruct-q8_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = qwen2
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = qwen2.5-0.5b-instruct
llama_model_loader: - kv   3:                            general.version str              = v0.1
llama_model_loa

<|im_start|>system
You are Confucius, the ancient philosopher. You finish quotes in a profound and compassionate manner.<|im_end|>
<|im_start|>user
Write a profound and thought proviking quote about Crypto. Provide only the quote, nothing else.<|im_end|>
<|im_start|>assistant
"Truly wise men must first understand that in the digital age, cryptography stands as a cornerstone of security and privacy."<|im_end|> [end of text]




llama_perf_sampler_print:    sampling time =      34.83 ms /    77 runs   (    0.45 ms per token,  2210.55 tokens per second)
llama_perf_context_print:        load time =    4371.85 ms
llama_perf_context_print: prompt eval time =    3774.99 ms /    52 tokens (   72.60 ms per token,    13.77 tokens per second)
llama_perf_context_print:        eval time =    2682.30 ms /    24 runs   (  111.76 ms per token,     8.95 tokens per second)
llama_perf_context_print:       total time =    7095.35 ms /    76 tokens
ggml_metal_free: deallocating
