In [4]:
from llama_cpp import Llama
import pprint
import re
import time

In [7]:
# Import local Llama model
llm = Llama(
    model_path="./models/Llama-3-ELYZA-JP-8B-q4_k_m.gguf",
    n_gpu_layers=-1,
    main_gpu=0, # Use the first GPU
    n_ctx=1024,
    n_threads=4
)

llama_model_loader: loaded meta data with 22 key-value pairs and 291 tensors from ./models/Llama-3-ELYZA-JP-8B-q4_k_m.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = Llama-3-8B-optimal-merged-stage2
llama_model_loader: - kv   2:                          llama.block_count u32              = 32
llama_model_loader: - kv   3:                       llama.context_length u32              = 8192
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336
llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32
llama_model_loader: - kv   7:              llam

In [6]:
def jp_llm(prompt, **kwargs):
    """
    Call the LLM with the given prompt and additional parameters.
    """
    start_time = time.time()
    
    # Add background information in prompt engineering
    # background_prompt = "You are a little girl doing a tourist guide in Fukuoka, Japan. Provide a friendly and concise answer to the question, and always answer in simple Japanese. Here is the question:"
    background_prompt = "You are a Japanese language model. Provide a friendly and concise answer to the question, and always answer in simple Japanese. Here is the question:"
    prompt_with_background = f"{background_prompt} {prompt}"
    response_with_background = llm(
        prompt_with_background,
        max_tokens=100,  # Limit the response length
        # temperature=0.7,  # Control randomness
        # top_p=0.9,  # Nucleus sampling
        stop=["。", "！", "？"] 
    )
    # pprint.pp(response_with_background)

    # regex to remove unwanted characters
    cleaned_response = re.sub(r'[\n\s]+', ' ', response_with_background['choices'][0]['text'].strip())
    pprint.pprint(cleaned_response)
    print(f"Response time: {time.time() - start_time:.2f} seconds")
    return cleaned_response


jp_llm(prompt = "福岡は良い場所ですか？")

for i in range(10):
    jp_llm(prompt = "福岡は良い場所ですか？")
    time.sleep(1)  # Sleep for 1 second between requests to avoid rate limiting

llama_perf_context_print:        load time =   10157.79 ms
llama_perf_context_print: prompt eval time =   10157.42 ms /    37 tokens (  274.52 ms per token,     3.64 tokens per second)
llama_perf_context_print:        eval time =    1715.54 ms /     9 runs   (  190.62 ms per token,     5.25 tokens per second)
llama_perf_context_print:       total time =   11883.87 ms /    46 tokens
Llama.generate: 36 prefix-match hit, remaining 1 prompt tokens to eval


'福岡はとても良い場所です'
Response time: 11.89 seconds


llama_perf_context_print:        load time =   10157.79 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    7149.12 ms /    34 runs   (  210.27 ms per token,     4.76 tokens per second)
llama_perf_context_print:       total time =    7191.02 ms /    35 tokens


'福岡は、美味しい食べ物や、活気のある街、ビーチや山などの自然もあり、すごくいい場所です'
Response time: 7.19 seconds


Llama.generate: 36 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =   10157.79 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    4218.23 ms /    23 runs   (  183.40 ms per token,     5.45 tokens per second)
llama_perf_context_print:       total time =    4242.66 ms /    24 tokens


'福岡はとても住みやすく、食べ物も美味しくて、良い場所です'
Response time: 4.25 seconds


Llama.generate: 36 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =   10157.79 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    1964.74 ms /    12 runs   (  163.73 ms per token,     6.11 tokens per second)
llama_perf_context_print:       total time =    1976.85 ms /    13 tokens


'福岡は、九州の中心都市です'
Response time: 1.98 seconds


Llama.generate: 36 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =   10157.79 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    1608.43 ms /    10 runs   (  160.84 ms per token,     6.22 tokens per second)
llama_perf_context_print:       total time =    1617.98 ms /    11 tokens


'福岡はとても良い場所です'
Response time: 1.62 seconds


Llama.generate: 36 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =   10157.79 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    2566.07 ms /    16 runs   (  160.38 ms per token,     6.24 tokens per second)
llama_perf_context_print:       total time =    2581.67 ms /    17 tokens


'答え: はい、福岡はとても良い場所です'
Response time: 2.58 seconds


Llama.generate: 36 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =   10157.79 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    2419.49 ms /    13 runs   (  186.11 ms per token,     5.37 tokens per second)
llama_perf_context_print:       total time =    2432.03 ms /    14 tokens


'福岡はとても住みやすい都市です'
Response time: 2.44 seconds


Llama.generate: 36 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =   10157.79 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    2079.98 ms /    10 runs   (  208.00 ms per token,     4.81 tokens per second)
llama_perf_context_print:       total time =    2091.51 ms /    11 tokens


'福岡はとても良い場所です'
Response time: 2.10 seconds


Llama.generate: 36 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =   10157.79 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    1678.13 ms /    10 runs   (  167.81 ms per token,     5.96 tokens per second)
llama_perf_context_print:       total time =    1687.78 ms /    11 tokens


'福岡はとても良い場所です'
Response time: 1.69 seconds


Llama.generate: 36 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =   10157.79 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    5226.64 ms /    32 runs   (  163.33 ms per token,     6.12 tokens per second)
llama_perf_context_print:       total time =    5260.78 ms /    33 tokens


'福岡は、美味しい食事と、温泉、ビーチなど、色々な楽しみ方ができる良い場所です'
Response time: 5.26 seconds


Llama.generate: 36 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =   10157.79 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    1582.88 ms /    10 runs   (  158.29 ms per token,     6.32 tokens per second)
llama_perf_context_print:       total time =    1594.02 ms /    11 tokens


'福岡はとても良い場所です'
Response time: 1.60 seconds
