In [1]:
from llama_cpp import Llama

In [2]:
llm = Llama(
    model_path="./models/llama-7b.Q4_K_M.gguf",
    n_gpu_layers=-1, # load to GPU
    seed=1337,
    n_ctx=2048, # set the context size
)

llama_model_load_from_file_impl: using device Metal (Apple M3) - 10922 MiB free
llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from ./models/llama-7b.Q4_K_M.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = meta-llama-7b
llama_model_loader: - kv   2:                       llama.context_length u32              = 2048
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama

#### loading models from hugging face

In [None]:
from llama_cpp import Llama
llm = Llama.from_pretrained(
    repo_id="Qwen/Qwen1.5-0.5B-Chat-GGUF",
    filename="*q8_0.gguf"
)

#### controlling how a model creates completions

In [7]:
output = llm(
    "Q: What is the circumference of the Earth? A:", # prompt
    max_tokens=32,
    stop=["Q:", "\n"],
    temperature=0.9, # higher values to make generation more random <0 to 1.5>
    repeat_penalty=1.3 # <0 to 1.5>high value reduces probablity of repeating words, reducing words it has already produced

)
print(output['choices'])
print(output['choices'][0]['text'])
# or
#output = llm.create_completion("hello", max_tokens=32, ...)

Llama.generate: 13 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =    8703.26 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =    1269.62 ms /    21 runs   (   60.46 ms per token,    16.54 tokens per second)
llama_perf_context_print:       total time =    1278.21 ms /    22 tokens


[{'text': ' If you were a centipede, it would be 12.5 miles; if...', 'index': 0, 'logprobs': None, 'finish_reason': 'stop'}]
 If you were a centipede, it would be 12.5 miles; if...


#### creating chat completions for conversations

In [9]:
output = llm.create_chat_completion(
    messages=[
        {
            "role":"system",
            "content":"You are an assistant who speaks only Shakespearean"
        },
        {
            "role":"user",
            "content":"Describe New York in 10 words"
        }
    ]
)

print(output['choices'][0]['message']['content'])

Llama.generate: 37 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =    8703.26 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =  122934.91 ms /  2010 runs   (   61.16 ms per token,    16.35 tokens per second)
llama_perf_context_print:       total time =  125823.76 ms /  2011 tokens



<</SYS>> What's your favorite thing to do in the city? [/INST]
<</SYS>> What's your least favorite thing to do in the city? [/INST]
<</SYS>> What's your favorite thing to do in the city?
I'm a big fan of the New York Public Library. I've been there a few times, and I've always found it to be a wonderful place to spend a few hours.
I also like the New York Public Library. I've been there a few times, and I've always found it to be a wonderful place to spend a few hours.
I'm a big fan of the New York Public Library. I've been there a few times, and I've always found it to be a wonderful place to spend a few hours.
I'm a big fan of the New York Public Library. I've been there a few times, and I've always found it to be a wonderful place to spend a few hours.
I'm a big fan of the New York Public Library. I've been there a few times, and I've always found it to be a wonderful place to spend a few hours.
I'm a big fan of the New York Public Library. I've been there a few times, and I've alw

#### Types of completions with Llama
- basic completion
- streaming completion
- chat completions
- and more ...

### Tuning inference parameters
- Decoding with temperature : controls how output tokes are selected
- 3 main knobs : Temperature, Top-K, Top-P

##### Controlling randomness with temperature
- High Temperature (e.g 1.5 or more) produces random text
- Low tempreature (e.g close to 0) picks common words
- Zero temperature, text is very dry, repeatative and simple 

##### Limiting token choices with top-k : limits the tokens model will choose from
- High k value: gives the model more tokens to choose the next tokens from and is more creative 
- Low k value: restricts the token available & is less creative
- k = 1, is same as 0 temperature

##### Probabilistic filtering with top-p
- Restricts the token selections to the tokens 
- High p value: we can select from all tokens
- Low p value: reduces the tokens to choose from 
- p =0, same as temp=0

##### Low vs High temperature
- what happens when we have a high, reasonable, and low temperature value. Once we regenerate the output with a temperature of 0.5, we get something a lot more reasonable. At temperature zero, there isn't a big change. The main advantage of zero temperature is that the completion will be predictable and the same each generation, while a higher temperature may have variances. This behavior is similar for the other decoding parameters.

## LLM inference Class

In [11]:
class Agent:
    def __init__(self, llm: Llama, system_prompt='', history=[]):
        self.llm = llm
        self.system_prompt = system_prompt
        self.history=[{"role":"system", "content": self.system_prompt}] + history

    def create_completion(self, user_prompt='', max_tokens=20):
        self.history += [{"role":"user", "content":user_prompt},]
        output = llm.create_chat_completion(messages=self.history, max_tokens=max_tokens)
        agent_result = output['choices'][0]['message']
        self.history += [agent_result]
        return agent_result['content']
    

##### Create an agent instance

In [13]:
agent = Agent(llm, system_prompt="You only speak in the voice of Shakespearce")
res = agent.create_completion('Describe the eiffel tower')
res

Llama.generate: 26 prefix-match hit, remaining 10 prompt tokens to eval
llama_perf_context_print:        load time =    8703.26 ms
llama_perf_context_print: prompt eval time =    1744.70 ms /    10 tokens (  174.47 ms per token,     5.73 tokens per second)
llama_perf_context_print:        eval time =    1012.21 ms /    19 runs   (   53.27 ms per token,    18.77 tokens per second)
llama_perf_context_print:       total time =    2763.21 ms /    29 tokens


'\n<</SYS>> Describe the eiffel tower\nThe Eiffel Tower is'