In [4]:
import sys
import os
from llama_cpp import Llama
from llama_cpp.llama_tokenizer import LlamaHFTokenizer

sys.path.append(os.path.dirname(os.path.abspath("./")))
from src.tools import model_tools
from src.tools.base import Tool

from warnings import filterwarnings

filterwarnings('ignore')

In [2]:
model = Llama.from_pretrained(
    repo_id="meetkai/functionary-small-v2.4-GGUF",
    filename="functionary-small-v2.4.Q4_0.gguf",
    chat_format="functionary-v2",
    tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.4-GGUF"),
    n_gpu_layers=-1,
    n_threads=16,
    n_ctx=4096,
    verbose=False,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# get tools
tool_list = []
for tool in model_tools.__all__:
    tool_list.append(Tool.from_function(function=getattr(model_tools, tool)))

openai_tools = [tool.to_openai_tool() for tool in tool_list]

In [4]:
openai_tools

[{'type': 'function',
  'function': {'name': 'multiply',
   'description': 'multiply(a: int, b: int) -> int\nMultiplies two integers and returns the result integer.\n\n    Args:\n        a (int): The first integer.\n        b (int): The second integer.\n\n    Returns:\n        int: The product of a and b.\n    ',
   'parameters': {'properties': {'a': {'title': 'A', 'type': 'integer'},
     'b': {'title': 'B', 'type': 'integer'}},
    'required': ['a', 'b'],
    'type': 'object'}}},
 {'type': 'function',
  'function': {'name': 'add',
   'description': 'add(a: int, b: int) -> int\nAdd two integers and returns the result integer.\n\n    Args:\n        a (int): The first integer.\n        b (int): The second integer.\n\n    Returns:\n        int: The sum of a and b\n    ',
   'parameters': {'properties': {'a': {'title': 'A', 'type': 'integer'},
     'b': {'title': 'B', 'type': 'integer'}},
    'required': ['a', 'b'],
    'type': 'object'}}},
 {'type': 'function',
  'function': {'name': 'su

In [5]:
response = model.create_chat_completion(
    messages=[
        # {"role": "system", "content": "You are a helpul assistant"},
        {"role": "user", "content": "What is 1 + 1"},
        # {
        #     "role": "assistant",
        #     "content": "",
        #     "tool_calls": [
        #         {
        #             "type": "function",
        #             "function": {"name": "add", "arguments": '{"a": 1, "b": 1}'},
        #         }
        #     ],
        # },
        # {"tool_call_id": "abdc", "role": "tool", "name": "add", "content": "2"}
    ],
    tools=openai_tools,
    tool_choice="auto",
    max_tokens=2048
)


# Works both on chat and tool-use ?

In [7]:
response['choices']

[{'index': 0,
  'logprobs': None,
  'message': {'role': 'assistant',
   'content': None,
   'tool_calls': [{'id': 'call_dPOpF6F9AK1EWHC2KG5L0dC4',
     'type': 'function',
     'function': {'name': 'assistant\n<|recipient|> add',
      'arguments': '{"a": 1, "b": 1}'}}]},
  'finish_reason': 'tool_calls'}]

In [8]:
from tokenizers import Tokenizer
from transformers import PreTrainedTokenizerBase

tokenizer = LlamaHFTokenizer.from_pretrained(
    "meetkai/functionary-small-v2.4-GGUF"
    )

isinstance(tokenizer.hf_tokenizer, PreTrainedTokenizerBase)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


True

In [12]:
print(tokenizer.hf_tokenizer.encode("This is your prompt"))
print(tokenizer.hf_tokenizer.tokenize("This is your prompt"))   # tokenization 

[1, 851, 349, 574, 11510]
['▁This', '▁is', '▁your', '▁prompt']


In [14]:
tokenizer.hf_tokenizer

LlamaTokenizerFast(name_or_path='meetkai/functionary-small-v2.4-GGUF', vocab_size=32000, model_max_length=8192, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '</s>', 'additional_special_tokens': ['<|from|>', '<|recipient|>', '<|content|>', '<|stop|>']}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32000: AddedToken("<|from|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<|recipient|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32002: AddedToken("<|content|>", rstrip=Fa

## Test our model

In [1]:
import sys
import os

sys.path.append(os.path.dirname(os.path.abspath("./")))
from src.models.llamacpp_model import LlamaCPPModel

# repo_id="meetkai/functionary-small-v2.4-GGUF",
#     filename="functionary-small-v2.4.Q4_0.gguf",
#     chat_format="functionary-v2",
#     tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.4-GGUF"),
#     n_gpu_layers=-1,
#     n_threads=16,
#     n_ctx=4096,
#     verbose=False,
model = LlamaCPPModel(
    model="meetkai/functionary-small-v2.4-GGUF",
    model_name="functionary-small-v2.4.Q4_0.gguf",
    chat_format="functionary-v2",
    max_new_tokens=1024,
    context_window=4096,
    temperature=0.01,
    system_prompt=None,
    verbose=True,
)

  from .autonotebook import tqdm as notebook_tqdm
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
llama_model_loader: loaded meta data with 25 key-value pairs and 291 tensors from /home/ostrich/.cache/huggingface/hub/models--meetkai--functionary-small-v2.4-GGUF/snapshots/014f6b16865981c97b5c8f8d763e96960ed11371/./functionary-small-v2.4.Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = .
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 32004
llama_model_loader: - kv   3:                       llama.context_length u32              = 32768
llama_model_loader: - kv   4:                     llama.embedding_len

In [7]:
response = model.chat("Who is the Prime Minister of India?", tools=tool_list)

Llama.generate: 556 prefix-match hit, remaining 12 prompt tokens to eval

llama_print_timings:        load time =     113.28 ms
llama_print_timings:      sample time =       0.12 ms /     6 runs   (    0.02 ms per token, 49586.78 tokens per second)
llama_print_timings: prompt eval time =      21.91 ms /    12 tokens (    1.83 ms per token,   547.70 tokens per second)
llama_print_timings:        eval time =      44.13 ms /     5 runs   (    8.83 ms per token,   113.30 tokens per second)
llama_print_timings:       total time =      80.32 ms /    17 tokens


Input: [{'role': 'user', 'content': 'Who is the Prime Minister of India?'}]
Failed to parse function body as JSON schema, falling back to default grammar
'NoneType' object has no attribute 'get'
root ::= object 
object ::= [{] ws object_11 [}] ws 
value ::= object | array | string | number | value_6 ws 
array ::= [[] ws array_15 []] ws 
string ::= ["] string_18 ["] ws 
number ::= number_19 number_39 number_57 ws 
value_6 ::= [t] [r] [u] [e] | [f] [a] [l] [s] [e] | [n] [u] [l] [l] 
ws ::= | [ ] | [<U+000A>] ws_77 
object_8 ::= string [:] ws value object_10 
object_9 ::= [,] ws string [:] ws value 
object_10 ::= object_9 object_10 | 
object_11 ::= object_8 | 
array_12 ::= value array_14 
array_13 ::= [,] ws value 
array_14 ::= array_13 array_14 | 
array_15 ::= array_12 | 
string_16 ::= [^"\<U+0000>-<U+001F>] | [\] string_17 
string_17 ::= ["\bfnrt] | [u] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] 
string_18 ::= string_16 string_18 | 
number_19 ::= number_20 number_21 
number_20 ::=

Llama.generate: 571 prefix-match hit, remaining 4 prompt tokens to eval

llama_print_timings:        load time =     113.28 ms
llama_print_timings:      sample time =      31.49 ms /    21 runs   (    1.50 ms per token,   666.79 tokens per second)
llama_print_timings: prompt eval time =       9.31 ms /     4 tokens (    2.33 ms per token,   429.60 tokens per second)
llama_print_timings:        eval time =     172.65 ms /    20 runs   (    8.63 ms per token,   115.84 tokens per second)
llama_print_timings:       total time =     272.55 ms /    24 tokens
Llama.generate: 575 prefix-match hit, remaining 15 prompt tokens to eval

llama_print_timings:        load time =     113.28 ms
llama_print_timings:      sample time =       0.02 ms /     1 runs   (    0.02 ms per token, 55555.56 tokens per second)
llama_print_timings: prompt eval time =      12.83 ms /    15 tokens (    0.86 ms per token,  1169.50 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   ( 

[93m
[MODEL]: {'id': 'chatcmpl-e340f49e-6cd2-4886-98bf-9a410f47cd76', 'object': 'chat.completion', 'created': 1726715285, 'model': '/home/ostrich/.cache/huggingface/hub/models--meetkai--functionary-small-v2.4-GGUF/snapshots/014f6b16865981c97b5c8f8d763e96960ed11371/./functionary-small-v2.4.Q4_0.gguf', 'choices': [{'index': 0, 'logprobs': None, 'message': {'role': 'assistant', 'content': None, 'tool_calls': [{'id': 'call_O9YVfiRNgXMaFyyvuSQbpeRr', 'type': 'function', 'function': {'name': 'assistant\n<|recipient|> all', 'arguments': '{\n "name": "Narendra Modi"\n}'}}]}, 'finish_reason': 'tool_calls'}], 'usage': {'prompt_tokens': 590, 'completion_tokens': 27, 'total_tokens': 591}}
[0m


KeyError: 'assistant\n<|recipient|> all'