In [1]:
import sys
import os
from llama_cpp import Llama
from llama_cpp.llama_tokenizer import LlamaHFTokenizer
from llama_cpp.llama_chat_format import hf_autotokenizer_to_chat_completion_handler, Jinja2ChatFormatter

sys.path.append(os.path.dirname(os.path.abspath("./")))
from src.tools import model_tools
from src.tools.base import Tool

from warnings import filterwarnings
from tqdm.notebook import tqdm
filterwarnings('ignore')

In [2]:
messages = [{"role": "user", "content": "whats the weather in Satara"}]
tools = [  # For functionary-7b-v2 we use "tools"; for functionary-7b-v1.4 we use "functions" = [{"name": "get_current_weather", "description":..., "parameters": ....}]
    {
        "type": "function",
        "function": {
            "name": "get_current_weather",
            "description": "Get the current weather",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g., San Francisco, CA",
                    }
                },
                "required": ["location"],
            },
        },
    }
]


In [3]:
# # chat completion handler
# chat_handler = hf_autotokenizer_to_chat_completion_handler(
#     "meetkai/functionary-small-llama-3.1-GGUF"
# )

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("meetkai/functionary-small-v2.4-GGUF")
# get the chat template from huggingface tokenizer
chat_handler = Jinja2ChatFormatter(template=tokenizer.chat_template, eos_token=tokenizer.eos_token, bos_token=tokenizer.bos_token).to_chat_handler()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
print(tokenizer.chat_template)

{# version=v2 #}{%- macro append_new_param_info(param_declaration, comment_info, examples_info, depth) -%}
    {%- set offset = "" -%}
    {%- if depth >= 1 -%}
        {%- set offset = "    " * depth -%}
    {%- endif -%}
    {%- if comment_info != "<|NONE|>" -%}
        {{ "\n" + offset + comment_info }}
        {%- if examples_info | length > 0 -%}
            {# Append each example info #}
            {%- for example in examples_info -%}
                {{ "\n" + offset + "// " + example|string|replace("'", '"') }}
            {%- endfor -%}
        {%- endif -%}
    {%- endif -%}
    {{ "\n" + offset + param_declaration }}
{%- endmacro -%}

{%- macro convert_data_type(param_type) -%}
    {%- if param_type == "integer" or param_type == "float" -%}
        {{ "number" }}
    {%- else -%}
        {{ param_type }}
    {%- endif -%}
{%- endmacro -%}

{%- macro get_param_type(param) -%}
    {%- set param_type = "any" -%}

    {%- if "type" in param -%}
        {%- set raw_param_type = p

In [6]:
chat_handler

<function llama_cpp.llama_chat_format.chat_formatter_to_chat_completion_handler.<locals>.chat_completion_handler(*, llama: 'llama.Llama', messages: 'List[llama_types.ChatCompletionRequestMessage]', functions: 'Optional[List[llama_types.ChatCompletionFunction]]' = None, function_call: 'Optional[llama_types.ChatCompletionRequestFunctionCall]' = None, tools: 'Optional[List[llama_types.ChatCompletionTool]]' = None, tool_choice: 'Optional[llama_types.ChatCompletionToolChoiceOption]' = None, temperature: 'float' = 0.2, top_p: 'float' = 0.95, top_k: 'int' = 40, min_p: 'float' = 0.05, typical_p: 'float' = 1.0, stream: 'bool' = False, stop: 'Optional[Union[str, List[str]]]' = [], seed: 'Optional[int]' = None, response_format: 'Optional[llama_types.ChatCompletionRequestResponseFormat]' = None, max_tokens: 'Optional[int]' = None, presence_penalty: 'float' = 0.0, frequency_penalty: 'float' = 0.0, repeat_penalty: 'float' = 1.1, tfs_z: 'float' = 1.0, mirostat_mode: 'int' = 0, mirostat_tau: 'float' =

In [7]:
model = Llama.from_pretrained(
    repo_id="meetkai/functionary-small-v2.4-GGUF",
    filename="functionary-small-v2.4.Q4_0.gguf",
    # repo_id="meetkai/functionary-small-llama-3.1-GGUF",
    # filename="functionary-small-llama-3.1.Q4_0.gguf",
    # chat_format="functionary-v2",
    chat_handler=chat_handler,
    tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.4-GGUF"),
    response_format='json_object',
    n_gpu_layers=-1,
    n_ctx=4096,
    verbose=True,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
llama_model_loader: loaded meta data with 25 key-value pairs and 291 tensors from /home/ostrich/.cache/huggingface/hub/models--meetkai--functionary-small-v2.4-GGUF/snapshots/014f6b16865981c97b5c8f8d763e96960ed11371/./functionary-small-v2.4.Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = .
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 32004
llama_model_loader: - kv   3:                       llama.context_length u32              = 32768
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - 

In [8]:
# get tools
tool_list = []
for tool in model_tools.__all__:
    tool_list.append(Tool.from_function(function=getattr(model_tools, tool)))

openai_tools = [tool.to_openai_tool() for tool in tool_list]

In [9]:
openai_tools

[{'type': 'function',
  'function': {'name': 'multiply',
   'description': 'multiply(a: int, b: int) -> int\nMultiplies two integers and returns the result integer.\n\n    Args:\n        a (int): The first integer.\n        b (int): The second integer.\n\n    Returns:\n        int: The product of a and b.\n    ',
   'parameters': {'properties': {'a': {'title': 'A', 'type': 'integer'},
     'b': {'title': 'B', 'type': 'integer'}},
    'required': ['a', 'b'],
    'type': 'object'}}},
 {'type': 'function',
  'function': {'name': 'add',
   'description': 'add(a: int, b: int) -> int\nAdd two integers and returns the result integer.\n\n    Args:\n        a (int): The first integer.\n        b (int): The second integer.\n\n    Returns:\n        int: The sum of a and b\n    ',
   'parameters': {'properties': {'a': {'title': 'A', 'type': 'integer'},
     'b': {'title': 'B', 'type': 'integer'}},
    'required': ['a', 'b'],
    'type': 'object'}}},
 {'type': 'function',
  'function': {'name': 'su

In [10]:
response = model.create_chat_completion(
    messages=[
        {"role": "system", "content": "You are a helpul assistant"},
        {"role": "user", "content": "What  1+ 1"},
        # {
        #     "role": "assistant",
        #     "content": "",
        #     "tool_calls": [
        #         {
        #             "type": "function",
        #             "function": {"name": "add", "arguments": '{"a": 1, "b": 1}'},
        #         }
        #     ],
        # },
        # {"tool_call_id": "abdc", "role": "tool", "name": "add", "content": "2"}
    ],
    tools=openai_tools,
    tool_choice="auto",
    max_tokens=1024,
    stop=["<|stop|>"],
    # temperature=0.01
)


# Works both on chat and tool-use ?



llama_print_timings:        load time =      97.57 ms
llama_print_timings:      sample time =       0.30 ms /    16 runs   (    0.02 ms per token, 54054.05 tokens per second)
llama_print_timings: prompt eval time =      97.36 ms /   480 tokens (    0.20 ms per token,  4930.41 tokens per second)
llama_print_timings:        eval time =     126.15 ms /    15 runs   (    8.41 ms per token,   118.90 tokens per second)
llama_print_timings:       total time =     256.85 ms /   495 tokens


In [11]:
response['choices']

[{'index': 0,
  'message': {'role': 'assistant',
   'content': ' add\n<|content|> {"a": 1, "b": 1}'},
  'logprobs': None,
  'finish_reason': 'stop'}]

In [None]:
from tokenizers import Tokenizer
from transformers import PreTrainedTokenizerBase

tokenizer = LlamaHFTokenizer.from_pretrained(
    "meetkai/functionary-small-v2.4-GGUF"
    )

isinstance(tokenizer.hf_tokenizer, PreTrainedTokenizerBase)


In [None]:
print(tokenizer.hf_tokenizer.encode("This is your prompt"))
print(tokenizer.hf_tokenizer.tokenize("This is your prompt"))   # tokenization 

In [None]:
tokenizer.hf_tokenizer

## Test our model

In [4]:
import sys
import os

sys.path.append(os.path.dirname(os.path.abspath("./")))
from src.models.llamacpp_model import LlamaCPPModel

# repo_id="meetkai/functionary-small-v2.4-GGUF",
#     filename="functionary-small-v2.4.Q4_0.gguf",
#     chat_format="functionary-v2",
#     tokenizer=LlamaHFTokenizer.from_pretrained("meetkai/functionary-small-v2.4-GGUF"),
#     n_gpu_layers=-1,
#     n_threads=16,
#     n_ctx=4096,
#     verbose=False,
model = LlamaCPPModel(
    model="meetkai/functionary-small-v2.4-GGUF",
    model_name="functionary-small-v2.4.Q4_0.gguf",
    chat_format="functionary-v2",
    max_new_tokens=1024,
    context_window=4096,
    temperature=0.01,
    system_prompt=None,
    stop=None,
    verbose=True,
    is_tool_use_model=True
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
llama_model_loader: loaded meta data with 25 key-value pairs and 291 tensors from /home/ostrich/.cache/huggingface/hub/models--meetkai--functionary-small-v2.4-GGUF/snapshots/014f6b16865981c97b5c8f8d763e96960ed11371/./functionary-small-v2.4.Q4_0.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = .
llama_model_loader: - kv   2:                           llama.vocab_size u32              = 32004
llama_model_loader: - kv   3:                       llama.context_length u32              = 32768
llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096
llama_model_loader: - 

In [5]:
response = model.chat("What is 1 + 1", tools=tool_list)


llama_print_timings:        load time =      98.65 ms
llama_print_timings:      sample time =       0.12 ms /     6 runs   (    0.02 ms per token, 49586.78 tokens per second)
llama_print_timings: prompt eval time =     114.68 ms /   568 tokens (    0.20 ms per token,  4953.04 tokens per second)
llama_print_timings:        eval time =      45.59 ms /     5 runs   (    9.12 ms per token,   109.67 tokens per second)
llama_print_timings:       total time =     175.22 ms /   573 tokens


Input: [{'role': 'user', 'content': 'Who is the Prime Minister of India?'}]
Failed to parse function body as JSON schema, falling back to default grammar
'NoneType' object has no attribute 'get'
root ::= object 
object ::= [{] ws object_11 [}] ws 
value ::= object | array | string | number | value_6 ws 
array ::= [[] ws array_15 []] ws 
string ::= ["] string_18 ["] ws 
number ::= number_19 number_39 number_57 ws 
value_6 ::= [t] [r] [u] [e] | [f] [a] [l] [s] [e] | [n] [u] [l] [l] 
ws ::= | [ ] | [<U+000A>] ws_77 
object_8 ::= string [:] ws value object_10 
object_9 ::= [,] ws string [:] ws value 
object_10 ::= object_9 object_10 | 
object_11 ::= object_8 | 
array_12 ::= value array_14 
array_13 ::= [,] ws value 
array_14 ::= array_13 array_14 | 
array_15 ::= array_12 | 
string_16 ::= [^"\<U+0000>-<U+001F>] | [\] string_17 
string_17 ::= ["\bfnrt] | [u] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] 
string_18 ::= string_16 string_18 | 
number_19 ::= number_20 number_21 
number_20 ::=

Llama.generate: 571 prefix-match hit, remaining 4 prompt tokens to eval

llama_print_timings:        load time =      98.65 ms
llama_print_timings:      sample time =      37.06 ms /    21 runs   (    1.76 ms per token,   566.63 tokens per second)
llama_print_timings: prompt eval time =      13.29 ms /     4 tokens (    3.32 ms per token,   301.05 tokens per second)
llama_print_timings:        eval time =     168.54 ms /    20 runs   (    8.43 ms per token,   118.66 tokens per second)
llama_print_timings:       total time =     277.48 ms /    24 tokens
Llama.generate: 575 prefix-match hit, remaining 15 prompt tokens to eval

llama_print_timings:        load time =      98.65 ms
llama_print_timings:      sample time =       0.02 ms /     1 runs   (    0.02 ms per token, 52631.58 tokens per second)
llama_print_timings: prompt eval time =      12.73 ms /    15 tokens (    0.85 ms per token,  1178.50 tokens per second)
llama_print_timings:        eval time =       0.00 ms /     1 runs   ( 

[93m
[MODEL]: {'id': 'chatcmpl-37b73cda-32cd-4020-8b61-a0539a3ddfbe', 'object': 'chat.completion', 'created': 1726723799, 'model': '/home/ostrich/.cache/huggingface/hub/models--meetkai--functionary-small-v2.4-GGUF/snapshots/014f6b16865981c97b5c8f8d763e96960ed11371/./functionary-small-v2.4.Q4_0.gguf', 'choices': [{'index': 0, 'logprobs': None, 'message': {'role': 'assistant', 'content': None, 'tool_calls': [{'id': 'call_xmvFmfTA9ANbxrExx7Gio1ve', 'type': 'function', 'function': {'name': 'assistant\n<|recipient|> all', 'arguments': '{\n "name": "Narendra Modi"\n}'}}]}, 'finish_reason': 'tool_calls'}], 'usage': {'prompt_tokens': 590, 'completion_tokens': 27, 'total_tokens': 591}}
[0m


KeyError: 'assistant\n<|recipient|> all'