In [11]:
from unsloth import FastLanguageModel
import pickle
import torch
from datasets import load_dataset, Dataset

In [12]:
max_seq_length = 2048     
dtype = None
load_in_4bit = False  
model_name = "../../mani/models/finetuned_models/Llama-3.2-3B-Instruct-finetune-func-v1"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,        # Trained model either locally or from huggingface
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(model)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A30. Num GPUs = 1. Max memory: 23.498 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )

In [13]:
def create_inference_prompts_from_examples(examples):
    prompts = []

    for query, tools in zip(examples["query"], examples["tools"]):
        system_message = {
            "role": "system",
            "content": (
                "You are a helpful assistant with access to the following tools or function calls. "
                "Your task is to produce a sequence of tools or function calls necessary to generate a response to the user utterance. "
                "Use the following tools or function calls as required:\n"
                f"{tools}"
            )
        }
        user_message = {
            "role": "user",
            "content": query
        }

        convo = [system_message, user_message]
        prompt = tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
        prompts.append(prompt)

    return {"text": prompts}

In [14]:
with open("./data/xlam-function-calling-60k-updated-test_data.pkl", "rb") as f:
    test_data = pickle.load(f)

test_dataset = Dataset.from_list(test_data)

dataset_test = test_dataset.map(create_inference_prompts_from_examples, batched = True,)

dataset_test

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

Dataset({
    features: ['id', 'query', 'answers', 'tools', 'text'],
    num_rows: 600
})

In [15]:
print(dataset_test["text"][10])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant with access to the following tools or function calls. Your task is to produce a sequence of tools or function calls necessary to generate a response to the user utterance. Use the following tools or function calls as required:
[{"name": "get_mempool_v2", "description": "Retrieve a list of transaction IDs currently in the mempool of the specified blockchain node, representing unconfirmed transactions not yet included in any block.", "parameters": {"blockchain": {"description": "The name of the blockchain.", "type": "str", "default": "bitcoin"}, "pagesize": {"description": "The number of transactions to return per call. Default and maximum is 1000.", "type": "int, optional", "default": 1000}, "page": {"description": "Specifies the page of returned transactions, starting from 1. If out of range, the closest possible page is returned. Default is 1.", "type": "int, optional", "default": 1}}}, {"name": "

In [16]:
test_data = dataset_test[270]

print(test_data["text"])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant with access to the following tools or function calls. Your task is to produce a sequence of tools or function calls necessary to generate a response to the user utterance. Use the following tools or function calls as required:
[{"name": "find_longest_word", "description": "Finds the longest word in a list of words.", "parameters": {"words": {"description": "A list of words.", "type": "List[str]"}}}, {"name": "is_valid_parentheses", "description": "Checks if a string contains valid parentheses.", "parameters": {"s": {"description": "The input string.", "type": "str"}}}, {"name": "get_range", "description": "Helper function to format the range string.", "parameters": {"start": {"description": "The start of the range.", "type": "int"}, "end": {"description": "The end of the range.", "type": "int"}}}]<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the range string for numbers from 10 to 20 

In [17]:
inputs = tokenizer(test_data["text"], return_tensors="pt", add_special_tokens=False).to(model.device)

input_len = inputs["input_ids"].shape[-1]

with torch.inference_mode():
    generation = model.generate(**inputs, max_new_tokens=100, do_sample=False)
    generation = generation[0][input_len:]

decoded = tokenizer.decode(generation, skip_special_tokens=True)
print(decoded)

[{"name": "get_range", "arguments": {"start": 10, "end": 20}}, {"name": "get_range", "arguments": {"start": 30, "end": 40}}]


In [18]:
print("QUERY is :", test_data["query"])

QUERY is : What is the range string for numbers from 10 to 20 and from 30 to 40?


In [19]:
print("Original Output :", test_data["answers"])

Original Output : [{"name": "get_range", "arguments": {"start": 10, "end": 20}}, {"name": "get_range", "arguments": {"start": 30, "end": 40}}]


### With out Finetune

In [20]:
max_seq_length = 2048     
dtype = None
load_in_4bit = False  

model_name = "../models/text/Llama-3.2-3B-Instruct/"

wf_model, wf_tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,        # Trained model either locally or from huggingface
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)
FastLanguageModel.for_inference(wf_model)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A30. Num GPUs = 1. Max memory: 23.498 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 3072, padding_idx=128004)
    (layers): ModuleList(
      (0-27): 28 x LlamaDecoderLayer(
        (self_attn): LlamaAttention(
          (q_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (k_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (v_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (up_proj): Linear(in_features=3072, out_features=8192, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((3072,), eps=1e-05)
      )

In [21]:
def create_inference_prompts_from_examples(examples):
    prompts = []

    for query, tools in zip(examples["query"], examples["tools"]):
        system_message = {
            "role": "system",
            "content": (
                "You are a helpful assistant with access to the following tools or function calls. "
                "Your task is to produce a sequence of tools or function calls necessary to generate a response to the user utterance. "
                "Use the following tools or function calls as required:\n"
                f"{tools}"
            )
        }
        user_message = {
            "role": "user",
            "content": query
        }

        convo = [system_message, user_message]
        prompt = wf_tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
        prompts.append(prompt)

    return {"text": prompts}

In [22]:
test_data = dataset_test[270]

print(test_data["text"])

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are a helpful assistant with access to the following tools or function calls. Your task is to produce a sequence of tools or function calls necessary to generate a response to the user utterance. Use the following tools or function calls as required:
[{"name": "find_longest_word", "description": "Finds the longest word in a list of words.", "parameters": {"words": {"description": "A list of words.", "type": "List[str]"}}}, {"name": "is_valid_parentheses", "description": "Checks if a string contains valid parentheses.", "parameters": {"s": {"description": "The input string.", "type": "str"}}}, {"name": "get_range", "description": "Helper function to format the range string.", "parameters": {"start": {"description": "The start of the range.", "type": "int"}, "end": {"description": "The end of the range.", "type": "int"}}}]<|eot_id|><|start_header_id|>user<|end_header_id|>

What is the range string for numbers from 10 to 20 

In [23]:
inputs = wf_tokenizer(test_data["text"], return_tensors="pt", add_special_tokens=False).to(wf_model.device)

input_len = inputs["input_ids"].shape[-1]

with torch.inference_mode():
    generation = wf_model.generate(**inputs, max_new_tokens=200, do_sample=False)
    generation = generation[0][input_len:]

decoded = wf_tokenizer.decode(generation, skip_special_tokens=True)
print(decoded)

To generate the range string for numbers from 10 to 20 and from 30 to 40, you can use the following function call:

```
get_range(start=10, end=20)
get_range(start=30, end=40)
```

This will return the range string for each of the two intervals.
