In [None]:
import sys
import logging

import datasets
from datasets import load_dataset
import torch
import transformers
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
import os
import json
import wandb


In [None]:
#!pip install -U torch

In [None]:
# !pip install datasets transformers trl wandb

In [None]:
ds = load_dataset("NousResearch/hermes-function-calling-v1", "glaive_func_calling")

In [None]:
ds

In [None]:
from ast import literal_eval
tools_list = []
for tool in ds['train']['tools']:
    if tool == 'null':
        tools_list.append(None)
        continue

    # str fixes to convert to dict
    tool = tool.replace('true','True')
    tool = tool.replace('false','False')
    tool = tool.replace('null','None')
    tools_list.append(literal_eval(tool))   
        


In [None]:
tools_list[123]

In [None]:
pydantic_schema = """{'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}"""
from tqdm import tqdm

system_prompts = []
for i in tqdm(range(len(ds['train']))):
    if ds['train']['tools'][i] == 'null':
        system_prompts.append(system_prompt)
    else:
        available_tools = tools_list[i]
        system_prompt = f"""You are a function calling AI model.
You are provided with function signatures within <tools></tools> XML tags.
You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
<tools>
{available_tools}
</tools>
Use the following pydantic model json schema for each tool call you will make: 
{pydantic_schema}
For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
<tool_call>
{{tool_call}}
</tool_call>
"""
        system_prompts.append(system_prompt)

In [None]:
#  To use as reference when creating tools in the future.

custom_tools_list = [{'type': 'function',
  'function': {'name': 'turn_on_lights',
   'description': 'Turn the lights on',
   # no parameters required
    'parameters': {}}},    
 {'type': 'function',
  'function': {'name': 'get_news',
   'description': 'Get the latest news',
   'parameters': {'type': 'object',
    'properties': {'category': {'type': 'string',
      'description': 'The category of news, e.g. sports, politics'}},
    'required': ['category']}}}]

In [None]:
# add column to dataset called system_prompt
ds['train'] = ds['train'].add_column('system_prompt', system_prompts)


In [None]:
ds['train'][i]

In [None]:
ds['train']['conversations'][1122]

# Some changes to make based on this to match LLM FT paradigm

# from and value should be replaced with role and content (chat format)

# replace human with user, gpt with assistant
# check to see if tool call can be a special token or not

# is the way assisant follows tool response what we want? 

# are the list of tools available fixed for all system prompts? 
# can we possibly enhance and add more, and create synthetic data?

In [None]:

###################
# Tokenizer Loading
###################

checkpoint_path = "HuggingFaceTB/SmolLM-360M-instruct"
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)
tokenizer.model_max_length = 2048
tokenizer.pad_token = "<|endoftext|>"  # note this is specific to smollm
tokenizer.pad_token_id = tokenizer.convert_tokens_to_ids(tokenizer.pad_token )
tokenizer.padding_side = 'right'
# https://stackoverflow.com/questions/76446228/setting-padding-token-as-eos-token-when-using-datacollatorforlanguagemodeling-fr


In [None]:
tokenizer.eos_token_id, tokenizer.pad_token_id

In [None]:
tokenizer

In [None]:
##################
# Data Processing
##################
def clean_mesages(messages, updated_system_prompt):
    messages_clean = []
    for m in messages:
        role = m['from']
        content = m['value']
        if role == 'human':
            role = 'user'
        elif role == 'gpt':
            role = 'assistant'
        elif role == 'system':
            content = updated_system_prompt
        messages_clean.append({'role': role, 'content': content})
    return messages_clean

def apply_chat_template(
    example,
    tokenizer,
):
    updated_system_prompt = example["system_prompt"]
    messages_init = example["conversations"]
    messages = clean_mesages(messages_init, updated_system_prompt)
    example["text"] = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False).strip('\n')
    return example



train_dataset = ds['train']
column_names = list(train_dataset.features)

processed_train_dataset = train_dataset.map(
    apply_chat_template,
    fn_kwargs={"tokenizer": tokenizer},
    num_proc=10,
    desc="Applying chat template to train_sft",
)

# shuffle the dataset
processed_train_dataset = processed_train_dataset.shuffle(seed=42)

In [None]:
print(processed_train_dataset['text'][0])

In [None]:
processed_train_dataset['text'][11]

# Perfect assistant example:

<|im_start|>system
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.Here are the available tools:<tools> [{'type': 'function', 'function': {'name': 'create_todo', 'description': 'Create a new todo item', 'parameters': {'type': 'object', 'properties': {'title': {'type': 'string', 'description': 'The title of the todo item'}, 'description': {'type': 'string', 'description': 'The description of the todo item'}}, 'required': ['title']}}}, {'type': 'function', 'function': {'name': 'create_invoice', 'description': 'Create a new invoice', 'parameters': {'type': 'object', 'properties': {'client': {'type': 'string', 'description': 'The name of the client'}, 'items': {'type': 'array', 'items': {'type': 'object', 'properties': {'description': {'type': 'string', 'description': 'The description of the item'}, 'quantity': {'type': 'integer', 'description': 'The quantity of the item'}, 'price': {'type': 'number', 'description': 'The price of the item'}}, 'required': ['description', 'quantity', 'price']}, 'description': 'The list of items in the invoice'}}, 'required': ['client', 'items']}}}] </tools>Use the following pydantic model json schema for each tool call you will make: {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
<tool_call>
{tool_call}
</tool_call><|im_end|>
<|im_start|>user
I need to create a new todo item.<|im_end|>
<|im_start|>assistant
Sure, I can help with that. Could you please provide me with the title and description of the todo item?<|im_end|>
<|im_start|>user
The title is "Grocery Shopping" and the description is "Buy fruits, vegetables, and bread".<|im_end|>
<|im_start|>assistant
<tool_call>
{'name': 'create_todo', 'arguments': {'title': 'Grocery Shopping', 'description': 'Buy fruits, vegetables, and bread'}}
</tool_call><|im_end|>
<|im_start|>tool
<tool_response>
{'status': 'success', 'message': "Todo item 'Grocery Shopping' has been created successfully."}
</tool_response><|im_end|>
<|im_start|>assistant
Your todo item "Grocery Shopping" has been created successfully. It includes "Buy fruits, vegetables, and bread".<|im_end|>


This todo list creation is exactly what you'd want an assistant to help with

In [None]:
# Now test how this works with the existing model

####################
# Base Model Loading
####################
checkpoint_path = "HuggingFaceTB/SmolLM-360M"
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
#    attn_implementation="flash_attention_2",  # only works on latest gpus, probably not worth it in most cases
     torch_dtype=torch.bfloat16,
   device_map='auto'
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)


In [None]:
user_prompt = "I need to create a new todo item with the title 'Buy groceries'."
tool_calling_system_prompt = """<|im_start|>system
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.Here are the available tools:<tools> [{'type': 'function', 'function': {'name': 'create_todo', 'description': 'Create a new todo item', 'parameters': {'type': 'object', 'properties': {'title': {'type': 'string', 'description': 'The title of the todo item'}, 'description': {'type': 'string', 'description': 'The description of the todo item'}}, 'required': ['title']}}}, {'type': 'function', 'function': {'name': 'create_invoice', 'description': 'Create a new invoice', 'parameters': {'type': 'object', 'properties': {'client': {'type': 'string', 'description': 'The name of the client'}, 'items': {'type': 'array', 'items': {'type': 'object', 'properties': {'description': {'type': 'string', 'description': 'The description of the item'}, 'quantity': {'type': 'integer', 'description': 'The quantity of the item'}, 'price': {'type': 'number', 'description': 'The price of the item'}}, 'required': ['description', 'quantity', 'price']}, 'description': 'The list of items in the invoice'}}, 'required': ['client', 'items']}}}] </tools>Use the following pydantic model json schema for each tool call you will make: {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
<tool_call>
{tool_call}
</tool_call><|im_end|>"""

tool_calling_user_prompt = f"""
<|im_start|>user
{user_prompt}.<|im_end|>
<|im_start|>assistant
"""

tool_calling_prompt = tool_calling_system_prompt + tool_calling_user_prompt

print(tool_calling_prompt)

In [None]:
model.eval();

input_ids = tokenizer.encode(tool_calling_prompt, return_tensors='pt')
input_ids = input_ids.to(model.device)
output = model.generate(input_ids, max_new_tokens=256,  do_sample=False, pad_token_id=tokenizer.eos_token_id)
output_text = tokenizer.decode(output[0], skip_special_tokens=False, pad_token_id = tokenizer.eos_token_id)
print(output_text)
formatted_output_text = "<|im_end|>".join(output_text.split("<|im_end|>")[:3]) + "<|im_end|>"
#print(formatted_output_text)

In [None]:
logger = logging.getLogger(__name__)
wandb.init(project="smollm-ft-function-calling")
###################
# Hyper-parameters
###################
training_config = {
    "do_eval": False,
    "learning_rate": 5.0e-05,
    "per_device_train_batch_size": 4,
    "gradient_accumulation_steps": 2,
    "log_level": "info",
    "logging_steps": 100,
    "logging_strategy": "steps",
    "lr_scheduler_type": "cosine",
    "num_train_epochs": 5,
    "max_steps": -1,
    "output_dir": "./checkpoint_dir",
    "overwrite_output_dir": True,
    "remove_unused_columns": True,
    "save_steps": 500,
    "save_total_limit": 1,
    "seed": 0,
    "gradient_checkpointing": True,
    "gradient_checkpointing_kwargs":{"use_reentrant": False},
    "gradient_accumulation_steps": 1,
    "warmup_ratio": 0.05,
    "report_to":"wandb"
    }


train_conf = TrainingArguments(**training_config)

###############
# Setup logging
###############
logging.basicConfig(
    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    handlers=[logging.StreamHandler(sys.stdout)],
)
log_level = train_conf.get_process_log_level()
logger.setLevel(log_level)
datasets.utils.logging.set_verbosity(log_level)
transformers.utils.logging.set_verbosity(log_level)
transformers.utils.logging.enable_default_handler()
transformers.utils.logging.enable_explicit_format()

logger.info(f"Training/evaluation parameters {train_conf}")


In [None]:
###########
# Training
###########

model.train();
trainer = SFTTrainer(
    model=model,
    args=train_conf,
    train_dataset=processed_train_dataset,
    max_seq_length=2048,
    dataset_text_field="text",
    tokenizer=tokenizer,
    #packing=True,
)
train_result = trainer.train()
metrics = train_result.metrics
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()


In [None]:
# Load the model from the checkpoint

# find most recently created folder in checkpoint_dir and set as checkpoint path
checkpoint_path = sorted(os.listdir(train_conf.output_dir))[-1]
checkpoint_path = os.path.join(train_conf.output_dir, checkpoint_path)
model_kwargs = dict(
    use_cache=False,
    trust_remote_code=True,
     torch_dtype=torch.bfloat16,
   device_map='auto'
)
model = AutoModelForCausalLM.from_pretrained(checkpoint_path, **model_kwargs)


In [None]:
user_prompt = "Who are you?"
tool_calling_system_prompt = """<|im_start|>system
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.Here are the available tools:<tools> [{'type': 'function', 'function': {'name': 'create_todo', 'description': 'Create a new todo item', 'parameters': {'type': 'object', 'properties': {'title': {'type': 'string', 'description': 'The title of the todo item'}, 'description': {'type': 'string', 'description': 'The description of the todo item'}}, 'required': ['title']}}}, {'type': 'function', 'function': {'name': 'create_invoice', 'description': 'Create a new invoice', 'parameters': {'type': 'object', 'properties': {'client': {'type': 'string', 'description': 'The name of the client'}, 'items': {'type': 'array', 'items': {'type': 'object', 'properties': {'description': {'type': 'string', 'description': 'The description of the item'}, 'quantity': {'type': 'integer', 'description': 'The quantity of the item'}, 'price': {'type': 'number', 'description': 'The price of the item'}}, 'required': ['description', 'quantity', 'price']}, 'description': 'The list of items in the invoice'}}, 'required': ['client', 'items']}}}] </tools>Use the following pydantic model json schema for each tool call you will make: {'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
<tool_call>
{tool_call}
</tool_call><|im_end|>"""

tool_calling_user_prompt = f"""
<|im_start|>user
{user_prompt}.<|im_end|>
<|im_start|>assistant
"""

tool_calling_prompt = tool_calling_system_prompt + tool_calling_user_prompt

print(tool_calling_prompt)

In [None]:
model.eval();

input_ids = tokenizer.encode(tool_calling_prompt, return_tensors='pt')
input_ids = input_ids.to(model.device)
output = model.generate(input_ids, max_new_tokens=256,  do_sample=False, pad_token_id=tokenizer.eos_token_id)
output_text = tokenizer.decode(output[0], skip_special_tokens=False, pad_token_id = tokenizer.eos_token_id)
formatted_output_text = "<|im_end|>".join(output_text.split("<|im_end|>")[:3]) + "<|im_end|>"
print(formatted_output_text)