# Training TinyLlama for Tool-calling

In [None]:
!echo "accelerate>=0.34.2" > requirements.txt
!echo "bitsandbytes>=0.44.0" >> requirements.txt
!echo "datasets==2.16.0" >> requirements.txt # https://github.com/huggingface/datasets/issues/6753
!echo "evaluate>=0.4.3" >> requirements.txt
!echo "fsspec==2023.10.0" >> requirements.txt # https://github.com/huggingface/datasets/issues/6753
!echo "gcsfs==2023.10.0" >> requirements.txt # https://github.com/huggingface/datasets/issues/6753
!echo "ipykernel>=6.29.5" >> requirements.txt
!echo "ipywidgets>=8.1.5" >> requirements.txt
!echo "jupyter>=1.1.1" >> requirements.txt
!echo "mlflow>=2.16.2" >> requirements.txt
!echo "openai>=1.50.2" >> requirements.txt
!echo "pandas>=2.2.3" >> requirements.txt
!echo "peft>=0.13.0" >> requirements.txt
!echo "scipy>=1.14.1" >> requirements.txt
!echo "torch>=2.4.1" >> requirements.txt
!echo "transformers>=4.45.1" >> requirements.txt
!echo "wheel>=0.44.0" >> requirements.txt

%pip install -r requirements.txt

In [None]:
import datasets
print(datasets.__version__)

In [None]:
import gc
import os
import pandas as pd
import warnings

import torch
from datasets import Dataset, DatasetDict, load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [None]:
HF_TOKEN = os.environ.get("HF_TOKEN", None)
SEED = 42

In [None]:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'max_split_size_mb:128'
# os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True,max_split_size_mb:128'

warnings.filterwarnings('ignore')

def fix_torch_seed(seed=SEED):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

fix_torch_seed()

In [None]:
chat_threads = [
    {
        "documents": [],
        "messages": [
            {"role": "user", "content": "What's the weather like in San Francisco and New York?"},
            {"role": "assistant", "tool_calls": [
                {"id": "call_sf", "type": "function", "function": {"name": "get_current_weather", "arguments": '{"location": "San Francisco, USA", "format": "celsius"}'}},
                {"id": "call_ny", "type": "function", "function": {"name": "get_current_weather", "arguments": '{"location": "New York, USA", "format": "celsius"}'}}
            ]},
            {"role": "tool", "name": "get_current_weather", "tool_call_id": "call_sf", "content": "21.0"},
            {"role": "tool", "name": "get_current_weather", "tool_call_id": "call_ny", "content": "18.5"},
            {"role": "assistant", "content": "The current temperature in San Francisco is 21°C (70°F), while in New York it's 18.5°C (65°F)."}
        ],
        "tools": [
            {
                "type": "function",
                "function": {
                    "name": "get_current_weather",
                    "description": "Get the current weather",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "location": {
                                "type": "string",
                                "description": "The city and country, eg. San Francisco, USA"
                            },
                            "format": {"type": "string", "enum": ["celsius", "fahrenheit"]}
                        },
                        "required": ["location", "format"]
                    }
                }
            }
        ]
    },
]

drone_training_dataset_df = pd.read_json(
    "https://raw.githubusercontent.com/openai/openai-cookbook/main/examples/data/drone_training.jsonl",
    lines=True,
)[['messages', 'tools']]

chat_threads_df = pd.concat(
    [pd.DataFrame(chat_threads), drone_training_dataset_df], # TODO: https://github.com/ShishirPatil/gorilla/blob/main/berkeley-function-call-leaderboard/data/README.md
    ignore_index=True
)

# TODO: validate schema using the OpenAI library

train_test_ds = Dataset.from_pandas(chat_threads_df).train_test_split(seed=SEED, test_size=0.2)
test_validation_ds = train_test_ds['test'].train_test_split(seed=SEED, test_size=0.5)

chat_threads_ds = DatasetDict({
    'train': train_test_ds['train'],
    'test': test_validation_ds['test'],
    'validation': test_validation_ds['train']
})

# TODO: add references to where the examples in the dataset came from to the Hub README
try:
    chat_threads_ds.push_to_hub("mjschock/chat_threads", token=HF_TOKEN)

except Exception as e:
    print(e)

In [None]:
# https://docs.anthropic.com/en/docs/build-with-claude/tool-use
# https://github.com/abetlen/llama-cpp-python/blob/c032fc65b0873337ed39e5d63e15468a5d797646/llama_cpp/llama_chat_format.py#L3387
# https://github.com/Mozilla-Ocho/llamafile/blob/66a84d8aea2990895fc4f64786406fea64e79197/llama.cpp/server/server.cpp#L480 (need <|im_start|> b/c Mozilla)
# https://github.com/openai/openai-python/blob/120d225b91a8453e15240a49fb1c6794d8119326/chatml.md
# https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html#prompt
# https://huggingface.co/blog/unified-tool-use
chat_template = """{%- set system_message_present = messages | selectattr('role', 'equalto', 'system') | list -%}
{%- if not system_message_present -%}
    {%- set messages = [{ "content": "You are an AI assistant capable of using tools.", "role": "system" }] + messages -%}
{%- endif -%}

{% for message in messages %}
<|im_start|>{{ message.role }}
{% if message.role == 'system' %}
{{ message.content }}

You are aware of the following tools:
{
  "tools": [
  {% for tool in tools %}
    {
      "type": "{{ tool.type }}",
      "function": {
        "description": "{{ tool.function.description }}",
        "name": "{{ tool.function.name }}",
        "parameters": {{ tool.function.parameters | tojson }}
      }
    }{% if not loop.last %},{% endif %}

  {% endfor %}
  ]
}

CRITICAL: Always respond with ONLY a JSON object. Do not include any text outside the JSON. Use this exact structure:

{
  "response": "Your entire response goes here as a single string.",
  "tool_calls": [
    {
      "name": "tool_name",
      "arguments": {"param1": "value1", "param2": "value2"},
      "id": "call_id"
    }
  ]
}

Use "call_id" for single tool calls, "call_id_abc123" for multiple calls. If no tool calls are needed, use an empty array for "tool_calls".
<|im_end|>
{% elif message.role == 'user' %}
{{ message.content }}
<|im_end|>
{% elif message.role == 'assistant' %}
  {%- if message.weights | default(1) > 0 -%}
    {% generation %}
    {% if message.content %}{{ message.content }}{% endif %}
    {% if message.tool_calls %}
{
  "tool_calls": [
    {% for tool_call in message.tool_calls %}
    {
      "arguments": {{ tool_call.function.arguments | tojson }},
      "id": "{{ tool_call.id }}",
      "name": "{{ tool_call.function.name }}"
    }{% if not loop.last %},{% endif %}
    {% endfor %}
  ]
}
    {% endif %}
    {% endgeneration %}
  {%- else -%}
    {% if message.content %}{{ message.content }}{% endif %}
    {% if message.tool_calls %}
{
  "tool_calls": [
    {% for tool_call in message.tool_calls %}
    {
      "arguments": {{ tool_call.function.arguments | tojson }},
      "id": "{{ tool_call.id }}",
      "name": "{{ tool_call.function.name }}"
    }{% if not loop.last %},{% endif %}
    {% endfor %}
  ]
}
    {% endif %}
  {%- endif -%}
<|im_end|>
{% elif message.role == 'tool' %}
Tool: {{ message.name }}
Result: {{ message.content }}
<|im_end|>
{% endif %}
{% endfor %}
<|im_start|>assistant"""

with open("chat_template.jinja2", "w") as f:
    f.write(chat_template)

def load_model_and_tokenizer(pretrained_model_name_or_path):
    """
    Load the model and tokenizer with 4-bit quantization.

    Args:
        pretrained_model_name_or_path (str): The model name or path to load.

    Returns:
        tuple: Loaded model and tokenizer.
    """
    # Configure 4-bit quantization
    quantization_config = BitsAndBytesConfig(
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        # bnb_4bit_use_double_quant=False,
        load_in_4bit=True,
    )

    # Load model with 4-bit quantization
    model = AutoModelForCausalLM.from_pretrained(
        pretrained_model_name_or_path,
        # attn_implementation="flash_attention_2",
        device_map="auto",
        quantization_config=quantization_config,
        torch_dtype=torch.bfloat16,
    )

    # model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="auto", offload_buffers=True)
    # model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path, device_map="auto", offload_buffers=False)

    # Load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
        pretrained_model_name_or_path,
    )

    # Set pad token if not defined
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token

    tokenizer.chat_template = chat_template

    return model, tokenizer

In [None]:
# Load model and tokenizer
model_name_input = "TinyLlama-1.1B-Chat-v1.0"
model_name_output = f"{model_name_input}-ft"
model_org_input = "TinyLlama"
model_org_output = "mjschock"
pretrained_model_name_or_path = f"{model_org_input}/{model_name_input}"
model, tokenizer = load_model_and_tokenizer(pretrained_model_name_or_path)

In [None]:
model

In [None]:
def get_formatted_chat(tokenizer):
    chat_thread = {
        "documents": [],
        "messages": [
            {"role": "user", "content": "What's the weather like in Oakland and Atlanta?"},
        ],
        "tools": [
            {
                "type": "function",
                "function": {
                    "name": "get_current_weather",
                    "description": "Get the current weather",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "location": {
                                "type": "string",
                                "description": "The city and country, eg. San Francisco, USA"
                            },
                            "format": {"type": "string", "enum": ["celsius", "fahrenheit"]}
                        },
                        "required": ["location", "format"]
                    }
                }
            }
        ]
    }

    formatted_chat = tokenizer.apply_chat_template(
        add_generation_prompt=True,
        conversation=chat_thread.get("messages"),
        documents=chat_thread.get("documents"),
        tokenize=False,
        tools=chat_thread.get("tools"),
    )

    return formatted_chat

In [None]:
def generate_formatted_response(formatted_chat, model, tokenizer):
    inputs = tokenizer(formatted_chat, return_tensors="pt", add_special_tokens=False)
    inputs = {key: tensor.to(model.device) for key, tensor in inputs.items()}

    outputs = model.generate(**inputs, max_new_tokens=512)

    decoded_output = tokenizer.decode(outputs[0][inputs["input_ids"].size(1):], skip_special_tokens=True)

    return decoded_output

In [None]:
formatted_chat = get_formatted_chat(tokenizer)
print(formatted_chat)

In [None]:
print(generate_formatted_response(formatted_chat, model, tokenizer))

In [None]:
# raise Exception('Stop!')

In [None]:
def load_and_preprocess_data(dataset, tokenizer):
    """
    Load and preprocess the dataset for training.
    
    Args:
        dataset: The dataset to preprocess
        tokenizer: Tokenizer to use for preprocessing.
    
    Returns:
        datasets.Dataset: Preprocessed dataset.
    """
    def preprocess_function(examples):
        # Extract the messages from the example
        conversation = examples['messages']
        documents = examples.get('documents', [])
        tools = examples.get('tools', [])

        # Apply chat template to generate tokenized input and assistant mask
        tokenized_output = tokenizer.apply_chat_template(
            add_generation_prompt=False,
            conversation=conversation,
            documents=documents,
            max_length=4096,
            # max_length=2048,
            # padding="max_length",
            padding="longest",
            return_assistant_tokens_mask=True,
            return_dict=True,
            return_tensors="pt",
            tokenize=True,
            tools=tools,
            truncation=True,
        )

        # Extract the input IDs and assistant tokens mask
        input_ids = tokenized_output['input_ids'][0]
        assistant_masks = torch.tensor(tokenized_output['assistant_masks'])
        attention_mask = tokenized_output['attention_mask'][0]

        # Use the assistant mask to create labels
        labels = torch.where(assistant_masks == 1, input_ids, torch.tensor(-100))

        return {
            'attention_mask': attention_mask,
            'input_ids': input_ids,
            'labels': labels
        }

    # Preprocess the dataset
    # return dataset.map(preprocess_function, batched=False, remove_columns=dataset.column_names)
    return dataset.map(preprocess_function, batched=False, num_proc=1, remove_columns=dataset["train"].column_names) # TODO: use batched=True and figure out how to pass tools

def prepare_model_for_training(model):
    """
    Prepare the model for k-bit training with LoRA.
    
    Args:
        model: The model to prepare.
    
    Returns:
        peft.PeftModel: The prepared model.
    """
    # Prepare the model for k-bit training
    model = prepare_model_for_kbit_training(model)

    # Configure LoRA
    peft_config = LoraConfig(
        bias="none",
        # init_lora_weights="gaussian",
        lora_alpha=16,
        # lora_alpha=8,
        lora_dropout=0.1,
        # modules_to_save=["lm_head"],
        r=8,
        # target_modules=["q_proj", "v_proj"],
        # target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
        # target_modules=["all_linear"],
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        # target_modules=["gate_proj", "up_proj", "down_proj"],
        # target_modules=["o_proj", "down_proj"],
        task_type="CAUSAL_LM",
        # use_dora=True, # optional DoRA
    )

    # Apply LoRA to the model
    model = get_peft_model(model, peft_config)

    # last_layer_name = "model.layers.21"
    # last_layer_start_i = None

    # for param_i, (param_name, param) in enumerate(model.named_parameters()):
    #     if last_layer_name in param_name:
    #         last_layer_start_i = param_i

    #     if last_layer_start_i is not None and param_i >= last_layer_start_i:
    #         param.requires_grad = True

    #     else:
    #         param.requires_grad = False

    return model

In [None]:
# Load and preprocess data
dataset = load_dataset("mjschock/chat_threads")
tokenized_dataset = load_and_preprocess_data(dataset, tokenizer)

# Prepare model for training
model = prepare_model_for_training(model)

# Print trainable parameters
model.print_trainable_parameters()
# total_params = sum(p.numel() for p in model.parameters())
# total_trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
# print(f"Total parameters: {total_params}")
# print(f"Total trainable parameters: {total_trainable_params}")

# Define training arguments
training_args = TrainingArguments(
    # adam_beta2=0.999,
    auto_find_batch_size=True,
    bf16=True,
    dataloader_pin_memory=False,
    eval_on_start=True,
    eval_steps=200,
    # eval_strategy="epoch",
    eval_strategy="steps",
    # fp16=True,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    # learning_rate=1e-4,
    learning_rate=2e-5,
    logging_dir="./logs",
    logging_steps=10,
    # logging_steps=250,
    load_best_model_at_end=True,
    num_train_epochs=2,
    optim="adamw_hf",
#     optim="adamw_torch",
    output_dir="./checkpoints",
    per_device_eval_batch_size=1,
    per_device_train_batch_size=1,
    push_to_hub=False,
    remove_unused_columns=False,
    report_to=["mlflow"],
    run_name=f"{model_org_output}/{model_name_output}",
    save_steps=200,
    save_strategy="steps",
    # save_strategy="no",
    # save_strategy="epoch",
    save_total_limit=1,
    warmup_steps=100,
    # warmup_steps=2,
    weight_decay=0.01,
    # weight_decay=1e-6,
)

# Data collator for language modeling
# mlm (bool, *optional*, defaults to True):
#     Whether or not to use masked language modeling. If set to False, the labels are the same as the inputs with the padding tokens ignored (by setting them to -100). Otherwise, the labels are -100 for non-masked tokens and the value to predict for the masked token.
# mlm_probability (float, *optional*, defaults to 0.15):
#     The probability with which to (randomly) mask tokens in the input, when mlm is set to True.
data_collator = DataCollatorForLanguageModeling(mlm=False, tokenizer=tokenizer)

# Initialize Trainer
trainer = Trainer(
    args=training_args,
    data_collator=data_collator,
    eval_dataset=tokenized_dataset["validation"],
    model=model,
    tokenizer=tokenizer,
    train_dataset=tokenized_dataset["train"],
)

# Fine-tune the model
trainer.train(
    # resume_from_checkpoint=True,
)

# Save the fine-tuned model and tokenizer
model.save_pretrained(f"{model_org_output}/{model_name_output}")
tokenizer.save_pretrained(f"{model_org_output}/{model_name_output}")

In [None]:
# del data_collator
# del model
# del tokenized_dataset
# del tokenizer
# del trainer
# gc.collect()
# gc.collect()

In [None]:
# Load model and tokenizer
# model, tokenizer = load_model_and_tokenizer(f"{model_org_output}/{model_name_output}")

print(generate_formatted_response(formatted_chat, model, tokenizer))