In [1]:
import json
import os
from typing import List

from datasets import load_dataset
import mlflow
from mlflow.entities.span import (
    SpanType,
)
from mlflow.types.llm import (
    ChatChoice,
    ChatChoiceLogProbs,
    ChatMessage,
    ChatParams,
    ChatResponse,
    FunctionToolCallArguments,
    FunctionToolDefinition,
    ParamProperty,
    ToolCall,
    ToolParamsSchema,
)
import torch
from transformers import (
    DataCollatorForLanguageModeling,
    TrainingArguments,
)
from trl import SFTTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


In [2]:
max_seq_length = 4096  # Choose any! We auto support RoPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
model_name_to_fourbit_model = {
    "meta-llama/Llama-3.2-1B-Instruct": "unsloth/llama-3.2-1b-instruct-bnb-4bit",
    "meta-llama/Llama-3.2-3B-Instruct": "unsloth/llama-3.2-3b-instruct-bnb-4bit",
    "meta-llama/Llama-3.2-11B-Vision-Instruct": "unsloth/llama-3.2-11b-vision-instruct-bnb-4bit",
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0": "unsloth/tinyllama-chat-bnb-4bit",
}

# model_name = "meta-llama/Llama-3.2-1B-Instruct"
# model_name = "meta-llama/Llama-3.2-3B-Instruct"
# model_name = "meta-llama/Llama-3.2-11B-Vision-Instruct"
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

print('model_name: ', model_name)
print('model.config._name_or_path: ', model.config._name_or_path)

assert model.config._name_or_path == model_name_to_fourbit_model.get(model_name), f"{model.config._name_or_path} != {model_name_to_fourbit_model.get(model_name)}"

==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA GeForce GTX 1050 Ti. Max memory: 3.94 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 6.1. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: unsloth/tinyllama-chat-bnb-4bit can only handle sequence lengths of at most 2048.
But with kaiokendev's RoPE scaling of 2.0, it can be magically be extended to 4096!


model_name:  TinyLlama/TinyLlama-1.1B-Chat-v1.0
model.config._name_or_path:  unsloth/tinyllama-chat-bnb-4bit


In [3]:
# Influenced by:
# https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models
# https://docs.anthropic.com/en/docs/build-with-claude/tool-use
# https://github.com/abetlen/llama-cpp-python/blob/7c4aead82d349469bbbe7d8c0f4678825873c039/llama_cpp/llama_chat_format.py#L3387
# https://github.com/Mozilla-Ocho/llamafile/blob/66a84d8aea2990895fc4f64786406fea64e79197/llama.cpp/server/server.cpp#L480 (need <|im_start|> b/c Mozilla)
# https://github.com/openai/openai-python/blob/120d225b91a8453e15240a49fb1c6794d8119326/chatml.md
# https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html#prompt
# https://huggingface.co/blog/unified-tool-use
chat_template = (
    "{%- set system_message_present = messages | selectattr('role', 'equalto', 'system') | list -%}"
    "{%- if not system_message_present -%}"
    '{%- set messages = [{ "content": "You are an AI agent acting as a human assistant.", "role": "system" }] + messages -%}'
    "{%- endif -%}"
    "{%- for message in messages -%}"
    # "<|im_start|>{{ message.role }}{{ '\n' }}"
    "<|{{ message.role }}|>{{ '\n' }}"
    # System message
    "{%- if message.role == 'system' -%}"
    "{{ message.content }}"
    "{%- if tools and tools | length > 0 -%}"
    "{{ '\n\n' }}You are aware of the following tools in your environment:{{ '\n' }}"
    "{\n"
    "  \"tools\": [{{ '\n' }}"
    "{%- for tool in tools -%}"
    "{{ '    ' }}{\n"
    '      "function": {\n'
    '        "description": "{{ tool.function.description }}",{{ \'\n\' }}'
    '        "name": "{{ tool.function.name }}",{{ \'\n\' }}'
    "        \"parameters\": {{ tool.function.parameters | tojson }}{{ '\n' }}"
    # "        \"parameters\": {\n"
    # "{{ '        ' }}}\n"
    "      },{{ '\n' }}"
    '      "type": "{{ tool.type }}"{{ \'\n\' }}'
    "    }{%- if not loop.last -%},{%- endif -%}{{ '\n' }}"
    "{%- endfor -%}"
    "{{ '  ' }}]{{ '\n' }}"
    "}"
    "{{ '\n\n' }}If you would like to suggest one or more tool calls, please respond in the following format:{{ '\n' }}"
    "{\n"
    '  "finish_reason": "tool_calls",{{ \'\n\' }}'
    "  \"tool_calls\": [{{ '\n' }}"
    "{{ '    ' }}{\n"
    '      "arguments": "{\\"parameter_name\\": \\"parameter_value\\"}",{{ \'\n\' }}'
    '      "id": "call_id",{{ \'\n\' }}'
    '      "name": "tool_name"{{ \'\n\' }}'
    "    }{{ '\n' }}"
    "  ]{{ '\n' }}"
    "}"
    "{%- endif -%}"
    # "<|im_end|>{{ '\n' }}"
    "{{ eos_token }}{{ '\n' }}"
    "{%- endif -%}"
    # User message
    "{%- if message.role == 'user' -%}"
    "{{ message.content }}"
    # "<|im_end|>{{ '\n' }}"
    "{{ eos_token }}{{ '\n' }}"
    "{%- endif -%}"
    # Assistant message
    "{%- if message.role == 'assistant' -%}"
    "{% generation %}"
    ## Tool calls (Actions)
    "{%- if message.tool_calls and message.tool_calls | length > 0 -%}"
    "{\n"
    '  "finish_reason": "tool_calls",{{ \'\n\' }}'
    "  \"tool_calls\": [{{ '\n' }}"
    "{%- for tool_call in message.tool_calls -%}"
    "{{ '    ' }}{\n"
    "      \"arguments\": {{ tool_call.function.arguments | tojson }},{{ '\n' }}"
    '      "id": "{{ tool_call.id }}",{{ \'\n\' }}'
    '      "name": "{{ tool_call.function.name }}"{{ \'\n\' }}'
    "    }{%- if not loop.last -%},{%- endif -%}{{ '\n' }}"
    "{%- endfor -%}"
    "{{ '  ' }}]{{ '\n' }}"
    "}"
    "{%- else -%}"
    ## Regular message
    "{{ message.content }}"
    "{%- endif -%}"
    "{% endgeneration %}"
    # "<|im_end|>{{ '\n' }}"
    "{{ eos_token }}{{ '\n' }}"
    "{%- endif -%}"
    ## Tool message (Observations)
    "{%- if message.role == 'tool' -%}"
    "{\n"
    "  \"content\": {{ message.content | tojson }},{{ '\n' }}"
    '  "name": "{{ message.name }}",{{ \'\n\' }}'
    '  "tool_call_id": "{{ message.tool_call_id }}"{{ \'\n\' }}'
    "}"
    # "<|im_end|>{{ '\n' }}"
    "{{ eos_token }}{{ '\n' }}"
    "{%- endif -%}"
    "{%- endfor -%}"
    # "{%- if add_generation_prompt -%}<|im_start|>assistant\n{%- endif -%}"
    # f"{{{{ '{self.assistant}\n' }}}}"
    "{%- if add_generation_prompt -%}"
    + f"{{{{ '<|assistant|>\n' }}}}"
    + "{%- endif -%}"
)

with open("tokenizer.chat_template.initial.jinja", "w") as f:
    f.write(tokenizer.chat_template)

tokenizer = get_chat_template(
    tokenizer,
    chat_template=(
        chat_template,
        "eos_token"
    ),  # You must provide a template and EOS token
    # mapping={
    #     "role": "from",
    #     "content": "value",
    #     "user": "human",
    #     "assistant": "gpt",
    # },  # ShareGPT style
    map_eos_token=True,  # Maps <|im_end|> to </s> instead
)

# tokenizer = get_chat_template(
#     tokenizer,
#     chat_template="llama-3.1", # TODO: update to 3.2, see ollama_modefile here: https://ollama.com/library/llama3.2/blobs/966de95ca8a6
# )

with open("tokenizer.chat_template.jinja", "w") as f:
    f.write(tokenizer.chat_template)

if hasattr(tokenizer, "_ollama_modelfile") and tokenizer._ollama_modelfile:
    with open("tokenizer._ollama_modelfile.go", "w") as f:
        f.write(tokenizer._ollama_modelfile)

else:
    if os.path.exists("tokenizer._ollama_modelfile.go"):
        os.remove("tokenizer._ollama_modelfile.go")

In [4]:
# dataset = load_dataset("mlabonne/FineTome-100k", split="train")
# dataset = load_dataset("philschmid/guanaco-sharegpt-style", split = "train")
train_dataset = load_dataset("mjschock/chat_threads", split="train")
validation_dataset = load_dataset("mjschock/chat_threads", split="validation")
test_dataset = load_dataset("mjschock/chat_threads", split="test")

example = test_dataset[0]
example

{'documents': '[]',
 'has_parallel_tool_calls': True,
 'messages': '[{"role": "user", "content": "What\'s the weather like in San Francisco and New York?"}, {"role": "assistant", "tool_calls": [{"id": "call_0", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\\"location\\": \\"San Francisco, USA\\", \\"format\\": \\"celsius\\"}"}}, {"id": "call_1", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\\"location\\": \\"New York, USA\\", \\"format\\": \\"celsius\\"}"}}]}, {"role": "tool", "name": "get_current_weather", "tool_call_id": "call_0", "content": "21.0"}, {"role": "tool", "name": "get_current_weather", "tool_call_id": "call_1", "content": "18.5"}, {"role": "assistant", "content": "The current temperature in San Francisco is 21\\u00b0C (70\\u00b0F), while in New York it\'s 18.5\\u00b0C (65\\u00b0F)."}]',
 'tools': '[{"type": "function", "function": {"name": "get_current_weather", "description": "Get the current weather", "

In [5]:
messages = json.loads(example["messages"])[0:2]
messages

[{'role': 'user',
  'content': "What's the weather like in San Francisco and New York?"},
 {'role': 'assistant',
  'tool_calls': [{'id': 'call_0',
    'type': 'function',
    'function': {'name': 'get_current_weather',
     'arguments': '{"location": "San Francisco, USA", "format": "celsius"}'}},
   {'id': 'call_1',
    'type': 'function',
    'function': {'name': 'get_current_weather',
     'arguments': '{"location": "New York, USA", "format": "celsius"}'}}]}]

In [6]:
text = tokenizer.apply_chat_template(
    add_generation_prompt=True,
    conversation=messages,
    documents=json.loads(example["documents"]),
    tools=json.loads(example["tools"]),
    tokenize=False,
)

prompt = text.split("<|assistant|>\n")[0]
response = text.split("<|assistant|>\n")[1]

In [7]:
print("prompt:")
print(prompt)

prompt:
<|system|>
You are an AI agent acting as a human assistant.

You are aware of the following tools in your environment:
{
  "tools": [
    {
      "function": {
        "description": "Get the current weather",
        "name": "get_current_weather",
        "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city and country, eg. San Francisco, USA"}, "format": {"type": "string", "enum": ["celsius", "fahrenheit"]}}, "required": ["location", "format"]}
      },
      "type": "function"
    }
  ]
}

If you would like to suggest one or more tool calls, please respond in the following format:
{
  "finish_reason": "tool_calls",
  "tool_calls": [
    {
      "arguments": "{\"parameter_name\": \"parameter_value\"}",
      "id": "call_id",
      "name": "tool_name"
    }
  ]
}</s>
<|user|>
What's the weather like in San Francisco and New York?</s>



In [8]:
print("response (ground truth):")
print(response)

response (ground truth):
{
  "finish_reason": "tool_calls",
  "tool_calls": [
    {
      "arguments": "{\"location\": \"San Francisco, USA\", \"format\": \"celsius\"}",
      "id": "call_0",
      "name": "get_current_weather"
    },
    {
      "arguments": "{\"location\": \"New York, USA\", \"format\": \"celsius\"}",
      "id": "call_1",
      "name": "get_current_weather"
    }
  ]
}</s>



In [9]:
def load_and_preprocess_data(dataset, tokenizer):
    """
    Load and preprocess the dataset for training.

    Args:
        dataset: The dataset to preprocess
        tokenizer: Tokenizer to use for preprocessing.

    Returns:
        datasets.Dataset: Preprocessed dataset.
    """

    def preprocess_function(examples):
        # Extract the messages from the example
        conversation = examples["messages"]
        documents = examples.get("documents", [])
        tools = examples.get("tools", [])

        # Apply chat template to generate tokenized input and assistant mask
        tokenized_output = tokenizer.apply_chat_template(
            add_generation_prompt=False,
            conversation=json.loads(conversation),
            documents=json.loads(documents),
            max_length=4096,
            padding="longest",
            return_assistant_tokens_mask=True,
            return_dict=True,
            return_tensors="pt",
            tokenize=True,
            tools=json.loads(tools),
            truncation=True,  # TODO: verify we're not truncating anything in the datasets
        )

        # Extract the input IDs and assistant tokens mask
        input_ids = tokenized_output["input_ids"][0]
        assistant_masks = torch.tensor(tokenized_output["assistant_masks"])
        attention_mask = tokenized_output["attention_mask"][0]

        # Use the assistant mask to create labels
        labels = torch.where(assistant_masks == 1, input_ids, torch.tensor(-100))

        return {
            "attention_mask": attention_mask,
            "input_ids": input_ids,
            "labels": labels,
        }

    # Preprocess the dataset
    return dataset.map(
        preprocess_function,
        batched=False,
        num_proc=1,
        remove_columns=dataset.column_names,
    )  # TODO: use batched=True


tokenized_train_dataset = load_and_preprocess_data(
    train_dataset,
    tokenizer,
)

tokenized_validation_dataset = load_and_preprocess_data(
    validation_dataset,
    tokenizer,
)

In [10]:
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

# example = dataset[5]

inputs = tokenizer.apply_chat_template(
    add_generation_prompt=True,
    conversation=messages[0:1], # just the user message
    documents=json.loads(example["documents"]),
    # conversation=json.loads(example["messages"])[0:-1],
    tools=json.loads(example["tools"]),
    return_tensors="pt",
    tokenize=True,
).to("cuda")

outputs = model.generate(
    # input_ids=inputs, max_new_tokens=64, use_cache=True, temperature=1.5, min_p=0.1
    input_ids=inputs,
    max_new_tokens=256,
    use_cache=True,
    temperature=0.0,
)
batch_decoded_outputs = tokenizer.batch_decode(outputs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [11]:
print('prompt:')
print(batch_decoded_outputs[0][0:len(tokenizer.decode(inputs[0]))])

prompt:
<|system|>
You are an AI agent acting as a human assistant.

You are aware of the following tools in your environment:
{
  "tools": [
    {
      "function": {
        "description": "Get the current weather",
        "name": "get_current_weather",
        "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city and country, eg. San Francisco, USA"}, "format": {"type": "string", "enum": ["celsius", "fahrenheit"]}}, "required": ["location", "format"]}
      },
      "type": "function"
    }
  ]
}

If you would like to suggest one or more tool calls, please respond in the following format:
{
  "finish_reason": "tool_calls",
  "tool_calls": [
    {
      "arguments": "{\"parameter_name\": \"parameter_value\"}",
      "id": "call_id",
      "name": "tool_name"
    }
  ]
}</s> 
<|user|>
What's the weather like in San Francisco and New York?</s> 
<|assistant|>



In [12]:
print('response (predicted, before training):')
print(batch_decoded_outputs[0][len(tokenizer.decode(inputs[0])):])

response (predicted, before training):
I don't have access to real-time weather data, but according to the information available data, the weather in san francisco and new york are both hot and humid. San francisco has a maximum temperature of 329.4°c (984.7°f) and minimum of 25.4° (39.3°c) degrees, while new york has a maximum of 39.4° (41.3°c) and minimum 25.° (3.3°) degrees.</s>


In [13]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    # lora_alpha = 16,
    lora_alpha=32,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
)

Unsloth: Offloading output_embeddings to disk to save VRAM


  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)
Unsloth 2024.11.5 patched 22 layers with 22 QKV layers, 22 O layers and 22 MLP layers.


Unsloth: Training lm_head in mixed precision to save VRAM


In [14]:
trainer = SFTTrainer(
    # compute_metrics=compute_metrics,
    model=model,
    tokenizer=tokenizer,
    # train_dataset=dataset,
    train_dataset=tokenized_train_dataset,
    # dataset_text_field="text",
    max_seq_length=max_seq_length,
    # data_collator=DataCollatorForSeq2Seq(
    #     tokenizer=tokenizer
    # ),  # TODO: comment this out?
    data_collator=DataCollatorForLanguageModeling(mlm=False, tokenizer=tokenizer),
    # dataset_num_proc=2,
    dataset_num_proc=1,
    packing=False,  # Can make training 5x faster for short sequences.
    args=TrainingArguments(
        # per_device_train_batch_size=2,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

max_steps is given, it will override any value given in num_train_epochs


In [15]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

'<|system|>\nYou are an AI agent acting as a human assistant.\n\nYou are aware of the following tools in your environment:\n{\n  "tools": [\n    {\n      "function": {\n        "description": "Calculates the standard deviation of a list of numbers.",\n        "name": "calculate_standard_deviation",\n        "parameters": {"properties": {"numbers": {"type": "array", "description": "The list of numbers.", "items": {"type": "number"}}}, "type": "object", "required": ["numbers"]}\n      },\n      "type": "function"\n    },\n    {\n      "function": {\n        "description": "Calculates the n numbers of the Fibonacci.",\n        "name": "get_fibonacci_sequence",\n        "parameters": {"properties": {"n": {"type": "integer", "description": "The number of Fibonacci numbers to calculate."}}, "type": "object", "required": ["n"]}\n      },\n      "type": "function"\n    }\n  ]\n}\n\nIf you would like to suggest one or more tool calls, please respond in the following format:\n{\n  "finish_reason

In [16]:
space = tokenizer(" ", add_special_tokens=False).input_ids[0]
tokenizer.decode(
    [space if x == -100 else x for x in trainer.train_dataset[5]["labels"]]
)

'                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           {\n  "finish_reason": "tool_calls",\n  "tool_calls": [\n    {\n      "arguments": "{\\"n\\": 10}",\n      "id": "call_0",\n 

In [17]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce GTX 1050 Ti. Max memory = 3.94 GB.
1.086 GB of memory reserved.


In [18]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 240 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 4
\        /    Total batch size = 4 | Total steps = 60
 "-____-"     Number of trainable parameters = 78,151,680


  0%|          | 0/60 [00:00<?, ?it/s]

{'loss': 1.9212, 'grad_norm': 8.343274116516113, 'learning_rate': 4e-05, 'epoch': 0.02}
{'loss': 1.9121, 'grad_norm': 5.440635681152344, 'learning_rate': 8e-05, 'epoch': 0.03}
{'loss': 1.741, 'grad_norm': 7.624862194061279, 'learning_rate': 0.00012, 'epoch': 0.05}
{'loss': 1.3817, 'grad_norm': 3.8515398502349854, 'learning_rate': 0.00016, 'epoch': 0.07}
{'loss': 1.1539, 'grad_norm': 2.4772722721099854, 'learning_rate': 0.0002, 'epoch': 0.08}
{'loss': 0.8734, 'grad_norm': 1.6178892850875854, 'learning_rate': 0.00019636363636363636, 'epoch': 0.1}
{'loss': 0.8139, 'grad_norm': 1.440341591835022, 'learning_rate': 0.00019272727272727274, 'epoch': 0.12}
{'loss': 0.6164, 'grad_norm': 1.3312119245529175, 'learning_rate': 0.0001890909090909091, 'epoch': 0.13}
{'loss': 0.687, 'grad_norm': 1.6745420694351196, 'learning_rate': 0.00018545454545454545, 'epoch': 0.15}
{'loss': 0.4965, 'grad_norm': 1.001059651374817, 'learning_rate': 0.00018181818181818183, 'epoch': 0.17}
{'loss': 0.4776, 'grad_norm':

In [19]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

3551.914 seconds used for training.
59.2 minutes used for training.
Peak reserved memory = 2.699 GB.
Peak reserved memory for training = 1.613 GB.
Peak reserved memory % of max memory = 68.503 %.
Peak reserved memory for training % of max memory = 40.939 %.


In [20]:
tokenizer.eos_token

'</s>'

In [21]:
def chat_completion_request(
    documents: list,
    messages: list,
    tools: list,
):
    FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

    inputs = tokenizer.apply_chat_template(
        add_generation_prompt=True,
        conversation=messages,
        documents=documents,
        return_tensors="pt",
        tokenize=True,
        tools=tools,
    ).to("cuda")

    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=256,
        use_cache=True,
        temperature=0.0,
    )

    batch_decoded_outputs = tokenizer.batch_decode(outputs)

    choices: List[ChatChoice] = []

    for i in range(len(batch_decoded_outputs)):
        prompt = batch_decoded_outputs[i][0 : len(tokenizer.decode(inputs[i]))]
        response = batch_decoded_outputs[i][len(tokenizer.decode(inputs[i])) :]

        response = response.replace(tokenizer.eos_token, "") # TODO: skip special tokens when decoding instead?

        print('response:')
        print(response)

        try:
            response = json.loads(response)

            finish_reason: str = response.get("finish_reason")

            tool_calls_json = response.get("tool_calls")
            tool_calls: List[ToolCall] = []

            # tool_calls: List[ToolCall] = ToolCall(
            #     function=FunctionToolDefinition(
            #         description=response.get("description"),
            #         name=response.get("name"),
            #         parameters=response.get("parameters"),
            #     ),
            # )

            for tool_call_json in tool_calls_json:
                print('tool_call_json:')
                print(tool_call_json)

                tool_call = ToolCall(
                    function=FunctionToolCallArguments(
                        arguments=tool_call_json.get("arguments"),
                        name=tool_call_json.get("name"),
                    ),
                    id=tool_call_json.get("id"),
                    # type=tool_call_json.get("type"),
                    type="function",
                )

                tool_calls.append(tool_call)

            message: ChatMessage = ChatMessage(
                # role: str,
                # content: str | None = None,
                # refusal: str | None = None,
                # name: str | None = None,
                # tool_calls: List[ToolCall] | None = None,
                # tool_call_id: str | None = None,
                role="assistant",
                tool_calls=tool_calls,
            )

        except json.JSONDecodeError:
            response = response

            finish_reason: str = "stop"
            message: ChatMessage = ChatMessage(
                role="assistant",
                content=response,
            )

        choices.append(
            ChatChoice(
                index=i,
                finish_reason=finish_reason,
                logprobs=None,
                message=message,
            )
        )

    # return batch_decoded_outputs
    return ChatResponse(
        choices=choices,
    )

# example = dataset[0]

documents = json.loads(example["documents"])
# conversation = json.loads(example["messages"])[0:-1]
tools = json.loads(example["tools"])

print('tools:')
print(tools)

# print('ground truth:')
# print(json.loads(example["messages"])[-1])

chat_response = chat_completion_request(
    documents=documents,
    # messages=conversation,
    messages=messages[0:1],
    tools=tools,
)

chat_response

tools:
[{'type': 'function', 'function': {'name': 'get_current_weather', 'description': 'Get the current weather', 'parameters': {'type': 'object', 'properties': {'location': {'type': 'string', 'description': 'The city and country, eg. San Francisco, USA'}, 'format': {'type': 'string', 'enum': ['celsius', 'fahrenheit']}}, 'required': ['location', 'format']}}}]
response:
{
  "finish_reason": "tool_calls",
  "tool_calls": [
    {
      "arguments": "{\"location\": \"'San Francisco, USA'\", \"format\": \"'celsius'\"}",
      "id": "call_0",
      "name": "get_current_weather"
    },
    {
      "arguments": "{\"location\": \"'New York, USA'\", \"format\": \"'fahrenheit'\"}",
      "id": "call_1",
      "name": "get_current_weather"
    }
  ]
}
tool_call_json:
{'arguments': '{"location": "\'San Francisco, USA\'", "format": "\'celsius\'"}', 'id': 'call_0', 'name': 'get_current_weather'}
tool_call_json:
{'arguments': '{"location": "\'New York, USA\'", "format": "\'fahrenheit\'"}', 'id': 'cal

ChatResponse(choices=[ChatChoice(index=0, message=ChatMessage(role='assistant', content=None, refusal=None, name=None, tool_calls=[ToolCall(function=FunctionToolCallArguments(name='get_current_weather', arguments='{"location": "\'San Francisco, USA\'", "format": "\'celsius\'"}'), id='call_0', type='function'), ToolCall(function=FunctionToolCallArguments(name='get_current_weather', arguments='{"location": "\'New York, USA\'", "format": "\'fahrenheit\'"}'), id='call_1', type='function')], tool_call_id=None), finish_reason='tool_calls', logprobs=None)], usage=None, id=None, model=None, object='chat.completion', created=1731646314, metadata=None)

In [22]:
chat_response.choices[0].message

ChatMessage(role='assistant', content=None, refusal=None, name=None, tool_calls=[ToolCall(function=FunctionToolCallArguments(name='get_current_weather', arguments='{"location": "\'San Francisco, USA\'", "format": "\'celsius\'"}'), id='call_0', type='function'), ToolCall(function=FunctionToolCallArguments(name='get_current_weather', arguments='{"location": "\'New York, USA\'", "format": "\'fahrenheit\'"}'), id='call_1', type='function')], tool_call_id=None)

In [23]:
# print('prompt:')
# print(batch_decoded_outputs[0][0:len(tokenizer.decode(inputs[0]))])

In [24]:
# print('response:')
# print(batch_decoded_outputs[0][len(tokenizer.decode(inputs[0])):])

In [25]:
# automatically trace OpenAI SDK calls
# mlflow.openai.autolog()

class WeatherAgent(mlflow.pyfunc.ChatModel):
    def __init__(self):
        # a sample tool definition. we use the `FunctionToolDefinition`
        # class to describe the name and expected params for the tool.
        # for this example, we're defining a simple tool that returns
        # the weather for a given city.
        weather_tool = FunctionToolDefinition(
            name="get_weather",
            description="Get weather information",
            parameters=ToolParamsSchema(
                {
                    "city": ParamProperty(
                        type="string",
                        description="City name to get weather information for",
                    ),
                }
            ),
            # make sure to call `to_tool_definition()` to convert the `FunctionToolDefinition`
            # to a `ToolDefinition` object. this step is necessary to normalize the data format,
            # as multiple types of tools (besides just functions) might be available in the future.
        ).to_tool_definition()

        # OpenAI expects tools to be provided as a list of dictionaries
        self.tools = [weather_tool.to_dict()]

    @mlflow.trace(span_type=SpanType.TOOL)
    def get_weather(self, city: str) -> str:
        # in a real-world scenario, the implementation might be more complex
        return f"It's sunny in {city}, with a temperature of 20C"

    # the core method that needs to be implemented. this function
    # will be called every time a user sends messages to our model
    @mlflow.trace(span_type=SpanType.AGENT)
    def predict(self, context, messages: list[ChatMessage], params: ChatParams):
        # instantiate the OpenAI client
        # client = OpenAI()

        # convert the messages to a format that the OpenAI API expects
        messages = [m.to_dict() for m in messages]

        # call the OpenAI API
        # response = client.chat.completions.create(
        #     model="gpt-4o-mini",
        #     messages=messages,
        #     # pass the tools in the request
        #     tools=self.tools,
        # )

        response = chat_completion_request(
            documents=[], # we don't need documents for this example
            messages=messages,
            tools=self.tools,
        )

        # if OpenAI returns a tool_calling response, then we call
        # our tool. otherwise, we just return the response as is
        tool_calls = response.choices[0].message.tool_calls

        if tool_calls:
            print("Received a tool call, calling the weather tool...")

            # for this example, we only provide the model with one tool,
            # so we can assume the tool call is for the weather tool. if
            # we had more, we'd need to check the name of the tool that
            # was called
            city = json.loads(tool_calls[0].function.arguments)["city"]
            tool_call_id = tool_calls[0].id

            # call the tool and construct a new chat message
            tool_response = ChatMessage(
                role="tool", content=self.get_weather(city), tool_call_id=tool_call_id
            ).to_dict()

            # send another request to the API, making sure to append
            # the assistant's tool call along with the tool response.
            messages.append(response.choices[0].message)
            messages.append(tool_response)

            # response = client.chat.completions.create(
            #     model="gpt-4o-mini",
            #     messages=messages,
            #     tools=self.tools,
            # )

            response = chat_completion_request(
                documents=[], # we don't need documents for this example
                messages=messages,
                tools=self.tools,
            )

        # return the result as a ChatResponse, as this
        # is the expected output of the predict method
        return ChatResponse.from_dict(response.to_dict())

In [26]:
# weather_example = test_dataset[24]
# weather_example = validation_dataset[123]
# weather_example

In [27]:
# messages = json.loads(weather_example["messages"])[0]
# tools = json.loads(weather_example["tools"])

In [None]:
# messages to use as input examples
# messages = [
#     {
#         "role": "system",
#         "content": "Please use the provided tools to answer user queries.",
#     },
#     {"role": "user", "content": "What's the weather in Singapore?"},
# ]

input_example = {
    "messages": messages[0:1],
}

# instantiate the model
agent = WeatherAgent()

# log the model
with mlflow.start_run():
    model_info = mlflow.pyfunc.log_model(
        artifact_path="weather-model",
        python_model=agent,
        input_example=input_example,
    )

    print("Successfully logged the model at the following URI: ", model_info.model_uri)

2024/11/14 20:51:55 INFO mlflow.pyfunc: Predicting on input example to validate output


response:
{
  "finish_reason": "tool_calls",
  "tool_calls": [
    {
      "arguments": "{\"city\": \"'San Francisco'\"}",
      "id": "call_0",
      "name": "get_weather"
    },
    {
      "arguments": "{\"city\": \"'New York'\"}",
      "id": "call_1",
      "name": "get_weather"
    }
  ]
}
tool_call_json:
{'arguments': '{"city": "\'San Francisco\'"}', 'id': 'call_0', 'name': 'get_weather'}
tool_call_json:
{'arguments': '{"city": "\'New York\'"}', 'id': 'call_1', 'name': 'get_weather'}
Received a tool call, calling the weather tool...
response:
{
  "finish_reason": "tool_calls",
  "tool_calls": [
    {
      "arguments": "{\"city\": \"'New York'\"}",
      "id": "call_1",
      "name": "get_weather"
    }
  ]
}
tool_call_json:
{'arguments': '{"city": "\'New York\'"}', 'id': 'call_1', 'name': 'get_weather'}


In [None]:
raise Exception("Stop here")

In [None]:
# model.save_pretrained("lora_model")
# tokenizer.save_pretrained("lora_model")

In [None]:
# # Merge to 16bit
# model.save_pretrained_merged(
#     "model_merged_16bit_vllm",
#     tokenizer,
#     save_method="merged_16bit",
# )

# # if False:
# #     model.push_to_hub_merged(
# #         "hf/model", tokenizer, save_method="merged_16bit", token=""
# #     )

# # # Merge to 4bit
# # # model.save_pretrained_merged(
# # #     "model_merged_4bit",
# # #     merged_4bit_forced=True,
# # #     save_method="merged_4bit",
# # #     tokenizer=tokenizer,
# # # )

# # if False:
# #     model.push_to_hub_merged("hf/model", tokenizer, save_method="merged_4bit", token="")

# # # Just LoRA adapters
# # # model.save_pretrained_merged(
# # #     "model_lora",
# # #     tokenizer,
# # #     save_method="lora",
# # # )

# # if False:
# #     model.push_to_hub_merged("hf/model", tokenizer, save_method="lora", token="")

<a name="Ollama"></a>
### Ollama Support

[Unsloth](https://github.com/unslothai/unsloth) now allows you to automatically finetune and create a [Modelfile](https://github.com/ollama/ollama/blob/main/docs/modelfile.md), and export to [Ollama](https://ollama.com/)! This makes finetuning much easier and provides a seamless workflow from `Unsloth` to `Ollama`!

Let's first install `Ollama`!

In [None]:
# !curl -fsSL https://ollama.com/install.sh | sh

Next, we shall save the model to GGUF / llama.cpp

We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

We also support saving to multiple GGUF options in a list fashion! This can speed things up by 10 minutes or more if you want multiple export formats!

In [None]:
# Save to 8bit Q8_0
# model.save_pretrained_gguf(
#     # "model_q8_0",
#     "model",
#     tokenizer,
# )
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False:
    model.push_to_hub_gguf("hf/model", tokenizer, token="")

# Save to 16bit GGUF
if False:
    model.save_pretrained_gguf("model", tokenizer, quantization_method="f16")
if False:
    model.push_to_hub_gguf("hf/model", tokenizer, quantization_method="f16", token="")

# Save to q4_k_m GGUF
if False:
    model.save_pretrained_gguf("model", tokenizer, quantization_method="q4_k_m")

model.push_to_hub_gguf(
    "mjschock/TinyLlama-1.1B-Chat-v1.0-sft-chat_threads-11-11-2024",
    quantization_method="q4_k_m",
    token=hf_token,
    tokenizer=tokenizer,
)

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model",  # Change hf to your username!
        tokenizer,
        quantization_method=[
            "q4_k_m",
            "q8_0",
            "q5_k_m",
        ],
        token="",  # Get a token at https://huggingface.co/settings/tokens
    )

We use `subprocess` to start `Ollama` up in a non blocking fashion! In your own desktop, you can simply open up a new `terminal` and type `ollama serve`, but in Colab, we have to use this hack! Alternatively, use the model-unsloth.gguf file or model-unsloth-Q4_K_M.gguf file in llama.cpp or a UI based system like GPT4All.

In [None]:
# import subprocess

# subprocess.Popen(["ollama", "serve"])
# import time

# time.sleep(3)  # Wait for a few seconds for Ollama to load!

`Ollama` needs a `Modelfile`, which specifies the model's prompt format. Let's print Unsloth's auto generated one:

In [None]:
print(tokenizer._ollama_modelfile)

We now will create an `Ollama` model called `unsloth_model` using the `Modelfile` which we auto generated!

In [None]:
!ollama create unsloth_model -f ./model/Modelfile

And now we can do inference on it via `Ollama`!

You can also upload to `Ollama` and try the `Ollama` Desktop app by heading to https://www.ollama.com/

In [None]:
# !curl http://localhost:11434/api/chat -d '{ \
#     "model": "unsloth_model", \
#     "messages": [ \
#         { "role": "user", "content": "Continue the Fibonacci sequence: 1, 1, 2, 3, 5, 8," } \
#     ] \
#     }'

In [None]:
example = dataset[5]

documents = json.loads(example["documents"])
conversation = json.loads(example["messages"])[0:-1]
tools = json.loads(example["tools"])

conversation

In [None]:
import openai

openai.api_key = 'ollama'
openai.base_url = "http://localhost:11434/v1/"

# messages = [
#     { "role": "user", "content": "Continue the Fibonacci sequence: 1, 1, 2, 3, 5, 8," }
# ]

# tools = []

response = openai.chat.completions.create(
	# model="llama3.2:1b",
    # model="unsloth_model:latest",
    # model="hf.co/mjschock/TinyLlama-1.1B-Chat-v1.0-sft-chat_threads-11-10-24:latest",
    model="hf.co/mjschock/TinyLlama-1.1B-Chat-v1.0-sft-chat_threads-11-11-2024:latest",
	# messages=messages,
    messages=conversation,
    temperature=0.0,
	# tools=tools,
)

# print(response)
print(response.choices[0].message.content)