# Training TinyLlama for Tool-calling

In [1]:
import os

def setup_environment():
    cwd = os.getcwd()

    if cwd == '/content':
        print("Running on Google Colab")

        %pip install unsloth
        %pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

    elif cwd == '/kaggle/working':
        print("Running on Kaggle")

        %pip install pip3-autoremove
        %pip-autoremove torch torchvision torchaudio -y
        %pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121
        %pip install unsloth

    else:
        print("Running in local environment")

        req_file = find_requirements_txt()

        if req_file:
            print(f"Found requirements.txt at: {req_file}")

            cwd = os.path.dirname(req_file)

            %pip install -r {req_file}

        else:
            print("requirements.txt not found. Skipping installation.")

    print(f"Current working directory: {cwd}")

    return cwd

def find_requirements_txt():
    cwd = os.getcwd()

    while cwd != '/':
        req_file = os.path.join(cwd, 'requirements.txt')

        if os.path.exists(req_file):
            return req_file

        cwd = os.path.dirname(cwd)

    return None

cwd = setup_environment()

Running in local environment
Found requirements.txt at: /home/mjschock/Projects/training-tinyllama-for-tool-calling/requirements.txt


Ignoring appnope: markers 'python_version >= "3.10" and python_version < "4.0" and platform_system == "Darwin"' don't match your environment
Ignoring pywin32: markers 'python_version >= "3.10" and python_version < "4.0" and sys_platform == "win32"' don't match your environment
Ignoring pywinpty: markers 'python_version >= "3.10" and python_version < "4.0" and os_name == "nt"' don't match your environment
Ignoring waitress: markers 'python_version >= "3.10" and python_version < "4.0" and platform_system == "Windows"' don't match your environment












Note: you may need to restart the kernel to use updated packages.
Current working directory: /home/mjschock/Projects/training-tinyllama-for-tool-calling


In [2]:
import json
import os
from pathlib import Path
from pprint import pprint
from typing import Dict, List

from datasets import load_dataset
import evaluate
import mlflow
from mlflow.types.llm import (
    ChatChoice,
    ChatMessage,
    ChatResponse,
    FunctionToolCallArguments,
    ToolCall,
)
import numpy as np
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling
from transformers.trainer_utils import EvalPrediction
from trl import SFTConfig, SFTTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
mlflow_experiment_name = "Training TinyLlama for Tool-calling"
# mlflow_tracking_uri = f"file://{project_root}/mlruns"
mlflow_tracking_uri = f"file://{cwd}/mlruns"

mlflow.set_tracking_uri(mlflow_tracking_uri)

assert mlflow.get_tracking_uri() == mlflow_tracking_uri, f"{mlflow.get_tracking_uri()} != {mlflow_tracking_uri}"

mlflow_experiment = mlflow.set_experiment(mlflow_experiment_name)

if mlflow.active_run():
    mlflow_run = mlflow.active_run()

    print(f"mlflow_run already exists: {mlflow_run.info.run_id}")

else:
    mlflow_run = mlflow.start_run(
        experiment_id=mlflow_experiment.experiment_id,
    )

    print(f"mlflow_run started: {mlflow_run.info.run_id}")

assert mlflow_run.info.experiment_id == mlflow_experiment.experiment_id, f"{mlflow_run.info.experiment_id} != {mlflow_experiment.experiment_id}"

print('mlflow_run.to_dictionary():')
pprint(mlflow_run.to_dictionary())

user_id = mlflow_run.info.user_id
print('user_id:', user_id)

mlflow_run started: dbb314a0b654499bbaefe1897a2e41dc
mlflow_run.to_dictionary():
{'data': {'metrics': {},
          'params': {},
          'tags': {'mlflow.runName': 'shivering-pug-339',
                   'mlflow.source.name': '/home/mjschock/Projects/training-tinyllama-for-tool-calling/.venv/lib/python3.10/site-packages/ipykernel_launcher.py',
                   'mlflow.source.type': 'LOCAL',
                   'mlflow.user': 'mjschock'}},
 'info': {'artifact_uri': 'file:///home/mjschock/Projects/training-tinyllama-for-tool-calling/mlruns/341257326214881697/dbb314a0b654499bbaefe1897a2e41dc/artifacts',
          'end_time': None,
          'experiment_id': '341257326214881697',
          'lifecycle_stage': 'active',
          'run_id': 'dbb314a0b654499bbaefe1897a2e41dc',
          'run_name': 'shivering-pug-339',
          'run_uuid': 'dbb314a0b654499bbaefe1897a2e41dc',
          'start_time': 1732905490273,
          'status': 'RUNNING',
          'user_id': 'mjschock'}}
user_id: mj

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = None
load_in_4bit = True
max_seq_length = 4096
pretrained_model_name = "TinyLlama-1.1B-Chat-v1.0"
pretrained_model_namespace = "TinyLlama"
# model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# model_name = "meta-llama/Llama-3.2-1B-Instruct"
pretrained_model_name_or_path = f"{pretrained_model_namespace}/{pretrained_model_name}"

if not os.path.exists(os.path.join(cwd, f"data/06_models/{pretrained_model_namespace}/{pretrained_model_name}")):
    model = AutoModelForCausalLM.from_pretrained(pretrained_model_name_or_path)
    tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path)

    model.save_pretrained(os.path.join(cwd, f"data/06_models/{pretrained_model_namespace}/{pretrained_model_name}"))
    tokenizer.save_pretrained(os.path.join(cwd, f"data/06_models/{pretrained_model_namespace}/{pretrained_model_name}"))

    del model
    del tokenizer

model, tokenizer = FastLanguageModel.from_pretrained(
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    max_seq_length=max_seq_length,
    # model_name=model_name,
    model_name=pretrained_model_name_or_path,
)

# Save a tokenizer without padding because it is only needed for training
# tokenizer_no_pad = AutoTokenizer.from_pretrained(model_name, add_bos_token=True) # https://mlflow.org/docs/latest/llms/transformers/tutorials/fine-tuning/transformers-peft.html#Save-the-PEFT-Model-to-MLflow

==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA GeForce GTX 1050 Ti. Max memory: 3.94 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 6.1. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: unsloth/tinyllama-chat-bnb-4bit can only handle sequence lengths of at most 2048.
But with kaiokendev's RoPE scaling of 2.0, it can be magically be extended to 4096!


In [5]:
print('tokenizer.pad_token:', tokenizer.pad_token)
print('tokenizer.padding_side:', tokenizer.padding_side)

tokenizer.pad_token: <unk>
tokenizer.padding_side: left


In [6]:
DEFAULT_SYSTEM_MESSAGE = {
    "content": "You are an AI agent acting as a human assistant.",
    "role": "system"
}

tools_template = """
{
  "tools": [
  {% for tool in tools %}
    {
      "function": {
        "description": "{{ tool.function.description }}",
        "name": "{{ tool.function.name }}",
        "parameters": {{ tool.function.parameters | tojson }}
      },
      "type": "{{ tool.type }}"
    }{% if not loop.last %},{% endif %}\n
  {% endfor %}
  ]
}

If you would like to suggest one or more tool calls, please respond in the following format:
{
  "finish_reason": "tool_calls",
  "tool_calls": [
    {
      "arguments": "{\\"parameter_name\\": \\"parameter_value\\"}",
      "id": "call_id",
      "name": "tool_name"
    }
  ]
}
"""

tool_calls_template = """
{
  "finish_reason": "tool_calls",
  "tool_calls": [
  {% for tool_call in message.tool_calls %}
    {
      "arguments": {{ tool_call.function.arguments | tojson }},
      "id": "{{ tool_call.id }}",
      "name": "{{ tool_call.function.name }}"
    }{% if not loop.last %},{% endif %}\n
  {% endfor %}
  ]
}
"""

tool_response_template = """
{
  "content": {{ message.content | tojson }},
  "name": "{{ message.name }}",
  "tool_call_id": "{{ message.tool_call_id }}"
}
"""

# if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v1.0":
if pretrained_model_name == "TinyLlama-1.1B-Chat-v1.0":
  start_header_id = "<|"
  end_header_id = "|>"

else:
  start_header_id = "<|start_header_id|>"
  end_header_id = "<|end_header_id|>"

role_header_template = start_header_id + "{{ message.role }}" + end_header_id + "{{ '\n' }}"
assistant_generation_role_header_template = start_header_id + "assistant" + end_header_id + "{{ '\n' }}"

# Influenced by:
# https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models
# https://docs.anthropic.com/en/docs/build-with-claude/tool-use
# https://github.com/abetlen/llama-cpp-python/blob/7c4aead82d349469bbbe7d8c0f4678825873c039/llama_cpp/llama_chat_format.py#L3387
# https://github.com/Mozilla-Ocho/llamafile/blob/66a84d8aea2990895fc4f64786406fea64e79197/llama.cpp/server/server.cpp#L480 (need <|im_start|> b/c Mozilla)
# https://github.com/openai/openai-python/blob/120d225b91a8453e15240a49fb1c6794d8119326/chatml.md
# https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html#prompt
# https://huggingface.co/blog/unified-tool-use
chat_template = (
    # Configuration and defaults
    "{%- set config = namespace(has_system_message=false, has_tools=false) -%}"
    "{%- set system_messages = messages | selectattr('role', 'equalto', 'system') | list -%}"
    "{%- set config.has_system_message = system_messages | length > 0 -%}"
    "{%- set config.has_tools = tools is defined and tools | length > 0 -%}"

    # Ensure system message exists
    "{%- if not config.has_system_message -%}"
    f'{{%- set messages = [{{ "content": "{DEFAULT_SYSTEM_MESSAGE["content"]}", "role": "{DEFAULT_SYSTEM_MESSAGE["role"]}" }}] + messages -%}}'
    "{%- endif -%}"

    # Process messages
    "{%- for message in messages -%}"
    # "<|{{ message.role }}|>{{ '\n' }}" # "<|start_header_id|>{{ message.role }}<|end_header_id|>{{ '\n' }}"
    # f"{start_header_id}{{ message.role }}{end_header_id}{{ '\n' }}"
    # start_header_id + "{{ message.role }}" + end_header_id + "{{ '\n' }}"
    # TODO: add bos_token if first message?
    "{% if loop.first %}{{ bos_token }}{% endif %}"f"{role_header_template}"

    # System message handling
    "{%- if message.role == 'system' -%}"
    "{{ message.content }}"
    "{%- if config.has_tools -%}"
    "{{ '\n\n' }}You are aware of the following tools in your environment:"
    f"{tools_template}"
    "{%- endif -%}"
    "{{ eos_token }}{{ '\n' }}" # <|eot_id|>
    "{%- endif -%}"

    # User message handling
    "{%- if message.role == 'user' -%}"
    "{{ message.content }}{{ eos_token }}{{ '\n' }}"
    "{%- endif -%}"

    # Assistant message handling
    "{%- if message.role == 'assistant' -%}"
    "{% generation %}"
    "{%- if message.tool_calls | default(false) -%}"
    f"{tool_calls_template}"
    "{%- else -%}"
    "{{ message.content }}"
    "{%- endif -%}"
    "{% endgeneration %}"
    "{{ eos_token }}{{ '\n' }}"
    "{%- endif -%}"

    # Tool message handling
    "{%- if message.role == 'tool' -%}"
    f"{tool_response_template}"
    "{{ eos_token }}{{ '\n' }}"
    "{%- endif -%}"
    "{%- endfor -%}"

    # Generation prompt
    "{%- if add_generation_prompt -%}"
    # "<|assistant|>{{ '\n' }}" # <|start_header_id|>assistant<|end_header_id|>
    f"{assistant_generation_role_header_template}"
    "{%- endif -%}"
)

# tokenizer = get_chat_template(
#     tokenizer,
#     chat_template=(
#         chat_template,
#         "eos_token"
#     ),
#     map_eos_token=True,
# )

tokenizer.chat_template = chat_template

In [7]:
assert model.config.bos_token_id == tokenizer.bos_token_id, f"{model.config.bos_token_id} != {tokenizer.bos_token_id}"

try:
    assert model.config.eos_token_id == tokenizer.eos_token_id, f"{model.config.eos_token_id} != {tokenizer.eos_token_id}"

except AssertionError as e:
    print(e)
    model.config.eos_token_id = tokenizer.eos_token_id

assert model.config.pad_token_id == tokenizer.pad_token_id, f"{model.config.pad_token_id} != {tokenizer.pad_token_id}"

assert model.generation_config.bos_token_id == tokenizer.bos_token_id, f"{model.generation_config.bos_token_id} != {tokenizer.bos_token_id}"

try:
    assert model.generation_config.eos_token_id == tokenizer.eos_token_id, f"{model.generation_config.eos_token_id} != {tokenizer.eos_token_id}"

except AssertionError as e:
    print(e)
    model.generation_config.eos_token_id = tokenizer.eos_token_id

assert model.generation_config.pad_token_id == tokenizer.pad_token_id, f"{model.generation_config.pad_token_id} != {tokenizer.pad_token_id}"

In [8]:
print('tokenizer.pad_token:', tokenizer.pad_token)
print('tokenizer.padding_side:', tokenizer.padding_side)

tokenizer.pad_token: <unk>
tokenizer.padding_side: left


In [9]:
model.save_pretrained_merged(
    # f"{project_root}/data/06_models/{user_id}/{pretrained_model_name}_unsloth_merged_16bit",
    f"{cwd}/data/06_models/{user_id}/{pretrained_model_name}_unsloth_merged_16bit",
    save_method="merged_16bit",
    tokenizer=tokenizer,
)

Unsloth: Merging 4bit and LoRA weights to 16bit...


Unsloth: Will use up to 0.0 out of 15.5 RAM for saving.


  0%|                                                                                                                                          | 0/22 [00:00<?, ?it/s]

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 252.21it/s]

Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...





Done.


In [10]:
# train_dataset = load_dataset("mjschock/chat_threads", split="train")
train_dataset = load_dataset(f"{user_id}/chat_threads", split="train")
# validation_dataset = load_dataset("mjschock/chat_threads", split="validation")
validation_dataset = load_dataset(f"{user_id}/chat_threads", split="validation")
# test_dataset = load_dataset("mjschock/chat_threads", split="test")
test_dataset = load_dataset(f"{user_id}/chat_threads", split="test")

test_example = test_dataset[0]
test_example

{'documents': '[]',
 'has_parallel_tool_calls': True,
 'messages': '[{"role": "user", "content": "What\'s the weather like in San Francisco and New York?"}, {"role": "assistant", "tool_calls": [{"id": "call_0", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\\"location\\": \\"San Francisco, USA\\", \\"format\\": \\"celsius\\"}"}}, {"id": "call_1", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\\"location\\": \\"New York, USA\\", \\"format\\": \\"celsius\\"}"}}]}, {"role": "tool", "name": "get_current_weather", "tool_call_id": "call_0", "content": "21.0"}, {"role": "tool", "name": "get_current_weather", "tool_call_id": "call_1", "content": "18.5"}, {"role": "assistant", "content": "The current temperature in San Francisco is 21\\u00b0C (70\\u00b0F), while in New York it\'s 18.5\\u00b0C (65\\u00b0F)."}]',
 'tools': '[{"type": "function", "function": {"name": "get_current_weather", "description": "Get the current weather", "

In [11]:
test_example_documents = json.loads(test_example["documents"])
test_example_messages = json.loads(test_example["messages"])
test_example_tools = json.loads(test_example["tools"])

In [12]:
print('test_example_messages:')
pprint(test_example_messages)

test_example_messages:
[{'content': "What's the weather like in San Francisco and New York?",
  'role': 'user'},
 {'role': 'assistant',
  'tool_calls': [{'function': {'arguments': '{"location": "San Francisco, '
                                            'USA", "format": "celsius"}',
                               'name': 'get_current_weather'},
                  'id': 'call_0',
                  'type': 'function'},
                 {'function': {'arguments': '{"location": "New York, USA", '
                                            '"format": "celsius"}',
                               'name': 'get_current_weather'},
                  'id': 'call_1',
                  'type': 'function'}]},
 {'content': '21.0',
  'name': 'get_current_weather',
  'role': 'tool',
  'tool_call_id': 'call_0'},
 {'content': '18.5',
  'name': 'get_current_weather',
  'role': 'tool',
  'tool_call_id': 'call_1'},
 {'content': 'The current temperature in San Francisco is 21°C (70°F), while '
             "

In [13]:
print('test_example_tools:')
pprint(test_example_tools)

test_example_tools:
[{'function': {'description': 'Get the current weather',
               'name': 'get_current_weather',
               'parameters': {'properties': {'format': {'enum': ['celsius',
                                                                 'fahrenheit'],
                                                        'type': 'string'},
                                             'location': {'description': 'The '
                                                                         'city '
                                                                         'and '
                                                                         'country, '
                                                                         'eg. '
                                                                         'San '
                                                                         'Francisco, '
                                                                         

In [14]:
prompt = tokenizer.apply_chat_template(
    add_generation_prompt=True,
    conversation=test_example_messages[0:1], # Only the user message, note that the system message will automatically be added
    documents=test_example_documents,
    tools=test_example_tools,
    tokenize=False,
)

prompt_and_response = tokenizer.apply_chat_template(
    add_generation_prompt=False,
    conversation=test_example_messages[0:2], # Only the user and first assistant message, note that the system message will automatically be added
    documents=test_example_documents,
    tools=test_example_tools,
    tokenize=False,
)

response = prompt_and_response.replace(prompt, "")

In [15]:
print('prompt:')
print(prompt)

prompt:
<s><|system|>
You are an AI agent acting as a human assistant.

You are aware of the following tools in your environment:
{
  "tools": [
    {
      "function": {
        "description": "Get the current weather",
        "name": "get_current_weather",
        "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city and country, eg. San Francisco, USA"}, "format": {"type": "string", "enum": ["celsius", "fahrenheit"]}}, "required": ["location", "format"]}
      },
      "type": "function"
    }
  ]
}

If you would like to suggest one or more tool calls, please respond in the following format:
{
  "finish_reason": "tool_calls",
  "tool_calls": [
    {
      "arguments": "{\"parameter_name\": \"parameter_value\"}",
      "id": "call_id",
      "name": "tool_name"
    }
  ]
}</s>
<|user|>
What's the weather like in San Francisco and New York?</s>
<|assistant|>



In [16]:
print("response (ground truth):")
print(response)

response (ground truth):
{
  "finish_reason": "tool_calls",
  "tool_calls": [
    {
      "arguments": "{\"location\": \"San Francisco, USA\", \"format\": \"celsius\"}",
      "id": "call_0",
      "name": "get_current_weather"
    },
    {
      "arguments": "{\"location\": \"New York, USA\", \"format\": \"celsius\"}",
      "id": "call_1",
      "name": "get_current_weather"
    }
  ]
}</s>



In [17]:
def load_and_preprocess_data(dataset, tokenizer):
    """
    Load and preprocess the dataset for training.

    Args:
        dataset: The dataset to preprocess
        tokenizer: Tokenizer to use for preprocessing.

    Returns:
        datasets.Dataset: Preprocessed dataset.
    """

    def preprocess_function(examples):
        # Extract the messages from the example
        conversation = examples["messages"]
        documents = examples.get("documents", [])
        tools = examples.get("tools", [])

        # Apply chat template to generate tokenized input and assistant mask
        tokenized_output = tokenizer.apply_chat_template(
            add_generation_prompt=False,
            conversation=json.loads(conversation),
            documents=json.loads(documents),
            max_length=max_seq_length,
            padding="longest",
            return_assistant_tokens_mask=True,
            return_dict=True,
            return_tensors="pt",
            tokenize=True,
            tools=json.loads(tools),
            truncation=True,  # TODO: verify we're not truncating anything in the datasets
        )

        # Extract the input IDs and assistant tokens mask
        input_ids = tokenized_output["input_ids"][0]
        assistant_masks = torch.tensor(tokenized_output["assistant_masks"])
        attention_mask = tokenized_output["attention_mask"][0]

        # Use the assistant mask to create labels
        labels = torch.where(assistant_masks == 1, input_ids, torch.tensor(-100))

        return {
            "attention_mask": attention_mask,
            "input_ids": input_ids,
            "labels": labels,
        }

    # Preprocess the dataset
    return dataset.map(
        preprocess_function,
        batched=False,
        num_proc=1,
        remove_columns=dataset.column_names,
    )  # TODO: use batched=True


tokenized_train_dataset = load_and_preprocess_data(
    train_dataset,
    tokenizer,
)

tokenized_validation_dataset = load_and_preprocess_data(
    validation_dataset,
    tokenizer,
)

tokenized_test_dataset = load_and_preprocess_data(
    test_dataset,
    tokenizer,
)

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [18]:
def generate_test_prediction():
    FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

    inputs = tokenizer.apply_chat_template(
        add_generation_prompt=True,
        conversation=test_example_messages[0:1], # Only the user message, note that the system message will automatically be added
        documents=test_example_documents,
        tools=test_example_tools,
        return_tensors="pt",
        tokenize=True,
    ).to(device)

    outputs = model.generate(
        do_sample=False,
        input_ids=inputs,
        max_new_tokens=256,
        use_cache=True,
        # temperature=0.0,
    )

    batch_decoded_outputs = tokenizer.batch_decode(outputs)

    prompt = batch_decoded_outputs[0][0:len(tokenizer.decode(inputs[0]))]
    response = batch_decoded_outputs[0][len(tokenizer.decode(inputs[0])):]

    return prompt, response

In [19]:
prompt, response = generate_test_prediction()

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [20]:
print('tokenizer.pad_token:', tokenizer.pad_token)
print('tokenizer.padding_side:', tokenizer.padding_side)

tokenizer.pad_token: <unk>
tokenizer.padding_side: left


In [21]:
print('prompt:')
print(prompt)

prompt:
<s> <|system|>
You are an AI agent acting as a human assistant.

You are aware of the following tools in your environment:
{
  "tools": [
    {
      "function": {
        "description": "Get the current weather",
        "name": "get_current_weather",
        "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city and country, eg. San Francisco, USA"}, "format": {"type": "string", "enum": ["celsius", "fahrenheit"]}}, "required": ["location", "format"]}
      },
      "type": "function"
    }
  ]
}

If you would like to suggest one or more tool calls, please respond in the following format:
{
  "finish_reason": "tool_calls",
  "tool_calls": [
    {
      "arguments": "{\"parameter_name\": \"parameter_value\"}",
      "id": "call_id",
      "name": "tool_name"
    }
  ]
}</s> 
<|user|>
What's the weather like in San Francisco and New York?</s> 
<|assistant|>



In [22]:
print('response (predicted, before training):')
print(response)

response (predicted, before training):
I don't have access to real-time weather data, but according to the information available data, the weather in san francisco and new york are both hot and humid. San francisco has a maximum temperature of 328.8°c (992.8f) and minimum of 25.8°c (439.2f, while new york has maximum of 39.8° (434.2f and minimum 2.8° (32f).</s>


In [23]:
model = FastLanguageModel.get_peft_model(
    bias="none",
    loftq_config=None,
    lora_alpha=32,
    lora_dropout=0,
    model=model,
    r=16,
    random_state=42,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    use_gradient_checkpointing="unsloth",
    use_rslora=False,
)

Unsloth: Offloading output_embeddings to disk to save VRAM


  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)


Unsloth 2024.11.5 patched 22 layers with 22 QKV layers, 22 O layers and 22 MLP layers.


Unsloth: Training lm_head in mixed precision to save VRAM


In [24]:
print('tokenizer.pad_token:', tokenizer.pad_token)
print('tokenizer.padding_side:', tokenizer.padding_side)

tokenizer.pad_token: <unk>
tokenizer.padding_side: right


In [25]:
metrics = evaluate.combine(["accuracy", "bleu", "meteor", "rouge"])
metrics_tracker = {}

def compute_metrics(eval_pred: EvalPrediction, compute_result: bool) -> Dict:
    assert isinstance(
        eval_pred, EvalPrediction
    ), f"Expected EvalPrediction, got {type(eval_pred)}"

    all_labels = eval_pred.label_ids
    all_preds = eval_pred.predictions
    is_last_step = compute_result

    all_labels[all_labels == -100] = tokenizer.pad_token_id
    references: List[str] = tokenizer.batch_decode(
        all_labels, skip_special_tokens=True
    )

    assert (
        all_preds.shape == all_labels.shape
    ), f"Expected predictions and labels to have the same shape, got {all_preds.shape} and {all_labels.shape}"

    predictions: List[str] = tokenizer.batch_decode(
        all_preds, skip_special_tokens=True
    )

    assert len(predictions) == len(
        references
    ), f"Expected predictions and references to have the same length, got {len(predictions)} and {len(references)}"

    eval_batch_metrics = metrics.compute(
        predictions=predictions,
        references=references,
    )

    computed_metrics = {}

    for key, value in eval_batch_metrics.items():
        if type(value) in [list, np.ndarray]:
            value = np.mean(value)

        metrics_tracker[key] = np.mean([metrics_tracker.get(key, 0.0), value])
        computed_metrics[key] = metrics_tracker[key]

        if is_last_step:
            metrics_tracker[key] = 0.0

    return computed_metrics

def preprocess_logits_for_metrics(
    logits: torch.Tensor, labels: torch.Tensor
) -> torch.Tensor:
    """
    Original Trainer may have a memory leak.
    This is a workaround to avoid storing too many tensors that are not needed.
    """
    pred_ids = torch.argmax(logits, dim=-1)

    return pred_ids

sft_trainer = SFTTrainer(
    args=SFTConfig(
        # auto_find_batch_size=True,
        # batch_eval_metrics=True,
        bf16=is_bfloat16_supported(),
        # eval_accumulation_steps=16,
        # eval_on_start=True,
        # eval_steps=1.0,
        # eval_strategy="epoch",
        fp16=not is_bfloat16_supported(),
        # gradient_accumulation_steps=16,
        gradient_accumulation_steps=4,
        # gradient_checkpointing="unsloth",
        # learning_rate=5e-05,
        learning_rate=2e-4,
        # load_best_model_at_end=True,
        logging_steps=1.0,
        # logging_strategy="steps",
        lr_scheduler_type="linear",
        # max_seq_length=max_seq_length,
        # max_steps=60,
        # max_steps=3,
        max_steps=1,
        # num_of_sequences=1,
        # num_train_epochs=3.0,
        # num_train_epochs=1.0,
        optim="adamw_8bit",
        # output_dir="outputs",
        # output_dir="data/06_models/model",
        # output_dir="data/06_models/model/checkpoints",
        # output_dir=os.path.join(project_root, "data/06_models/mjschock/TinyLlama-1.1B-Chat-v1.0_merged_16bit/checkpoints"),
        # output_dir=os.path.join(project_root, "data/06_models/mjschock/TinyLlama-1.1B-Chat-v1.0-tool-calling-sft/checkpoints"),
        # output_dir=f"{project_root}/data/06_models/{user_id}/{pretrained_model_name}_tool-calling-sft/checkpoints",
        output_dir=f"{cwd}/data/06_models/{user_id}/{pretrained_model_name}_tool-calling-sft/checkpoints",
        overwrite_output_dir=True,
        # packing=False,
        per_device_eval_batch_size=1,
        per_device_train_batch_size=1,
        # push_to_hub=False,
        report_to="mlflow",
        # save_steps=1.0,
        # save_strategy="epoch",
        # save_total_limit=1,
        seed=42,
        warmup_steps=5,
        weight_decay=0.01,
    ),
    compute_metrics=compute_metrics,
    data_collator=DataCollatorForLanguageModeling(mlm=False, tokenizer=tokenizer),
    dataset_num_proc=1,
    eval_dataset=tokenized_validation_dataset,
    max_seq_length=max_seq_length,
    model=model,
    packing=False,  # Can make training 5x faster for short sequences.
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    tokenizer=tokenizer,
    train_dataset=tokenized_train_dataset,
)

# dpo_trainer = DPOTrainer()

[nltk_data] Downloading package wordnet to /home/mjschock/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/mjschock/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/mjschock/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


max_steps is given, it will override any value given in num_train_epochs


In [26]:
print('tokenizer.pad_token:', tokenizer.pad_token)
print('tokenizer.padding_side:', tokenizer.padding_side)

tokenizer.pad_token: <unk>
tokenizer.padding_side: right


In [27]:
# Take a look at the first training example. When we decod the input_ids, we'll see the full chat history.

tokenizer.decode(sft_trainer.train_dataset[0]["input_ids"])

'<s> <|system|>\nYou are an intelligent AI that controls a drone. Given a command or request from the user,\ncall one of your functions to complete the request. If the request cannot be completed by your available functions, call the reject_request function.\nIf the request is ambiguous or unclear, reject the request.\n\nYou are aware of the following tools in your environment:\n{\n  "tools": [\n    {\n      "function": {\n        "description": "",\n        "name": "takeoff_drone",\n        "parameters": {"type": "object", "properties": {"altitude": {"type": "integer"}}, "required": ["altitude"]}\n      },\n      "type": "function"\n    },\n    {\n      "function": {\n        "description": "",\n        "name": "land_drone",\n        "parameters": {"type": "object", "properties": {"location": {"type": "string", "enum": ["current", "home_base", "custom"]}, "coordinates": {"type": "object"}}, "required": ["location"]}\n      },\n      "type": "function"\n    },\n    {\n      "function":

In [28]:
# Now let's take a look at the labels. The labels are the same as the input_ids, except that the assistant tokens are replaced with -100. This is because we want to predict the assistant tokens.

space = tokenizer(" ", add_special_tokens=False).input_ids[0]
tokenizer.decode(
    [space if x == -100 else x for x in sft_trainer.train_dataset[0]["labels"]]
)

'                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [29]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)

print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce GTX 1050 Ti. Max memory = 3.94 GB.
2.592 GB of memory reserved.


In [30]:
# with mlflow.start_run(
    # experiment_id=experiment.experiment_id,
#     nested=True,
# ):
trainer_stats = sft_trainer.train(
    resume_from_checkpoint=False,
    trial=None,
)

print('trainer_stats:')
pprint(trainer_stats)

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 239 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 4
\        /    Total batch size = 4 | Total steps = 1
 "-____-"     Number of trainable parameters = 78,151,680


Step,Training Loss
1,1.7715


trainer_stats:
TrainOutput(global_step=1, training_loss=1.7715449333190918, metrics={'train_runtime': 32.8106, 'train_samples_per_second': 0.122, 'train_steps_per_second': 0.03, 'total_flos': 17350883414016.0, 'train_loss': 1.7715449333190918, 'epoch': 0.016736401673640166})


In [31]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)

print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

32.8106 seconds used for training.
0.55 minutes used for training.
Peak reserved memory = 2.592 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 65.787 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [32]:
print('tokenizer.pad_token:', tokenizer.pad_token)
print('tokenizer.padding_side:', tokenizer.padding_side)

tokenizer.pad_token: <unk>
tokenizer.padding_side: right


In [33]:
prompt, response = generate_test_prediction()

In [34]:
print('prompt:')
print(prompt)

prompt:
<s> <|system|>
You are an AI agent acting as a human assistant.

You are aware of the following tools in your environment:
{
  "tools": [
    {
      "function": {
        "description": "Get the current weather",
        "name": "get_current_weather",
        "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city and country, eg. San Francisco, USA"}, "format": {"type": "string", "enum": ["celsius", "fahrenheit"]}}, "required": ["location", "format"]}
      },
      "type": "function"
    }
  ]
}

If you would like to suggest one or more tool calls, please respond in the following format:
{
  "finish_reason": "tool_calls",
  "tool_calls": [
    {
      "arguments": "{\"parameter_name\": \"parameter_value\"}",
      "id": "call_id",
      "name": "tool_name"
    }
  ]
}</s> 
<|user|>
What's the weather like in San Francisco and New York?</s> 
<|assistant|>



In [35]:
print('response (predicted, after training):')
print(response)

response (predicted, after training):
I don't have access to real-time weather data, but according to the information available data, the weather in san francisco and new york are both hot and humid. San francisco has a maximum temperature of 328.8°c (992.8f) and minimum of 25.8°c (439.2f, while new york has maximum of 39.8° (434.2f and minimum 2.8° (32f).</s>


In [36]:
print('tokenizer.pad_token:', tokenizer.pad_token)
print('tokenizer.padding_side:', tokenizer.padding_side)

tokenizer.pad_token: <unk>
tokenizer.padding_side: left


In [37]:
if True:
    # model.save_pretrained("lora_model")  # Local saving
    # model.save_pretrained("data/06_models/model/lora")  # Local saving
    # model.save_pretrained(os.path.join(project_root, "data/06_models/mjschock/TinyLlama-1.1B-Chat-v1.0-tool-calling-sft/lora"))  # Local saving
    # model.save_pretrained(f"{project_root}/data/06_models/{user_id}/{pretrained_model_name}-tool-calling-sft/lora"), # Local saving
    model.save_pretrained(f"{cwd}/data/06_models/{user_id}/{pretrained_model_name}-tool-calling-sft/lora") # Local saving
    # tokenizer.save_pretrained("lora_model")
    # tokenizer.save_pretrained("data/06_models/model/lora")
    # tokenizer.save_pretrained(os.path.join(project_root, "data/06_models/mjschock/TinyLlama-1.1B-Chat-v1.0-tool-calling-sft/lora"))
    # tokenizer.save_pretrained(f"{project_root}/data/06_models/{user_id}/{pretrained_model_name}-tool-calling-sft/lora") # Local saving
    tokenizer.save_pretrained(f"{cwd}/data/06_models/{user_id}/{pretrained_model_name}-tool-calling-sft/lora") # Local saving

else:
    # TODO: push to hub
    raise NotImplementedError("Pushing to hub is not implemented yet.")

In [38]:
print('tokenizer.pad_token:', tokenizer.pad_token)
print('tokenizer.padding_side:', tokenizer.padding_side)

tokenizer.pad_token: <unk>
tokenizer.padding_side: left


In [39]:
if True:
    model.save_pretrained_merged(
        # os.path.join(project_root, "data/06_models/mjschock/TinyLlama-1.1B-Chat-v1.0-tool-calling-sft/merged_16bit"),
        # f"{project_root}/data/06_models/{user_id}/{pretrained_model_name}-tool-calling-sft/unsloth_merged_16bit",
        f"{cwd}/data/06_models/{user_id}/{pretrained_model_name}-tool-calling-sft/unsloth_merged_16bit",
        save_method="merged_16bit",
        tokenizer=tokenizer,
    )
    # model.save_pretrained_merged(
    #     os.path.join(project_root, "data/06_models/mjschock/TinyLlama-1.1B-Chat-v1.0-tool-calling-sft/merged_4bit"),
    #     save_method="merged_4bit",
    #     tokenizer=tokenizer,
    # )
    model.save_pretrained_merged(
        # f"{project_root}/data/06_models/{user_id}/{pretrained_model_name}-tool-calling-sft/unsloth_lora",
        f"{cwd}/data/06_models/{user_id}/{pretrained_model_name}-tool-calling-sft/unsloth_lora",
        save_method="lora",
        tokenizer=tokenizer,
    )

else:
    raise NotImplementedError("Pushing to hub is not implemented yet.")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 0.0 out of 15.5 RAM for saving.


  0%|                                                                                                                                          | 0/22 [00:00<?, ?it/s]

  9%|███████████▊                                                                                                                      | 2/22 [00:00<00:01, 18.67it/s]

 23%|█████████████████████████████▌                                                                                                    | 5/22 [00:00<00:00, 23.48it/s]

 36%|███████████████████████████████████████████████▎                                                                                  | 8/22 [00:00<00:00, 25.03it/s]

 50%|████████████████████████████████████████████████████████████████▌                                                                | 11/22 [00:00<00:00, 25.83it/s]

 64%|██████████████████████████████████████████████████████████████████████████████████                                               | 14/22 [00:00<00:00, 26.40it/s]

 77%|███████████████████████████████████████████████████████████████████████████████████████████████████▋                             | 17/22 [00:00<00:00, 26.92it/s]

 91%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████▎           | 20/22 [00:00<00:00, 27.28it/s]

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 22/22 [00:00<00:00, 26.24it/s]




Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...


Done.


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model...

 Done.


In [40]:
print('tokenizer.pad_token:', tokenizer.pad_token)
print('tokenizer.padding_side:', tokenizer.padding_side)

tokenizer.pad_token: <unk>
tokenizer.padding_side: left


In [41]:
# code_path = f"{project_root}/src/training_tinyllama_for_tool_calling/services/agent.py"
code_path = f"{cwd}/src/training_tinyllama_for_tool_calling/services/agent.py"

os.makedirs(Path(code_path).parent, exist_ok=True)

In [42]:
%%writefile $code_path
import json
import os
from pprint import pprint
from typing import Dict, List

import evaluate
import mlflow
import numpy as np
import torch
from datasets import load_dataset
from mlflow.models import set_model
from mlflow.pyfunc import ChatModel
from mlflow.types.llm import (
    ChatChoice,
    ChatMessage,
    ChatParams,
    ChatResponse,
    FunctionToolCallArguments,
    ToolCall,
)
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    DataCollatorForLanguageModeling,
)
from transformers.trainer_utils import EvalPrediction
from trl import SFTConfig, SFTTrainer
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template

project_root = os.getcwd()

while not os.path.exists(os.path.join(project_root, "register_prefect_flow.py")):
    project_root = os.path.dirname(project_root)

print(f"Project root: {project_root}")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
dtype = None
load_in_4bit = True
max_seq_length = 4096
# model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
# model_name = "meta-llama/Llama-3.2-1B-Instruct"
# model_name = "data/06_models/model/lora"
# model_name = "data/06_models/mjschock/TinyLlama-1.1B-Chat-v1.0_lora_sft"
# model_name = f"{project_root}/data/06_models/mjschock/TinyLlama-1.1B-Chat-v1.0-tool-calling-sft/lora"
# model_name = f"{project_root}/data/06_models/mjschock/TinyLlama-1.1B-Chat-v1.0-tool-calling-sft/lora"

class ModelClient:
    def __init__(self):
        user_id = "mjschock" # TODO: get this dynamically
        pretrained_model_name = "TinyLlama-1.1B-Chat-v1.0"
        model_name = f"{project_root}/data/06_models/{user_id}/{pretrained_model_name}-tool-calling-sft/lora"
        # model_name = f"{project_root}/data/06_models/{user_id}/{pretrained_model_name}-tool-calling-sft/unsloth_lora" # TODO:Maybe this would work better for using the model rather than code path?

        model, tokenizer = FastLanguageModel.from_pretrained(
            dtype=dtype,
            load_in_4bit=load_in_4bit,
            max_seq_length=max_seq_length,
            model_name=model_name,
        )

        FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

        self.model = model
        self.tokenizer = tokenizer

    def chat_completion_request(
        self,
        documents: list,
        messages: list,
        tools: list,
    ):
        # FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

        inputs = self.tokenizer.apply_chat_template(
            add_generation_prompt=True,
            conversation=messages,
            documents=documents,
            return_tensors="pt",
            tokenize=True,
            tools=tools,
        ).to(device)

        outputs = self.model.generate(
            do_sample=False,
            input_ids=inputs,
            max_new_tokens=256,
            use_cache=True,
            # temperature=0.0,
        )

        batch_decoded_outputs = self.tokenizer.batch_decode(outputs)

        choices: List[ChatChoice] = []

        for i in range(len(batch_decoded_outputs)):
            response = batch_decoded_outputs[i][
                len(self.tokenizer.decode(inputs[i])) :
            ].replace(
                self.tokenizer.eos_token, ""
            )  # TODO: skip special tokens when decoding instead?

            try:
                response = json.loads(response)

                finish_reason: str = response.get("finish_reason")
                tool_calls_json = response.get("tool_calls")
                tool_calls: List[ToolCall] = []

                for tool_call_json in tool_calls_json:
                    tool_call = ToolCall(
                        function=FunctionToolCallArguments(
                            arguments=tool_call_json.get("arguments"),
                            name=tool_call_json.get("name"),
                        ),
                        id=tool_call_json.get("id"),
                        type="function",
                    )

                    tool_calls.append(tool_call)

                message: ChatMessage = ChatMessage(
                    role="assistant",
                    tool_calls=tool_calls,
                )

            except json.JSONDecodeError:
                finish_reason: str = "stop"
                message: ChatMessage = ChatMessage(
                    role="assistant",
                    content=response,
                )

            choices.append(
                ChatChoice(
                    index=i,
                    finish_reason=finish_reason,
                    logprobs=None,
                    message=message,
                )
            )

        return ChatResponse(
            choices=choices,
        )


class Agent(ChatModel):
    def __init__(self):
        # self.model_name = "llama3.2:1b"
        self.client = None

    def load_context(self, context):
        # self.model_name = "llama3.2:1b"
        # self.client = ollama.Client()
        print('=== load_context ===')
        print('context:', context)

        self.client = ModelClient()

    # the core method that needs to be implemented. this function
    # will be called every time a user sends messages to our model
    # @mlflow.trace(span_type=SpanType.AGENT)
    def predict(self, context, messages: list[ChatMessage], params: ChatParams):
        # instantiate the OpenAI client
        # client = OpenAI()

        # convert the messages to a format that the OpenAI API expects
        messages = [m.to_dict() for m in messages]

        print("params:")
        pprint(params.to_dict())

        tools = params.tools or []

        print("tools:")
        pprint(tools)

        tools = [t.to_dict() for t in tools]

        print("tools:")
        pprint(tools)

        # call the OpenAI API
        # response = client.chat.completions.create(
        #     model="gpt-4o-mini",
        #     messages=messages,
        #     # pass the tools in the request
        #     tools=self.tools,
        # )

        response = self.client.chat_completion_request(
            documents=[],  # we don't need documents for this example
            messages=messages,
            # tools=self.tools,
            # tools=[],
            tools=tools,
        )

        # return the result as a ChatResponse, as this
        # is the expected output of the predict method
        return ChatResponse.from_dict(response.to_dict())


set_model(Agent())

Overwriting /home/mjschock/Projects/training-tinyllama-for-tool-calling/src/training_tinyllama_for_tool_calling/services/agent.py


In [43]:
import gc

try:
    del model

except NameError:
    pass

try:
    del sft_trainer

except NameError:
    pass

try:
    del tokenizer

except NameError:
    pass

gc.collect()

34229

In [44]:
# mlflow.set_experiment("chatmodel-quickstart")
# code_path = "ollama_model.py"
# code_path = "model.py"
# code_path = f"{project_root}/src/training_tinyllama_for_tool_calling/services/agent.py"
# model = Model()

# with mlflow.start_run(
#     experiment_id=experiment.experiment_id,
#     nested=True,
# ):
model_info = mlflow.pyfunc.log_model(
    # "ollama_model",
    "model",
    python_model=code_path,
    # python_model=model, # AttributeError: Can't get attribute 'unsloth_push_to_hub' on <module 'unsloth.save'
    input_example={
        # "messages": [{"role": "user", "content": "Hello, how are you?"}]
        "messages": test_example_messages[0:1],
        "tools": test_example_tools,
    },
)

2024/11/29 10:40:16 INFO mlflow.pyfunc: Predicting on input example to validate output


Project root: /home/mjschock/Projects/training-tinyllama-for-tool-calling
=== load_context ===
context: <mlflow.pyfunc.model.PythonModelContext object at 0x768c23705de0>


==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA GeForce GTX 1050 Ti. Max memory: 3.94 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 6.1. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: unsloth/tinyllama-chat-bnb-4bit can only handle sequence lengths of at most 2048.
But with kaiokendev's RoPE scaling of 2.0, it can be magically be extended to 4096!


params:
{'n': 1,
 'stream': False,
 'temperature': 1.0,
 'tools': [{'function': {'description': 'Get the current weather',
                         'name': 'get_current_weather',
                         'parameters': {'properties': {'format': {'enum': ['celsius',
                                                                           'fahrenheit'],
                                                                  'type': 'string'},
                                                       'location': {'description': 'The '
                                                                                   'city '
                                                                                   'and '
                                                                                   'country, '
                                                                                   'eg. '
                                                                                   'San '
              

Downloading artifacts:   0%|          | 0/7 [00:00<?, ?it/s]

Project root: /home/mjschock/Projects/training-tinyllama-for-tool-calling
=== load_context ===
context: <mlflow.pyfunc.model.PythonModelContext object at 0x768c23705d50>
==((====))==  Unsloth 2024.11.5: Fast Llama patching. Transformers = 4.46.2.
   \\   /|    GPU: NVIDIA GeForce GTX 1050 Ti. Max memory: 3.94 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 6.1. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


params:
{'n': 1,
 'stream': False,
 'temperature': 1.0,
 'tools': [{'function': {'description': 'Get the current weather',
                         'name': 'get_current_weather',
                         'parameters': {'properties': {'format': {'enum': ['celsius',
                                                                           'fahrenheit'],
                                                                  'type': 'string'},
                                                       'location': {'description': 'The '
                                                                                   'city '
                                                                                   'and '
                                                                                   'country, '
                                                                                   'eg. '
                                                                                   'San '
              

In [45]:
model_info.model_uri

'runs:/dbb314a0b654499bbaefe1897a2e41dc/model'