In [1]:
import torch
from unsloth import FastLanguageModel

max_seq_length = 4096  # Choose any! We auto support RoPE Scaling internally!
dtype = (
    None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
)
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/tinyllama-bnb-4bit",
]  # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name = "unsloth/tinyllama-bnb-4bit", # "unsloth/tinyllama" for 16bit loading
    model_name="TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    # model_name=fourbit_models[0],
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.10.7: Fast Llama patching. Transformers = 4.44.2.
   \\   /|    GPU: NVIDIA GeForce GTX 1050 Ti. Max memory: 3.94 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.5.1+cu124. CUDA = 6.1. CUDA Toolkit = 12.4.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth


Unsloth: unsloth/tinyllama-chat-bnb-4bit can only handle sequence lengths of at most 2048.
But with kaiokendev's RoPE scaling of 2.0, it can be magically be extended to 4096!
Unsloth: We fixed a gradient accumulation bug, but it seems like you don't have the latest transformers version!
Please update transformers, TRL and unsloth via:
`pip install --upgrade --no-cache-dir unsloth git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/trl.git`


In [2]:
from datasets import load_dataset

# dataset = load_dataset("mlabonne/FineTome-100k", split="train")
# dataset = load_dataset("philschmid/guanaco-sharegpt-style", split = "train")
dataset = load_dataset("mjschock/chat_threads", split="train")

In [3]:
dataset[5]

{'documents': '[]',
 'messages': '[{"role": "user", "content": "I came across some slang terms that the younger folks in my office have been using, and I\'m feeling a bit out of the loop. Could you help me understand what they mean? I\'d like to know the definitions of \'Lit\', \'Savage\', and \'YOLO\' as they\'re defined on Urban Dictionary. Can you look these up for me?"}, {"role": "assistant", "tool_calls": [{"function": {"arguments": ["term=\'Lit\'"], "name": "find_term_on_urban_dictionary"}, "id": "call_0", "type": "function"}, {"function": {"arguments": ["term=\'Savage\'"], "name": "find_term_on_urban_dictionary"}, "id": "call_1", "type": "function"}, {"function": {"arguments": ["term=\'YOLO\'"], "name": "find_term_on_urban_dictionary"}, "id": "call_2", "type": "function"}]}]',
 'tools': '[{"type": "function", "function": {"name": "find_term_on_urban_dictionary", "description": "Finds the definition of a term on Urban Dictionary.", "parameters": {"type": "dict", "properties": {"t

In [4]:
from unsloth.chat_templates import get_chat_template

# Influenced by:
# https://cookbook.openai.com/examples/how_to_call_functions_with_chat_models
# https://docs.anthropic.com/en/docs/build-with-claude/tool-use
# https://github.com/abetlen/llama-cpp-python/blob/7c4aead82d349469bbbe7d8c0f4678825873c039/llama_cpp/llama_chat_format.py#L3387
# https://github.com/Mozilla-Ocho/llamafile/blob/66a84d8aea2990895fc4f64786406fea64e79197/llama.cpp/server/server.cpp#L480 (need <|im_start|> b/c Mozilla)
# https://github.com/openai/openai-python/blob/120d225b91a8453e15240a49fb1c6794d8119326/chatml.md
# https://gorilla.cs.berkeley.edu/blogs/8_berkeley_function_calling_leaderboard.html#prompt
# https://huggingface.co/blog/unified-tool-use
unsloth_template = (
    "{%- set system_message_present = messages | selectattr('role', 'equalto', 'system') | list -%}"
    "{%- if not system_message_present -%}"
    '{%- set messages = [{ "content": "You are an AI agent acting as a human assistant.", "role": "system" }] + messages -%}'
    "{%- endif -%}"
    "{%- for message in messages -%}"
    # "<|im_start|>{{ message.role }}{{ '\n' }}"
    "<|{{ message.role }}|>{{ '\n' }}"
    # System message
    "{%- if message.role == 'system' -%}"
    "{{ message.content }}"
    "{%- if tools and tools | length > 0 -%}"
    "{{ '\n\n' }}You are aware of the following tools in your environment:{{ '\n' }}"
    "{\n"
    "  \"tools\": [{{ '\n' }}"
    "{%- for tool in tools -%}"
    "{{ '    ' }}{\n"
    '      "function": {\n'
    '        "description": "{{ tool.function.description }}",{{ \'\n\' }}'
    '        "name": "{{ tool.function.name }}",{{ \'\n\' }}'
    "        \"parameters\": {{ tool.function.parameters | tojson }}{{ '\n' }}"
    # "        \"parameters\": {\n"
    # "{{ '        ' }}}\n"
    "      },{{ '\n' }}"
    '      "type": "{{ tool.type }}"{{ \'\n\' }}'
    "    }{%- if not loop.last -%},{%- endif -%}{{ '\n' }}"
    "{%- endfor -%}"
    "{{ '  ' }}]{{ '\n' }}"
    "}"
    "{{ '\n\n' }}If you would like to suggest one or more tool calls, please respond in the following format:{{ '\n' }}"
    "{\n"
    '  "finish_reason": "tool_calls",{{ \'\n\' }}'
    "  \"tool_calls\": [{{ '\n' }}"
    "{{ '    ' }}{\n"
    '      "arguments": "{\\"parameter_name\\": \\"parameter_value\\"}",{{ \'\n\' }}'
    '      "id": "call_id",{{ \'\n\' }}'
    '      "name": "tool_name"{{ \'\n\' }}'
    "    }{{ '\n' }}"
    "  ]{{ '\n' }}"
    "}"
    "{%- endif -%}"
    # "<|im_end|>{{ '\n' }}"
    "{{ eos_token }}{{ '\n' }}"
    "{%- endif -%}"
    # User message
    "{%- if message.role == 'user' -%}"
    "{{ message.content }}"
    # "<|im_end|>{{ '\n' }}"
    "{{ eos_token }}{{ '\n' }}"
    "{%- endif -%}"
    # Assistant message
    "{%- if message.role == 'assistant' -%}"
    "{% generation %}"
    ## Tool calls (Actions)
    "{%- if message.tool_calls and message.tool_calls | length > 0 -%}"
    "{\n"
    '  "finish_reason": "tool_calls",{{ \'\n\' }}'
    "  \"tool_calls\": [{{ '\n' }}"
    "{%- for tool_call in message.tool_calls -%}"
    "{{ '    ' }}{\n"
    "      \"arguments\": {{ tool_call.function.arguments | tojson }},{{ '\n' }}"
    '      "id": "{{ tool_call.id }}",{{ \'\n\' }}'
    '      "name": "{{ tool_call.function.name }}"{{ \'\n\' }}'
    "    }{%- if not loop.last -%},{%- endif -%}{{ '\n' }}"
    "{%- endfor -%}"
    "{{ '  ' }}]{{ '\n' }}"
    "}"
    "{%- else -%}"
    ## Regular message
    "{{ message.content }}"
    "{%- endif -%}"
    "{% endgeneration %}"
    # "<|im_end|>{{ '\n' }}"
    "{{ eos_token }}{{ '\n' }}"
    "{%- endif -%}"
    ## Tool message (Observations)
    "{%- if message.role == 'tool' -%}"
    "{\n"
    "  \"content\": {{ message.content | tojson }},{{ '\n' }}"
    '  "name": "{{ message.name }}",{{ \'\n\' }}'
    '  "tool_call_id": "{{ message.tool_call_id }}"{{ \'\n\' }}'
    "}"
    # "<|im_end|>{{ '\n' }}"
    "{{ eos_token }}{{ '\n' }}"
    "{%- endif -%}"
    "{%- endfor -%}"
    # "{%- if add_generation_prompt -%}<|im_start|>assistant\n{%- endif -%}"
    # f"{{{{ '{self.assistant}\n' }}}}"
    "{%- if add_generation_prompt -%}"
    + f"{{{{ '<|assistant|>\n' }}}}"
    + "{%- endif -%}"
)
unsloth_eos_token = "eos_token"

tokenizer = get_chat_template(
    tokenizer,
    chat_template=(
        unsloth_template,
        unsloth_eos_token,
    ),  # You must provide a template and EOS token
    # mapping={
    #     "role": "from",
    #     "content": "value",
    #     "user": "human",
    #     "assistant": "gpt",
    # },  # ShareGPT style
    map_eos_token=True,  # Maps <|im_end|> to </s> instead
)

In [5]:
import json


def load_and_preprocess_data(dataset, tokenizer):
    """
    Load and preprocess the dataset for training.

    Args:
        dataset: The dataset to preprocess
        tokenizer: Tokenizer to use for preprocessing.

    Returns:
        datasets.Dataset: Preprocessed dataset.
    """

    def preprocess_function(examples):
        # Extract the messages from the example
        conversation = examples["messages"]
        documents = examples.get("documents", [])
        tools = examples.get("tools", [])

        # Apply chat template to generate tokenized input and assistant mask
        tokenized_output = tokenizer.apply_chat_template(
            add_generation_prompt=False,
            conversation=json.loads(conversation),
            documents=json.loads(documents),
            max_length=4096,
            padding="longest",
            return_assistant_tokens_mask=True,
            return_dict=True,
            return_tensors="pt",
            tokenize=True,
            tools=json.loads(tools),
            truncation=True,  # TODO: verify we're not truncating anything in the datasets
        )

        # Extract the input IDs and assistant tokens mask
        input_ids = tokenized_output["input_ids"][0]
        assistant_masks = torch.tensor(tokenized_output["assistant_masks"])
        attention_mask = tokenized_output["attention_mask"][0]

        # Use the assistant mask to create labels
        labels = torch.where(assistant_masks == 1, input_ids, torch.tensor(-100))

        return {
            "attention_mask": attention_mask,
            "input_ids": input_ids,
            "labels": labels,
        }

    # Preprocess the dataset
    return dataset.map(
        preprocess_function,
        batched=False,
        num_proc=1,
        remove_columns=dataset.column_names,
    )  # TODO: use batched=True


tokenized_train_dataset = load_and_preprocess_data(
    # chat_threads_train_ds, tokenizer
    dataset,
    tokenizer,
)

In [6]:
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

example = dataset[5]

inputs = tokenizer.apply_chat_template(
    add_generation_prompt=True,
    documents=json.loads(example["documents"]),
    conversation=json.loads(example["messages"])[0:-1],
    tools=json.loads(example["tools"]),
    return_tensors="pt",
    tokenize=True,
).to("cuda")

outputs = model.generate(
    # input_ids=inputs, max_new_tokens=64, use_cache=True, temperature=1.5, min_p=0.1
    input_ids=inputs,
    max_new_tokens=256,
    use_cache=True,
    temperature=0.0,
)
batch_decoded_outputs = tokenizer.batch_decode(outputs)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


In [7]:
print('prompt:')
print(batch_decoded_outputs[0][0:len(tokenizer.decode(inputs[0]))])

prompt:
<|system|>
You are an AI agent acting as a human assistant.

You are aware of the following tools in your environment:
{
  "tools": [
    {
      "function": {
        "description": "Finds the definition of a term on Urban Dictionary.",
        "name": "find_term_on_urban_dictionary",
        "parameters": {"type": "dict", "properties": {"term": {"type": "string", "description": "The term to find the definition of."}}, "required": ["term"]}
      },
      "type": "function"
    }
  ]
}

If you would like to suggest one or more tool calls, please respond in the following format:
{
  "finish_reason": "tool_calls",
  "tool_calls": [
    {
      "arguments": "{\"parameter_name\": \"parameter_value\"}",
      "id": "call_id",
      "name": "tool_name"
    }
  ]
}</s> 
<|user|>
I came across some slang terms that the younger folks in my office have been using, and I'm feeling a bit out of the loop. Could you help me understand what they mean? I'd like to know the definitions of 'Lit

In [8]:
print('response:')
print(batch_decoded_outputs[0][len(tokenizer.decode(inputs[0])):])

response:
Sure, I'd be happy to help you understand the definitions of the slang terms you mentioned in Urban Dictionary. Here are the definitions of the slang terms you mentioned:


1. "It" (pron) - a word that is used to emphasize the first syllable of a word. It is used to emphasize the first syll of word in a word.
2. "Sage" (on - a word that is used to emphasize the second syll of word in a. It emphasizes the second syll in a word.
3 "YLO" (on - word that emphasizes the last syll of a word. It emphasizes the last sy in word.


Here's the definition of each of these slang:

1. "It" (on" - a word that emphasizes the first syll of a. It emphasizes the first sy in a.
   Definition: It is a word that emphasizes the first syll of a. It emphasizes the first sy in.

2 "Sage (on" - word emphasizes the second sy of. It emphasizes the second.
 Definition: It emphasizes second.


In [9]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
        "lm_head",
    ],
    # lora_alpha = 16,
    lora_alpha=32,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
)

Unsloth: Offloading output_embeddings to disk to save VRAM


  offloaded_W = torch.load(filename, map_location = "cpu", mmap = True)
Unsloth 2024.10.7 patched 22 layers with 22 QKV layers, 22 O layers and 22 MLP layers.


Unsloth: Casting lm_head to float32


In [10]:
from trl import SFTTrainer
from transformers import (
    DataCollatorForLanguageModeling,
    TrainingArguments,
)
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    # compute_metrics=compute_metrics,
    model=model,
    tokenizer=tokenizer,
    # train_dataset=dataset,
    train_dataset=tokenized_train_dataset,
    # dataset_text_field="text",
    max_seq_length=max_seq_length,
    # data_collator=DataCollatorForSeq2Seq(
    #     tokenizer=tokenizer
    # ),  # TODO: comment this out?
    data_collator=DataCollatorForLanguageModeling(mlm=False, tokenizer=tokenizer),
    # dataset_num_proc=2,
    dataset_num_proc=1,
    packing=False,  # Can make training 5x faster for short sequences.
    args=TrainingArguments(
        # per_device_train_batch_size=2,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps=60,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
    ),
)

max_steps is given, it will override any value given in num_train_epochs


In [11]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

'<|system|>\nYou are an AI agent acting as a human assistant.\n\nYou are aware of the following tools in your environment:\n{\n  "tools": [\n    {\n      "function": {\n        "description": "Finds the definition of a term on Urban Dictionary.",\n        "name": "find_term_on_urban_dictionary",\n        "parameters": {"type": "dict", "properties": {"term": {"type": "string", "description": "The term to find the definition of."}}, "required": ["term"]}\n      },\n      "type": "function"\n    }\n  ]\n}\n\nIf you would like to suggest one or more tool calls, please respond in the following format:\n{\n  "finish_reason": "tool_calls",\n  "tool_calls": [\n    {\n      "arguments": "{\\"parameter_name\\": \\"parameter_value\\"}",\n      "id": "call_id",\n      "name": "tool_name"\n    }\n  ]\n}</s> \n<|user|>\nI came across some slang terms that the younger folks in my office have been using, and I\'m feeling a bit out of the loop. Could you help me understand what they mean? I\'d like to 

In [12]:
space = tokenizer(" ", add_special_tokens=False).input_ids[0]
tokenizer.decode(
    [space if x == -100 else x for x in trainer.train_dataset[5]["labels"]]
)

'                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                   {\n  "finish_reason": "tool_calls",\n  "tool_calls": [\n    {\n      "arguments": ["term=\'Lit\'"],\n      "id": "call_0",\n      "name": "find_term_on_urban_dictionary"\n    },\n    {\n      "arguments": ["term=\'Savage\'"],\n      "id": "call_1",\n      "name": "find_term_on_urban_dictionary"\n    },\n    {\n      "argum

In [13]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA GeForce GTX 1050 Ti. Max memory = 3.94 GB.
1.086 GB of memory reserved.


In [14]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 216 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 4
\        /    Total batch size = 4 | Total steps = 60
 "-____-"     Number of trainable parameters = 78,151,680


**** Unsloth: Please use our fixed gradient_accumulation_steps by updating transformers, TRL and Unsloth!
`pip install --upgrade --no-cache-dir unsloth git+https://github.com/huggingface/transformers.git git+https://github.com/huggingface/trl.git`


  0%|          | 0/60 [00:00<?, ?it/s]

{'loss': 1.7846, 'grad_norm': 4.529193878173828, 'learning_rate': 4e-05, 'epoch': 0.02}
{'loss': 1.8872, 'grad_norm': 7.591363430023193, 'learning_rate': 8e-05, 'epoch': 0.04}
{'loss': 1.8242, 'grad_norm': 5.61696195602417, 'learning_rate': 0.00012, 'epoch': 0.06}
{'loss': 1.4991, 'grad_norm': 4.182041645050049, 'learning_rate': 0.00016, 'epoch': 0.07}
{'loss': 1.0372, 'grad_norm': 2.690903663635254, 'learning_rate': 0.0002, 'epoch': 0.09}
{'loss': 0.9344, 'grad_norm': 1.6501234769821167, 'learning_rate': 0.00019636363636363636, 'epoch': 0.11}
{'loss': 0.8307, 'grad_norm': 1.5004554986953735, 'learning_rate': 0.00019272727272727274, 'epoch': 0.13}
{'loss': 0.8921, 'grad_norm': 1.9034873247146606, 'learning_rate': 0.0001890909090909091, 'epoch': 0.15}
{'loss': 0.4416, 'grad_norm': 1.3987164497375488, 'learning_rate': 0.00018545454545454545, 'epoch': 0.17}
{'loss': 0.5868, 'grad_norm': 1.3992094993591309, 'learning_rate': 0.00018181818181818183, 'epoch': 0.19}
{'loss': 0.3802, 'grad_norm

In [15]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

3802.886 seconds used for training.
63.38 minutes used for training.
Peak reserved memory = 2.699 GB.
Peak reserved memory for training = 1.613 GB.
Peak reserved memory % of max memory = 68.503 %.
Peak reserved memory for training % of max memory = 40.939 %.


In [16]:
FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

example = dataset[5]

inputs = tokenizer.apply_chat_template(
    add_generation_prompt=True,
    documents=json.loads(example["documents"]),
    conversation=json.loads(example["messages"])[0:-1],
    tools=json.loads(example["tools"]),
    return_tensors="pt",
    tokenize=True,
).to("cuda")

outputs = model.generate(
    # input_ids=inputs, max_new_tokens=64, use_cache=True, temperature=1.5, min_p=0.1
    input_ids=inputs,
    max_new_tokens=256,
    use_cache=True,
    temperature=0.0,
)
batch_decoded_outputs = tokenizer.batch_decode(outputs)

In [17]:
print('prompt:')
print(batch_decoded_outputs[0][0:len(tokenizer.decode(inputs[0]))])

prompt:
<|system|>
You are an AI agent acting as a human assistant.

You are aware of the following tools in your environment:
{
  "tools": [
    {
      "function": {
        "description": "Finds the definition of a term on Urban Dictionary.",
        "name": "find_term_on_urban_dictionary",
        "parameters": {"type": "dict", "properties": {"term": {"type": "string", "description": "The term to find the definition of."}}, "required": ["term"]}
      },
      "type": "function"
    }
  ]
}

If you would like to suggest one or more tool calls, please respond in the following format:
{
  "finish_reason": "tool_calls",
  "tool_calls": [
    {
      "arguments": "{\"parameter_name\": \"parameter_value\"}",
      "id": "call_id",
      "name": "tool_name"
    }
  ]
}</s> 
<|user|>
I came across some slang terms that the younger folks in my office have been using, and I'm feeling a bit out of the loop. Could you help me understand what they mean? I'd like to know the definitions of 'Lit

In [18]:
print('response:')
print(batch_decoded_outputs[0][len(tokenizer.decode(inputs[0])):])

response:
{
  "finish_reason": "tool_calls",
  "tool_calls": [
    {
      "arguments": ["term='Lit'"],
      "id": "call_0",
      "name": "find_term_on_urban_dictionary"
    },
    {
      "arguments": ["term='Savage'"],
      "id": "call_1",
      "name": "find_term_on_urban_dictionary"
    },
    {
      "arguments": ["term='YOLO'"],
      "id": "call_2",
      "name": "find_term_on_urban_dictionary"
    }
  ]
}</s>


In [None]:
# Merge to 16bit
if False:
    model.save_pretrained_merged(
        "model",
        tokenizer,
        save_method="merged_16bit",
    )
if False:
    model.push_to_hub_merged(
        "hf/model", tokenizer, save_method="merged_16bit", token=""
    )

# Merge to 4bit
if False:
    model.save_pretrained_merged(
        "model",
        tokenizer,
        save_method="merged_4bit",
    )
if False:
    model.push_to_hub_merged("hf/model", tokenizer, save_method="merged_4bit", token="")

# Just LoRA adapters
if False:
    model.save_pretrained_merged(
        "model",
        tokenizer,
        save_method="lora",
    )
if False:
    model.push_to_hub_merged("hf/model", tokenizer, save_method="lora", token="")

<a name="Ollama"></a>
### Ollama Support

[Unsloth](https://github.com/unslothai/unsloth) now allows you to automatically finetune and create a [Modelfile](https://github.com/ollama/ollama/blob/main/docs/modelfile.md), and export to [Ollama](https://ollama.com/)! This makes finetuning much easier and provides a seamless workflow from `Unsloth` to `Ollama`!

Let's first install `Ollama`!

In [None]:
!curl -fsSL https://ollama.com/install.sh | sh

Next, we shall save the model to GGUF / llama.cpp

We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

We also support saving to multiple GGUF options in a list fashion! This can speed things up by 10 minutes or more if you want multiple export formats!

In [None]:
# Save to 8bit Q8_0
if True:
    model.save_pretrained_gguf(
        "model",
        tokenizer,
    )
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False:
    model.push_to_hub_gguf("hf/model", tokenizer, token="")

# Save to 16bit GGUF
if False:
    model.save_pretrained_gguf("model", tokenizer, quantization_method="f16")
if False:
    model.push_to_hub_gguf("hf/model", tokenizer, quantization_method="f16", token="")

# Save to q4_k_m GGUF
if False:
    model.save_pretrained_gguf("model", tokenizer, quantization_method="q4_k_m")
if False:
    model.push_to_hub_gguf(
        "hf/model", tokenizer, quantization_method="q4_k_m", token=""
    )

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model",  # Change hf to your username!
        tokenizer,
        quantization_method=[
            "q4_k_m",
            "q8_0",
            "q5_k_m",
        ],
        token="",  # Get a token at https://huggingface.co/settings/tokens
    )

We use `subprocess` to start `Ollama` up in a non blocking fashion! In your own desktop, you can simply open up a new `terminal` and type `ollama serve`, but in Colab, we have to use this hack! Alternatively, use the model-unsloth.gguf file or model-unsloth-Q4_K_M.gguf file in llama.cpp or a UI based system like GPT4All.

In [None]:
import subprocess

subprocess.Popen(["ollama", "serve"])
import time

time.sleep(3)  # Wait for a few seconds for Ollama to load!

`Ollama` needs a `Modelfile`, which specifies the model's prompt format. Let's print Unsloth's auto generated one:

In [None]:
print(tokenizer._ollama_modelfile)

We now will create an `Ollama` model called `unsloth_model` using the `Modelfile` which we auto generated!

In [None]:
!ollama create unsloth_model -f ./model/Modelfile

And now we can do inference on it via `Ollama`!

You can also upload to `Ollama` and try the `Ollama` Desktop app by heading to https://www.ollama.com/

In [None]:
!curl http://localhost:11434/api/chat -d '{ \
    "model": "unsloth_model", \
    "messages": [ \
        { "role": "user", "content": "Continue the Fibonacci sequence: 1, 1, 2, 3, 5, 8," } \
    ] \
    }'

# ChatGPT interactive mode

### ⭐ To run the finetuned model like in a ChatGPT style interface, first click the **| >_ |** button.
![](https://raw.githubusercontent.com/unslothai/unsloth/nightly/images/Where_Terminal.png)

---
---
---

### ⭐ Then, type `ollama run unsloth_model`

![](https://raw.githubusercontent.com/unslothai/unsloth/nightly/images/Terminal_Type.png)

---
---
---
### ⭐ And you have a ChatGPT style assistant!

### Type any question you like and press `ENTER`. If you want to exit, hit `CTRL + D`
![](https://raw.githubusercontent.com/unslothai/unsloth/nightly/images/Assistant.png)

And we're done! If you have any questions on Unsloth, we have a [Discord](https://discord.gg/u54VK8m8tk) channel! If you find any bugs or want to keep updated with the latest LLM stuff, or need help, join projects etc, feel free to join our Discord!

Some other links:
1. Zephyr DPO 2x faster [free Colab](https://colab.research.google.com/drive/15vttTpzzVXv_tJwEk-hIcQ0S9FcEWvwP?usp=sharing)
2. Llama 7b 2x faster [free Colab](https://colab.research.google.com/drive/1lBzz5KeZJKXjvivbYvmGarix9Ao6Wxe5?usp=sharing)
3. TinyLlama 4x faster full Alpaca 52K in 1 hour [free Colab](https://colab.research.google.com/drive/1AZghoNBQaMDgWJpi4RbffGM1h6raLUj9?usp=sharing)
4. CodeLlama 34b 2x faster [A100 on Colab](https://colab.research.google.com/drive/1y7A0AxE3y8gdj4AVkl2aZX47Xu3P1wJT?usp=sharing)
5. Mistral 7b [free Kaggle version](https://www.kaggle.com/code/danielhanchen/kaggle-mistral-7b-unsloth-notebook)
6. We also did a [blog](https://huggingface.co/blog/unsloth-trl) with 🤗 HuggingFace, and we're in the TRL [docs](https://huggingface.co/docs/trl/main/en/sft_trainer#accelerate-fine-tuning-2x-using-unsloth)!
7. `ChatML` for ShareGPT datasets, [conversational notebook](https://colab.research.google.com/drive/1Aau3lgPzeZKQ-98h69CCu1UJcvIBLmy2?usp=sharing)
8. Text completions like novel writing [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing)
9. [**NEW**] We make Phi-3 Medium / Mini **2x faster**! See our [Phi-3 Medium notebook](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing)
10. [**NEW**] We make Gemma-2 9b / 27b **2x faster**! See our [Gemma-2 9b notebook](https://colab.research.google.com/drive/1vIrqH5uYDQwsJ4-OO3DErvuv4pBgVwk4?usp=sharing)
11. [**NEW**] To finetune and auto export to Ollama, try our [Ollama notebook](https://colab.research.google.com/drive/1WZDi7APtQ9VsvOrQSSC5DDtxq159j8iZ?usp=sharing)
12. [**NEW**] We make Mistral NeMo 12B 2x faster and fit in under 12GB of VRAM! [Mistral NeMo notebook](https://colab.research.google.com/drive/17d3U-CAIwzmbDRqbZ9NnpHxCkmXB6LZ0?usp=sharing)

<div class="align-center">
  <a href="https://github.com/unslothai/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/unsloth%20new%20logo.png" width="115"></a>
  <a href="https://discord.gg/u54VK8m8tk"><img src="https://github.com/unslothai/unsloth/raw/main/images/Discord.png" width="145"></a>
  <a href="https://ko-fi.com/unsloth"><img src="https://github.com/unslothai/unsloth/raw/main/images/Kofi button.png" width="145"></a></a> Support our work if you can! Thanks!
</div>