In [None]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

* We support Llama3.1
* We support 16bit LoRA or 4bit QLoRA. Both 2x faster.
* `max_seq_length` can be set to anything, since we do automatic RoPE Scaling via [kaiokendev's](https://kaiokendev.github.io/til) method.

In [None]:
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

lora_r = 16 # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
lora_alpha = 16
lora_target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]

use_max_steps = True # Add this variable to control whether to use max_steps or num_train_epochs

num_train_epochs = 3
max_steps = 32 # this is one full pass of our test dataset
per_device_train_batch_size = 2
gradient_accumulation_steps = 16
warmup_steps = 16
learning_rate = 2e-4
lr_scheduler_type = "linear"
optim = "adamw_8bit"

MODEL_TO_TRAIN = "LlamaFinetuneBase/Meta-Llama-3.1-8B-Instruct"
DATASET_TO_USE = "LlamaFinetuneBase/Notebook-Test"

HF_TOKEN = "hf_huggingfaceAPIKey" # Get a token at https://huggingface.co/settings/tokens
HUGGINGFACE_ORG = "LlamaFinetune" # Change to your username!, check out our huggingface page for models that we have finetunned already 'https://huggingface.co/LlamaFinetune'
NEW_MODEL_NAME = "pick-a-name" # Enter any model name you would like!

Push_Merge_16bit = False # Push fp16 merge to huggingface
Push_Lora = False # Push merge lora to huggingface
Push_All_GGUF = True # Push q4m q5, q8, fp16 to huggingface

Push_GGUF_q4 = False # for this to work Push_All_GGUF has to be false because if true it will save anyways
Push_GGUF_q5 = False # for this to work Push_All_GGUF has to be false because if true it will save anyways
Push_GGUF_q8 = False # for this to work Push_All_GGUF has to be false because if true it will save anyways
Push_GGUF_fp16 = False # for this to work Push_All_GGUF has to be false because if true it will save anyways


In [None]:
from unsloth import FastLanguageModel
import torch

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = MODEL_TO_TRAIN,
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = lora_r,
    target_modules = lora_target_modules,
    lora_alpha = lora_alpha,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

<a name="Data"></a>
### Data Prep
We now use the `Llama-3.1` format for conversation style finetunes. We use [LlamaFinetuneBase/ClaudeStruct](https://huggingface.co/datasets/LlamaFinetuneBase/ClaudeStruct) dataset in ShareGPT style. But we convert it to HuggingFace's normal multiturn format `("role", "content")` instead of `("from", "value")`/ Llama-3 renders multi turn conversations like below:

```
<|begin_of_text|><|start_header_id|>user<|end_header_id|>

Hello!<|eot_id|><|start_header_id|>assistant<|end_header_id|>

Hey there! How are you?<|eot_id|><|start_header_id|>user<|end_header_id|>

I'm great thanks!<|eot_id|>
```

We use our `get_chat_template` function to get the correct chat template. We support `zephyr, chatml, mistral, llama, alpaca, vicuna, vicuna_old, phi3, llama3` and more.

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template = "llama-3.1",
)

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

from datasets import load_dataset
dataset = load_dataset(DATASET_TO_USE, split = "train")

We now use `standardize_sharegpt` to convert ShareGPT style datasets into HuggingFace's generic format. This changes the dataset from looking like:
```
{"from": "system", "value": "You are an assistant"}
{"from": "human", "value": "What is 2+2?"}
{"from": "gpt", "value": "It's 4."}
```
to
```
{"role": "system", "content": "You are an assistant"}
{"role": "user", "content": "What is 2+2?"}
{"role": "assistant", "content": "It's 4."}
```

In [None]:
from unsloth.chat_templates import standardize_sharegpt
dataset = standardize_sharegpt(dataset)
dataset = dataset.map(formatting_prompts_func, batched = True,)

<a name="Train"></a>
### Train the model
Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    data_collator = DataCollatorForSeq2Seq(tokenizer = tokenizer),
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = per_device_train_batch_size,
        gradient_accumulation_steps = gradient_accumulation_steps,
        warmup_steps = warmup_steps,
        num_train_epochs = num_train_epochs,
        max_steps = max_steps if use_max_steps else -1,
        learning_rate = learning_rate,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        optim = optim,
        lr_scheduler_type = lr_scheduler_type,
        logging_steps = 1,
        weight_decay = 0.01,
        seed = 3407,
        output_dir = "outputs",
    ),
)

We also use Unsloth's `train_on_completions` method to only train on the assistant outputs and ignore the loss on the user's inputs.

In [None]:
from unsloth.chat_templates import train_on_responses_only
trainer = train_on_responses_only(
    trainer,
    instruction_part = "<|start_header_id|>user<|end_header_id|>\n\n",
    response_part = "<|start_header_id|>assistant<|end_header_id|>\n\n",
)

We verify masking is actually done:

In [None]:
tokenizer.decode(trainer.train_dataset[5]["input_ids"])

In [None]:
space = tokenizer(" ", add_special_tokens = False).input_ids[0]
tokenizer.decode([space if x == -100 else x for x in trainer.train_dataset[5]["labels"]])

We can see the System and Instruction prompts are successfully masked!

In [None]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

In [None]:
trainer_stats = trainer.train()

In [None]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory         /max_memory*100, 3)
lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

<a name="Inference"></a>
### Inference
Let's run the model! You can change the instruction and input - leave the output blank!

In [None]:
# used from above

# from unsloth.chat_templates import get_chat_template

# tokenizer = get_chat_template(
#     tokenizer,
#     chat_template = "llama-3.1",
# )

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

outputs = model.generate(input_ids = inputs, max_new_tokens = 128, use_cache = True,
                         temperature = 0.1)
tokenizer.batch_decode(outputs)

We use `min_p = 0.1` and `temperature = 1.5`. Read this [Tweet](https://x.com/menhguin/status/1826132708508213629) for more information on why.

 You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!

In [None]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

messages = [
    {"role": "user", "content": "Continue the fibonnaci sequence: 1, 1, 2, 3, 5, 8,"},
]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize = True,
    add_generation_prompt = True, # Must add for generation
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids = inputs, streamer = text_streamer, max_new_tokens = 128,
                   use_cache = True, temperature = 1.5, min_p = 0.1)

<a name="Save"></a>
### Saving, loading finetuned models
To save the final model as LoRA adapters, either use Huggingface's `push_to_hub` for an online save or `save_pretrained` for a local save.

**[NOTE]** This ONLY saves the LoRA adapters, and not the full model. To save to 16bit or GGUF, scroll down!

In [None]:
if False: model.save_pretrained(NEW_MODEL_NAME)
if False: tokenizer.save_pretrained(NEW_MODEL_NAME)
if False: model.push_to_hub(f"{HUGGINGFACE_ORG}/{NEW_MODEL_NAME}", token = HF_TOKEN)
if False: tokenizer.push_to_hub(f"{HUGGINGFACE_ORG}/{NEW_MODEL_NAME}", token = HF_TOKEN)

### Saving to float16 for VLLM

We also support saving to `float16` directly. Select `merged_16bit` for float16 or `merged_4bit` for int4. We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens.

In [None]:
# Merge to 16bit
if False: model.save_pretrained_merged(NEW_MODEL_NAME, tokenizer, save_method = "merged_16bit",)
if Push_Merge_16bit: model.push_to_hub_merged(f"{HUGGINGFACE_ORG}/{NEW_MODEL_NAME}", tokenizer, save_method = "merged_16bit", token = HF_TOKEN)

# Merge to 4bit
if False: model.save_pretrained_merged(NEW_MODEL_NAME, tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged(f"{HUGGINGFACE_ORG}/{NEW_MODEL_NAME}", tokenizer, save_method = "merged_4bit", token = HF_TOKEN)

# Just LoRA adapters
if False: model.save_pretrained_merged(NEW_MODEL_NAME, tokenizer, save_method = "lora",)
if Push_Lora: model.push_to_hub_merged(f"{HUGGINGFACE_ORG}/{NEW_MODEL_NAME}", tokenizer, save_method = "lora", token = HF_TOKEN)

### GGUF / llama.cpp Conversion
To save to `GGUF` / `llama.cpp`, we support it natively now! We clone `llama.cpp` and we default save it to `q8_0`. We allow all methods like `q4_k_m`. Use `save_pretrained_gguf` for local saving and `push_to_hub_gguf` for uploading to HF.

Some supported quant methods (full list on our [Wiki page](https://github.com/unslothai/unsloth/wiki#gguf-quantization-options)):
* `q8_0` - Fast conversion. High resource use, but generally acceptable.
* `q4_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q4_K.
* `q5_k_m` - Recommended. Uses Q6_K for half of the attention.wv and feed_forward.w2 tensors, else Q5_K.

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf(NEW_MODEL_NAME, tokenizer,)
if Push_GGUF_q8: model.push_to_hub_gguf(f"{HUGGINGFACE_ORG}/{NEW_MODEL_NAME}", tokenizer, token = HF_TOKEN)

# Save to 16bit GGUF
if False: model.save_pretrained_gguf(NEW_MODEL_NAME, tokenizer, quantization_method = "f16")
if Push_GGUF_fp16: model.push_to_hub_gguf(f"{HUGGINGFACE_ORG}/{NEW_MODEL_NAME}", tokenizer, quantization_method = "f16", token = HF_TOKEN)

# Save to q5_k_m GGUF
if False: model.save_pretrained_gguf(NEW_MODEL_NAME, tokenizer, quantization_method = "q5_k_m")
if Push_GGUF_q5: model.push_to_hub_gguf(f"{HUGGINGFACE_ORG}/{NEW_MODEL_NAME}", tokenizer, quantization_method = "q4_k_m", token = HF_TOKEN)

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf(NEW_MODEL_NAME, tokenizer, quantization_method = "q4_k_m")
if Push_GGUF_q4: model.push_to_hub_gguf(f"{HUGGINGFACE_ORG}/{NEW_MODEL_NAME}", tokenizer, quantization_method = "q4_k_m", token = HF_TOKEN)

# Save to multiple GGUF options - much faster if you want multiple!
if Push_All_GGUF:
    model.push_to_hub_gguf(
        f"{HUGGINGFACE_ORG}/{NEW_MODEL_NAME}",
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = HF_TOKEN,
    )

Now, use the `model-unsloth.gguf` file or `model-unsloth-Q4_K_M.gguf` file in `llama.cpp` or a UI based system like `GPT4All`.