In [2]:
# IMPORTANT: before running any code, set Colab runtime to be GPU-based (e.g., T4 GPU).
# Stable release from PyPI:
!pip install unsloth



In [3]:
from unsloth import FastLanguageModel
import torch

# Configuration
max_seq_length = 2048
dtype = None
load_in_4bit = True
model_name = "unsloth/Llama-3.2-1B-Instruct"

# Load both model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.8.6: Fast Llama patching. Transformers: 4.55.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.10G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "up_proj", "down_proj"
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=42,
    use_rslora=False,
    loftq_config=None
)

Not an error, but Unsloth cannot patch MLP layers with our manual autograd engine since either LoRA adapters
are not enabled or a bias term (like in Qwen) is used.
Unsloth 2025.8.6 patched 16 layers with 16 QKV layers, 16 O layers and 0 MLP layers.


In [5]:
# Importing our training set:
import json
from datasets import load_dataset, Dataset

SHAREGPT_DS = "rza.jsonl"

# Read each line as a JSON array
with open(SHAREGPT_DS, 'r') as f:
    conversations = [json.loads(line) for line in f]

# Create a dataset from the list of conversations
dataset = Dataset.from_dict({"conversations": conversations})


In [6]:
# Let's see all the different chat templates unsloth has ready-to-go:
from unsloth.chat_templates import CHAT_TEMPLATES
print(list(CHAT_TEMPLATES.keys()))

['unsloth', 'zephyr', 'chatml', 'mistral', 'llama', 'vicuna', 'vicuna_old', 'vicuna old', 'alpaca', 'gemma', 'gemma_chatml', 'gemma2', 'gemma2_chatml', 'llama-3', 'llama3', 'phi-3', 'phi-35', 'phi-3.5', 'llama-3.1', 'llama-31', 'llama-3.2', 'llama-3.3', 'llama-32', 'llama-33', 'qwen-2.5', 'qwen-25', 'qwen25', 'qwen2.5', 'phi-4', 'gemma-3', 'gemma3', 'qwen-3', 'qwen3', 'gemma-3n', 'gemma3n', 'gpt-oss', 'gptoss', 'qwen3-instruct', 'qwen3-thinking']


In [7]:
# Converting our training set to the right format:
from unsloth.chat_templates import get_chat_template, standardize_sharegpt

tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3.2",      # for our model
    mapping = {"role" : "from",
               "content" : "value",
               "user" : "human",
               "assistant" : "gpt"}
)

def apply_template(examples):
  texts = [
      tokenizer.apply_chat_template(
          convo,
          tokenize=False,
          add_generation_prompt=False
      )
      for convo in examples["conversations"]
  ]
  return {"text": texts}

nds = dataset.map(apply_template, batched=True)

print("===== BEFORE =====")
print(nds[0]["conversations"])
print("===== AFTER =====")
print(nds[0]["text"])


Map:   0%|          | 0/1693 [00:00<?, ? examples/s]

===== BEFORE =====
[{'from': 'system', 'value': 'You are Wu-Tang Clan member rza. When a user prompts you with one of your lyrics, you deliver the next line.'}, {'from': 'human', 'value': 'yo, you may catch me in a pair of polo skipperys, matching cap'}, {'from': 'gpt', 'value': 'razor blades in my gums (bobby!)'}]
===== AFTER =====
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

You are Wu-Tang Clan member rza. When a user prompts you with one of your lyrics, you deliver the next line.<|eot_id|><|start_header_id|>human<|end_header_id|>

yo, you may catch me in a pair of polo skipperys, matching cap<|eot_id|><|start_header_id|>gpt<|end_header_id|>

razor blades in my gums (bobby!)<|eot_id|>


In [8]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=nds,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        num_train_epochs=1,
        max_steps=3100,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        output_dir="outputs",
        report_to="none",
    ),
)

Unsloth: Tokenizing ["text"]:   0%|          | 0/1693 [00:00<?, ? examples/s]

In [9]:
from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(
    trainer,
    instruction_part="<|start_header_id|>human<|end_header_id|>",
    response_part="<|start_header_id|>gpt<|end_header_id|>",
)

stats = trainer.train()

Map (num_proc=2):   0%|          | 0/1693 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,693 | Num Epochs = 15 | Total steps = 3,100
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 8,650,752 of 1,244,465,152 (0.70% trained)


Step,Training Loss
1,5.3309
2,5.0124
3,3.8305
4,5.1864
5,5.1318
6,3.8625
7,3.1754
8,4.267
9,4.0964
10,4.6088


In [10]:
model = FastLanguageModel.for_inference(model)

inputs = tokenizer.apply_chat_template(
    [{"role": "user", "content": "As rza, complete the lyric: 'I'm the insider, the one runnin psyops'"}],
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

outputs = model.generate(input_ids=inputs, max_new_tokens=256)
print(tokenizer.batch_decode(outputs)[0])

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 July 2024

<|eot_id|><|start_header_id|>user<|end_header_id|>

As rza, complete the lyric: 'I'm the insider, the one runnin psyops'<|eot_id|><|start_header_id|>assistant<|end_header_id|>

i'm the one who's given in the notes<|eot_id|>


In [11]:
# Save
model.save_pretrained("/content/drive/MyDrive/my_llama3_rza_model")
tokenizer.save_pretrained("/content/drive/MyDrive/my_llama3_rza_model")

# Reload in 4-bit mode
from transformers import AutoTokenizer
from unsloth import FastLanguageModel

tokenizer = AutoTokenizer.from_pretrained("/content/drive/MyDrive/my_llama3_rza_model")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="/content/drive/MyDrive/my_llama3_rza_model",
    load_in_4bit=True,
    max_seq_length=2048,
)

==((====))==  Unsloth 2025.8.6: Fast Llama patching. Transformers: 4.55.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu128. CUDA: 7.5. CUDA Toolkit: 12.8. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.32.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [16]:
# Quick test
inputs = tokenizer("Describe Disney's Magic Kingdom as rza, using rhyming verse", return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=300)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Describe Disney's Magic Kingdom as rza, using rhyming verse

yo, what you knowin? it's the king's best
waxin' like a phoenix, it's here to pass
the test of time, it's the real test
of the best when it comes to the best

now we get to the heart of the best
it's the magic of the land, it's the wonder of the best
it's the reason why i'm here, it's the reason why me
it's the reason why all these wizards come

to this place, it's the reason why all these wizards
come to this place, it's the reason why all these wizards
come to this place, it's the reason why all these wizards

me and my brothers, we build the magic
we build the magic, we build the magic

and it's been always, it's been always been
it's been always been, it's been always been
it's the reason why all these wizards come
it's the reason why all these wizards come
it's the reason why all these wizards are
it's the reason why all these wizards are
me and my brothers, we build the magic

and it's been always, it's been always been