In [9]:
# ===== Colab 1: Full Finetuning SmolLM2-135M (Final Version) =====

!pip install -q "unsloth>=2025.3.0" "unsloth_zoo" "trl>=0.9.6" "datasets" "accelerate" "transformers>=4.44.0"

from unsloth import FastLanguageModel
from datasets import Dataset
from trl import SFTTrainer, SFTConfig
import torch

# 1) Tiny toy dataset
dataset = Dataset.from_list([
    {
        "text": "### Instruction:\nSay hello\n\n### Response:\nHello there!"
    },
    {
        "text": "### Instruction:\nWhat is 2 + 2?\n\n### Response:\n4"
    },
])

# 2) Load SMALL model for full finetuning
#    Let Unsloth pick dtype (fp16 on T4) and do NOT use 4-bit.
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name      = "unsloth/SmolLM2-135M-Instruct",  # public, no key
    max_seq_length  = 512,
    load_in_4bit    = False,       # full finetune
    full_finetuning = True,        # train all weights
    dtype           = None,        # let Unsloth decide
)

# 3) Training args — disable wandb
training_args = SFTConfig(
    output_dir                  = "demo_full_ft",
    num_train_epochs            = 1,
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 2,
    learning_rate               = 2e-4,
    logging_steps               = 1,
    save_strategy               = "no",
    report_to                   = "none",   # <--- no wandb
)

trainer = SFTTrainer(
    model              = model,
    tokenizer          = tokenizer,
    train_dataset      = dataset,
    dataset_text_field = "text",
    args               = training_args,
)

trainer.train()

# 4) Inference demo - IMPORTANT: do NOT call FastLanguageModel.for_inference
model.eval()

prompt = "### Instruction:\nSay hello\n\n### Response:\n"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens = 20,
        do_sample      = True,
        temperature    = 0.7,
        use_cache      = False,   # <--- avoids dtype mismatch in KV cache
    )

print(tokenizer.decode(outputs[0], skip_special_tokens=True))


==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.2.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Unsloth: Float16 full finetuning uses more memory since we upcast weights to float32.


num_proc must be <= 2. Reducing num_proc to 2 for dataset of size 2.


Unsloth: We found double BOS tokens - we shall remove one automatically.


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/2 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 2 | Num Epochs = 1 | Total steps = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 2 x 1) = 2
 "-____-"     Trainable parameters = 134,515,584 of 134,515,584 (100.00% trained)


Step,Training Loss
1,3.3858


### Instruction:
Say hello

### Response:

Hello! Hello! Hello! Hello! Hello! Hello! Hello! Hello! Hello! Hello
