# Load model with Unsloth patching

In [1]:
from unsloth import FastLanguageModel

model, tok = FastLanguageModel.from_pretrained(
    model_name="deepseek-ai/deepseek-llm-7b-base",
    max_seq_length=1024,
    load_in_4bit=True,
)
print("Loaded model in 4-bit ✅")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
INFO 08-14 17:05:43 [__init__.py:244] Automatically detected platform cuda.
==((====))==  Unsloth 2025.7.11: Fast Llama patching. Transformers: 4.53.2. vLLM: 0.9.2.
   \\   /|    NVIDIA RTX A4000. Num GPUs = 1. Max memory: 15.724 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.7.0+cu126. CUDA: 8.6. CUDA Toolkit: 12.6. Triton: 3.3.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = True]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

deepseek-ai/deepseek-llm-7b-base does not have a padding token! Will use pad_token = <|PAD_TOKEN|>.
Loaded model in 4-bit ✅


# Apply LoRa adapter

In [2]:
peft_model = FastLanguageModel.get_peft_model(
    model,
    r=8,
    lora_alpha=16,
    lora_dropout=0.05,
    target_modules=["q_proj", "v_proj"],
    bias="none",
    use_gradient_checkpointing=True,
)
print("Loaded peft model ✅")


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.05.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.7.11 patched 30 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


# Load the dataset

In [14]:
from datasets import load_dataset
from unsloth import FastLanguageModel
from trl import SFTTrainer

# Load preformatted dataset
dataset = load_dataset("mlabonne/guanaco-llama2-1k")["train"].select(range(200))

print("Loaded dataset ✅")

Loaded dataset ✅


# Train using SFTTrainer

In [18]:
trainer = SFTTrainer(
    model = peft_model,
    tokenizer = tok,
    train_dataset = dataset,
    formatting_func=lambda x: tokenizer(x["text"], truncation=True, max_length=1024)["input_ids"],
    max_seq_length=1024,
    args = {
        "output_dir": "deepseek-lora-alpaca",
        "per_device_train_batch_size": 2,
        "gradient_accumulation_steps": 4,
        "num_train_epochs": 1,
        "logging_steps": 10,
        "save_steps": 50,
        "save_total_limit": 2,
        "fp16": True,
        "bf16": False,
        "remove_unused_columns": False,
        "report_to": "none",
    },
)
trainer.train()
print("Training complete ✅")

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/200 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 200 | Num Epochs = 3 | Total steps = 75
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 2
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 2 x 1) = 8
 "-____-"     Trainable parameters = 3,932,160 of 6,914,297,856 (0.06% trained)


Step,Training Loss
1,1.6507
2,1.4475
3,1.3198
4,1.262
5,1.5864
6,1.2446
7,1.7399
8,1.5039
9,1.7134
10,1.5609


Training complete ✅


# Save training results

In [20]:
trainer.model.save_pretrained("my_lora/")
tok.save_pretrained("my_lora/")

print("Training results saved ✅")

('my_lora/tokenizer_config.json',
 'my_lora/special_tokens_map.json',
 'my_lora/tokenizer.json')