In [None]:
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'  #可选
from huggingface_hub import login
hf_token = "XXX" 
login(hf_token)

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "2"  #我在服务器上训练的，如果你在本地训练，这3行注释掉吧
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"]  = "1"


In [None]:
from unsloth import FastLanguageModel
import torch
from datasets import load_dataset

In [None]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen3-14B",
    max_seq_length = 2048,
    load_in_4bit = True,    
    load_in_8bit = False,     
    full_finetuning = False,
)

In [5]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 32,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 32,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.8.4 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [None]:
dataset = load_dataset("json", data_files="XXX", split="train")  #数据集文件路径因人而异，记得设置
def format_conversation(example):
    return {
        "conversations": [
            {"role": "user", "content": example["instruction"]},
            {"role": "assistant", "content": example["output"]}
        ]
    }

dataset = dataset.map(format_conversation)
dataset_text = tokenizer.apply_chat_template(
    dataset["conversations"],
    tokenize=False
)
dataset = dataset.add_column("text", dataset_text)

In [7]:
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    eval_dataset=None,
    args=SFTConfig(
        dataset_text_field="text",
        per_device_train_batch_size=16,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=150,
        learning_rate=2e-4,
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        report_to="none",
    ),
)


Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 10,068 | Num Epochs = 1 | Total steps = 150
O^O/ \_/ \    Batch size per device = 16 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (16 x 4 x 1) = 64
 "-____-"     Trainable parameters = 128,450,560 of 14,896,757,760 (0.86% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.6403
2,2.7389
3,2.5088
4,2.6183
5,2.4807
6,2.3203
7,2.1935
8,2.0514
9,1.9432
10,1.6576


In [None]:
model.save_pretrained("qwen3-14b-150")
tokenizer.save_pretrained("qwen3-14b-150")  #保存模型

('qwen3-14b-150/tokenizer_config.json',
 'qwen3-14b-150/special_tokens_map.json',
 'qwen3-14b-150/chat_template.jinja',
 'qwen3-14b-150/vocab.json',
 'qwen3-14b-150/merges.txt',
 'qwen3-14b-150/added_tokens.json',
 'qwen3-14b-150/tokenizer.json')

In [None]:
messages = [
    {"role" : "user", "content" : "你在干啥"}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True,
)

from transformers import TextStreamer
_ = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = 1500,
    temperature = 0.7, top_p = 0.8, top_k = 20,
    streamer = TextStreamer(tokenizer, skip_prompt = True),
)