In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
# model_name = "Qwen/Qwen3-4B-Instruct-2507"
# model_name = "Qwen/Qwen3-4B" 
model_name = "Qwen/Qwen3-0.6B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
  model_name,
  torch_dtype="auto",
  device_map="auto"
)

model

`torch_dtype` is deprecated! Use `dtype` instead!


Qwen3ForCausalLM(
  (model): Qwen3Model(
    (embed_tokens): Embedding(151936, 1024)
    (layers): ModuleList(
      (0-27): 28 x Qwen3DecoderLayer(
        (self_attn): Qwen3Attention(
          (q_proj): Linear(in_features=1024, out_features=2048, bias=False)
          (k_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (v_proj): Linear(in_features=1024, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=1024, bias=False)
          (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
          (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
        )
        (mlp): Qwen3MLP(
          (gate_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (up_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (down_proj): Linear(in_features=3072, out_features=1024, bias=False)
          (act_fn): SiLUActivation()
        )
        (input_layernorm): Qwen3RMSNorm((1024,), eps=1e-06)
        (post_attention_layer

In [3]:
def tokenize(tokenizer, prompt, thinking = False):
  messages = [
    {"role": "user", "content": prompt}
  ]
  text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=thinking
  )
  return tokenizer([text], return_tensors="pt").to(model.device)

def generate(model, tokenizer, prompt, thinking = False):
  tokens = tokenize(tokenizer, prompt, thinking)
  generated_ids = model.generate(
    **tokens,
    max_new_tokens=32768
  )
  output_ids = generated_ids[0][len(tokens.input_ids[0]):].tolist() 

  # parsing thinking content
  try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
  except ValueError:
    index = 0
  
  return {
    "thinking_content" : tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n"),
    "content" : tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")
  }

prompt = "Give me a short introduction to large language model."

content = generate(model, tokenizer, prompt, False)
{
  "prompt" :prompt,
  "output" : content["content"]
}

{'prompt': 'Give me a short introduction to large language model.',
 'output': 'A large language model (LLM) is a type of artificial intelligence model designed to understand and generate human language. These models are trained on vast amounts of text data to understand and respond to human queries in natural and conversational ways.'}

In [4]:
# https://huggingface.co/docs/peft/main/en/developer_guides/quantization
#  QLoRA is a method that quantizes a model to 4-bits and then trains it with LoRA. This method allows you to finetune a 65B parameter model on a single 48GB GPU
# https://huggingface.co/papers/2305.14314

from transformers import BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

model = AutoModelForCausalLM.from_pretrained(
  model_name,
  torch_dtype="auto",
  device_map="auto",
  quantization_config = (bnb_config := BitsAndBytesConfig( # quantitazation (q in qlora) part
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
  ))
)
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
  r=8, # rank (capacity knob), Higher r = more capacity to change behavior; Lower r = cheaper, but weaker
  lora_alpha=32, # scaling factor: Too small → LoRA has no effect, Too large → unstable / overfitting
  target_modules=[
    "q_proj", "k_proj", "v_proj", "o_proj", # self attention decoder layers
    # "gate_proj", "up_proj", "down_proj"   # mlp decoder layers
  ], 
  lora_dropout=0.05,
  bias="none",
  task_type="CAUSAL_LM",
)

model = get_peft_model(model, lora_config)
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen3ForCausalLM(
      (model): Qwen3Model(
        (embed_tokens): Embedding(151936, 1024)
        (layers): ModuleList(
          (0-27): 28 x Qwen3DecoderLayer(
            (self_attn): Qwen3Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=1024, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=8, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=8, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): lora.

In [5]:
model.print_trainable_parameters()

trainable params: 2,293,760 || all params: 598,343,680 || trainable%: 0.3834


In [None]:
# train
from transformers import Trainer, TrainingArguments

# use exampole dataset
from datasets import load_dataset
dataset = load_dataset("FutureMa/DramaBench")

args = TrainingArguments(
  output_dir = "../out/",
  per_device_train_batch_size=1,
  gradient_accumulation_steps=16,
  learning_rate=2e-4,
  num_train_epochs=3,
  weight_decay=0.01,
  fp16=True,
  logging_steps=10,
  save_steps=500,
  save_total_limit=2,
  remove_unused_columns=False,
  push_to_hub=False,
  report_to="none"
)

dataset = dataset.map(lambda x: {"text": x["context"] + x["continuation"] + tokenizer.eos_token},)
def tokenize_fn(examples):
    tokens = tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=1024
    )
    tokens["labels"] = tokens["input_ids"].copy()
    return tokens

tokenized_dataset = dataset.map(
    tokenize_fn,
    batched=True,
    remove_columns=dataset["train"].column_names  # remove old columns
)
trainer = Trainer(
  model=model,
  train_dataset=tokenized_dataset["train"],
  args=args,
)
trainer.train()

Map:   0%|          | 0/600 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.
  return fn(*args, **kwargs)


Step,Training Loss
10,3.5683
20,2.2752
30,2.1589
40,2.1654
50,2.1869
60,2.1061
70,2.0023
80,2.0949
90,2.0757
100,2.0055


TrainOutput(global_step=114, training_loss=2.243082314206843, metrics={'train_runtime': 2838.6127, 'train_samples_per_second': 0.634, 'train_steps_per_second': 0.04, 'total_flos': 4896584839987200.0, 'train_loss': 2.243082314206843, 'epoch': 3.0})

In [None]:
model.eval()
# test with same prompt
content = generate(model, tokenizer, prompt, False)
{
  "prompt" : prompt,
  "output" : content["content"]
}

{'prompt': 'Give me a short introduction to large language model.',
 'output': 'A large language model is a type of artificial intelligence system designed to understand and generate text in natural and complex scenarios. It is often used to assist in writing, translation, or content creation. The model can comprehend human language, produce coherent and meaningful responses, and support various tasks.'}