### Installation

In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

### Unsloth

In [3]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 8192
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/DeepSeek-R1-0528-Qwen3-8B-unsloth-bnb-4bit",
    max_seq_length = max_seq_length,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

Unrecognized keys in `rope_scaling` for 'rope_type'='yarn': {'attn_factor'}


==((====))==  Unsloth 2025.7.5: Fast Qwen3 patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unrecognized keys in `rope_scaling` for 'rope_type'='yarn': {'attn_factor'}
Unrecognized keys in `rope_scaling` for 'rope_type'='yarn': {'attn_factor'}


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/472 [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

We now add LoRA adapters so we only need to update 1 to 10% of all parameters!

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8,           # Choose any number > 0! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,  # Best to choose alpha = rank or rank*2
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = True,   # We support rank stabilized LoRA
    loftq_config = None,  # And LoftQ
)

## data train alqac

In [7]:
!gdown 13WRuCCpUFPSqwQ3csIe7-UcoSfVcHjpU

Downloading...
From: https://drive.google.com/uc?id=13WRuCCpUFPSqwQ3csIe7-UcoSfVcHjpU
To: /kaggle/working/data_train_legal_qa_new.json
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5.83M/5.83M [00:00<00:00, 128MB/s]


In [9]:
import json
file_path = 'data_train_legal_qa_new.json'
with open(file_path, 'r', encoding='utf-8') as f:
    questions = json.load(f)

In [10]:
from datasets import Dataset
train_dataset = Dataset.from_list(questions)
# valid_dataset = Dataset.from_list(validation_questions)

In [11]:
# Chuy·ªÉn ƒë·ªïi ƒë·ªãnh d·∫°ng dataset sang 'messages' n·∫øu b·∫°n mu·ªën fine-tune v·ªõi chat_template
# ƒêi·ªÅu n√†y r·∫•t h·ªØu √≠ch cho c√°c m√¥ h√¨nh nh∆∞ Qwen, DeepSeek-R1-Distill-Qwen2.5-7B
def format_to_chat_template(example):
    messages = [
        {"role": "system", "content": example["system_prompt"]},
        {"role": "user", "content": example["prompt"]},
        {"role": "assistant", "content": example["answer_think"]},
    ]
    # Unsloth s·∫Ω t·ª± ƒë·ªông √°p d·ª•ng tokenizer.apply_chat_template khi b·∫°n hu·∫•n luy·ªán
    # Tuy nhi√™n, n·∫øu b·∫°n mu·ªën ki·ªÉm tra tr∆∞·ªõc, b·∫°n c√≥ th·ªÉ g·ªçi tokenizer.apply_chat_template(messages, tokenize=False)
    return {"messages": messages}

# √Åp d·ª•ng h√†m chuy·ªÉn ƒë·ªïi cho to√†n b·ªô dataset
train_dataset = train_dataset.map(format_to_chat_template, remove_columns=['question_id', 'text', 'relevant_articles', 'system_prompt', 'prompt', 'answer', 'answer_think', 'question_type'])
# valid_dataset = valid_dataset.map(format_to_chat_template, remove_columns=['question_id', 'text', 'relevant_articles', 'system_prompt', 'prompt', 'answer', 'answer_think', 'question_type'])

Map:   0%|          | 0/728 [00:00<?, ? examples/s]

In [12]:
print("\nM·ªôt m·∫´u d·ªØ li·ªáu sau khi ƒë∆∞·ª£c ƒë·ªãnh d·∫°ng l·∫°i cho chat_template:")
print(train_dataset[0])
print("\nC√°c c·ªôt trong dataset sau khi x·ª≠ l√Ω:")
print(train_dataset.column_names)
print(len(train_dataset))


M·ªôt m·∫´u d·ªØ li·ªáu sau khi ƒë∆∞·ª£c ƒë·ªãnh d·∫°ng l·∫°i cho chat_template:
{'messages': [{'content': 'B·∫°n l√† m·ªôt tr·ª£ l√Ω ph√°p l√Ω Ti·∫øng Vi·ªát, c√≥ nhi·ªám v·ª• tr·∫£ l·ªùi c√°c c√¢u h·ªèi ng·∫Øn g·ªçn, ch√≠nh x√°c, d·ª±a tr√™n n·ªôi dung ƒëi·ªÅu lu·∫≠t ƒë∆∞·ª£c cung c·∫•p. Ch·ªâ tr·∫£ l·ªùi b·∫±ng Ti·∫øng Vi·ªát.\n\nB·∫°n h√£y tr·∫£ l·ªùi theo ƒë·ªãnh d·∫°ng sau:\n<think>\n[Suy nghƒ©, ph√¢n t√≠ch c·ªßa b·∫°n]\n</think>\n[C√¢u tr·∫£ l·ªùi c·ªßa b·∫°n]\n', 'role': 'system'}, {'content': "D·ª±a v√†o b·ªëi c·∫£nh b√™n d∆∞·ªõi, h√£y ph√¢n t√≠ch k·ªπ tr∆∞·ªõc khi tr·∫£ l·ªùi c√¢u h·ªèi.\n\nLo·∫°i c√¢u h·ªèi: ƒê√∫ng/Sai (format c·ªßa K·∫øt lu·∫≠n cu·ªëi c√πng sau khi suy lu·∫≠n l√† 1 trong 2 k·∫øt lu·∫≠n: 'ƒê√∫ng', 'Sai'. Kh√¥ng ƒë∆∞·ª£c gi·∫£i th√≠ch g√¨ th√™m.)\n\nC√¢u h·ªèi: Ng∆∞·ªùi nghi·ªán ma t√∫y t·ª´ ƒë·ªß 18 tu·ªïi tr·ªü l√™n b·ªã √°p d·ª•ng bi·ªán ph√°p x·ª≠ l√Ω h√†nh ch√≠nh ƒë∆∞a v√†o c∆° s·ªü cai nghi·ªán b·∫Øt bu·ªôc theo quy ƒë·ªãnh c·ªßa Lu·∫≠t X·ª≠ l√Ω vi ph·

## format data

In [None]:
from jinja2 import Template

chat_template = """{%- if not add_generation_prompt is defined -%}
{%- set add_generation_prompt = false -%}
{%- endif -%}
{%- set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt="", is_first_sp=true, is_last_user=false) -%}
{%- for message in messages -%}
{%- if message["role"] == "system" -%}
{%- if ns.is_first_sp -%}
{%- set ns.system_prompt = ns.system_prompt + message["content"] -%}
{%- set ns.is_first_sp = false -%}
{%- else -%}
{%- set ns.system_prompt = ns.system_prompt + "\\n\\n" + message["content"] -%}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
{{- bos_token -}}
{{- ns.system_prompt -}}
{%- for message in messages -%}
{%- set content = message["content"] -%}
{%- if message["role"] == "user" -%}
{%- set ns.is_tool = false -%}
{%- set ns.is_first = false -%}
{%- set ns.is_last_user = true -%}
{{- "<ÔΩúUserÔΩú>" + content + "<ÔΩúAssistantÔΩú>" -}}
{%- endif -%}
{%- if message["role"] == "assistant" and message["tool_calls"] is defined and message["tool_calls"] is not none -%}
{%- set ns.is_last_user = false -%}
{%- if ns.is_tool -%}
{{- "<ÔΩútool‚ñÅoutputs‚ñÅendÔΩú>" -}}
{%- endif -%}
{%- set ns.is_first = false -%}
{%- set ns.is_tool = false -%}
{%- set ns.is_output_first = true -%}
{%- for tool in message["tool_calls"] -%}
{%- if not ns.is_first -%}
{%- if content is none -%}
{{- "<ÔΩútool‚ñÅcalls‚ñÅbeginÔΩú><ÔΩútool‚ñÅcall‚ñÅbeginÔΩú>" + tool["type"] + "<ÔΩútool‚ñÅsepÔΩú>" + tool["function"]["name"] + "\\n```json\\n" + tool["function"]["arguments"] + "\\n```<ÔΩútool‚ñÅcall‚ñÅendÔΩú>" -}}
{%- else -%}
{{- content + "<ÔΩútool‚ñÅcalls‚ñÅbeginÔΩú><ÔΩútool‚ñÅcall‚ñÅbeginÔΩú>" + tool["type"] + "<ÔΩútool‚ñÅsepÔΩú>" + tool["function"]["name"] + "\\n```json\\n" + tool["function"]["arguments"] + "\\n```<ÔΩútool‚ñÅcall‚ñÅendÔΩú>" -}}
{%- endif -%}
{%- set ns.is_first = true -%}
{%- else -%}
{{- "\\n<ÔΩútool‚ñÅcall‚ñÅbeginÔΩú>" + tool["type"] + "<ÔΩútool‚ñÅsepÔΩú>" + tool["function"]["name"] + "\\n```json\\n" + tool["function"]["arguments"] + "\\n```<ÔΩútool‚ñÅcall‚ñÅendÔΩú>" -}}
{%- endif -%}
{%- endfor -%}
{{- "<ÔΩútool‚ñÅcalls‚ñÅendÔΩú><ÔΩúend‚ñÅof‚ñÅsentenceÔΩú>" -}}
{%- endif -%}
{%- if message["role"] == "assistant" and (message["tool_calls"] is not defined or message["tool_calls"] is none) -%}
{%- set ns.is_last_user = false -%}
{%- if ns.is_tool -%}
{{- "<ÔΩútool‚ñÅoutputs‚ñÅendÔΩú>" + content + "<ÔΩúend‚ñÅof‚ñÅsentenceÔΩú>" -}}
{%- set ns.is_tool = false -%}
{%- else -%}
{{- content + "<ÔΩúend‚ñÅof‚ñÅsentenceÔΩú>" -}}
{%- endif -%}
{%- endif -%}
{%- if message["role"] == "tool" -%}
{%- set ns.is_last_user = false -%}
{%- set ns.is_tool = true -%}
{%- if ns.is_output_first -%}
{{- "<ÔΩútool‚ñÅoutputs‚ñÅbeginÔΩú><ÔΩútool‚ñÅoutput‚ñÅbeginÔΩú>" + content + "<ÔΩútool‚ñÅoutput‚ñÅendÔΩú>" -}}
{%- set ns.is_output_first = false -%}
{%- else -%}
{{- "\\n<ÔΩútool‚ñÅoutput‚ñÅbeginÔΩú>" + content + "<ÔΩútool‚ñÅoutput‚ñÅendÔΩú>" -}}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
{%- if ns.is_tool -%}
{{- "<ÔΩútool‚ñÅoutputs‚ñÅendÔΩú>" -}}
{%- endif -%}
{#- if add_generation_prompt and not ns.is_last_user and not ns.is_tool #}
{%- if add_generation_prompt and not ns.is_tool %}
{{- "<ÔΩúAssistantÔΩú>" -}}
{%- endif -%}"""
tokenizer.chat_template = chat_template
# chat_template = tokenizer.chat_template

# Load the template
chat_template = Template(chat_template)

In [None]:
import json
combined_dataset = train_dataset.map(
    lambda x: {
        "text": chat_template.render(
                messages=x["messages"],
                add_generation_prompt=False,)
    }
)

# combined_dataset = combined_dataset.remove_columns("messages")

In [None]:
combined_dataset[0]['text']

In [None]:
max_text_length = 0
longest_text = ""

if 'text' in combined_dataset.column_names:
    for item in combined_dataset['text']:
        if isinstance(item, str): # ƒê·∫£m b·∫£o ph·∫ßn t·ª≠ l√† chu·ªói ƒë·ªÉ tr√°nh l·ªói
            current_length = len(item)
            if current_length > 25000:
                print(current_length)
            if current_length > max_text_length:
            # if current_length == 25202:
                max_text_length = current_length
                longest_text = item
else:
    print("C·ªôt 'text' kh√¥ng t·ªìn t·∫°i trong combined_dataset.")

print(f"ƒê·ªô d√†i t·ªëi ƒëa c·ªßa c·ªôt 'text' trong combined_dataset (Hugging Face Dataset): {max_text_length} k√Ω t·ª±")
# print(f"ƒêo·∫°n vƒÉn b·∫£n d√†i nh·∫•t l√†:\n---\n{longest_text}\n---")

# Tokenize ƒëo·∫°n vƒÉn b·∫£n d√†i nh·∫•t
if longest_text: # ƒê·∫£m b·∫£o c√≥ ƒëo·∫°n vƒÉn b·∫£n d√†i nh·∫•t ƒë·ªÉ tokenize
    # tokenizer.encode_plus ho·∫∑c tokenizer() s·∫Ω tr·∫£ v·ªÅ dictionary c√≥ 'input_ids'
    # B·∫°n c√≥ th·ªÉ d√πng tokenizer.encode() ƒë·ªÉ ch·ªâ l·∫•y list of token ids
    tokenized_output = tokenizer.encode(longest_text)
    num_tokens = len(tokenized_output)
    print(f"S·ªë l∆∞·ª£ng token c·ªßa ƒëo·∫°n vƒÉn b·∫£n d√†i nh·∫•t: {num_tokens}")
else:
    print("Kh√¥ng t√¨m th·∫•y ƒëo·∫°n vƒÉn b·∫£n n√†o ƒë·ªÉ tokenize.")

## Train the model

In [None]:
from trl import SFTTrainer, SFTConfig
# save_steps = 314
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = combined_dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 1, # Use GA to mimic batch size!
        warmup_steps = 5,
        num_train_epochs = 14, # Set this for 1 full training run.
        # max_steps = 30,
        
        # --- C·∫•u h√¨nh logging v√† saving theo b∆∞·ªõc ---
        logging_strategy = "steps", # Ghi log theo s·ªë b∆∞·ªõc
        logging_steps = 20,          # Ghi log sau m·ªói 1 b∆∞·ªõc

        # save_strategy = "steps",    # L∆∞u checkpoint theo s·ªë b∆∞·ªõc
        # save_steps = save_steps,            # V√≠ d·ª•: L∆∞u sau m·ªói 50 b∆∞·ªõc. B·∫°n c√≥ th·ªÉ ƒëi·ªÅu 
        save_strategy = "epoch",

        save_total_limit = 15,       # Gi·ªØ t·ªëi ƒëa 5 checkpoint.
        output_dir = "./results",   # Th√™m th∆∞ m·ª•c ƒë·∫ßu ra ƒë·ªÉ l∆∞u checkpoint v√† log
        
        # ƒê·ªÉ ƒë·∫£m b·∫£o floating point ch√≠nh x√°c n·∫øu GPU h·ªó tr·ª£
        fp16 = not torch.cuda.is_bf16_supported(),
        bf16 = torch.cuda.is_bf16_supported(),
        
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

In [None]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

Let's train the model! To resume a training run, set `trainer.train(resume_from_checkpoint = True)`

In [None]:
trainer_stats = trainer.train()

In [None]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

## Save model

In [None]:
hf_token = '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'
username = 'lmq1909'
your_model_repo = 'Qwen3-8B-LQA-14e-full'
model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit")
model.push_to_hub_merged(f"{username}/{your_model_repo}", tokenizer, save_method = "merged_16bit", token = hf_token)

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Downloading safetensors index for unsloth/deepseek-r1-0528-qwen3-8b...


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Unsloth: Merging weights into 16bit:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  25%|‚ñà‚ñà‚ñå       | 1/4 [00:51<02:34, 51.60s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 2/4 [01:31<01:30, 45.00s/it]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 3/4 [02:17<00:45, 45.35s/it]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [02:35<00:00, 38.98s/it]


Unsloth: Saving to lmq1909/Qwen3-8B-LQA-3e-full will fail, but using a temp folder works! Switching to a temp folder then uploading!


  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub
Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Downloading safetensors index for unsloth/deepseek-r1-0528-qwen3-8b...


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Unsloth: Merging weights into 16bit:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  25%|‚ñà‚ñà‚ñå       | 1/4 [01:29<04:27, 89.17s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 2/4 [02:45<02:42, 81.49s/it]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 3/4 [04:08<01:22, 82.31s/it]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

  0%|          | 0/1 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.58G [00:00<?, ?B/s]

Unsloth: Merging weights into 16bit: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4/4 [04:31<00:00, 67.95s/it]


## Inference

In [7]:
import os
from unsloth import FastLanguageModel
import torch

# T√¨m checkpoint l·ªõn nh·∫•t trong ./results
def get_latest_checkpoint(results_dir="./results"):
    checkpoints = []
    for name in os.listdir(results_dir):
        if name.startswith("checkpoint-"):
            try:
                step = int(name.split("-")[1])
                checkpoints.append((step, name))
            except ValueError:
                continue
    if not checkpoints:
        raise ValueError("‚ùå Kh√¥ng t√¨m th·∫•y checkpoint trong th∆∞ m·ª•c.")
    latest = max(checkpoints)[1]
    return os.path.join(results_dir, latest)

# L·∫•y checkpoint m·ªõi nh·∫•t
results_dir="./results"
# results_dir="/kaggle/input/alqac-qwen-14e-full/results"
checkpoint_path = get_latest_checkpoint(results_dir)
checkpoint_path

'/kaggle/input/alqac-qwen-14e-full/results/checkpoint-5096'

In [2]:
from unsloth import FastLanguageModel
import torch

# max_seq_length = 10000 # set cao h∆°n khi train?
max_seq_length = 8192 
model, tokenizer = FastLanguageModel.from_pretrained(
    # model_name = checkpoint_path,
    model_name = '/kaggle/input/alqac-qwen-14e-full/results/checkpoint-1092',
    max_seq_length = max_seq_length,   # Context length - can be longer, but uses more memory
    load_in_4bit = True,     # 4bit uses much less memory
    load_in_8bit = False,    # A bit more accurate, uses 2x memory
    full_finetuning = False, # We have full finetuning now!
    # token = "hf_...",      # use one if using gated models
)

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.


2025-07-21 09:05:00.700784: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753088700.899668      36 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753088700.966406      36 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


ü¶• Unsloth Zoo will now patch everything to make training faster!


Unrecognized keys in `rope_scaling` for 'rope_type'='yarn': {'attn_factor'}


==((====))==  Unsloth 2025.7.5: Fast Qwen3 patching. Transformers: 4.52.4.
   \\   /|    Tesla T4. Num GPUs = 2. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unrecognized keys in `rope_scaling` for 'rope_type'='yarn': {'attn_factor'}
Unrecognized keys in `rope_scaling` for 'rope_type'='yarn': {'attn_factor'}


model.safetensors.index.json: 0.00B [00:00, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

Unsloth 2025.7.5 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


In [13]:
train_dataset[-1]['messages']

[{'content': 'B·∫°n l√† m·ªôt tr·ª£ l√Ω ph√°p l√Ω Ti·∫øng Vi·ªát, c√≥ nhi·ªám v·ª• tr·∫£ l·ªùi c√°c c√¢u h·ªèi ng·∫Øn g·ªçn, ch√≠nh x√°c, d·ª±a tr√™n n·ªôi dung ƒëi·ªÅu lu·∫≠t ƒë∆∞·ª£c cung c·∫•p. Ch·ªâ tr·∫£ l·ªùi b·∫±ng Ti·∫øng Vi·ªát.\n\nB·∫°n h√£y tr·∫£ l·ªùi theo ƒë·ªãnh d·∫°ng sau:\n<think>\n[Suy nghƒ©, ph√¢n t√≠ch c·ªßa b·∫°n]\n</think>\n[C√¢u tr·∫£ l·ªùi c·ªßa b·∫°n]\n',
  'role': 'system'},
 {'content': 'D·ª±a v√†o b·ªëi c·∫£nh b√™n d∆∞·ªõi, h√£y ph√¢n t√≠ch k·ªπ tr∆∞·ªõc khi tr·∫£ l·ªùi c√¢u h·ªèi.\n\nLo·∫°i c√¢u h·ªèi: T·ª± lu·∫≠n\n\nC√¢u h·ªèi: C∆° quan n√†o c√≥ tr√°ch nhi·ªám l√† c∆° quan gi√∫p Ch√≠nh ph·ªß th·ª±c hi·ªán qu·∫£n l√Ω nh√† n∆∞·ªõc v·ªÅ ƒëi·ªán ·∫£nh?\n\nB·ªëi c·∫£nh: \nTr√°ch nhi·ªám qu·∫£n l√Ω nh√† n∆∞·ªõc v·ªÅ ƒëi·ªán ·∫£nh c·ªßa Ch√≠nh ph·ªß, B·ªô VƒÉn h√≥a, Th·ªÉ thao v√† Du l·ªãch\n\n1. Ch√≠nh ph·ªß th·ªëng nh·∫•t qu·∫£n l√Ω nh√† n∆∞·ªõc v·ªÅ ƒëi·ªán ·∫£nh.\n\n2. B·ªô VƒÉn h√≥a, Th·ªÉ thao v√† Du l·ªãch l√† c∆° quan gi√∫p Ch√≠nh ph·ªß th·ª±c hi·ªán qu·∫£

In [14]:
import time
start = time.time()
# prompt = 'D·ª±a v√†o b·ªëi c·∫£nh b√™n d∆∞·ªõi, h√£y ph√¢n t√≠ch k·ªπ tr∆∞·ªõc khi tr·∫£ l·ªùi c√¢u h·ªèi.\n\nC√¢u h·ªèi: Vi·ªác s·ª≠ d·ª•ng t√†i s·∫£n c√¥ng c·ªßa c∆° quan ƒë·ªÉ quy√™n g√≥p t·ª´ thi·ªán c√≥ tr√°i quy ƒë·ªãnh c·ªßa ph√°p lu·∫≠t kh√¥ng?\n\nB·ªëi c·∫£nh: 1. C∆° quan, t·ªï ch·ª©c, ƒë∆°n v·ªã, ng∆∞·ªùi c√≥ ch·ª©c v·ª•, quy·ªÅn h·∫°n ch·ªâ ƒë∆∞·ª£c s·ª≠ d·ª•ng t√†i ch√≠nh c√¥ng, t√†i s·∫£n c√¥ng ƒë·ªÉ l√†m qu√† t·∫∑ng v√¨ m·ª•c ƒë√≠ch t·ª´ thi·ªán, ƒë·ªëi ngo·∫°i v√† th·ª±c hi·ªán ch·∫ø ƒë·ªô, ch√≠nh s√°ch theo quy ƒë·ªãnh c·ªßa ph√°p lu·∫≠t.\n2. Vi·ªác t·∫∑ng qu√† ph·∫£i th·ª±c hi·ªán ƒë√∫ng ch·∫ø ƒë·ªô, ƒë·ªãnh m·ª©c, ti√™u chu·∫©n, ƒë·ªëi t∆∞·ª£ng theo quy ƒë·ªãnh c·ªßa ph√°p lu·∫≠t; c∆° quan, ƒë∆°n v·ªã t·∫∑ng qu√† ph·∫£i h·∫°ch to√°n k·∫ø to√°n v√† th·ª±c hi·ªán c√¥ng khai trong c∆° quan, ƒë∆°n v·ªã m√¨nh theo ƒë√∫ng quy ƒë·ªãnh c·ªßa ph√°p lu·∫≠t.\n\nH√£y sinh ph·∫ßn suy lu·∫≠n chi ti·∫øt theo m·∫´u b√™n d∆∞·ªõi trong th·∫ª <think>...</think>. B·∫°n c·∫ßn vi·∫øt ƒë·∫ßy ƒë·ªß c√°c b∆∞·ªõc ph√¢n t√≠ch, d·∫´n ch·ª©ng v√† suy lu·∫≠n tr∆∞·ªõc khi ƒë∆∞a ra c√¢u tr·∫£ l·ªùi ng·∫Øn g·ªçn.\nKh√¥ng ƒë∆∞·ª£c tr·∫£ l·ªùi ngay m√† ph·∫£i suy lu·∫≠n ƒë·∫ßy ƒë·ªß tr∆∞·ªõc trong <think>.\n\n<think>\n1. Ph√¢n t√≠ch c√¢u h·ªèi: [tr√¨nh b√†y ng·∫Øn g·ªçn n·ªôi dung v√† √Ω ƒë·ªãnh c·ªßa c√¢u h·ªèi]\n2. D·∫´n ch·ª©ng t·ª´ b·ªëi c·∫£nh:\n   - H√£y t√°ch t·ª´ng ƒëo·∫°n d√†i trong b·ªëi c·∫£nh th√†nh nhi·ªÅu √Ω nh·ªè r√µ r√†ng.\n   - M·ªói √Ω n√™n n√™u r√µ n·ªôi dung ph√°p l√Ω, vi·∫øt ng·∫Øn g·ªçn d·ªÖ hi·ªÉu.\n   - V√≠ d·ª•:\n       - [√Ω 1 t·ª´ ƒëo·∫°n lu·∫≠t A]\n       - [√Ω 2 t·ª´ ƒëo·∫°n lu·∫≠t A]\n       - [√Ω 3 t·ª´ ƒëo·∫°n lu·∫≠t B]\n   - Ghi r√µ ƒëo·∫°n n√†o c√≥ li√™n quan ƒë·∫øn c√¢u h·ªèi.\n3. Suy lu·∫≠n step-by-step:\n   a) [b∆∞·ªõc suy lu·∫≠n 1 d·ª±a tr√™n d·∫´n ch·ª©ng ·ªü tr√™n]\n   b) [b∆∞·ªõc suy lu·∫≠n 2 ti·∫øp theo]\n   ‚Ä¶\n4. K·∫øt lu·∫≠n: [t√≥m t·∫Øt c√¢u tr·∫£ l·ªùi cu·ªëi c√πng d·ª±a tr√™n suy lu·∫≠n]\n</think>\n\n[ƒê√°p √°n cu·ªëi c√πng sau khi suy lu·∫≠n]'
prompt = [msg['content'] for msg in train_dataset[-1]['messages'] if msg['role'] == 'user'][0]
# prompt += "\n<think>\n"
messages = [{'role': 'system',
  'content': 'B·∫°n l√† m·ªôt tr·ª£ l√Ω ph√°p l√Ω, c√≥ nhi·ªám v·ª• tr·∫£ l·ªùi c√°c c√¢u h·ªèi ng·∫Øn g·ªçn, ch√≠nh x√°c, d·ª±a tr√™n n·ªôi dung ƒëi·ªÅu lu·∫≠t ƒë∆∞·ª£c cung c·∫•p.'},
  {'role': 'user',
  'content': prompt}
          ]
# print(messages)

text = tokenizer.apply_chat_template(
    messages,
    tokenize = False,
    add_generation_prompt = True, # Must add for generation
    enable_thinking = True, # kh√¥ng c·∫ßn trong m√¥ h√¨nh n√†y
)
max_input_tokens = len(tokenizer(prompt)["input_ids"])
max_new_tokens = max_seq_length - max_input_tokens
print(max_new_tokens)

# from transformers import TextStreamer
# _ = model.generate(
#     **tokenizer(text, return_tensors = "pt").to("cuda"),
#     max_new_tokens = max_new_tokens,
#     temperature = 0.6, top_p = 0.95, top_k = 20, # For thinking
#     streamer = TextStreamer(tokenizer, skip_prompt = True),
# )

generated_output = model.generate(
    **tokenizer(text, return_tensors = "pt").to("cuda"),
    max_new_tokens = max_new_tokens,
    temperature = 0.6, top_p = 0.95, top_k = 20, # For thinking
)

# Gi·∫£i m√£ token ID th√†nh chu·ªói vƒÉn b·∫£n
decoded_output = tokenizer.decode(generated_output[0], skip_special_tokens=True)
print(decoded_output.split('<ÔΩúAssistantÔΩú>')[-1])

end = time.time()
print(end-start)

7516
<think>

1. **Ph√¢n t√≠ch c√¢u h·ªèi**:  
   C√¢u h·ªèi x√°c ƒë·ªãnh c∆° quan gi√∫p Ch√≠nh ph·ªß th·ª±c hi·ªán qu·∫£n l√Ω nh√† n∆∞·ªõc v·ªÅ ƒëi·ªán ·∫£nh.

2. **D·∫´n ch·ª©ng t·ª´ b·ªëi c·∫£nh**:  
   - Ch√≠nh ph·ªß th·ªëng nh·∫•t qu·∫£n l√Ω nh√† n∆∞·ªõc v·ªÅ ƒëi·ªán ·∫£nh.  
   - B·ªô VƒÉn h√≥a, Th·ªÉ thao v√† Du l·ªãch l√† c∆° quan gi√∫p Ch√≠nh ph·ªß th·ª±c hi·ªán qu·∫£n l√Ω nh√† n∆∞·ªõc v·ªÅ ƒëi·ªán ·∫£nh.  
   - B·ªô c√≥ tr√°ch nhi·ªám ban h√†nh ch√≠nh s√°ch, vƒÉn b·∫£n quy ph·∫°m ph√°p lu·∫≠t v·ªÅ ƒëi·ªán ·∫£nh, chi·∫øn l∆∞·ª£c, k·∫ø ho·∫°ch ph√°t tri·ªÉn ƒëi·ªán ·∫£nh.  
   - B·ªô ch·ªãu tr√°ch nhi·ªám th√¥ng tin, tuy√™n truy·ªÅn, ph·ªï bi·∫øn, gi√°o d·ª•c ph√°p lu·∫≠t v·ªÅ ƒëi·ªán ·∫£nh.  
   - B·ªô c√≥ tr√°ch nhi·ªám x√¢y d·ª±ng ti√™u chu·∫©n qu·ªëc gia, quy chu·∫©n k·ªπ thu·∫≠t, h·ªá th·ªëng ch·ªâ ti√™u th·ªëng k√™, c∆° s·ªü d·ªØ li·ªáu ng√†nh ƒëi·ªán ·∫£nh.  
   - B·ªô ƒë·∫£m nh·∫≠n vi·ªác c·∫•p, thu h·ªìi gi·∫•y ph√©p, d·ª´ng ph·ªï bi·∫øn phim theo th·∫©m quy·ªÅn.  
   