In [1]:
# pip installs
!pip install -q -U datasets peft bitsandbytes transformers trl accelerate sentencepiece

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m506.8/506.8 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m504.9/504.9 kB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.0/12.0 MB[0m [31m80.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m423.1/423.1 kB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m375.8/375.8 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m51.6 MB/s[0m eta [36m0:00:0

In [2]:
# imports
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login
import os, wandb
import re
import math
from tqdm import tqdm
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, set_seed, BitsAndBytesConfig
from datasets import load_dataset, Dataset, DatasetDict
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
import transformers.utils.hub
import transformers.utils.generic
import transformers.tokenization_utils_base

2025-10-30 20:11:55.905840: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1761855116.123760      37 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1761855116.183310      37 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# set check point tracker
# Path to store the checkpoint tracking information
def create_checkpoint_tracker():
    checkpoint_file = "checkpoint_tracker.py"

    with open(checkpoint_file, "w") as f:
        f.write("""
def get_latest_step():
    try:
        with open("latest_step.txt", "r") as f:
            return int(f.read().strip())
    except:
        return 0

def save_latest_step(step):
    with open("latest_step.txt", "w") as f:
        f.write(str(step))
""")

create_checkpoint_tracker()
from checkpoint_tracker import get_latest_step, save_latest_step

In [5]:
# resume training function
def train_or_resume(
    base_model_name,
    hf_model_name,
    train_dataset,
    lora_config,
    steps_per_session=500,
    max_total_steps=1000,
    batch_size=1,
    grad_accum_steps=16,
    save_steps=100
):
    """
    Train a model or resume training from the latest checkpoint on Hugging Face.

    Args:
        base_model_name: Original model to fine-tune or 'resume' to continue training
        hf_model_name: HF repo name to save model to (username/model-name)
        train_dataset: Dataset to train on
        lora_config: LoRA configuration
        steps_per_session: How many steps to train in this session
        max_total_steps: Maximum number of steps to train overall
        batch_size: Batch size for training
        grad_accum_steps: Gradient accumulation steps
        save_steps: How often to save checkpoints
    """
    # Get the latest step we've trained to
    latest_step = get_latest_step()

    # Check if we've already reached the max steps
    if latest_step >= max_total_steps:
        print(f"Training already completed! Reached {latest_step}/{max_total_steps} steps")
        return

    # Calculate how many steps to train in this session
    steps_this_session = min(steps_per_session, max_total_steps - latest_step)
    print(f"Training for {steps_this_session} steps (total progress: {latest_step}/{max_total_steps})")

    # Set up tokenizer
    tokenizer = AutoTokenizer.from_pretrained(
    base_model_name,
    trust_remote_code=True,
    revision="main",
    local_files_only=False
    )
    tokenizer.chat_template = None
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"

    # Configure quantization
    quant_config = transformers.BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_quant_type="nf4"
    )

    # Check if we need to resume training
    try:
        if latest_step > 0:
            print(f"Resuming from checkpoint at step {latest_step}")
            # Load from Hugging Face
            base_model = AutoModelForCausalLM.from_pretrained(
                hf_model_name,
                quantization_config=quant_config,
                device_map="auto",
                trust_remote_code=True,
                revision="main",
                local_files_only=False
            )
        else:
            print("Starting training from base model")
            # Start fresh
            base_model = AutoModelForCausalLM.from_pretrained(
                base_model_name,
                quantization_config=quant_config,
                device_map="auto",
                trust_remote_code=True,
                revision="main",
                local_files_only=False
            )
    except Exception as e:
        print(f"Error loading model, starting fresh: {e}")
        base_model = AutoModelForCausalLM.from_pretrained(
            base_model_name,
            quantization_config=quant_config,
            device_map="auto",
        )

    # Configure training parameters
    train_params = SFTConfig(
        output_dir=f"./checkpoints",
        num_train_epochs=1,
        max_steps=steps_this_session,
        per_device_train_batch_size=batch_size,
        gradient_accumulation_steps=grad_accum_steps,
        optim="paged_adamw_32bit",
        save_steps=save_steps,
        logging_steps=20,
        learning_rate=2e-4,
        weight_decay=0.001,
        fp16=False,
        bf16=True,
        max_grad_norm=0.3,
        warmup_ratio=0.03,
        group_by_length=True,
        lr_scheduler_type="cosine",
        push_to_hub=True,
        hub_model_id=hf_model_name,
        hub_private_repo=True
    )

    # Create trainer
    trainer = SFTTrainer(
        model=base_model,
        train_dataset=train_dataset,
        peft_config=lora_config,
        args=train_params,
    )

    # Train the model
    trainer.train()

    # Push to Hugging Face Hub
    trainer.model.push_to_hub(hf_model_name, private=True)

    # Update and save the latest step count
    save_latest_step(latest_step + steps_this_session)

    print(f"Completed training session ({latest_step + steps_this_session}/{max_total_steps} steps)")
    print(f"Model saved to HuggingFace: {hf_model_name}")

    return latest_step + steps_this_session

In [8]:

# Model and repository names
BASE_MODEL = "meta-llama/Meta-Llama-3.1-8B"
HF_USER = "calmm-m"
PROJECT_NAME = "summarization"
HUB_MODEL_NAME = f"{HF_USER}/{PROJECT_NAME}"

user_secrets = UserSecretsClient()
# Login to Hugging Face
hf_token = user_secrets.get_secret("HF_TOKEN")
login(token=hf_token, add_to_git_credential=True)

# Log in to Weights & Biases
os.environ["WANDB_API_KEY"] = user_secrets.get_secret("WANDB_API_KEY")
wandb.login()

# Configure Weights & Biases to record against our project
os.environ["WANDB_PROJECT"] = PROJECT_NAME
os.environ["WANDB_LOG_MODEL"] = "checkpoint"
os.environ["WANDB_WATCH"] = "gradients"

# Load your dataset
from datasets import load_dataset
dataset = load_dataset(f"{HF_USER}/summarization")
# Check the dataset keys to find the correct train dataset key
print(dataset.keys())
train_data = dataset['train'].shuffle(seed=123).select(range(min(3000, len(dataset['train']))))

# LoRA configuration
lora_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
)

def formatting_func(example):
    # Trích xuất dữ liệu, lưu ý: `completion` là bản gốc, `prompt` là tóm tắt
    # Chúng ta hoán đổi chúng để `prompt` (yêu cầu) là văn bản gốc
    # và `completion` (đáp án) là tóm tắt
    original_text = example['completion'][0]
    summary_text = example['prompt'][0]

    # Định dạng dữ liệu thành một chuỗi duy nhất theo cấu trúc Instruction-based
    # Dấu '###' giúp mô hình phân biệt giữa yêu cầu và câu trả lời
    instruction_text = f"Tóm tắt đoạn văn bản sau:\n\n{original_text}\n\n### Tóm tắt:\n{summary_text}"
    return {"text": instruction_text}

Token has not been saved to git credential helper.


[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
dict_keys(['train'])


In [None]:
# Train or resume
current_step = train_or_resume(
    base_model_name=BASE_MODEL,
    hf_model_name=HUB_MODEL_NAME,
    train_dataset=train_data,
    lora_config=lora_parameters,
    steps_per_session=60,
    max_total_steps=200,
    batch_size=1,
    grad_accum_steps=8,
    save_steps=60
)

print(f"Current training progress: {current_step}/200 steps")