In [None]:
from transformers import AutoModelForMaskedLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch
import wandb
wandb.init(project="bert-japanese", name="qlora-mlm")

In [None]:
model_name = "cl-tohoku/bert-base-japanese"

tokenizer = AutoTokenizer.from_pretrained(model_name)
dataset = load_dataset("json", data_files={"train": "unlabeled.jsonl"})


In [None]:
def tokenize(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

In [None]:
tokenized_dataset = dataset["train"].map(tokenize, batched=True, remove_columns=["text"])

In [None]:
model = AutoModelForMaskedLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
)

model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["query", "value"],  # Tùy model, có thể là `dense`, `q_proj`, `v_proj`,...
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"  # Hoặc "MASKED_LM" với BERT, bạn có thể thử cả 2
)

model = get_peft_model(model, lora_config)

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=True,
    mlm_probability=0.15,
)

In [None]:
training_args = TrainingArguments(
    output_dir="./qlora-bert-japanese",
    per_device_train_batch_size=8,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=500,
    fp16=True,
    save_total_limit=2,
    report_to="wandb",         # <<< bật WandB
    run_name="qlora-bert-japanese-mlm",  # <<< tên hiển thị trên WandB
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator,
)


In [None]:
trainer.train()

# Lưu model
trainer.save_model("qlora-bert-japanese-mlm")
tokenizer.save_pretrained("qlora-bert-japanese-mlm")