In [None]:
# 1. Install required libraries
!pip install -q transformers datasets==2.12.0 peft accelerate bitsandbytes


In [None]:
# 2. Import libraries
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch


In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
model_id = "EleutherAI/polyglot-ko-5.8b"

tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    load_in_4bit=True,
    device_map="auto",
    torch_dtype=torch.float16
)
model = prepare_model_for_kbit_training(model)

In [None]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["query_key_value"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)

In [None]:
import json
from datasets import Dataset, DatasetDict


def load_jsonl(filepath):
    with open(filepath, encoding="utf-8") as f:
        return [json.loads(line) for line in f]

train_data = load_jsonl("/content/train_filtered.jsonl")
valid_data = load_jsonl("/content/valid_filtered.jsonl")


train_dataset = Dataset.from_list(train_data)
valid_dataset = Dataset.from_list(valid_data)


dataset = DatasetDict({
    "train": train_dataset,
    "validation": valid_dataset
})


In [None]:
def tokenize(sample):
    return tokenizer(sample["text"], truncation=True, padding="max_length", max_length=1024)

tokenized_dataset = dataset.map(tokenize, batched=True)

In [None]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir="./polyglot-lora",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    eval_strategy="steps",
    eval_steps=500,
    save_steps=1000,
    logging_steps=100,
    save_total_limit=2,
    fp16=True,
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("/content/polyglot-ko-lora")
tokenizer.save_pretrained("/content/polyglot-ko-lora")

In [None]:
from google.colab import files
!zip -r polyglot-ko-lora.zip polyglot-ko-lora
files.download("polyglot-ko-lora.zip")