In [None]:

import pandas as pd

df = pd.read_csv('Excuses.csv')

from datasets import Dataset
dataset = Dataset.from_pandas(df)

In [None]:
!pip install -U transformers


Collecting transformers
  Downloading transformers-4.52.4-py3-none-any.whl.metadata (38 kB)
Downloading transformers-4.52.4-py3-none-any.whl (10.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.5/10.5 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.52.3
    Uninstalling transformers-4.52.3:
      Successfully uninstalled transformers-4.52.3
Successfully installed transformers-4.52.4


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, TaskType

model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.pad_token_id

# LoRA 적용
lora_config = LoraConfig(
    r=4,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)


def format_prompt(example):
    prompt = (
        f"[SCENARIO]\n"
        f"Who: {example['who']}\n"
        f"What: {example['what']}\n"
        f"When: {example['when']}\n"
        f"Where: {example['where']}\n"
        f"Why: {example['why']}\n"
        f"How: {example['how']}\n"

        f"[EXCUSE]\n"
    )
    full = prompt + example["text"]
    tokenized = tokenizer(full, padding="max_length", max_length=256, truncation=True)

    prompt_ids = tokenizer(prompt, truncation=True)["input_ids"]
    full_ids = tokenizer(full, padding="max_length", max_length=256, truncation=True)["input_ids"]
    labels = [-100] * len(prompt_ids) + full_ids[len(prompt_ids):]
    labels += [-100] * (256 - len(labels))
    tokenized["labels"] = labels[:256]
    return tokenized

tokenized = dataset.map(format_prompt)

# Trainer 설정
from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir="./gpt2-lora-w5h1",
    per_device_train_batch_size=2,
    num_train_epochs=5,
    logging_steps=1,
    save_strategy="epoch",
    save_total_limit=1,
    report_to="none",
    fp16=True
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized,
    tokenizer=tokenizer
)

trainer.train()

model.save_pretrained("./gpt2-lora-w5h1_2")
tokenizer.save_pretrained("./gpt2-lora-w5h1_2")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Map:   0%|          | 0/500 [00:00<?, ? examples/s]

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
1,8.6257
2,7.7788
3,7.744
4,7.9981
5,7.6475
6,8.0487
7,8.0632
8,7.7412
9,8.1154
10,8.1849


Step,Training Loss
1,8.6257
2,7.7788
3,7.744
4,7.9981
5,7.6475
6,8.0487
7,8.0632
8,7.7412
9,8.1154
10,8.1849


('./gpt2-lora-w5h1_2/tokenizer_config.json',
 './gpt2-lora-w5h1_2/special_tokens_map.json',
 './gpt2-lora-w5h1_2/vocab.json',
 './gpt2-lora-w5h1_2/merges.txt',
 './gpt2-lora-w5h1_2/added_tokens.json',
 './gpt2-lora-w5h1_2/tokenizer.json')

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

# GPT-2 base
base = AutoModelForCausalLM.from_pretrained("gpt2")

# Drive 경로에 있는 tokenizer + LoRA weight 로드
model_path = "/content/drive/MyDrive/gpt2-lora-w5h1_2"
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token
base.config.pad_token_id = tokenizer.pad_token_id

model = PeftModel.from_pretrained(base, model_path)
model = model.to("cuda" if torch.cuda.is_available() else "cpu")



# 육하원칙 프롬프트
prompt = (
    "Who: my professor\n"
    "What: a final exam\n"
    "When: yesterday morning\n"
    "Where: at the university\n"
    "Why: I was sick\n"
    "How: Would it be possible to reschedule?\n"

    "Excuse:"
)

# 토크나이즈
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# 생성
output = model.generate(
    **inputs,
    max_length=100,         # 길이 충분히 확보
    do_sample=True,         # 샘플링 모드 (무작위성 포함)
    temperature=0.9,        # 창의성 제어
    top_p=0.95,
    num_return_sequences=1,
    pad_token_id=tokenizer.eos_token_id  # 경고 방지
)

# 디코딩
print(tokenizer.decode(output[0], skip_special_tokens=True))

#아뭔가 애매한데;;

Who: my professor
What: a final exam
When: yesterday morning
Where: at the university
Why: I was sick
How: Would it be possible to reschedule?
Excuse: I was sick yesterday morning at the university. Would it be possible to reschedule?


In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# 예시: LoRA 적용된 모델과 토크나이저를 저장
model.save_pretrained("/content/drive/MyDrive/gpt2-lora-w5h1_2")
tokenizer.save_pretrained("/content/drive/MyDrive/gpt2-lora-w5h1_2")


('/content/drive/MyDrive/gpt2-lora-w5h1_2/tokenizer_config.json',
 '/content/drive/MyDrive/gpt2-lora-w5h1_2/special_tokens_map.json',
 '/content/drive/MyDrive/gpt2-lora-w5h1_2/vocab.json',
 '/content/drive/MyDrive/gpt2-lora-w5h1_2/merges.txt',
 '/content/drive/MyDrive/gpt2-lora-w5h1_2/added_tokens.json',
 '/content/drive/MyDrive/gpt2-lora-w5h1_2/tokenizer.json')