In [1]:
!pip install --upgrade pip

Collecting pip
  Downloading pip-25.0.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m28.6 MB/s[0m eta [36m0:00:00[0m [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.3.1
    Uninstalling pip-23.3.1:
      Successfully uninstalled pip-23.3.1
Successfully installed pip-25.0.1
[0m

In [2]:
!pip install pandas 
!pip install -q U datasets==3.2.0
!pip install -q U transformers==4.47.0
!pip install -U trl peft bitsandbytes accelerate

Collecting pandas
  Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.1/13.1 MB[0m [31m215.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pytz-2025.1-py2.py3-none-any.whl (507 kB)
Downloading tzdata-2025.1-py2.py3-none-any.whl (346 kB)
Installing collected packages: pytz, tzdata, pandas
Successfully installed pandas-2.2.3 pytz-2025.1 tzdata-2025.1
[0mCollecting trl
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.3-py3-none-manyl

# 라이브러리 로드

In [3]:
import os
import gc
import torch
import pandas as pd
from tqdm.auto import tqdm
from datasets import Dataset
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    TrainingArguments,
    TrainerCallback,
    pipeline,
)
from trl import SFTTrainer, SFTConfig

In [4]:
!pip list | grep -E 'torch|pandas|tqdm|accelerate|datasets|peft|transformers|trl|bitsandbytes'

accelerate                        1.4.0
bitsandbytes                      0.45.3
datasets                          3.2.0
pandas                            2.2.3
peft                              0.14.0
torch                             2.1.0+cu118
torchaudio                        2.1.0+cu118
torchvision                       0.16.0+cu118
tqdm                              4.67.1
transformers                      4.47.0
trl                               0.15.2


# 학습 진행

In [2]:
# === 4bit 양자화 설정 ===
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)

# === 모델 병렬 로드 ===
model_id = 'rtzr/ko-gemma-2-9b-it'
model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config = bnb_config, device_map='auto')
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'

# === LoRA 구성 ===
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules = [
    "q_proj", "v_proj", "k_proj", "o_proj", 
    "gate_proj", "down_proj", "up_proj"
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

def load_data(train_path, test_path):
    train_df = pd.read_csv(train_path, encoding='utf-8-sig')
    test_df = pd.read_csv(test_path, encoding='utf-8-sig')
    return train_df, test_df

# 학습을 위한 prompt를 생성
def generate_prompts(examples):
    prompt_list=[]
    for input, output in zip(examples["input"], examples["output"]):
        prompt_list.append(
            f"""<bos><start_of_turn>user
            Your task is to transform the given obfuscated Korean review into a clear, correct, and natural-sounding Korean review that reflects its original meaning.
            Input: {input}
            <end_of_turn>
            <start_of_turn>model
            {output}<end_of_turn><eos>"""
        )
    return prompt_list

train_path = 'train.csv'
test_path = 'test.csv'

train, test = load_data(train_path, test_path)
dataset = Dataset.from_pandas(train)
train_test_split = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

model = get_peft_model(model, lora_config)

# 모델을 훈련 모드로 설정
model.train()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# === 학습 설정 ===
training_args = SFTConfig(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=16,
    optim="paged_adamw_32bit",
    eval_strategy="steps",
    eval_steps=100, # 모델의 평가 주기
    logging_dir="./logs",
    logging_steps=100,
    warmup_steps=10, # 학습률 스케줄링
    logging_strategy="steps",
    learning_rate=2e-4,
    group_by_length=True, 
    fp16=True,
    max_seq_length=512,
)

# === 트레이너 구성 ===
trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    peft_config=lora_config,
    args=training_args,
    formatting_func=generate_prompts,
)

# === 콜백 추가 및 학습 실행 ===
trainer.train()

# === 모델 저장 ===
trainer.model.save_pretrained("lora_adapter_9b")

Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

Applying formatting function to train dataset:   0%|          | 0/10136 [00:00<?, ? examples/s]

Converting train dataset to ChatML:   0%|          | 0/10136 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/10136 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/10136 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/10136 [00:00<?, ? examples/s]

Applying formatting function to eval dataset:   0%|          | 0/1127 [00:00<?, ? examples/s]

Converting eval dataset to ChatML:   0%|          | 0/1127 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/1127 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/1127 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/1127 [00:00<?, ? examples/s]

Step,Training Loss,Validation Loss
100,34.8963,2.082212
200,27.1436,1.696712
300,24.6844,1.579264
400,24.4502,1.46387
500,22.3202,1.429295
600,21.8884,1.399289
700,20.7688,1.394777
800,19.6952,1.379235
900,19.563,1.374995


Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
The 'batch_size' argument of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'max_batch_size' argument instead.
The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer.tokenizer is now deprecated. You should use Trainer.processing_class instead.
Trainer

# 모델 병합 후 저장

In [3]:
BASE_MODEL = "rtzr/ko-gemma-2-9b-it"
ADAPTER_MODEL = "lora_adapter_9b"

model = AutoModelForCausalLM.from_pretrained(BASE_MODEL, device_map='auto', torch_dtype=torch.float16)
model = PeftModel.from_pretrained(model, ADAPTER_MODEL, device_map='auto', torch_dtype=torch.float16)
model.save_pretrained('gemma2_9b_finetuning')

Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

Some parameters are on the meta device because they were offloaded to the cpu.


# 추론 진행

In [2]:
def load_data(train_path, test_path):
    train_df = pd.read_csv(train_path, encoding='utf-8-sig')
    test_df = pd.read_csv(test_path, encoding='utf-8-sig')
    return train_df, test_df

train_path = 'train.csv'
test_path = 'test.csv'

train, test = load_data(train_path, test_path)

# === 모델 & 토크나이저 로드 ===
BASE_MODEL = "rtzr/ko-gemma-2-9b-it"
FINETUNE_MODEL = "gemma2_9b_finetuning"

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
model = AutoModelForCausalLM.from_pretrained(FINETUNE_MODEL, device_map='auto')
model.eval()

# === 추론 파이프라인 설정 ===
text_gen_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

# === 추론 실행 ===
restored_reviews = []

def generate_inference_prompt(query):
    prompt = f"""<bos><start_of_turn>user
            Your task is to transform the given obfuscated Korean review into a clear, correct, and natural-sounding Korean review that reflects its original meaning.
            Input: {query}
            <end_of_turn>
            <start_of_turn>model
            """
    return prompt
    
for index, row in tqdm(test.iterrows(), total=len(test)):
    query = row['input']
    prompt = generate_inference_prompt(query)

    generated = text_gen_pipeline(
        prompt,
        num_return_sequences=1,
        temperature=0.2,
        top_p=0.9,
        max_new_tokens=len(query),
        do_sample=True,
        eos_token_id=tokenizer.eos_token_id
    )
    
    generated_text = generated[0]['generated_text']
    output_start = generated_text.find("<start_of_turn>model")
    
    if output_start != -1:
        output_text = generated_text[output_start + len("<start_of_turn>model"):].strip()
    else:
        output_text = generated_text.strip()
        
    restored_reviews.append(output_text)


Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

Device set to use cuda:0


  0%|          | 0/1689 [00:00<?, ?it/s]

The 'batch_size' attribute of HybridCache is deprecated and will be removed in v4.49. Use the more precisely named 'self.max_batch_size' attribute instead.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [3]:
# === 결과 저장 ===
submission_path = 'sample_submission.csv'
submission = pd.read_csv(submission_path, encoding='utf-8-sig')
submission['output'] = restored_reviews
submission['output'] = submission['output'].apply(lambda x: x.split("<end_of_turn>")[0])
submission.to_csv('./submission.csv', index=False, encoding='utf-8-sig')

In [4]:
submission

Unnamed: 0,ID,output
0,TEST_0000,너무너무 만족스러운 호텔이에요. 부산에 오면 꼭 추천하고 싶은 곳이에요. 최고입니다...
1,TEST_0001,"프론트가 없고, 조식도 없으며, 일반 입주민들이 사이트임에 있어 호텔처럼 관리가 잘..."
2,TEST_0002,진짜 불친절해요. 살면서 머물렀던 호텔 중에 최악이었습니다. 직원인지 사장인지 체크...
3,TEST_0003,뷰 맛집~~ 그런데 방음이 미흡하네요. 층간 소음과 발코니가 이중창이 아니라서 밤에...
4,TEST_0004,방 상태는 진짜 폐허 직전인데 전망은 좋아요. 보일러가 아주 찬찬하게 돌아서 추웠어...
...,...,...
1684,TEST_1684,"일반실 5만원이고, 프리미엄 6만원짜리는 스타일러스, 안마기도 있고, PC도 있지만..."
1685,TEST_1685,"일단 방은 이쁘고, 배달 음식은 꿀지였어서 (외인지는 이미지..) 룸서비스랑 조식으..."
1686,TEST_1686,"엔만하면 다시 안 올 예정입니다. 겁을 웨핑이 있습니다. 싫고. 이건 뭐, 관리할 ..."
1687,TEST_1687,도착 후 급게 기대하지 않았는데 직원분들이 친절하시고 2박 머무는 동안 매일 객실도...
