In [None]:
# ==== 0) 기본 셋업 & 버전 확인 ====================================================
!pip -q install -U bitsandbytes trl rouge_score

import os, random, numpy as np, torch, logging, gc, re, pandas as pd
from datasets import load_dataset, Dataset
from typing import Any, List, Dict
from dataclasses import dataclass, field

from transformers import (
    AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainerCallback
)
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
os.environ["TOKENIZERS_PARALLELISM"] = "false"
torch.set_float32_matmul_precision("high")

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)

print("PyTorch:", torch.__version__)
if torch.cuda.is_available():
    print("CUDA:", torch.version.cuda, "| GPU:", torch.cuda.get_device_name(0))

PyTorch: 2.8.0+cu126
CUDA: 12.6 | GPU: NVIDIA A100-SXM4-40GB


In [None]:
# ==== 1) 경로/하이퍼파라미터 ======================================================
BASE_MODEL_ID = "LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct"
PATH_DATA_SFT_JSON = "/content/drive/MyDrive/sejong_dataset.json"  # <-- 실제 경로 확인
BASE_OUTPUT_DIR = "/content/drive/MyDrive/data/test/experiments_lora"

LORA_R = 1
LORA_ALPHA = 2
LORA_DROPOUT = 0.1

TRAIN_DATA_RATIO = 0.9
LEARNING_RATE = 3e-4
NUM_TRAIN_EPOCHS = 10

LORA_TARGET_MODULES = ["q_proj","k_proj","v_proj","o_proj"]
MAX_SEQ_LENGTH = 512
BATCH_SIZE = 4

experiment_name = f"r{LORA_R}_a{LORA_ALPHA}_do{LORA_DROPOUT}_lr{str(LEARNING_RATE)}"
OUTPUT_DIR = os.path.join(BASE_OUTPUT_DIR, experiment_name)
ADAPTER_PATH = f"{OUTPUT_DIR}/final_adapter"
MERGED_MODEL_PATH = f"{OUTPUT_DIR}/sft_tuned_model"
os.makedirs(OUTPUT_DIR, exist_ok=True)
print("Output:", OUTPUT_DIR)

Output: /content/drive/MyDrive/data/test/experiments_lora/r1_a2_do0.1_lr0.0003


In [None]:
# ==== 2) 데이터 로딩 & 포맷 ("### 응답:\\n" 기준) ====================================
assert os.path.exists(PATH_DATA_SFT_JSON), f"데이터 파일이 없습니다: {PATH_DATA_SFT_JSON}"
dataset_sft_raw = load_dataset("json", data_files=PATH_DATA_SFT_JSON, split="train")

split_dataset = dataset_sft_raw.train_test_split(test_size=1-TRAIN_DATA_RATIO, seed=SEED)
train_raw = split_dataset["train"]
test_raw  = split_dataset["test"]

def _clean_row(ex):
    for k in ("instruction","input","output"):
        ex[k] = str(ex.get(k, "") or "").strip()
    return ex

train_raw = train_raw.map(_clean_row)
test_raw  = test_raw.map(_clean_row)

def format_sft(example):
    instr, inp, out = example["instruction"], example["input"], example["output"]
    if inp:
        prompt = f"### 지시문:\n{instr}\n\n### 입력:\n{inp}\n\n### 응답:\n{out}"
    else:
        prompt = f"### 지시문:\n{instr}\n\n### 응답:\n{out}"
    return {"text": prompt}

sft_train_dataset = train_raw.map(format_sft, remove_columns=train_raw.column_names)
sft_test_dataset  = test_raw.map(format_sft,  remove_columns=test_raw.column_names)

print(f"train: {len(sft_train_dataset)}, test: {len(sft_test_dataset)}")

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2502 [00:00<?, ? examples/s]

Map:   0%|          | 0/278 [00:00<?, ? examples/s]

Map:   0%|          | 0/2502 [00:00<?, ? examples/s]

Map:   0%|          | 0/278 [00:00<?, ? examples/s]

train: 2502, test: 278


In [None]:
# ==== 3) 평가 콜백 (left padding + bool mask) =======================================
class EvaluateAtEpochEndCallback(TrainerCallback):
    def __init__(
        self,
        test_dataset: Dataset,
        tokenizer,
        max_input_len=512,
        max_new_tokens=128,
        batch_size=16,
        max_eval_samples=64,          # ← 에폭당 평가 샘플 수 제한
        eval_every_n_epochs=2,        # ← N 에폭마다만 평가
        compute_rouge_bleu=False,     # ← 무거운 지표 생략
        seed=42,
    ):
        super().__init__()
        self.test_dataset = test_dataset
        self.tokenizer = tokenizer
        self.results = []
        self.max_input_len = max_input_len
        self.max_new_tokens = max_new_tokens
        self.batch_size = batch_size
        self.max_eval_samples = max_eval_samples
        self.eval_every_n_epochs = eval_every_n_epochs
        self.compute_rouge_bleu = compute_rouge_bleu
        self.seed = seed

        import re
        from rouge_score import rouge_scorer
        from nltk.translate.bleu_score import SmoothingFunction
        self._punc_regex  = re.compile(r"[\"'`.,!?;:()[\\]{}<>~\\-_=+/\\\\|@#$%^&]")
        self._multi_space = re.compile(r"\\s+")
        self._rouge  = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=False)
        self._smooth = SmoothingFunction().method1

        if self.tokenizer.pad_token_id is None and self.tokenizer.eos_token_id is not None:
            self.tokenizer.pad_token = self.tokenizer.eos_token

    def _normalize(self, s: str) -> str:
        return self._multi_space.sub(" ", self._punc_regex.sub(" ", str(s).strip().lower())).strip()

    @torch.no_grad()
    def on_epoch_end(self, args, state, control, **kwargs):
        import numpy as np, torch, gc
        dev = "cuda" if torch.cuda.is_available() else "cpu"
        model = kwargs["model"]
        tok   = self.tokenizer
        epoch = int(state.epoch)

        # 🔸 평가 주기 조절
        if (epoch % self.eval_every_n_epochs) != 0:
            logging.info(f"--- Epoch {epoch} 평가 스킵 (every {self.eval_every_n_epochs} epochs) ---")
            return control

        # 🔸 무작위 부분 샘플만 평가
        ds = self.test_dataset
        if self.max_eval_samples and self.max_eval_samples < len(ds):
            rng = np.random.RandomState(self.seed + epoch)
            idx = rng.choice(len(ds), size=self.max_eval_samples, replace=False)
            ds  = ds.select(idx.tolist())

        logging.info(f"--- Epoch {epoch} 평가 시작 (samples={len(ds)}) ---")

        prev_mode = model.training
        model.eval()

        # user 프롬프트 구성 (학습 포맷에서 '### 응답:' 이전만)
        def build_prompt(row_text):
            cut = row_text.split("### 응답:")[0].strip()
            msgs = [{"role":"user","content":cut.replace("### 지시문:\n","").replace("### 입력:\n","\n\n")}]
            return tok.apply_chat_template(msgs, tokenize=False, add_generation_prompt=True)

        prompts = [build_prompt(r["text"]) for r in ds]

        answers = []
        old_pad = tok.padding_side
        tok.padding_side = "left"
        try:
            for i in range(0, len(prompts), self.batch_size):
                batch_prompts = prompts[i:i+self.batch_size]
                enc = tok(batch_prompts, return_tensors="pt", padding=True, truncation=True, max_length=self.max_input_len)
                enc["attention_mask"] = enc["attention_mask"].to(torch.bool)
                enc = {k:v.to(dev) for k,v in enc.items()}
                gen = model.generate(
                    **enc,
                    max_new_tokens=self.max_new_tokens,
                    do_sample=False,
                    pad_token_id=tok.pad_token_id,
                    eos_token_id=tok.eos_token_id,
                    use_cache=True
                )
                lens = enc["attention_mask"].sum(dim=1).tolist()
                for row, plen in zip(gen, lens):
                    ans = tok.decode(row[plen:], skip_special_tokens=True).strip()
                    answers.append(ans)
        finally:
            tok.padding_side = old_pad
            if prev_mode: model.train()

        # 🔸 빠른 지표만 계산 (acc/f1). 무거운 지표는 옵션
        scores = {'acc':[], 'f1':[], 'rougeL':[], 'bleu4':[]}
        for i, row in enumerate(ds):
            ref = self._normalize(row["text"].split("### 응답:\n",1)[1] if "### 응답:" in row["text"] else "")
            hyp = self._normalize(answers[i] if i < len(answers) else "")
            ref_tok, hyp_tok = ref.split(), hyp.split()

            acc  = 1.0 if ref == hyp else 0.0
            common = len(set(ref_tok) & set(hyp_tok))
            prec = common/len(hyp_tok) if hyp_tok else 0.0
            rec  = common/len(ref_tok) if ref_tok else 0.0
            f1   = 2*prec*rec/(prec+rec) if (prec+rec)>0 else 0.0

            if self.compute_rouge_bleu:
                from nltk.translate.bleu_score import sentence_bleu
                rougeL = float(self._rouge.score(ref, hyp)["rougeL"].fmeasure)
                bleu4  = float(sentence_bleu([ref_tok], hyp_tok, smoothing_function=self._smooth)) if ref_tok and hyp_tok else 0.0
            else:
                rougeL = float("nan")
                bleu4  = float("nan")

            scores['acc'].append(acc); scores['f1'].append(f1)
            scores['rougeL'].append(rougeL); scores['bleu4'].append(bleu4)

        import numpy as _np
        self.results.append({
            "Epoch": epoch,
            "Accuracy": float(_np.mean(scores['acc'])),
            "Token F1": float(_np.mean(scores['f1'])),
            "ROUGE-L":  float(_np.nanmean(scores['rougeL'])),
            "BLEU-4":   float(_np.nanmean(scores['bleu4'])),
        })
        logging.info(f"--- Epoch {epoch} 평가 완료 (samples={len(ds)}) ---")
        gc.collect();
        if torch.cuda.is_available(): torch.cuda.empty_cache()
        return control

In [None]:
# ==== 4) Completion-only Collator (응답 이후만 라벨) ================================
@dataclass
class CompletionOnlyCollator:
    tokenizer: Any = field(repr=False)
    response_template: str = "### 응답:\n"
    pad_to_multiple_of: int | None = 8

    def __post_init__(self):
        self.template_ids: List[int] = self.tokenizer.encode(self.response_template, add_special_tokens=False)

    def _find_subseq(self, seq: List[int], subseq: List[int]) -> int:
        if not subseq: return -1
        Ls, Lt = len(seq), len(subseq)
        for i in range(0, Ls-Lt+1):
            if seq[i:i+Lt] == subseq: return i
        return -1

    def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, torch.Tensor]:
        if "input_ids" not in features[0]:
            texts = [f["text"] for f in features]
            batch = self.tokenizer(texts, padding=True, truncation=True, return_tensors="pt", return_attention_mask=True)
        else:
            input_ids = [torch.tensor(f["input_ids"], dtype=torch.long) for f in features]
            if "attention_mask" in features[0]:
                attention = [torch.tensor(f["attention_mask"], dtype=torch.bool) for f in features]
            else:
                attention = [torch.ones(len(ids), dtype=torch.bool) for ids in input_ids]
            batch = self.tokenizer.pad(
                {"input_ids": input_ids, "attention_mask": attention},
                padding=True, pad_to_multiple_of=self.pad_to_multiple_of, return_tensors="pt"
            )

        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"].to(torch.bool)
        labels = input_ids.clone()
        pad_id = self.tokenizer.pad_token_id
        if pad_id is not None:
            labels[input_ids == pad_id] = -100

        tmpl = self.template_ids
        for i in range(input_ids.size(0)):
            seq = input_ids[i].tolist()
            start = self._find_subseq(seq, tmpl)
            if start != -1:
                cut = start + len(tmpl)
                labels[i, :cut] = -100
            else:
                labels[i, :] = -100  # 템플릿 없으면 무시

        return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}


In [None]:
# 1) bnb 환경변수 정리 (잘못된 힌트 제거)
import os
os.environ.pop("BNB_CUDA_VERSION", None)  # ← 이전에 "126" 강제했던 값 제거

# 2) bnb 최신 설치/갱신 (캐시 없이 깔끔히)
!pip -q install -U --no-cache-dir bitsandbytes

# 3) 로드 확인 + CUDA 핸들 점검
import bitsandbytes as bnb
print("bitsandbytes:", bnb.__version__)
try:
    from bitsandbytes.cuda_setup.main import get_compute_capabilities, get_cuda_lib_handle
    print("SM capability:", get_compute_capabilities())
    print("CUDA lib handle:", get_cuda_lib_handle())   # None가 아니어야 정상
except Exception as e:
    print("[!] bnb cuda setup warn:", e)


bitsandbytes: 0.47.0
[!] bnb cuda setup warn: No module named 'bitsandbytes.cuda_setup'


In [None]:
# ==== 5) 모델/토크나이저 로드 (bnb 연산 float32 + SDPA math/eager) =================
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float32,  # ★ 핵심: 내부 연산 float32로 통일
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
    tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.float32,        # 안전
    trust_remote_code=True
)
model.config.use_cache = False
# SDPA 고속 커널 비활성화 → math/eager로
torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)
try:
    model.config.attn_implementation = "eager"
except Exception:
    pass

ImportError: Using `bitsandbytes` 4-bit quantization requires the latest version of bitsandbytes: `pip install -U bitsandbytes`

In [None]:
# ==== 6) LoRA 설정 (SFTTrainer에서만 주입; 이중 주입 금지) ======================
peft_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    target_modules=LORA_TARGET_MODULES,
    task_type="CAUSAL_LM"
)

# ==== 7) 토크나이즈(사전 처리) & Collator =========================================
def tok_fn(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_SEQ_LENGTH, padding=False, return_attention_mask=True)

tokenized_train = sft_train_dataset.map(tok_fn, batched=True, remove_columns=sft_train_dataset.column_names)
collator = CompletionOnlyCollator(tokenizer=tokenizer, response_template="### 응답:\n")

# ==== 8) SFT 설정 + Trainer ========================================================
sft_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    num_train_epochs=NUM_TRAIN_EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE,
    logging_steps=10,
    logging_strategy="steps",
    save_strategy="epoch",
    report_to="none",
    fp16=False,                      # 연산을 float32로 통일
    bf16=False,
    gradient_checkpointing=True,
    lr_scheduler_type="cosine",
    warmup_ratio=0.03,
)
# 소소 보강
sft_args.save_total_limit = 2
sft_args.gradient_checkpointing_kwargs = {"use_reentrant": False}

evaluation_callback = EvaluateAtEpochEndCallback(
    test_dataset=sft_test_dataset,
    tokenizer=tokenizer,
    max_input_len=MAX_SEQ_LENGTH,
    max_new_tokens=128,        # ← 짧게 생성
    batch_size=16,             # ← 크게 배치
    max_eval_samples=64,       # ← 샘플 제한
    eval_every_n_epochs=2,     # ← 2에폭마다만 평가
    compute_rouge_bleu=False   # ← 무거운 지표 생략
)


trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_train,
    peft_config=peft_config,
    args=sft_args,
    data_collator=collator,
    callbacks=[evaluation_callback],
)

logging.info(f"--- QA 데이터 {len(sft_train_dataset)}개로 모델 학습 시작 ---")
trainer.train()
logging.info("--- 모델 학습 완료 ---")

trainer.model.save_pretrained(ADAPTER_PATH)
tokenizer.save_pretrained(ADAPTER_PATH)
print(f"\n✅ 어댑터 저장 완료 → {ADAPTER_PATH}")

Map:   0%|          | 0/2502 [00:00<?, ? examples/s]

The repository LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct contains custom code which must be executed to correctly load the model. You can inspect the repository content at https://hf.co/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct .
 You can inspect the repository content at https://hf.co/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Truncating train dataset:   0%|          | 0/2502 [00:00<?, ? examples/s]

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,2.7871
20,2.897
30,2.825
40,2.7638
50,2.72
60,2.5738
70,2.4526
80,2.5066
90,2.3615
100,2.2554



✅ 어댑터 저장 완료 → /content/drive/MyDrive/data/test/experiments_lora/r1_a1_do0.05_lr0.0003/final_adapter


In [None]:
# 구글 드라이브 마운트
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch, os

MERGED_MODEL_PATH = f"{OUTPUT_DIR}/sft_tuned_model"
os.makedirs(MERGED_MODEL_PATH, exist_ok=True)

# 1) 베이스 모델 로드 (메모리 절약하려면 float16 권장)
base = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_ID,
    device_map="auto",
    torch_dtype=torch.float16,   # 저장 크기↓, 추론 속도↑ (A100이면 bfloat16도 OK)
    trust_remote_code=True
)
tok = AutoTokenizer.from_pretrained(BASE_MODEL_ID, trust_remote_code=True)
if tok.pad_token_id is None and tok.eos_token_id is not None:
    tok.pad_token = tok.eos_token

# 2) LoRA 어댑터 로드 & 병합
merged = PeftModel.from_pretrained(base, ADAPTER_PATH)
merged = merged.merge_and_unload()     # 🔹 병합 핵심

# 3) 저장
merged.save_pretrained(MERGED_MODEL_PATH, safe_serialization=True)  # safetensors
tok.save_pretrained(MERGED_MODEL_PATH)

print("✅ 병합 모델 저장:", MERGED_MODEL_PATH)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

configuration_exaone.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct:
- configuration_exaone.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`torch_dtype` is deprecated! Use `dtype` instead!


modeling_exaone.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/LGAI-EXAONE/EXAONE-3.5-2.4B-Instruct:
- modeling_exaone.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.65G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/134 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/563 [00:00<?, ?B/s]

✅ 병합 모델 저장: /content/drive/MyDrive/data/test/experiments_lora/r1_a1_do0.05_lr0.0003/sft_tuned_model
