In [1]:
%pip install llmcompressor
%pip install -U llmcompressor transformers

Collecting transformers
  Using cached transformers-5.1.0-py3-none-any.whl.metadata (31 kB)


In [None]:
import os
import torch
import shutil
from pathlib import Path

from google.colab import drive
drive.mount('/content/drive')

from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.smoothquant import SmoothQuantModifier
from llmcompressor.modifiers.quantization import QuantizationModifier
from llmcompressor.modifiers.quantization import GPTQModifier


In [None]:
from decimal import Decimal, ROUND_HALF_UP

MODEL_ID = "/content/drive/MyDrive/LGAimers/base_model"

DATASET_ID = "LGAI-EXAONE/MANTA-1M"
DATASET_SPLIT = "train"

NUM_CALIBRATION_SAMPLES = 512
MAX_SEQUENCE_LENGTH = 512

# SmoothQuant
SMOOTHING_STRENGTH = 0.30

# 0.35 -> 35 형태로 변환
sq_tag = int((Decimal(str(SMOOTHING_STRENGTH)) * 100).quantize(Decimal("1"), rounding=ROUND_HALF_UP))

OUT_DIR = f"/content/drive/MyDrive/LGAimers/sqmodel{sq_tag}"

# Quantization
TARGETS = ["Linear"]
IGNORE  = ["embed_tokens", "lm_head"]
SCHEME = {"W8A8": TARGETS}


In [4]:
print("[INFO] 모델 로드 중...")

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
)

print("[INFO] 모델/토크나이저 로드 완료")

[INFO] 모델 로드 중...


`torch_dtype` is deprecated! Use `dtype` instead!


[INFO] 모델/토크나이저 로드 완료


In [None]:
def collect_modules(model):
    tracked_keys = [
        "q_proj",
        "k_proj",
        "v_proj",
        "gate_proj",
        "up_proj",
        "input_layernorm",
        "post_attention_layernorm",
    ]
    mod_index = {key: [] for key in tracked_keys}

    for name, _ in model.named_modules():
        for key in tracked_keys:
            if name.endswith(key):
                mod_index[key].append(name)

    return mod_index


def build_exaone_sq_mappings(mod_index):
    module_counts = {k: len(v) for k, v in mod_index.items()}
    module_samples = {k: v[:5] for k, v in mod_index.items()}

    has_input_ln = module_counts["input_layernorm"] > 0
    has_post_attn_ln = module_counts["post_attention_layernorm"] > 0
    has_gate = module_counts["gate_proj"] > 0
    has_up = module_counts["up_proj"] > 0

    mappings = []
    mode = "disabled"

    if has_post_attn_ln and has_gate and has_up:
        # llmcompressor 호환 포맷: [ [balance_layers(list), smooth_layer(str)] ]
        mappings = [
            [["re:.*gate_proj", "re:.*up_proj"], "re:.*post_attention_layernorm"]
        ]
        mode = "mlp_only"

    reason_parts = []
    if not has_input_ln:
        reason_parts.append("input_layernorm not found -> qkv smoothing disabled")
    if not has_post_attn_ln:
        reason_parts.append("post_attention_layernorm not found")
    if not has_gate or not has_up:
        reason_parts.append("gate_proj/up_proj pair incomplete")
    if not reason_parts:
        reason_parts.append("EXAONE policy: qkv smoothing disabled, mlp-only smoothing enabled")

    diag = {
        "module_counts": module_counts,
        "module_samples": module_samples,
        "reason": "; ".join(reason_parts),
    }

    return mappings, mode, diag


def normalize_sq_mappings(mappings):
    normalized = []
    for m in mappings:
        if isinstance(m, dict):
            normalized.append([m["balance_layers"], m["smooth_layers"]])
        else:
            normalized.append(m)
    return normalized


module_index = collect_modules(model)
SQ_MAPPINGS, SQ_MODE, SQ_DIAG = build_exaone_sq_mappings(module_index)
SQ_MAPPINGS = normalize_sq_mappings(SQ_MAPPINGS)
SQ_ENABLED = len(SQ_MAPPINGS) > 0

print("[SQ] EXAONE mapping diagnostics")
for key in ["q_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "input_layernorm", "post_attention_layernorm"]:
    print(f"[SQ] {key}: count={SQ_DIAG['module_counts'][key]}, sample={SQ_DIAG['module_samples'][key]}")

print(f"[SQ] mode={SQ_MODE}, enabled={SQ_ENABLED}, mappings={len(SQ_MAPPINGS)}")
print(f"[SQ] reason={SQ_DIAG['reason']}")
if SQ_ENABLED:
    print(f"[SQ] mappings={SQ_MAPPINGS}")


In [6]:
print("[INFO] 캘리브레이션 데이터 로드 중...")

ds = load_dataset(
    DATASET_ID,
    split=DATASET_SPLIT,
)
ds = ds.shuffle(seed=42).select(range(min(NUM_CALIBRATION_SAMPLES, len(ds))))

def preprocess(example):
    return {
        "text": tokenizer.apply_chat_template(
            example["conversations"],
            add_generation_prompt=True,
            tokenize=False)
    }

ds = ds.map(preprocess)

print("[INFO] 데이터 전처리 완료")

[INFO] 캘리브레이션 데이터 로드 중...


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


README.md: 0.00B [00:00, ?B/s]

data/train.parquet:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000000 [00:00<?, ? examples/s]

Map:   0%|          | 0/512 [00:00<?, ? examples/s]

[INFO] 데이터 전처리 완료


In [None]:
scheme_name = next(iter(SCHEME)) if isinstance(SCHEME, dict) else SCHEME
scheme_targets = SCHEME[scheme_name] if isinstance(SCHEME, dict) else TARGETS

def build_gptq_modifier():
    return GPTQModifier(
        scheme=scheme_name,
        targets=scheme_targets,
        ignore=IGNORE,
    )

print(
    f"[INFO] SmoothQuant + {scheme_name} 시작 (strength={SMOOTHING_STRENGTH}, "
    f"samples={NUM_CALIBRATION_SAMPLES}, max_len={MAX_SEQUENCE_LENGTH}, sq_mode={SQ_MODE})..."
)

applied_mode = "gptq_only_fallback"

sq_mappings = normalize_sq_mappings(SQ_MAPPINGS)
if SQ_ENABLED:
    print(f"[SQ] normalized mappings={sq_mappings}")

try:
    if SQ_ENABLED:
        recipe = [
            SmoothQuantModifier(
                smoothing_strength=SMOOTHING_STRENGTH,
                mappings=sq_mappings,
            ),
            build_gptq_modifier(),
        ]
        oneshot(
            model=model,
            dataset=ds,
            recipe=recipe,
            max_seq_length=MAX_SEQUENCE_LENGTH,
            num_calibration_samples=NUM_CALIBRATION_SAMPLES,
        )
        applied_mode = f"smoothquant_{SQ_MODE}+gptq"
    else:
        print("[WARN] SmoothQuant 비활성 상태라 GPTQ-only로 실행합니다.")
        print(f"[WARN] reason: {SQ_DIAG['reason']}")
        recipe = [build_gptq_modifier()]
        oneshot(
            model=model,
            dataset=ds,
            recipe=recipe,
            max_seq_length=MAX_SEQUENCE_LENGTH,
            num_calibration_samples=NUM_CALIBRATION_SAMPLES,
        )
except (ValueError, RuntimeError) as e:
    print(f"[WARN] SmoothQuant 경로 실패: {type(e).__name__}: {e}")
    print("[WARN] GPTQ-only 폴백으로 재실행합니다.")
    recipe = [build_gptq_modifier()]
    oneshot(
        model=model,
        dataset=ds,
        recipe=recipe,
        max_seq_length=MAX_SEQUENCE_LENGTH,
        num_calibration_samples=NUM_CALIBRATION_SAMPLES,
    )

print(f"[INFO] SmoothQuant + {scheme_name} 완료 (applied: {applied_mode})")


In [None]:
os.makedirs(OUT_DIR, exist_ok=True)

model.save_pretrained(OUT_DIR, save_compressed=True)
tokenizer.save_pretrained(OUT_DIR)

print(f"[INFO] 모델 저장 완료: {OUT_DIR}")


In [None]:
zip_name = f"/content/drive/MyDrive/LGAimers/submit/sqmodel{sq_tag}"
zip_path = Path(zip_name)

zip_path.parent.mkdir(parents=True, exist_ok=True)  # 경로 없으면 생성

from tempfile import TemporaryDirectory
with TemporaryDirectory() as tmpdir:
    tmp_root = Path(tmpdir)
    model_dir = tmp_root / "model"
    shutil.copytree(OUT_DIR, model_dir)
    shutil.make_archive(str(zip_path), "zip", root_dir=tmp_root, base_dir="model")

print(f"[INFO] 생성 완료: {zip_name}.zip")
