In [1]:
%pip install llmcompressor
%pip install -U llmcompressor transformers

Collecting llmcompressor
  Downloading llmcompressor-0.9.0.1-py3-none-any.whl.metadata (12 kB)
Collecting loguru<=0.7.3,>=0.7.2 (from llmcompressor)
  Downloading loguru-0.7.3-py3-none-any.whl.metadata (22 kB)
Collecting tqdm<=4.67.1,>=4.66.3 (from llmcompressor)
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting transformers<=4.57.3,>=4.54.0 (from llmcompressor)
  Downloading transformers-4.57.3-py3-none-any.whl.metadata (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-ml-py<=13.590.44,>=12.560.30 (from llmcompressor)
  Downloading nvidia_ml_py-13.590.44-py3-none-any.whl.metadata (9.8 kB)
Collecting compressed-tensors==0.13.0 (from llmcompressor)
  Downloading compressed_tensors-0.13.0-py3-none-any.whl.metadata (7.0 kB)
Collecting huggingface_hub>=0.21

In [2]:
import os
import json
import re
import hashlib
import torch
import shutil
from pathlib import Path
from datetime import datetime, timezone

from google.colab import drive
drive.mount('/content/drive')

from datasets import Dataset, concatenate_datasets, load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import GPTQModifier


Mounted at /content/drive


In [3]:
MODEL_ID = "/content/drive/MyDrive/LGAimers/base_model"

NUM_CALIBRATION_SAMPLES = 1024
MAX_SEQUENCE_LENGTH = 512
CALIBRATION_SEED = 42

CALIB_DATASETS = [
    {
        "id": "LGAI-EXAONE/MANTA-1M",
        "split": "train",
        "n_samples": 512,
        "format": "manta",
        "priority": 1,
    },
    {
        "id": "nlpai-lab/kullm-v2",
        "split": "train",
        "n_samples": 256,
        "format": "instruction",
        "priority": 2,
    },
    {
        "id": "heegyu/OIG-small-chip2-ko",
        "split": "train",
        "n_samples": 192,
        "format": "oig_human_bot",
        "priority": 3,
    },
    {
        "id": "beomi/KoAlpaca-v1.1a",
        "split": "train",
        "n_samples": 64,
        "format": "instruction",
        "priority": 4,
    },
]

CALIBRATION_BENCHMARK_EXCLUDE = {
    "LGAI-EXAONE/KoMT-Bench",
    "LGAI-EXAONE/KMMLU-Redux",
    "LGAI-EXAONE/KMMLU-Pro",
}
KOALPACA_FALLBACK_ID = "nlpai-lab/kullm-v2"

assert sum(spec["n_samples"] for spec in CALIB_DATASETS) == NUM_CALIBRATION_SAMPLES, (
    "CALIB_DATASETS 샘플 합계와 NUM_CALIBRATION_SAMPLES가 일치해야 합니다."
)
assert not any(spec["id"] in CALIBRATION_BENCHMARK_EXCLUDE for spec in CALIB_DATASETS), (
    "평가용 벤치마크 데이터셋은 calibration에서 제외해야 합니다."
)

PRIMARY_SCHEME = "W4A16"
FALLBACK_SCHEME = "W8A8"

ATTN_TARGETS = [
    "re:.*self_attn\\.q_proj",
    "re:.*self_attn\\.k_proj",
    "re:.*self_attn\\.v_proj",
    "re:.*self_attn\\.o_proj",
]
IGNORE = ["embed_tokens", "lm_head"]

OUT_DIR = "/content/drive/MyDrive/LGAimers/sq_w4a16_attn_calmix1024"


In [4]:
print("[INFO] 모델 로드 중...")

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_ID,
    trust_remote_code=True,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
)

print("[INFO] 모델/토크나이저 로드 완료")

[INFO] 모델 로드 중...


`torch_dtype` is deprecated! Use `dtype` instead!


[INFO] 모델/토크나이저 로드 완료


In [5]:
import re


def collect_target_modules(model, target_patterns):
    all_module_names = [name for name, _ in model.named_modules()]

    matched_by_pattern = {}
    for pattern in target_patterns:
        if pattern.startswith("re:"):
            regex = pattern[len("re:"):]
        else:
            regex = re.escape(pattern)
        matches = [name for name in all_module_names if re.fullmatch(regex, name)]
        matched_by_pattern[pattern] = matches
        print(f"[TARGET] pattern={pattern} matches={len(matches)} sample={matches[:3]}")

    unmatched_patterns = [p for p, m in matched_by_pattern.items() if not m]
    if unmatched_patterns:
        raise AssertionError(f"No module matched for patterns: {unmatched_patterns}")

    unique_matches = sorted({m for matches in matched_by_pattern.values() for m in matches})
    mlp_overlap = [name for name in unique_matches if ".mlp." in name]
    if mlp_overlap:
        raise AssertionError(f"MLP modules were matched unexpectedly: {mlp_overlap[:10]}")

    expected_suffixes = {
        "self_attn.q_proj",
        "self_attn.k_proj",
        "self_attn.v_proj",
        "self_attn.o_proj",
    }
    per_layer = {}
    for name in unique_matches:
        parts = name.split(".")
        if len(parts) < 5 or parts[0] != "model" or parts[1] != "layers" or not parts[2].isdigit():
            continue
        layer_id = int(parts[2])
        suffix = ".".join(parts[3:])
        per_layer.setdefault(layer_id, set()).add(suffix)

    missing_per_layer = {
        layer_id: sorted(expected_suffixes - suffixes)
        for layer_id, suffixes in per_layer.items()
        if suffixes != expected_suffixes
    }
    if missing_per_layer:
        raise AssertionError(f"Incomplete attention targets per layer: {missing_per_layer}")

    expected_total = 120
    if len(unique_matches) != expected_total:
        raise AssertionError(
            f"Expected {expected_total} attention modules (30 layers x 4 projections), got {len(unique_matches)}"
        )

    layer_ids = sorted(per_layer)
    print(f"[TARGET] total_unique_matches={len(unique_matches)}")
    print(f"[TARGET] layer_count={len(layer_ids)}, layer_range=({layer_ids[0]}, {layer_ids[-1]})")

    return unique_matches


ATTN_MATCHED_MODULES = collect_target_modules(model, ATTN_TARGETS)
print("[INFO] attention-only target validation complete")


[TARGET] pattern=re:.*self_attn\.q_proj matches=30 sample=['model.layers.0.self_attn.q_proj', 'model.layers.1.self_attn.q_proj', 'model.layers.2.self_attn.q_proj']
[TARGET] pattern=re:.*self_attn\.k_proj matches=30 sample=['model.layers.0.self_attn.k_proj', 'model.layers.1.self_attn.k_proj', 'model.layers.2.self_attn.k_proj']
[TARGET] pattern=re:.*self_attn\.v_proj matches=30 sample=['model.layers.0.self_attn.v_proj', 'model.layers.1.self_attn.v_proj', 'model.layers.2.self_attn.v_proj']
[TARGET] pattern=re:.*self_attn\.o_proj matches=30 sample=['model.layers.0.self_attn.o_proj', 'model.layers.1.self_attn.o_proj', 'model.layers.2.self_attn.o_proj']
[TARGET] total_unique_matches=120
[TARGET] layer_count=30, layer_range=(0, 29)
[INFO] attention-only target validation complete


In [7]:
print("[INFO] 캘리브레이션 데이터 로드 중...")

def _clean_text(value):
    if value is None:
        return ""
    return str(value).strip()

def _normalize_manta_conversations(conversations):
    if not isinstance(conversations, list):
        return None

    normalized = []
    for turn in conversations:
        if not isinstance(turn, dict):
            continue
        role = _clean_text(turn.get("role")).lower()
        content = _clean_text(turn.get("content"))
        if role in {"system", "user", "assistant"} and content:
            normalized.append({"role": role, "content": content})

    has_user = any(turn["role"] == "user" for turn in normalized)
    has_assistant = any(turn["role"] == "assistant" for turn in normalized)
    if not (has_user and has_assistant):
        return None
    return normalized

def _normalize_instruction_turn(example):
    instruction = _clean_text(example.get("instruction"))
    input_text = _clean_text(example.get("input"))
    output_text = _clean_text(example.get("output"))

    user_content = instruction
    if input_text:
        user_content = f"{instruction}\n\n{input_text}" if instruction else input_text

    if not user_content or not output_text:
        return None

    return [
        {"role": "user", "content": user_content},
        {"role": "assistant", "content": output_text},
    ]

def _normalize_oig_pair(example):
    # heegyu/OIG-small-chip2-ko는 user/chip2 또는 *_translated 컬럼 기반이다.
    user_content = _clean_text(example.get("user_translated")) or _clean_text(example.get("user"))
    assistant_content = _clean_text(example.get("chip2_translated")) or _clean_text(example.get("chip2"))
    if user_content and assistant_content:
        return [
            {"role": "user", "content": user_content},
            {"role": "assistant", "content": assistant_content},
        ]

    # 구버전/파생셋 호환: text 컬럼에 <human>/<bot> 태그가 있는 경우 파싱.
    raw = _clean_text(example.get("text"))
    if not raw:
        return None

    pattern = re.compile(r"<human>\s*(.*?)\s*<bot>\s*(.*?)(?=<human>|$)", re.IGNORECASE | re.DOTALL)
    match = pattern.search(raw)
    if not match:
        return None

    user_content = _clean_text(match.group(1))
    assistant_content = _clean_text(match.group(2))
    if not user_content or not assistant_content:
        return None

    return [
        {"role": "user", "content": user_content},
        {"role": "assistant", "content": assistant_content},
    ]

def normalize_example(example, source_format):
    if source_format == "manta":
        conversations = _normalize_manta_conversations(example.get("conversations"))
    elif source_format == "instruction":
        conversations = _normalize_instruction_turn(example)
    elif source_format == "oig_human_bot":
        conversations = _normalize_oig_pair(example)
    else:
        raise ValueError(f"지원하지 않는 source format: {source_format}")

    if conversations is None:
        return None

    return {"conversations": conversations}

def _conversation_hash(record):
    payload = json.dumps(record["conversations"], ensure_ascii=False, sort_keys=True)
    return hashlib.sha1(payload.encode("utf-8")).hexdigest()

def _load_single_spec(spec, seed, global_seen_hashes):
    dataset_id = spec["id"]
    split = spec["split"]
    target_n = spec["n_samples"]
    source_format = spec["format"]

    raw_ds = load_dataset(dataset_id, split=split).shuffle(seed=seed)
    dataset_columns = list(raw_ds.features.keys())

    selected = []
    scanned = 0
    dropped = 0
    duplicate = 0

    for example in raw_ds:
        scanned += 1
        normalized = normalize_example(example, source_format)
        if normalized is None:
            dropped += 1
            continue

        conv_hash = _conversation_hash(normalized)
        if conv_hash in global_seen_hashes:
            duplicate += 1
            continue

        global_seen_hashes.add(conv_hash)
        selected.append(normalized)
        if len(selected) >= target_n:
            break

    if len(selected) < target_n:
        raise RuntimeError(
            f"{dataset_id}에서 목표 샘플({target_n})을 채우지 못했습니다. "
            f"selected={len(selected)}, scanned={scanned}, dropped={dropped}, duplicate={duplicate}, "
            f"columns={dataset_columns}"
        )

    normalized_ds = Dataset.from_list(selected)
    stats = {
        "id": dataset_id,
        "split": split,
        "format": source_format,
        "target_n": target_n,
        "selected_n": len(selected),
        "scanned_n": scanned,
        "dropped_n": dropped,
        "duplicate_n": duplicate,
        "columns": dataset_columns,
    }
    return normalized_ds, stats

def build_calibration_dataset(specs, seed):
    if not specs:
        raise ValueError("CALIB_DATASETS가 비어 있습니다.")

    specs_sorted = sorted(specs, key=lambda x: x["priority"])
    subset_list = []
    stats_list = []
    global_seen_hashes = set()

    for spec in specs_sorted:
        try:
            subset, stats = _load_single_spec(spec, seed=seed, global_seen_hashes=global_seen_hashes)
            subset_list.append(subset)
            stats_list.append(stats)
        except Exception as exc:
            if spec["id"] == "beomi/KoAlpaca-v1.1a":
                print(f"[WARN] {spec['id']} 로드 실패 -> {KOALPACA_FALLBACK_ID}로 대체: {type(exc).__name__}: {exc}")
                fallback_spec = dict(spec)
                fallback_spec["id"] = KOALPACA_FALLBACK_ID
                fallback_spec["split"] = "train"
                fallback_spec["format"] = "instruction"
                subset, stats = _load_single_spec(fallback_spec, seed=seed, global_seen_hashes=global_seen_hashes)
                stats["fallback_from"] = spec["id"]
                subset_list.append(subset)
                stats_list.append(stats)
                continue
            raise

    mixed = concatenate_datasets(subset_list).shuffle(seed=seed)

    if len(mixed) != NUM_CALIBRATION_SAMPLES:
        raise RuntimeError(
            f"최종 calibration 샘플 수가 다릅니다: expected={NUM_CALIBRATION_SAMPLES}, actual={len(mixed)}"
        )

    for stats in stats_list:
        if stats["selected_n"] < min(5, stats["target_n"]):
            raise RuntimeError(f"정규화 성공 샘플이 부족합니다: {stats}")

    return mixed, stats_list

def _contains_korean(text):
    return any("가" <= ch <= "힣" for ch in text)

def _compute_ko_char_ratio(records):
    total_chars = 0
    ko_chars = 0
    for record in records:
        for turn in record["conversations"]:
            content = turn["content"]
            total_chars += len(content)
            ko_chars += sum(1 for ch in content if "가" <= ch <= "힣")
    if total_chars == 0:
        return 0.0
    return ko_chars / total_chars

ds, calibration_source_stats = build_calibration_dataset(CALIB_DATASETS, seed=CALIBRATION_SEED)

records = list(ds)
ko_char_ratio = _compute_ko_char_ratio(records)
ko_record_ratio = (
    sum(1 for item in records if _contains_korean(json.dumps(item["conversations"], ensure_ascii=False)))
    / max(len(records), 1)
)

print(f"[INFO] 혼합 캘리브레이션 샘플 수: {len(records)}")
print(f"[INFO] 한글 문자 비율: {ko_char_ratio:.4f}")
print(f"[INFO] 한글 포함 샘플 비율: {ko_record_ratio:.4f}")
print("[INFO] 소스별 통계:")
for stats in calibration_source_stats:
    print(f"  - {stats}")

def preprocess(example):
    return {
        "text": tokenizer.apply_chat_template(
            example["conversations"],
            add_generation_prompt=True,
            tokenize=False,
        )
    }

ds = ds.map(preprocess)

print("[INFO] 데이터 전처리 완료")

[INFO] 캘리브레이션 데이터 로드 중...


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001-21df739eb88d71(…):   0%|          | 0.00/12.9M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/21155 [00:00<?, ? examples/s]

[INFO] 혼합 캘리브레이션 샘플 수: 1024
[INFO] 한글 문자 비율: 0.0666
[INFO] 한글 포함 샘플 비율: 0.4980
[INFO] 소스별 통계:
  - {'id': 'LGAI-EXAONE/MANTA-1M', 'split': 'train', 'format': 'manta', 'target_n': 512, 'selected_n': 512, 'scanned_n': 512, 'dropped_n': 0, 'duplicate_n': 0, 'columns': ['id', 'conversations', 'complexity_label']}
  - {'id': 'nlpai-lab/kullm-v2', 'split': 'train', 'format': 'instruction', 'target_n': 256, 'selected_n': 256, 'scanned_n': 257, 'dropped_n': 1, 'duplicate_n': 0, 'columns': ['id', 'instruction', 'input', 'output']}
  - {'id': 'heegyu/OIG-small-chip2-ko', 'split': 'train', 'format': 'oig_human_bot', 'target_n': 192, 'selected_n': 192, 'scanned_n': 192, 'dropped_n': 0, 'duplicate_n': 0, 'columns': ['user', 'chip2', 'index', 'user_translated', 'chip2_translated']}
  - {'id': 'beomi/KoAlpaca-v1.1a', 'split': 'train', 'format': 'instruction', 'target_n': 64, 'selected_n': 64, 'scanned_n': 64, 'dropped_n': 0, 'duplicate_n': 0, 'columns': ['instruction', 'output', 'url']}


Map:   0%|          | 0/1024 [00:00<?, ? examples/s]

[INFO] 데이터 전처리 완료


In [8]:
def build_gptq_modifier(scheme_name):
    return GPTQModifier(
        scheme=scheme_name,
        targets=ATTN_TARGETS,
        ignore=IGNORE,
    )


candidate_schemes = [PRIMARY_SCHEME]
if FALLBACK_SCHEME and FALLBACK_SCHEME not in candidate_schemes:
    candidate_schemes.append(FALLBACK_SCHEME)

applied_scheme = None
applied_mode = None
last_error = None

for idx, scheme_name in enumerate(candidate_schemes, start=1):
    print(
        f"[INFO] GPTQ(attention-only) start ({idx}/{len(candidate_schemes)}, "
        f"scheme={scheme_name}, samples={NUM_CALIBRATION_SAMPLES}, max_len={MAX_SEQUENCE_LENGTH})"
    )

    try:
        recipe = [build_gptq_modifier(scheme_name)]
        oneshot(
            model=model,
            dataset=ds,
            recipe=recipe,
            max_seq_length=MAX_SEQUENCE_LENGTH,
            num_calibration_samples=NUM_CALIBRATION_SAMPLES,
        )
        applied_scheme = scheme_name
        applied_mode = f"gptq_attn_only_{scheme_name.lower()}"
        print(f"[INFO] GPTQ(attention-only) success (scheme={scheme_name})")
        break
    except (ValueError, RuntimeError) as e:
        last_error = e
        print(f"[WARN] GPTQ(attention-only) failed (scheme={scheme_name}): {type(e).__name__}: {e}")

if applied_scheme is None:
    raise RuntimeError(
        f"All quantization schemes failed: {candidate_schemes}. Last error: {last_error}"
    )

print(f"[INFO] quantization complete (applied_mode={applied_mode}, applied_scheme={applied_scheme})")


[INFO] GPTQ(attention-only) start (1/2, scheme=W4A16, samples=1024, max_len=512)


Tokenizing:   0%|          | 0/1024 [00:00<?, ? examples/s]

2026-02-13T18:30:50.224497+0000 | reset | INFO - Compression lifecycle reset
2026-02-13T18:30:50.227398+0000 | from_modifiers | INFO - Creating recipe from modifiers
2026-02-13T18:30:50.266046+0000 | initialize | INFO - Compression lifecycle initialized for 1 modifiers
2026-02-13T18:30:50.266996+0000 | IndependentPipeline | INFO - Inferred `SequentialPipeline` for `GPTQModifier`


Preparing cache: 100%|██████████| 1024/1024 [00:00<00:00, 1201.20it/s]
(1/31): Calibrating: 100%|██████████| 1024/1024 [00:06<00:00, 159.93it/s]

2026-02-13T18:31:04.371620+0000 | compress_modules | INFO - Quantizing model.layers.0.self_attn.q_proj using 1024 samples





2026-02-13T18:31:06.160079+0000 | compress | METRIC - time 1.79s
2026-02-13T18:31:06.160967+0000 | compress | METRIC - error 0.84
2026-02-13T18:31:06.162266+0000 | compress | METRIC - GPU 0 | usage: 2.39% | total memory: 85 GB
2026-02-13T18:31:06.162881+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:31:06.163976+0000 | compress_modules | INFO - Quantizing model.layers.0.self_attn.k_proj using 1024 samples
2026-02-13T18:31:07.345169+0000 | compress | METRIC - time 1.18s
2026-02-13T18:31:07.346274+0000 | compress | METRIC - error 0.24
2026-02-13T18:31:07.347031+0000 | compress | METRIC - GPU 0 | usage: 2.39% | total memory: 85 GB
2026-02-13T18:31:07.347584+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:31:07.348904+0000 | compress_modules | INFO - Quantizing model.layers.0.self_attn.v_proj using 1024 samples
2026-02-13T18:31:08.536577+0000 | compress | METRIC - time 1.19s
2026-02-13T18:31:08.537842+0000 | compress | METRIC - err

(1/31): Propagating: 100%|██████████| 1024/1024 [00:09<00:00, 102.60it/s]
(2/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 299.87it/s]

2026-02-13T18:31:23.465771+0000 | compress_modules | INFO - Quantizing model.layers.1.self_attn.q_proj using 1024 samples





2026-02-13T18:31:24.657222+0000 | compress | METRIC - time 1.19s
2026-02-13T18:31:24.658372+0000 | compress | METRIC - error 3.98
2026-02-13T18:31:24.659421+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:31:24.660177+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:31:24.661447+0000 | compress_modules | INFO - Quantizing model.layers.1.self_attn.k_proj using 1024 samples
2026-02-13T18:31:25.839330+0000 | compress | METRIC - time 1.18s
2026-02-13T18:31:25.840508+0000 | compress | METRIC - error 1.14
2026-02-13T18:31:25.841415+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:31:25.841928+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:31:25.842892+0000 | compress_modules | INFO - Quantizing model.layers.1.self_attn.v_proj using 1024 samples
2026-02-13T18:31:27.045874+0000 | compress | METRIC - time 1.20s
2026-02-13T18:31:27.047154+0000 | compress | METRIC - err

(2/31): Propagating: 100%|██████████| 1024/1024 [00:04<00:00, 250.13it/s]
(3/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 297.06it/s]

2026-02-13T18:31:35.865422+0000 | compress_modules | INFO - Quantizing model.layers.2.self_attn.q_proj using 1024 samples





2026-02-13T18:31:37.070686+0000 | compress | METRIC - time 1.20s
2026-02-13T18:31:37.072222+0000 | compress | METRIC - error 10.89
2026-02-13T18:31:37.072891+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:31:37.073458+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:31:37.074425+0000 | compress_modules | INFO - Quantizing model.layers.2.self_attn.k_proj using 1024 samples
2026-02-13T18:31:38.257655+0000 | compress | METRIC - time 1.18s
2026-02-13T18:31:38.259075+0000 | compress | METRIC - error 3.08
2026-02-13T18:31:38.260093+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:31:38.260930+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:31:38.261824+0000 | compress_modules | INFO - Quantizing model.layers.2.self_attn.v_proj using 1024 samples
2026-02-13T18:31:39.451058+0000 | compress | METRIC - time 1.19s
2026-02-13T18:31:39.452511+0000 | compress | METRIC - er

(3/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 317.33it/s]
(4/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 297.06it/s]

2026-02-13T18:31:47.367097+0000 | compress_modules | INFO - Quantizing model.layers.3.self_attn.q_proj using 1024 samples





2026-02-13T18:31:48.584173+0000 | compress | METRIC - time 1.21s
2026-02-13T18:31:48.585148+0000 | compress | METRIC - error 21.66
2026-02-13T18:31:48.586002+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:31:48.586538+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:31:48.587484+0000 | compress_modules | INFO - Quantizing model.layers.3.self_attn.k_proj using 1024 samples
2026-02-13T18:31:49.758568+0000 | compress | METRIC - time 1.17s
2026-02-13T18:31:49.760156+0000 | compress | METRIC - error 6.15
2026-02-13T18:31:49.761274+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:31:49.761876+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:31:49.762926+0000 | compress_modules | INFO - Quantizing model.layers.3.self_attn.v_proj using 1024 samples
2026-02-13T18:31:50.951674+0000 | compress | METRIC - time 1.19s
2026-02-13T18:31:50.953250+0000 | compress | METRIC - er

(4/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 315.97it/s]
(5/31): Calibrating: 100%|██████████| 1024/1024 [00:04<00:00, 245.40it/s]

2026-02-13T18:31:59.634097+0000 | compress_modules | INFO - Quantizing model.layers.4.self_attn.q_proj using 1024 samples





2026-02-13T18:32:00.846181+0000 | compress | METRIC - time 1.21s
2026-02-13T18:32:00.847127+0000 | compress | METRIC - error 41.24
2026-02-13T18:32:00.848068+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:32:00.848779+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:32:00.849779+0000 | compress_modules | INFO - Quantizing model.layers.4.self_attn.k_proj using 1024 samples
2026-02-13T18:32:02.038431+0000 | compress | METRIC - time 1.19s
2026-02-13T18:32:02.040032+0000 | compress | METRIC - error 11.47
2026-02-13T18:32:02.041087+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:32:02.041742+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:32:02.042848+0000 | compress_modules | INFO - Quantizing model.layers.4.self_attn.v_proj using 1024 samples
2026-02-13T18:32:03.228021+0000 | compress | METRIC - time 1.18s
2026-02-13T18:32:03.229665+0000 | compress | METRIC - e

(5/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 313.26it/s]
(6/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 297.42it/s]

2026-02-13T18:32:11.204097+0000 | compress_modules | INFO - Quantizing model.layers.5.self_attn.q_proj using 1024 samples





2026-02-13T18:32:12.417636+0000 | compress | METRIC - time 1.21s
2026-02-13T18:32:12.419425+0000 | compress | METRIC - error 66.28
2026-02-13T18:32:12.420310+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:32:12.420944+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:32:12.422003+0000 | compress_modules | INFO - Quantizing model.layers.5.self_attn.k_proj using 1024 samples
2026-02-13T18:32:13.614943+0000 | compress | METRIC - time 1.19s
2026-02-13T18:32:13.616750+0000 | compress | METRIC - error 19.55
2026-02-13T18:32:13.617708+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:32:13.618384+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:32:13.619393+0000 | compress_modules | INFO - Quantizing model.layers.5.self_attn.v_proj using 1024 samples
2026-02-13T18:32:14.809124+0000 | compress | METRIC - time 1.19s
2026-02-13T18:32:14.810851+0000 | compress | METRIC - e

(6/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 311.84it/s]
(7/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 296.44it/s]

2026-02-13T18:32:22.793824+0000 | compress_modules | INFO - Quantizing model.layers.6.self_attn.q_proj using 1024 samples





2026-02-13T18:32:23.996541+0000 | compress | METRIC - time 1.20s
2026-02-13T18:32:23.998401+0000 | compress | METRIC - error 99.12
2026-02-13T18:32:23.999212+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:32:23.999822+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:32:24.000907+0000 | compress_modules | INFO - Quantizing model.layers.6.self_attn.k_proj using 1024 samples
2026-02-13T18:32:25.197582+0000 | compress | METRIC - time 1.20s
2026-02-13T18:32:25.199401+0000 | compress | METRIC - error 27.36
2026-02-13T18:32:25.200163+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:32:25.200756+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:32:25.201783+0000 | compress_modules | INFO - Quantizing model.layers.6.self_attn.v_proj using 1024 samples
2026-02-13T18:32:26.398016+0000 | compress | METRIC - time 1.20s
2026-02-13T18:32:26.399861+0000 | compress | METRIC - e

(7/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 314.12it/s]
(8/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 296.80it/s]

2026-02-13T18:32:34.350893+0000 | compress_modules | INFO - Quantizing model.layers.7.self_attn.q_proj using 1024 samples





2026-02-13T18:32:35.542562+0000 | compress | METRIC - time 1.19s
2026-02-13T18:32:35.544486+0000 | compress | METRIC - error 150.90
2026-02-13T18:32:35.545263+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:32:35.545801+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:32:35.546687+0000 | compress_modules | INFO - Quantizing model.layers.7.self_attn.k_proj using 1024 samples
2026-02-13T18:32:36.705302+0000 | compress | METRIC - time 1.16s
2026-02-13T18:32:36.707111+0000 | compress | METRIC - error 42.49
2026-02-13T18:32:36.708012+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:32:36.708696+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:32:36.709838+0000 | compress_modules | INFO - Quantizing model.layers.7.self_attn.v_proj using 1024 samples
2026-02-13T18:32:37.895301+0000 | compress | METRIC - time 1.18s
2026-02-13T18:32:37.897151+0000 | compress | METRIC - 

(8/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 311.66it/s]
(9/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 297.25it/s]

2026-02-13T18:32:45.879002+0000 | compress_modules | INFO - Quantizing model.layers.8.self_attn.q_proj using 1024 samples





2026-02-13T18:32:47.078951+0000 | compress | METRIC - time 1.20s
2026-02-13T18:32:47.080806+0000 | compress | METRIC - error 166.53
2026-02-13T18:32:47.081656+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:32:47.082266+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:32:47.083395+0000 | compress_modules | INFO - Quantizing model.layers.8.self_attn.k_proj using 1024 samples
2026-02-13T18:32:48.262435+0000 | compress | METRIC - time 1.18s
2026-02-13T18:32:48.264323+0000 | compress | METRIC - error 47.65
2026-02-13T18:32:48.265171+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:32:48.265858+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:32:48.266934+0000 | compress_modules | INFO - Quantizing model.layers.8.self_attn.v_proj using 1024 samples
2026-02-13T18:32:49.460058+0000 | compress | METRIC - time 1.19s
2026-02-13T18:32:49.461908+0000 | compress | METRIC - 

(9/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 310.25it/s]
(10/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 298.10it/s]

2026-02-13T18:32:57.444666+0000 | compress_modules | INFO - Quantizing model.layers.9.self_attn.q_proj using 1024 samples





2026-02-13T18:32:58.632130+0000 | compress | METRIC - time 1.18s
2026-02-13T18:32:58.634002+0000 | compress | METRIC - error 224.07
2026-02-13T18:32:58.634952+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:32:58.635950+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:32:58.637081+0000 | compress_modules | INFO - Quantizing model.layers.9.self_attn.k_proj using 1024 samples
2026-02-13T18:32:59.796333+0000 | compress | METRIC - time 1.16s
2026-02-13T18:32:59.798264+0000 | compress | METRIC - error 66.40
2026-02-13T18:32:59.799275+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:32:59.800166+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:32:59.801096+0000 | compress_modules | INFO - Quantizing model.layers.9.self_attn.v_proj using 1024 samples
2026-02-13T18:33:00.993031+0000 | compress | METRIC - time 1.19s
2026-02-13T18:33:00.994892+0000 | compress | METRIC - 

(10/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 317.51it/s]
(11/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 266.40it/s]

2026-02-13T18:33:09.293183+0000 | compress_modules | INFO - Quantizing model.layers.10.self_attn.q_proj using 1024 samples





2026-02-13T18:33:10.498300+0000 | compress | METRIC - time 1.20s
2026-02-13T18:33:10.500253+0000 | compress | METRIC - error 244.42
2026-02-13T18:33:10.500993+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:33:10.501631+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:33:10.502947+0000 | compress_modules | INFO - Quantizing model.layers.10.self_attn.k_proj using 1024 samples
2026-02-13T18:33:11.682395+0000 | compress | METRIC - time 1.18s
2026-02-13T18:33:11.684353+0000 | compress | METRIC - error 66.12
2026-02-13T18:33:11.685202+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:33:11.685857+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:33:11.686924+0000 | compress_modules | INFO - Quantizing model.layers.10.self_attn.v_proj using 1024 samples
2026-02-13T18:33:12.875343+0000 | compress | METRIC - time 1.19s
2026-02-13T18:33:12.877286+0000 | compress | METRIC 

(11/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 315.01it/s]
(12/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 295.11it/s]

2026-02-13T18:33:20.842161+0000 | compress_modules | INFO - Quantizing model.layers.11.self_attn.q_proj using 1024 samples





2026-02-13T18:33:22.042353+0000 | compress | METRIC - time 1.20s
2026-02-13T18:33:22.044410+0000 | compress | METRIC - error 267.66
2026-02-13T18:33:22.045329+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:33:22.045879+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:33:22.046981+0000 | compress_modules | INFO - Quantizing model.layers.11.self_attn.k_proj using 1024 samples
2026-02-13T18:33:23.222808+0000 | compress | METRIC - time 1.18s
2026-02-13T18:33:23.224855+0000 | compress | METRIC - error 75.95
2026-02-13T18:33:23.225792+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:33:23.226361+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:33:23.227336+0000 | compress_modules | INFO - Quantizing model.layers.11.self_attn.v_proj using 1024 samples
2026-02-13T18:33:24.422783+0000 | compress | METRIC - time 1.19s
2026-02-13T18:33:24.424773+0000 | compress | METRIC 

(12/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 311.87it/s]
(13/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 299.42it/s]

2026-02-13T18:33:32.395441+0000 | compress_modules | INFO - Quantizing model.layers.12.self_attn.q_proj using 1024 samples





2026-02-13T18:33:33.606025+0000 | compress | METRIC - time 1.21s
2026-02-13T18:33:33.607950+0000 | compress | METRIC - error 298.17
2026-02-13T18:33:33.608790+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:33:33.609283+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:33:33.610311+0000 | compress_modules | INFO - Quantizing model.layers.12.self_attn.k_proj using 1024 samples
2026-02-13T18:33:34.793831+0000 | compress | METRIC - time 1.18s
2026-02-13T18:33:34.795760+0000 | compress | METRIC - error 81.96
2026-02-13T18:33:34.796835+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:33:34.797546+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:33:34.798694+0000 | compress_modules | INFO - Quantizing model.layers.12.self_attn.v_proj using 1024 samples
2026-02-13T18:33:35.987026+0000 | compress | METRIC - time 1.19s
2026-02-13T18:33:35.988970+0000 | compress | METRIC 

(13/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 314.07it/s]
(14/31): Calibrating: 100%|██████████| 1024/1024 [00:04<00:00, 233.53it/s]

2026-02-13T18:33:44.887000+0000 | compress_modules | INFO - Quantizing model.layers.13.self_attn.q_proj using 1024 samples





2026-02-13T18:33:46.098390+0000 | compress | METRIC - time 1.21s
2026-02-13T18:33:46.100399+0000 | compress | METRIC - error 334.42
2026-02-13T18:33:46.101193+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:33:46.101682+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:33:46.102791+0000 | compress_modules | INFO - Quantizing model.layers.13.self_attn.k_proj using 1024 samples
2026-02-13T18:33:47.283023+0000 | compress | METRIC - time 1.18s
2026-02-13T18:33:47.284984+0000 | compress | METRIC - error 94.26
2026-02-13T18:33:47.285867+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:33:47.286432+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:33:47.287414+0000 | compress_modules | INFO - Quantizing model.layers.13.self_attn.v_proj using 1024 samples
2026-02-13T18:33:48.468659+0000 | compress | METRIC - time 1.18s
2026-02-13T18:33:48.470664+0000 | compress | METRIC 

(14/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 313.47it/s]
(15/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 292.50it/s]

2026-02-13T18:33:56.486254+0000 | compress_modules | INFO - Quantizing model.layers.14.self_attn.q_proj using 1024 samples





2026-02-13T18:33:57.676118+0000 | compress | METRIC - time 1.19s
2026-02-13T18:33:57.678541+0000 | compress | METRIC - error 365.60
2026-02-13T18:33:57.679445+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:33:57.680076+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:33:57.681246+0000 | compress_modules | INFO - Quantizing model.layers.14.self_attn.k_proj using 1024 samples
2026-02-13T18:33:58.884269+0000 | compress | METRIC - time 1.20s
2026-02-13T18:33:58.885569+0000 | compress | METRIC - error 111.05
2026-02-13T18:33:58.886345+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:33:58.886823+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:33:58.887706+0000 | compress_modules | INFO - Quantizing model.layers.14.self_attn.v_proj using 1024 samples
2026-02-13T18:34:00.080679+0000 | compress | METRIC - time 1.19s
2026-02-13T18:34:00.082814+0000 | compress | METRIC

(15/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 315.04it/s]
(16/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 298.19it/s]

2026-02-13T18:34:08.014792+0000 | compress_modules | INFO - Quantizing model.layers.15.self_attn.q_proj using 1024 samples





2026-02-13T18:34:09.227788+0000 | compress | METRIC - time 1.21s
2026-02-13T18:34:09.229751+0000 | compress | METRIC - error 373.40
2026-02-13T18:34:09.230571+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:34:09.231166+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:34:09.232123+0000 | compress_modules | INFO - Quantizing model.layers.15.self_attn.k_proj using 1024 samples
2026-02-13T18:34:10.410651+0000 | compress | METRIC - time 1.18s
2026-02-13T18:34:10.412668+0000 | compress | METRIC - error 106.04
2026-02-13T18:34:10.413487+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:34:10.413926+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:34:10.415074+0000 | compress_modules | INFO - Quantizing model.layers.15.self_attn.v_proj using 1024 samples
2026-02-13T18:34:11.610496+0000 | compress | METRIC - time 1.19s
2026-02-13T18:34:11.612508+0000 | compress | METRIC

(16/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 315.75it/s]
(17/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 293.69it/s]

2026-02-13T18:34:19.595372+0000 | compress_modules | INFO - Quantizing model.layers.16.self_attn.q_proj using 1024 samples





2026-02-13T18:34:20.792199+0000 | compress | METRIC - time 1.19s
2026-02-13T18:34:20.794173+0000 | compress | METRIC - error 441.48
2026-02-13T18:34:20.795082+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:34:20.795701+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:34:20.796449+0000 | compress_modules | INFO - Quantizing model.layers.16.self_attn.k_proj using 1024 samples
2026-02-13T18:34:21.997208+0000 | compress | METRIC - time 1.20s
2026-02-13T18:34:21.999202+0000 | compress | METRIC - error 116.54
2026-02-13T18:34:22.000042+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:34:22.000599+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:34:22.001507+0000 | compress_modules | INFO - Quantizing model.layers.16.self_attn.v_proj using 1024 samples
2026-02-13T18:34:23.191590+0000 | compress | METRIC - time 1.19s
2026-02-13T18:34:23.193591+0000 | compress | METRIC

(17/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 309.65it/s]
(18/31): Calibrating: 100%|██████████| 1024/1024 [00:04<00:00, 234.85it/s]

2026-02-13T18:34:32.098280+0000 | compress_modules | INFO - Quantizing model.layers.17.self_attn.q_proj using 1024 samples





2026-02-13T18:34:33.306398+0000 | compress | METRIC - time 1.20s
2026-02-13T18:34:33.308465+0000 | compress | METRIC - error 448.93
2026-02-13T18:34:33.309249+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:34:33.309798+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:34:33.310760+0000 | compress_modules | INFO - Quantizing model.layers.17.self_attn.k_proj using 1024 samples
2026-02-13T18:34:34.490062+0000 | compress | METRIC - time 1.18s
2026-02-13T18:34:34.492200+0000 | compress | METRIC - error 122.78
2026-02-13T18:34:34.493097+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:34:34.493786+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:34:34.494809+0000 | compress_modules | INFO - Quantizing model.layers.17.self_attn.v_proj using 1024 samples
2026-02-13T18:34:35.666678+0000 | compress | METRIC - time 1.17s
2026-02-13T18:34:35.668642+0000 | compress | METRIC

(18/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 312.00it/s]
(19/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 296.59it/s]

2026-02-13T18:34:43.651766+0000 | compress_modules | INFO - Quantizing model.layers.18.self_attn.q_proj using 1024 samples





2026-02-13T18:34:44.852302+0000 | compress | METRIC - time 1.20s
2026-02-13T18:34:44.854306+0000 | compress | METRIC - error 491.02
2026-02-13T18:34:44.855078+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:34:44.855584+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:34:44.856454+0000 | compress_modules | INFO - Quantizing model.layers.18.self_attn.k_proj using 1024 samples
2026-02-13T18:34:46.031190+0000 | compress | METRIC - time 1.17s
2026-02-13T18:34:46.033254+0000 | compress | METRIC - error 140.95
2026-02-13T18:34:46.034090+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:34:46.034813+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:34:46.035691+0000 | compress_modules | INFO - Quantizing model.layers.18.self_attn.v_proj using 1024 samples
2026-02-13T18:34:47.216486+0000 | compress | METRIC - time 1.18s
2026-02-13T18:34:47.218521+0000 | compress | METRIC

(19/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 313.45it/s]
(20/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 295.32it/s]

2026-02-13T18:34:55.210210+0000 | compress_modules | INFO - Quantizing model.layers.19.self_attn.q_proj using 1024 samples





2026-02-13T18:34:56.441746+0000 | compress | METRIC - time 1.23s
2026-02-13T18:34:56.443784+0000 | compress | METRIC - error 492.01
2026-02-13T18:34:56.444637+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:34:56.445132+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:34:56.446278+0000 | compress_modules | INFO - Quantizing model.layers.19.self_attn.k_proj using 1024 samples
2026-02-13T18:34:57.632673+0000 | compress | METRIC - time 1.19s
2026-02-13T18:34:57.634671+0000 | compress | METRIC - error 141.50
2026-02-13T18:34:57.635593+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:34:57.636196+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:34:57.637355+0000 | compress_modules | INFO - Quantizing model.layers.19.self_attn.v_proj using 1024 samples
2026-02-13T18:34:58.818555+0000 | compress | METRIC - time 1.18s
2026-02-13T18:34:58.820673+0000 | compress | METRIC

(20/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 313.78it/s]
(21/31): Calibrating: 100%|██████████| 1024/1024 [00:04<00:00, 235.99it/s]

2026-02-13T18:35:07.696204+0000 | compress_modules | INFO - Quantizing model.layers.20.self_attn.q_proj using 1024 samples





2026-02-13T18:35:08.932251+0000 | compress | METRIC - time 1.23s
2026-02-13T18:35:08.934195+0000 | compress | METRIC - error 577.14
2026-02-13T18:35:08.935054+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:35:08.935641+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:35:08.936826+0000 | compress_modules | INFO - Quantizing model.layers.20.self_attn.k_proj using 1024 samples
2026-02-13T18:35:10.121083+0000 | compress | METRIC - time 1.18s
2026-02-13T18:35:10.122981+0000 | compress | METRIC - error 155.42
2026-02-13T18:35:10.123769+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:35:10.124319+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:35:10.125507+0000 | compress_modules | INFO - Quantizing model.layers.20.self_attn.v_proj using 1024 samples
2026-02-13T18:35:11.315671+0000 | compress | METRIC - time 1.19s
2026-02-13T18:35:11.317662+0000 | compress | METRIC

(21/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 313.16it/s]
(22/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 295.19it/s]

2026-02-13T18:35:19.304988+0000 | compress_modules | INFO - Quantizing model.layers.21.self_attn.q_proj using 1024 samples





2026-02-13T18:35:20.536271+0000 | compress | METRIC - time 1.23s
2026-02-13T18:35:20.538295+0000 | compress | METRIC - error 659.29
2026-02-13T18:35:20.539130+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:35:20.539715+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:35:20.540762+0000 | compress_modules | INFO - Quantizing model.layers.21.self_attn.k_proj using 1024 samples
2026-02-13T18:35:21.717433+0000 | compress | METRIC - time 1.18s
2026-02-13T18:35:21.719465+0000 | compress | METRIC - error 178.43
2026-02-13T18:35:21.720376+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:35:21.720840+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:35:21.721784+0000 | compress_modules | INFO - Quantizing model.layers.21.self_attn.v_proj using 1024 samples
2026-02-13T18:35:22.910732+0000 | compress | METRIC - time 1.19s
2026-02-13T18:35:22.912731+0000 | compress | METRIC

(22/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 312.27it/s]
(23/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 296.36it/s]

2026-02-13T18:35:30.900735+0000 | compress_modules | INFO - Quantizing model.layers.22.self_attn.q_proj using 1024 samples





2026-02-13T18:35:32.115868+0000 | compress | METRIC - time 1.21s
2026-02-13T18:35:32.117844+0000 | compress | METRIC - error 712.07
2026-02-13T18:35:32.118727+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:35:32.119332+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:35:32.120386+0000 | compress_modules | INFO - Quantizing model.layers.22.self_attn.k_proj using 1024 samples
2026-02-13T18:35:33.324369+0000 | compress | METRIC - time 1.20s
2026-02-13T18:35:33.326421+0000 | compress | METRIC - error 203.20
2026-02-13T18:35:33.327315+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:35:33.327860+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:35:33.328795+0000 | compress_modules | INFO - Quantizing model.layers.22.self_attn.v_proj using 1024 samples
2026-02-13T18:35:34.522458+0000 | compress | METRIC - time 1.19s
2026-02-13T18:35:34.524634+0000 | compress | METRIC

(23/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 310.78it/s]
(24/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 296.19it/s]

2026-02-13T18:35:42.538091+0000 | compress_modules | INFO - Quantizing model.layers.23.self_attn.q_proj using 1024 samples





2026-02-13T18:35:43.743796+0000 | compress | METRIC - time 1.20s
2026-02-13T18:35:43.745765+0000 | compress | METRIC - error 792.33
2026-02-13T18:35:43.746628+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:35:43.747154+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:35:43.748075+0000 | compress_modules | INFO - Quantizing model.layers.23.self_attn.k_proj using 1024 samples
2026-02-13T18:35:44.937935+0000 | compress | METRIC - time 1.19s
2026-02-13T18:35:44.939874+0000 | compress | METRIC - error 237.50
2026-02-13T18:35:44.940669+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:35:44.941296+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:35:44.942238+0000 | compress_modules | INFO - Quantizing model.layers.23.self_attn.v_proj using 1024 samples
2026-02-13T18:35:46.124277+0000 | compress | METRIC - time 1.18s
2026-02-13T18:35:46.126251+0000 | compress | METRIC

(24/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 310.76it/s]
(25/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 266.92it/s]

2026-02-13T18:35:54.494890+0000 | compress_modules | INFO - Quantizing model.layers.24.self_attn.q_proj using 1024 samples





2026-02-13T18:35:55.697035+0000 | compress | METRIC - time 1.20s
2026-02-13T18:35:55.698933+0000 | compress | METRIC - error 1145.97
2026-02-13T18:35:55.699796+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:35:55.700336+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:35:55.701438+0000 | compress_modules | INFO - Quantizing model.layers.24.self_attn.k_proj using 1024 samples
2026-02-13T18:35:56.931038+0000 | compress | METRIC - time 1.23s
2026-02-13T18:35:56.933036+0000 | compress | METRIC - error 308.74
2026-02-13T18:35:56.933964+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:35:56.934555+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:35:56.935630+0000 | compress_modules | INFO - Quantizing model.layers.24.self_attn.v_proj using 1024 samples
2026-02-13T18:35:58.125072+0000 | compress | METRIC - time 1.19s
2026-02-13T18:35:58.127040+0000 | compress | METRI

(25/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 314.72it/s]
(26/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 299.79it/s]

2026-02-13T18:36:06.053283+0000 | compress_modules | INFO - Quantizing model.layers.25.self_attn.q_proj using 1024 samples





2026-02-13T18:36:07.256581+0000 | compress | METRIC - time 1.20s
2026-02-13T18:36:07.258612+0000 | compress | METRIC - error 1324.48
2026-02-13T18:36:07.259416+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:36:07.260053+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:36:07.261349+0000 | compress_modules | INFO - Quantizing model.layers.25.self_attn.k_proj using 1024 samples
2026-02-13T18:36:08.483202+0000 | compress | METRIC - time 1.22s
2026-02-13T18:36:08.485158+0000 | compress | METRIC - error 339.36
2026-02-13T18:36:08.486166+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:36:08.486800+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:36:08.488079+0000 | compress_modules | INFO - Quantizing model.layers.25.self_attn.v_proj using 1024 samples
2026-02-13T18:36:09.685930+0000 | compress | METRIC - time 1.20s
2026-02-13T18:36:09.687877+0000 | compress | METRI

(26/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 313.75it/s]
(27/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 301.84it/s]

2026-02-13T18:36:17.576053+0000 | compress_modules | INFO - Quantizing model.layers.26.self_attn.q_proj using 1024 samples





2026-02-13T18:36:18.795288+0000 | compress | METRIC - time 1.21s
2026-02-13T18:36:18.797202+0000 | compress | METRIC - error 1571.71
2026-02-13T18:36:18.798012+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:36:18.798603+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:36:18.799825+0000 | compress_modules | INFO - Quantizing model.layers.26.self_attn.k_proj using 1024 samples
2026-02-13T18:36:19.994497+0000 | compress | METRIC - time 1.19s
2026-02-13T18:36:19.996483+0000 | compress | METRIC - error 431.58
2026-02-13T18:36:19.997277+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:36:19.997897+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:36:19.999360+0000 | compress_modules | INFO - Quantizing model.layers.26.self_attn.v_proj using 1024 samples
2026-02-13T18:36:21.189925+0000 | compress | METRIC - time 1.19s
2026-02-13T18:36:21.191815+0000 | compress | METRI

(27/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 309.70it/s]
(28/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 300.98it/s]

2026-02-13T18:36:29.158365+0000 | compress_modules | INFO - Quantizing model.layers.27.self_attn.q_proj using 1024 samples





2026-02-13T18:36:30.362250+0000 | compress | METRIC - time 1.20s
2026-02-13T18:36:30.364148+0000 | compress | METRIC - error 2363.99
2026-02-13T18:36:30.365007+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:36:30.365564+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:36:30.366501+0000 | compress_modules | INFO - Quantizing model.layers.27.self_attn.k_proj using 1024 samples
2026-02-13T18:36:31.544925+0000 | compress | METRIC - time 1.18s
2026-02-13T18:36:31.546064+0000 | compress | METRIC - error 617.77
2026-02-13T18:36:31.546957+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:36:31.547575+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:36:31.548595+0000 | compress_modules | INFO - Quantizing model.layers.27.self_attn.v_proj using 1024 samples
2026-02-13T18:36:32.741996+0000 | compress | METRIC - time 1.19s
2026-02-13T18:36:32.743970+0000 | compress | METRI

(28/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 313.04it/s]
(29/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 257.00it/s]

2026-02-13T18:36:41.251525+0000 | compress_modules | INFO - Quantizing model.layers.28.self_attn.q_proj using 1024 samples





2026-02-13T18:36:42.455607+0000 | compress | METRIC - time 1.20s
2026-02-13T18:36:42.457553+0000 | compress | METRIC - error 2703.70
2026-02-13T18:36:42.458297+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:36:42.458820+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:36:42.459725+0000 | compress_modules | INFO - Quantizing model.layers.28.self_attn.k_proj using 1024 samples
2026-02-13T18:36:43.652813+0000 | compress | METRIC - time 1.19s
2026-02-13T18:36:43.654854+0000 | compress | METRIC - error 704.62
2026-02-13T18:36:43.656057+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:36:43.656825+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:36:43.658043+0000 | compress_modules | INFO - Quantizing model.layers.28.self_attn.v_proj using 1024 samples
2026-02-13T18:36:44.843911+0000 | compress | METRIC - time 1.19s
2026-02-13T18:36:44.845838+0000 | compress | METRI

(29/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 315.39it/s]
(30/31): Calibrating: 100%|██████████| 1024/1024 [00:03<00:00, 297.49it/s]

2026-02-13T18:36:52.784920+0000 | compress_modules | INFO - Quantizing model.layers.29.self_attn.q_proj using 1024 samples





2026-02-13T18:36:53.999366+0000 | compress | METRIC - time 1.21s
2026-02-13T18:36:54.001504+0000 | compress | METRIC - error 2680.24
2026-02-13T18:36:54.002671+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:36:54.003314+0000 | compress | METRIC - Compressed module size: 8.486912 MB
2026-02-13T18:36:54.004413+0000 | compress_modules | INFO - Quantizing model.layers.29.self_attn.k_proj using 1024 samples
2026-02-13T18:36:55.199883+0000 | compress | METRIC - time 1.19s
2026-02-13T18:36:55.201902+0000 | compress | METRIC - error 766.38
2026-02-13T18:36:55.202997+0000 | compress | METRIC - GPU 0 | usage: 2.40% | total memory: 85 GB
2026-02-13T18:36:55.203641+0000 | compress | METRIC - Compressed module size: 2.121728 MB
2026-02-13T18:36:55.204630+0000 | compress_modules | INFO - Quantizing model.layers.29.self_attn.v_proj using 1024 samples
2026-02-13T18:36:56.401971+0000 | compress | METRIC - time 1.20s
2026-02-13T18:36:56.403832+0000 | compress | METRI

(30/31): Propagating: 100%|██████████| 1024/1024 [00:03<00:00, 306.96it/s]
(31/31): Calibrating: 100%|██████████| 1024/1024 [00:00<00:00, 1049.23it/s]
(31/31): Propagating: 100%|██████████| 1024/1024 [00:00<00:00, 1070.00it/s]

2026-02-13T18:37:02.920003+0000 | finalize | INFO - Compression lifecycle finalized for 1 modifiers





[INFO] GPTQ(attention-only) success (scheme=W4A16)
[INFO] quantization complete (applied_mode=gptq_attn_only_w4a16, applied_scheme=W4A16)


In [9]:
os.makedirs(OUT_DIR, exist_ok=True)

model.save_pretrained(OUT_DIR, save_compressed=True)
tokenizer.save_pretrained(OUT_DIR)

quant_recipe = {
    "timestamp_utc": datetime.now(timezone.utc).isoformat(),
    "model_id": MODEL_ID,
    "calibration_sources": CALIB_DATASETS,
    "calibration_seed": CALIBRATION_SEED,
    "calibration_benchmark_exclude": sorted(CALIBRATION_BENCHMARK_EXCLUDE),
    "calibration_source_stats": calibration_source_stats,
    "num_calibration_samples": NUM_CALIBRATION_SAMPLES,
    "max_sequence_length": MAX_SEQUENCE_LENGTH,
    "primary_scheme": PRIMARY_SCHEME,
    "fallback_scheme": FALLBACK_SCHEME,
    "applied_scheme": applied_scheme,
    "applied_mode": applied_mode,
    "targets": ATTN_TARGETS,
    "ignore": IGNORE,
    "out_dir": OUT_DIR,
}

recipe_path = Path(OUT_DIR) / "quant_recipe.json"
recipe_path.write_text(json.dumps(quant_recipe, indent=2), encoding="utf-8")

print(f"[INFO] quant recipe saved: {recipe_path}")
print(f"[INFO] 모델 저장 완료: {OUT_DIR}")


2026-02-13T18:37:03.335362+0000 | get_model_compressor | INFO - skip_sparsity_compression_stats set to True. Skipping sparsity compression statistic calculations. No sparsity compressor will be applied.


Compressing model: 120it [00:00, 160.40it/s]


[INFO] quant recipe saved: /content/drive/MyDrive/LGAimers/sq_w4a16_attn_calmix1024/quant_recipe.json
[INFO] 모델 저장 완료: /content/drive/MyDrive/LGAimers/sq_w4a16_attn_calmix1024


In [10]:
zip_name = f"/content/drive/MyDrive/LGAimers/submit/{Path(OUT_DIR).name}"
zip_path = Path(zip_name)

zip_path.parent.mkdir(parents=True, exist_ok=True)  # 경로 없으면 생성

from tempfile import TemporaryDirectory
with TemporaryDirectory() as tmpdir:
    tmp_root = Path(tmpdir)
    model_dir = tmp_root / "model"
    shutil.copytree(OUT_DIR, model_dir)
    shutil.make_archive(str(zip_path), "zip", root_dir=tmp_root, base_dir="model")

print(f"[INFO] 생성 완료: {zip_name}.zip")


[INFO] 생성 완료: /content/drive/MyDrive/LGAimers/submit/sq_w4a16_attn_calmix1024.zip
