# **Meta-Llama-3.1-8B-Instruct 모델 GPTQ 양자화하기**

# 1. 라이브러리 다운로드

In [1]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
!pip install -q -U peft accelerate optimum
!pip install datasets==2.15.0
!pip install auto-gptq --extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/
!pip install huggingface_hub
!pip install langchain
!pip install transformers[torch] -U
!pip install torch --extra-index-url https://download.pytorch.org/whl/cu118

[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Collecting datasets==2.15.0
  Downloading datasets-2.15.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets==2.15.0)
  Downloading dill-0.3.7-py3-none-any.whl.metadata (9.9 kB)
Collecting fsspec<=2023.10.0,>=2023.1.0 (from fsspec[http]<=2023.10.0,>=2023.1.0->datasets==2.15.0)
  Downloading fsspec-2023.10.0-py3-none-any.whl.metadata (6.8 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
Collecting multiprocess (from datasets==2.15.0)
  Downloading multiprocess-0.70.15-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import torch

# CUDA 사용 가능 확인
print("Is CUDA available:", torch.cuda.is_available())

Is CUDA available: True


# 2. 지원되는 데이터셋을 전달하여 모델 양자화하기
- 'wikitest2' 데이터셋 사용
- 추후 재정정보 데이터셋 전처리 후 사용고려
- 4비트 정밀도로 양자화 (지원 : 2,4,6,8)

> **전처리 시 고려사항**
1. PyPDF2, pdfplumbler, pdfminer 같은 라이브러리로 pdf에서 텍스트 추출
2. 불필요한 공백, 페이지 번호, 제목 등 필요하지 않은 정보 제거
3. 순서가 맞지 않는 문장 재구성, 정제
4. 데이터셋 형태로 저장(JSON, CSV, TXT 등)

In [5]:
from transformers import AutoModelForCausalLM, AutoTokenizer, GPTQConfig
import logging
import torch

# 로깅 기본 설정
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# 모델 ID
model_id = "sh2orc/Llama-3.1-Korean-8B-Instruct" # 베이스 Llama 3 + 한국어 파인튜닝 모델

# GPTQ 양자화 설정
quantization_config = GPTQConfig(
    bits=4,
    group_size=128,       # 일반적으로 사용되는 그룹 크기
    dataset="wikitext2",  # 양자화에 사용할 기본 데이터셋, 추후 제공받은 데이터셋을 전처리하여 넣어주는 것도 고려하기
    desc_act=False
)

# 토크나이저 로드
logging.info(f"{model_id} 모델을 위한 토크나이저 로딩 중...")
tokenizer = AutoTokenizer.from_pretrained(model_id)
logging.info("토크나이저 로딩 완료")

# 모델 로드 및 양자화
logging.info(f"{model_id} 모델 로딩 및 양자화 설정 적용 중: {quantization_config}")
quant_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=quantization_config,
    device_map="auto",         # 자동으로 가능한 장치에 할당
    torch_dtype=torch.float16  # 메모리 절약을 위해 float16 사용
)
logging.info("모델 로딩 및 양자화 완료")

2024-08-13 12:13:29,834 - INFO - sh2orc/Llama-3.1-Korean-8B-Instruct 모델을 위한 토크나이저 로딩 중...
2024-08-13 12:13:30,320 - INFO - 토크나이저 로딩 완료
2024-08-13 12:13:30,321 - INFO - sh2orc/Llama-3.1-Korean-8B-Instruct 모델 로딩 및 양자화 설정 적용 중: GPTQConfig(quant_method=<QuantizationMethod.GPTQ: 'gptq'>)
2024-08-13 12:13:30,476 - INFO - We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Quantizing model.layers blocks :   0%|          | 0/32 [00:00<?, ?it/s]

2024-08-13 12:14:33,346 - INFO - Start quantizing block model.layers 1/32
2024-08-13 12:14:33,347 - INFO - Module to quantize [['self_attn.q_proj'], ['self_attn.k_proj'], ['self_attn.v_proj'], ['self_attn.o_proj'], ['mlp.gate_proj'], ['mlp.up_proj'], ['mlp.down_proj']]


Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:14:35,435 - INFO - Quantizing self_attn.q_proj in block 1/32...
2024-08-13 12:14:37,746 - INFO - duration: 2.3090169429779053
2024-08-13 12:14:37,749 - INFO - avg loss: 42.028236389160156
2024-08-13 12:14:39,831 - INFO - Quantizing self_attn.k_proj in block 1/32...
2024-08-13 12:14:42,120 - INFO - duration: 2.286870002746582
2024-08-13 12:14:42,123 - INFO - avg loss: 24.211139678955078
2024-08-13 12:14:44,212 - INFO - Quantizing self_attn.v_proj in block 1/32...
2024-08-13 12:14:46,512 - INFO - duration: 2.2977356910705566
2024-08-13 12:14:46,514 - INFO - avg loss: 0.974865198135376
2024-08-13 12:14:48,608 - INFO - Quantizing self_attn.o_proj in block 1/32...
2024-08-13 12:14:50,941 - INFO - duration: 2.330991268157959
2024-08-13 12:14:50,944 - INFO - avg loss: 0.020795434713363647
2024-08-13 12:14:53,036 - INFO - Quantizing mlp.gate_proj in block 1/32...
2024-08-13 12:14:55,345 - INFO - duration: 2.306201457977295
2024-08-13 12:14:55,348 - INFO - avg loss: 20.37120628356

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:15:24,454 - INFO - Quantizing self_attn.q_proj in block 2/32...
2024-08-13 12:15:26,804 - INFO - duration: 2.3476974964141846
2024-08-13 12:15:26,806 - INFO - avg loss: 50.9440803527832
2024-08-13 12:15:28,919 - INFO - Quantizing self_attn.k_proj in block 2/32...
2024-08-13 12:15:31,252 - INFO - duration: 2.330869674682617
2024-08-13 12:15:31,254 - INFO - avg loss: 29.199434280395508
2024-08-13 12:15:33,374 - INFO - Quantizing self_attn.v_proj in block 2/32...
2024-08-13 12:15:35,710 - INFO - duration: 2.3346173763275146
2024-08-13 12:15:35,713 - INFO - avg loss: 2.16469407081604
2024-08-13 12:15:37,834 - INFO - Quantizing self_attn.o_proj in block 2/32...
2024-08-13 12:15:40,195 - INFO - duration: 2.35874605178833
2024-08-13 12:15:40,198 - INFO - avg loss: 0.11542366445064545
2024-08-13 12:15:42,312 - INFO - Quantizing mlp.gate_proj in block 2/32...
2024-08-13 12:15:44,683 - INFO - duration: 2.3697667121887207
2024-08-13 12:15:44,687 - INFO - avg loss: 35.947383880615234

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:16:13,770 - INFO - Quantizing self_attn.q_proj in block 3/32...
2024-08-13 12:16:16,107 - INFO - duration: 2.3349392414093018
2024-08-13 12:16:16,110 - INFO - avg loss: 231.9821014404297
2024-08-13 12:16:18,242 - INFO - Quantizing self_attn.k_proj in block 3/32...
2024-08-13 12:16:20,576 - INFO - duration: 2.3318052291870117
2024-08-13 12:16:20,579 - INFO - avg loss: 142.6824188232422
2024-08-13 12:16:22,716 - INFO - Quantizing self_attn.v_proj in block 3/32...
2024-08-13 12:16:25,045 - INFO - duration: 2.326442003250122
2024-08-13 12:16:25,048 - INFO - avg loss: 7.545915603637695
2024-08-13 12:16:27,167 - INFO - Quantizing self_attn.o_proj in block 3/32...
2024-08-13 12:16:29,510 - INFO - duration: 2.3411972522735596
2024-08-13 12:16:29,513 - INFO - avg loss: 0.08965198695659637
2024-08-13 12:16:31,640 - INFO - Quantizing mlp.gate_proj in block 3/32...
2024-08-13 12:16:33,997 - INFO - duration: 2.354746103286743
2024-08-13 12:16:33,999 - INFO - avg loss: 67.1592330932617

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:17:03,113 - INFO - Quantizing self_attn.q_proj in block 4/32...
2024-08-13 12:17:05,459 - INFO - duration: 2.343071460723877
2024-08-13 12:17:05,461 - INFO - avg loss: 166.1986541748047
2024-08-13 12:17:07,605 - INFO - Quantizing self_attn.k_proj in block 4/32...
2024-08-13 12:17:09,935 - INFO - duration: 2.3274848461151123
2024-08-13 12:17:09,938 - INFO - avg loss: 91.67578125
2024-08-13 12:17:12,076 - INFO - Quantizing self_attn.v_proj in block 4/32...
2024-08-13 12:17:14,404 - INFO - duration: 2.3262999057769775
2024-08-13 12:17:14,407 - INFO - avg loss: 8.517759323120117
2024-08-13 12:17:16,526 - INFO - Quantizing self_attn.o_proj in block 4/32...
2024-08-13 12:17:18,873 - INFO - duration: 2.3448173999786377
2024-08-13 12:17:18,875 - INFO - avg loss: 0.21951833367347717
2024-08-13 12:17:20,994 - INFO - Quantizing mlp.gate_proj in block 4/32...
2024-08-13 12:17:23,353 - INFO - duration: 2.3560845851898193
2024-08-13 12:17:23,356 - INFO - avg loss: 106.84910583496094
20

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:17:52,470 - INFO - Quantizing self_attn.q_proj in block 5/32...
2024-08-13 12:17:54,812 - INFO - duration: 2.339782476425171
2024-08-13 12:17:54,814 - INFO - avg loss: 157.26239013671875
2024-08-13 12:17:56,958 - INFO - Quantizing self_attn.k_proj in block 5/32...
2024-08-13 12:17:59,278 - INFO - duration: 2.317671537399292
2024-08-13 12:17:59,280 - INFO - avg loss: 90.3774185180664
2024-08-13 12:18:01,423 - INFO - Quantizing self_attn.v_proj in block 5/32...
2024-08-13 12:18:03,746 - INFO - duration: 2.3212344646453857
2024-08-13 12:18:03,749 - INFO - avg loss: 9.081046104431152
2024-08-13 12:18:05,887 - INFO - Quantizing self_attn.o_proj in block 5/32...
2024-08-13 12:18:08,241 - INFO - duration: 2.3525967597961426
2024-08-13 12:18:08,245 - INFO - avg loss: 0.3407130837440491
2024-08-13 12:18:10,374 - INFO - Quantizing mlp.gate_proj in block 5/32...
2024-08-13 12:18:12,731 - INFO - duration: 2.35528826713562
2024-08-13 12:18:12,735 - INFO - avg loss: 149.834716796875
20

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:18:41,820 - INFO - Quantizing self_attn.q_proj in block 6/32...
2024-08-13 12:18:44,174 - INFO - duration: 2.3526692390441895
2024-08-13 12:18:44,177 - INFO - avg loss: 231.38519287109375
2024-08-13 12:18:46,320 - INFO - Quantizing self_attn.k_proj in block 6/32...
2024-08-13 12:18:48,656 - INFO - duration: 2.333840847015381
2024-08-13 12:18:48,659 - INFO - avg loss: 142.35379028320312
2024-08-13 12:18:50,803 - INFO - Quantizing self_attn.v_proj in block 6/32...
2024-08-13 12:18:53,134 - INFO - duration: 2.3284802436828613
2024-08-13 12:18:53,136 - INFO - avg loss: 9.285213470458984
2024-08-13 12:18:55,256 - INFO - Quantizing self_attn.o_proj in block 6/32...
2024-08-13 12:18:57,594 - INFO - duration: 2.3353450298309326
2024-08-13 12:18:57,596 - INFO - avg loss: 0.45885875821113586
2024-08-13 12:18:59,721 - INFO - Quantizing mlp.gate_proj in block 6/32...
2024-08-13 12:19:02,097 - INFO - duration: 2.374053478240967
2024-08-13 12:19:02,099 - INFO - avg loss: 173.0822448730

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:19:31,312 - INFO - Quantizing self_attn.q_proj in block 7/32...
2024-08-13 12:19:33,648 - INFO - duration: 2.3343465328216553
2024-08-13 12:19:33,651 - INFO - avg loss: 193.93023681640625
2024-08-13 12:19:35,800 - INFO - Quantizing self_attn.k_proj in block 7/32...
2024-08-13 12:19:38,128 - INFO - duration: 2.3265209197998047
2024-08-13 12:19:38,131 - INFO - avg loss: 107.45542907714844
2024-08-13 12:19:40,278 - INFO - Quantizing self_attn.v_proj in block 7/32...
2024-08-13 12:19:42,630 - INFO - duration: 2.3505964279174805
2024-08-13 12:19:42,633 - INFO - avg loss: 9.435697555541992
2024-08-13 12:19:44,769 - INFO - Quantizing self_attn.o_proj in block 7/32...
2024-08-13 12:19:47,196 - INFO - duration: 2.4254062175750732
2024-08-13 12:19:47,200 - INFO - avg loss: 0.7809548377990723
2024-08-13 12:19:49,322 - INFO - Quantizing mlp.gate_proj in block 7/32...
2024-08-13 12:19:51,666 - INFO - duration: 2.3422746658325195
2024-08-13 12:19:51,670 - INFO - avg loss: 188.687255859

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:20:20,817 - INFO - Quantizing self_attn.q_proj in block 8/32...
2024-08-13 12:20:23,170 - INFO - duration: 2.3501670360565186
2024-08-13 12:20:23,173 - INFO - avg loss: 179.76084899902344
2024-08-13 12:20:25,326 - INFO - Quantizing self_attn.k_proj in block 8/32...
2024-08-13 12:20:27,647 - INFO - duration: 2.318657398223877
2024-08-13 12:20:27,650 - INFO - avg loss: 106.5436782836914
2024-08-13 12:20:29,799 - INFO - Quantizing self_attn.v_proj in block 8/32...
2024-08-13 12:20:32,125 - INFO - duration: 2.3233718872070312
2024-08-13 12:20:32,128 - INFO - avg loss: 9.293523788452148
2024-08-13 12:20:34,273 - INFO - Quantizing self_attn.o_proj in block 8/32...
2024-08-13 12:20:36,597 - INFO - duration: 2.321821928024292
2024-08-13 12:20:36,600 - INFO - avg loss: 1.161210060119629
2024-08-13 12:20:38,722 - INFO - Quantizing mlp.gate_proj in block 8/32...
2024-08-13 12:20:41,091 - INFO - duration: 2.3668031692504883
2024-08-13 12:20:41,095 - INFO - avg loss: 190.3667907714843

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:21:10,271 - INFO - Quantizing self_attn.q_proj in block 9/32...
2024-08-13 12:21:12,673 - INFO - duration: 2.399696111679077
2024-08-13 12:21:12,676 - INFO - avg loss: 236.83633422851562
2024-08-13 12:21:14,812 - INFO - Quantizing self_attn.k_proj in block 9/32...
2024-08-13 12:21:17,141 - INFO - duration: 2.3270413875579834
2024-08-13 12:21:17,144 - INFO - avg loss: 145.73944091796875
2024-08-13 12:21:19,294 - INFO - Quantizing self_attn.v_proj in block 9/32...
2024-08-13 12:21:21,627 - INFO - duration: 2.331228494644165
2024-08-13 12:21:21,630 - INFO - avg loss: 13.035892486572266
2024-08-13 12:21:23,766 - INFO - Quantizing self_attn.o_proj in block 9/32...
2024-08-13 12:21:26,069 - INFO - duration: 2.3012495040893555
2024-08-13 12:21:26,072 - INFO - avg loss: 1.6786541938781738
2024-08-13 12:21:28,193 - INFO - Quantizing mlp.gate_proj in block 9/32...
2024-08-13 12:21:30,585 - INFO - duration: 2.390007257461548
2024-08-13 12:21:30,589 - INFO - avg loss: 203.66647338867

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:21:59,724 - INFO - Quantizing self_attn.q_proj in block 10/32...
2024-08-13 12:22:02,062 - INFO - duration: 2.335982322692871
2024-08-13 12:22:02,065 - INFO - avg loss: 228.21009826660156
2024-08-13 12:22:04,214 - INFO - Quantizing self_attn.k_proj in block 10/32...
2024-08-13 12:22:06,552 - INFO - duration: 2.3359363079071045
2024-08-13 12:22:06,556 - INFO - avg loss: 135.29937744140625
2024-08-13 12:22:08,688 - INFO - Quantizing self_attn.v_proj in block 10/32...
2024-08-13 12:22:11,021 - INFO - duration: 2.330453872680664
2024-08-13 12:22:11,024 - INFO - avg loss: 17.683921813964844
2024-08-13 12:22:13,159 - INFO - Quantizing self_attn.o_proj in block 10/32...
2024-08-13 12:22:15,513 - INFO - duration: 2.3517191410064697
2024-08-13 12:22:15,516 - INFO - avg loss: 1.7565605640411377
2024-08-13 12:22:17,645 - INFO - Quantizing mlp.gate_proj in block 10/32...
2024-08-13 12:22:20,009 - INFO - duration: 2.3620030879974365
2024-08-13 12:22:20,013 - INFO - avg loss: 215.92918

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:22:49,142 - INFO - Quantizing self_attn.q_proj in block 11/32...
2024-08-13 12:22:51,494 - INFO - duration: 2.3497464656829834
2024-08-13 12:22:51,496 - INFO - avg loss: 286.0300598144531
2024-08-13 12:22:53,643 - INFO - Quantizing self_attn.k_proj in block 11/32...
2024-08-13 12:22:55,987 - INFO - duration: 2.3419744968414307
2024-08-13 12:22:55,989 - INFO - avg loss: 179.50184631347656
2024-08-13 12:22:58,133 - INFO - Quantizing self_attn.v_proj in block 11/32...
2024-08-13 12:23:00,463 - INFO - duration: 2.328028917312622
2024-08-13 12:23:00,465 - INFO - avg loss: 15.12006950378418
2024-08-13 12:23:02,586 - INFO - Quantizing self_attn.o_proj in block 11/32...
2024-08-13 12:23:04,934 - INFO - duration: 2.345777750015259
2024-08-13 12:23:04,936 - INFO - avg loss: 1.9192898273468018
2024-08-13 12:23:07,055 - INFO - Quantizing mlp.gate_proj in block 11/32...
2024-08-13 12:23:09,438 - INFO - duration: 2.381199598312378
2024-08-13 12:23:09,440 - INFO - avg loss: 208.05307006

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:23:38,560 - INFO - Quantizing self_attn.q_proj in block 12/32...
2024-08-13 12:23:40,907 - INFO - duration: 2.3442063331604004
2024-08-13 12:23:40,908 - INFO - avg loss: 237.93594360351562
2024-08-13 12:23:43,058 - INFO - Quantizing self_attn.k_proj in block 12/32...
2024-08-13 12:23:45,391 - INFO - duration: 2.331113815307617
2024-08-13 12:23:45,394 - INFO - avg loss: 152.92620849609375
2024-08-13 12:23:47,533 - INFO - Quantizing self_attn.v_proj in block 12/32...
2024-08-13 12:23:49,860 - INFO - duration: 2.324761152267456
2024-08-13 12:23:49,863 - INFO - avg loss: 14.317408561706543
2024-08-13 12:23:51,984 - INFO - Quantizing self_attn.o_proj in block 12/32...
2024-08-13 12:23:54,327 - INFO - duration: 2.34072208404541
2024-08-13 12:23:54,329 - INFO - avg loss: 2.1274776458740234
2024-08-13 12:23:56,448 - INFO - Quantizing mlp.gate_proj in block 12/32...
2024-08-13 12:23:58,791 - INFO - duration: 2.3402297496795654
2024-08-13 12:23:58,795 - INFO - avg loss: 219.1680297

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:24:27,952 - INFO - Quantizing self_attn.q_proj in block 13/32...
2024-08-13 12:24:30,326 - INFO - duration: 2.370281457901001
2024-08-13 12:24:30,329 - INFO - avg loss: 202.4832763671875
2024-08-13 12:24:32,480 - INFO - Quantizing self_attn.k_proj in block 13/32...
2024-08-13 12:24:34,807 - INFO - duration: 2.324845314025879
2024-08-13 12:24:34,810 - INFO - avg loss: 113.98382568359375
2024-08-13 12:24:36,952 - INFO - Quantizing self_attn.v_proj in block 13/32...
2024-08-13 12:24:39,273 - INFO - duration: 2.3193159103393555
2024-08-13 12:24:39,276 - INFO - avg loss: 16.267288208007812
2024-08-13 12:24:41,402 - INFO - Quantizing self_attn.o_proj in block 13/32...
2024-08-13 12:24:43,754 - INFO - duration: 2.3502700328826904
2024-08-13 12:24:43,757 - INFO - avg loss: 2.5945496559143066
2024-08-13 12:24:45,885 - INFO - Quantizing mlp.gate_proj in block 13/32...
2024-08-13 12:24:48,256 - INFO - duration: 2.368833303451538
2024-08-13 12:24:48,258 - INFO - avg loss: 221.3030395

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:25:17,348 - INFO - Quantizing self_attn.q_proj in block 14/32...
2024-08-13 12:25:19,683 - INFO - duration: 2.3324007987976074
2024-08-13 12:25:19,686 - INFO - avg loss: 278.56903076171875
2024-08-13 12:25:21,830 - INFO - Quantizing self_attn.k_proj in block 14/32...
2024-08-13 12:25:24,154 - INFO - duration: 2.321295738220215
2024-08-13 12:25:24,157 - INFO - avg loss: 184.80929565429688
2024-08-13 12:25:26,298 - INFO - Quantizing self_attn.v_proj in block 14/32...
2024-08-13 12:25:28,611 - INFO - duration: 2.311035394668579
2024-08-13 12:25:28,613 - INFO - avg loss: 19.082439422607422
2024-08-13 12:25:30,736 - INFO - Quantizing self_attn.o_proj in block 14/32...
2024-08-13 12:25:33,075 - INFO - duration: 2.336509943008423
2024-08-13 12:25:33,077 - INFO - avg loss: 3.1353139877319336
2024-08-13 12:25:35,207 - INFO - Quantizing mlp.gate_proj in block 14/32...
2024-08-13 12:25:37,584 - INFO - duration: 2.3750834465026855
2024-08-13 12:25:37,588 - INFO - avg loss: 234.252624

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:26:06,679 - INFO - Quantizing self_attn.q_proj in block 15/32...
2024-08-13 12:26:08,980 - INFO - duration: 2.2989895343780518
2024-08-13 12:26:08,983 - INFO - avg loss: 294.7220458984375
2024-08-13 12:26:11,127 - INFO - Quantizing self_attn.k_proj in block 15/32...
2024-08-13 12:26:13,412 - INFO - duration: 2.283842086791992
2024-08-13 12:26:13,415 - INFO - avg loss: 204.1912841796875
2024-08-13 12:26:15,562 - INFO - Quantizing self_attn.v_proj in block 15/32...
2024-08-13 12:26:17,909 - INFO - duration: 2.345252752304077
2024-08-13 12:26:17,911 - INFO - avg loss: 20.576343536376953
2024-08-13 12:26:20,041 - INFO - Quantizing self_attn.o_proj in block 15/32...
2024-08-13 12:26:22,370 - INFO - duration: 2.3264260292053223
2024-08-13 12:26:22,372 - INFO - avg loss: 3.579751968383789
2024-08-13 12:26:24,492 - INFO - Quantizing mlp.gate_proj in block 15/32...
2024-08-13 12:26:26,839 - INFO - duration: 2.3456342220306396
2024-08-13 12:26:26,841 - INFO - avg loss: 278.03125
20

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:26:55,929 - INFO - Quantizing self_attn.q_proj in block 16/32...
2024-08-13 12:26:58,263 - INFO - duration: 2.331707715988159
2024-08-13 12:26:58,265 - INFO - avg loss: 322.79547119140625
2024-08-13 12:27:00,417 - INFO - Quantizing self_attn.k_proj in block 16/32...
2024-08-13 12:27:02,727 - INFO - duration: 2.3087220191955566
2024-08-13 12:27:02,730 - INFO - avg loss: 171.013916015625
2024-08-13 12:27:04,866 - INFO - Quantizing self_attn.v_proj in block 16/32...
2024-08-13 12:27:07,157 - INFO - duration: 2.2881412506103516
2024-08-13 12:27:07,159 - INFO - avg loss: 24.086652755737305
2024-08-13 12:27:09,282 - INFO - Quantizing self_attn.o_proj in block 16/32...
2024-08-13 12:27:11,610 - INFO - duration: 2.3259952068328857
2024-08-13 12:27:11,612 - INFO - avg loss: 3.0098648071289062
2024-08-13 12:27:13,732 - INFO - Quantizing mlp.gate_proj in block 16/32...
2024-08-13 12:27:16,091 - INFO - duration: 2.3575053215026855
2024-08-13 12:27:16,093 - INFO - avg loss: 303.301818

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:27:45,225 - INFO - Quantizing self_attn.q_proj in block 17/32...
2024-08-13 12:27:47,563 - INFO - duration: 2.3359267711639404
2024-08-13 12:27:47,565 - INFO - avg loss: 310.843017578125
2024-08-13 12:27:49,710 - INFO - Quantizing self_attn.k_proj in block 17/32...
2024-08-13 12:27:52,021 - INFO - duration: 2.3085434436798096
2024-08-13 12:27:52,023 - INFO - avg loss: 180.99252319335938
2024-08-13 12:27:54,153 - INFO - Quantizing self_attn.v_proj in block 17/32...
2024-08-13 12:27:56,472 - INFO - duration: 2.3167459964752197
2024-08-13 12:27:56,475 - INFO - avg loss: 22.79961585998535
2024-08-13 12:27:58,598 - INFO - Quantizing self_attn.o_proj in block 17/32...
2024-08-13 12:28:00,954 - INFO - duration: 2.3542187213897705
2024-08-13 12:28:00,957 - INFO - avg loss: 2.755053758621216
2024-08-13 12:28:03,081 - INFO - Quantizing mlp.gate_proj in block 17/32...
2024-08-13 12:28:05,465 - INFO - duration: 2.3818747997283936
2024-08-13 12:28:05,467 - INFO - avg loss: 326.6124267

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:28:34,621 - INFO - Quantizing self_attn.q_proj in block 18/32...
2024-08-13 12:28:36,968 - INFO - duration: 2.3453216552734375
2024-08-13 12:28:36,971 - INFO - avg loss: 304.5554504394531
2024-08-13 12:28:39,111 - INFO - Quantizing self_attn.k_proj in block 18/32...
2024-08-13 12:28:41,421 - INFO - duration: 2.307816982269287
2024-08-13 12:28:41,424 - INFO - avg loss: 184.3427734375
2024-08-13 12:28:43,564 - INFO - Quantizing self_attn.v_proj in block 18/32...
2024-08-13 12:28:45,876 - INFO - duration: 2.309361696243286
2024-08-13 12:28:45,880 - INFO - avg loss: 23.755367279052734
2024-08-13 12:28:48,014 - INFO - Quantizing self_attn.o_proj in block 18/32...
2024-08-13 12:28:50,368 - INFO - duration: 2.3512353897094727
2024-08-13 12:28:50,371 - INFO - avg loss: 2.197904586791992
2024-08-13 12:28:52,505 - INFO - Quantizing mlp.gate_proj in block 18/32...
2024-08-13 12:28:54,881 - INFO - duration: 2.373922824859619
2024-08-13 12:28:54,885 - INFO - avg loss: 343.887512207031

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:29:24,047 - INFO - Quantizing self_attn.q_proj in block 19/32...
2024-08-13 12:29:26,378 - INFO - duration: 2.3285956382751465
2024-08-13 12:29:26,382 - INFO - avg loss: 300.4314880371094
2024-08-13 12:29:28,532 - INFO - Quantizing self_attn.k_proj in block 19/32...
2024-08-13 12:29:30,862 - INFO - duration: 2.3284213542938232
2024-08-13 12:29:30,866 - INFO - avg loss: 195.93148803710938
2024-08-13 12:29:33,009 - INFO - Quantizing self_attn.v_proj in block 19/32...
2024-08-13 12:29:35,309 - INFO - duration: 2.297760248184204
2024-08-13 12:29:35,312 - INFO - avg loss: 23.605676651000977
2024-08-13 12:29:37,436 - INFO - Quantizing self_attn.o_proj in block 19/32...
2024-08-13 12:29:39,746 - INFO - duration: 2.3075056076049805
2024-08-13 12:29:39,749 - INFO - avg loss: 1.412214994430542
2024-08-13 12:29:41,878 - INFO - Quantizing mlp.gate_proj in block 19/32...
2024-08-13 12:29:44,231 - INFO - duration: 2.3501784801483154
2024-08-13 12:29:44,235 - INFO - avg loss: 354.933410

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:30:13,447 - INFO - Quantizing self_attn.q_proj in block 20/32...
2024-08-13 12:30:15,810 - INFO - duration: 2.360743284225464
2024-08-13 12:30:15,814 - INFO - avg loss: 301.09063720703125
2024-08-13 12:30:17,956 - INFO - Quantizing self_attn.k_proj in block 20/32...
2024-08-13 12:30:20,253 - INFO - duration: 2.2947027683258057
2024-08-13 12:30:20,255 - INFO - avg loss: 176.5435791015625
2024-08-13 12:30:22,403 - INFO - Quantizing self_attn.v_proj in block 20/32...
2024-08-13 12:30:24,695 - INFO - duration: 2.2898306846618652
2024-08-13 12:30:24,698 - INFO - avg loss: 24.614946365356445
2024-08-13 12:30:26,821 - INFO - Quantizing self_attn.o_proj in block 20/32...
2024-08-13 12:30:29,167 - INFO - duration: 2.343763589859009
2024-08-13 12:30:29,170 - INFO - avg loss: 1.3274917602539062
2024-08-13 12:30:31,289 - INFO - Quantizing mlp.gate_proj in block 20/32...
2024-08-13 12:30:33,652 - INFO - duration: 2.360658884048462
2024-08-13 12:30:33,655 - INFO - avg loss: 375.0755004

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:31:02,797 - INFO - Quantizing self_attn.q_proj in block 21/32...
2024-08-13 12:31:05,176 - INFO - duration: 2.377578020095825
2024-08-13 12:31:05,179 - INFO - avg loss: 316.66668701171875
2024-08-13 12:31:07,330 - INFO - Quantizing self_attn.k_proj in block 21/32...
2024-08-13 12:31:09,661 - INFO - duration: 2.3286612033843994
2024-08-13 12:31:09,664 - INFO - avg loss: 199.58489990234375
2024-08-13 12:31:11,805 - INFO - Quantizing self_attn.v_proj in block 21/32...
2024-08-13 12:31:14,139 - INFO - duration: 2.331873893737793
2024-08-13 12:31:14,142 - INFO - avg loss: 28.500093460083008
2024-08-13 12:31:16,277 - INFO - Quantizing self_attn.o_proj in block 21/32...
2024-08-13 12:31:18,637 - INFO - duration: 2.358736038208008
2024-08-13 12:31:18,640 - INFO - avg loss: 1.4085948467254639
2024-08-13 12:31:20,759 - INFO - Quantizing mlp.gate_proj in block 21/32...
2024-08-13 12:31:23,141 - INFO - duration: 2.379887104034424
2024-08-13 12:31:23,144 - INFO - avg loss: 398.2726745

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:31:52,347 - INFO - Quantizing self_attn.q_proj in block 22/32...
2024-08-13 12:31:54,703 - INFO - duration: 2.3530707359313965
2024-08-13 12:31:54,707 - INFO - avg loss: 301.56903076171875
2024-08-13 12:31:56,857 - INFO - Quantizing self_attn.k_proj in block 22/32...
2024-08-13 12:31:59,171 - INFO - duration: 2.311819553375244
2024-08-13 12:31:59,174 - INFO - avg loss: 185.151611328125
2024-08-13 12:32:01,322 - INFO - Quantizing self_attn.v_proj in block 22/32...
2024-08-13 12:32:03,654 - INFO - duration: 2.3288512229919434
2024-08-13 12:32:03,657 - INFO - avg loss: 29.882423400878906
2024-08-13 12:32:05,780 - INFO - Quantizing self_attn.o_proj in block 22/32...
2024-08-13 12:32:08,131 - INFO - duration: 2.348987340927124
2024-08-13 12:32:08,134 - INFO - avg loss: 2.290593147277832
2024-08-13 12:32:10,265 - INFO - Quantizing mlp.gate_proj in block 22/32...
2024-08-13 12:32:12,633 - INFO - duration: 2.3657114505767822
2024-08-13 12:32:12,637 - INFO - avg loss: 426.74325561

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:32:41,801 - INFO - Quantizing self_attn.q_proj in block 23/32...
2024-08-13 12:32:44,135 - INFO - duration: 2.3319053649902344
2024-08-13 12:32:44,138 - INFO - avg loss: 313.28851318359375
2024-08-13 12:32:46,291 - INFO - Quantizing self_attn.k_proj in block 23/32...
2024-08-13 12:32:48,602 - INFO - duration: 2.309458017349243
2024-08-13 12:32:48,605 - INFO - avg loss: 203.236083984375
2024-08-13 12:32:50,742 - INFO - Quantizing self_attn.v_proj in block 23/32...
2024-08-13 12:32:53,060 - INFO - duration: 2.3160927295684814
2024-08-13 12:32:53,062 - INFO - avg loss: 35.67998504638672
2024-08-13 12:32:55,191 - INFO - Quantizing self_attn.o_proj in block 23/32...
2024-08-13 12:32:57,532 - INFO - duration: 2.338890790939331
2024-08-13 12:32:57,535 - INFO - avg loss: 1.8207887411117554
2024-08-13 12:32:59,658 - INFO - Quantizing mlp.gate_proj in block 23/32...
2024-08-13 12:33:02,016 - INFO - duration: 2.3562369346618652
2024-08-13 12:33:02,019 - INFO - avg loss: 437.91470336

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:33:31,180 - INFO - Quantizing self_attn.q_proj in block 24/32...
2024-08-13 12:33:33,504 - INFO - duration: 2.322471857070923
2024-08-13 12:33:33,506 - INFO - avg loss: 311.2845764160156
2024-08-13 12:33:35,652 - INFO - Quantizing self_attn.k_proj in block 24/32...
2024-08-13 12:33:37,968 - INFO - duration: 2.314829111099243
2024-08-13 12:33:37,971 - INFO - avg loss: 197.7870635986328
2024-08-13 12:33:40,119 - INFO - Quantizing self_attn.v_proj in block 24/32...
2024-08-13 12:33:42,435 - INFO - duration: 2.3134641647338867
2024-08-13 12:33:42,437 - INFO - avg loss: 37.16040802001953
2024-08-13 12:33:44,588 - INFO - Quantizing self_attn.o_proj in block 24/32...
2024-08-13 12:33:46,919 - INFO - duration: 2.3291563987731934
2024-08-13 12:33:46,921 - INFO - avg loss: 1.5235705375671387
2024-08-13 12:33:49,040 - INFO - Quantizing mlp.gate_proj in block 24/32...
2024-08-13 12:33:51,399 - INFO - duration: 2.357175350189209
2024-08-13 12:33:51,401 - INFO - avg loss: 459.598449707

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:34:20,526 - INFO - Quantizing self_attn.q_proj in block 25/32...
2024-08-13 12:34:22,833 - INFO - duration: 2.304760694503784
2024-08-13 12:34:22,835 - INFO - avg loss: 313.7362060546875
2024-08-13 12:34:24,962 - INFO - Quantizing self_attn.k_proj in block 25/32...
2024-08-13 12:34:27,274 - INFO - duration: 2.310629367828369
2024-08-13 12:34:27,276 - INFO - avg loss: 193.94436645507812
2024-08-13 12:34:29,415 - INFO - Quantizing self_attn.v_proj in block 25/32...
2024-08-13 12:34:31,730 - INFO - duration: 2.313189744949341
2024-08-13 12:34:31,733 - INFO - avg loss: 48.12168884277344
2024-08-13 12:34:33,866 - INFO - Quantizing self_attn.o_proj in block 25/32...
2024-08-13 12:34:36,196 - INFO - duration: 2.326904773712158
2024-08-13 12:34:36,200 - INFO - avg loss: 1.993638277053833
2024-08-13 12:34:38,350 - INFO - Quantizing mlp.gate_proj in block 25/32...
2024-08-13 12:34:40,733 - INFO - duration: 2.380890369415283
2024-08-13 12:34:40,737 - INFO - avg loss: 490.99429321289

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:35:09,923 - INFO - Quantizing self_attn.q_proj in block 26/32...
2024-08-13 12:35:12,270 - INFO - duration: 2.3452484607696533
2024-08-13 12:35:12,274 - INFO - avg loss: 312.1860046386719
2024-08-13 12:35:14,414 - INFO - Quantizing self_attn.k_proj in block 26/32...
2024-08-13 12:35:16,726 - INFO - duration: 2.30904221534729
2024-08-13 12:35:16,728 - INFO - avg loss: 184.77662658691406
2024-08-13 12:35:18,869 - INFO - Quantizing self_attn.v_proj in block 26/32...
2024-08-13 12:35:21,172 - INFO - duration: 2.3002469539642334
2024-08-13 12:35:21,174 - INFO - avg loss: 49.20032501220703
2024-08-13 12:35:23,313 - INFO - Quantizing self_attn.o_proj in block 26/32...
2024-08-13 12:35:25,641 - INFO - duration: 2.3265018463134766
2024-08-13 12:35:25,644 - INFO - avg loss: 2.391836643218994
2024-08-13 12:35:27,763 - INFO - Quantizing mlp.gate_proj in block 26/32...
2024-08-13 12:35:30,117 - INFO - duration: 2.351135015487671
2024-08-13 12:35:30,119 - INFO - avg loss: 526.730102539

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:35:59,327 - INFO - Quantizing self_attn.q_proj in block 27/32...
2024-08-13 12:36:01,675 - INFO - duration: 2.346006393432617
2024-08-13 12:36:01,677 - INFO - avg loss: 310.6707763671875
2024-08-13 12:36:03,819 - INFO - Quantizing self_attn.k_proj in block 27/32...
2024-08-13 12:36:06,129 - INFO - duration: 2.3086390495300293
2024-08-13 12:36:06,132 - INFO - avg loss: 202.22393798828125
2024-08-13 12:36:08,286 - INFO - Quantizing self_attn.v_proj in block 27/32...
2024-08-13 12:36:10,592 - INFO - duration: 2.3044519424438477
2024-08-13 12:36:10,595 - INFO - avg loss: 49.568199157714844
2024-08-13 12:36:12,722 - INFO - Quantizing self_attn.o_proj in block 27/32...
2024-08-13 12:36:15,073 - INFO - duration: 2.3488504886627197
2024-08-13 12:36:15,076 - INFO - avg loss: 3.5938525199890137
2024-08-13 12:36:17,204 - INFO - Quantizing mlp.gate_proj in block 27/32...
2024-08-13 12:36:19,571 - INFO - duration: 2.365440607070923
2024-08-13 12:36:19,574 - INFO - avg loss: 572.355590

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:36:48,727 - INFO - Quantizing self_attn.q_proj in block 28/32...
2024-08-13 12:36:51,043 - INFO - duration: 2.314298391342163
2024-08-13 12:36:51,045 - INFO - avg loss: 318.90228271484375
2024-08-13 12:36:53,185 - INFO - Quantizing self_attn.k_proj in block 28/32...
2024-08-13 12:36:55,502 - INFO - duration: 2.3144707679748535
2024-08-13 12:36:55,504 - INFO - avg loss: 215.25485229492188
2024-08-13 12:36:57,653 - INFO - Quantizing self_attn.v_proj in block 28/32...
2024-08-13 12:36:59,972 - INFO - duration: 2.3167223930358887
2024-08-13 12:36:59,975 - INFO - avg loss: 67.3883056640625
2024-08-13 12:37:02,104 - INFO - Quantizing self_attn.o_proj in block 28/32...
2024-08-13 12:37:04,441 - INFO - duration: 2.334873676300049
2024-08-13 12:37:04,443 - INFO - avg loss: 4.698245048522949
2024-08-13 12:37:06,569 - INFO - Quantizing mlp.gate_proj in block 28/32...
2024-08-13 12:37:08,936 - INFO - duration: 2.365147113800049
2024-08-13 12:37:08,939 - INFO - avg loss: 622.420288085

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:37:38,141 - INFO - Quantizing self_attn.q_proj in block 29/32...
2024-08-13 12:37:40,480 - INFO - duration: 2.3364717960357666
2024-08-13 12:37:40,482 - INFO - avg loss: 291.7882995605469
2024-08-13 12:37:42,633 - INFO - Quantizing self_attn.k_proj in block 29/32...
2024-08-13 12:37:44,959 - INFO - duration: 2.323512077331543
2024-08-13 12:37:44,961 - INFO - avg loss: 173.3155517578125
2024-08-13 12:37:47,107 - INFO - Quantizing self_attn.v_proj in block 29/32...
2024-08-13 12:37:49,430 - INFO - duration: 2.3210716247558594
2024-08-13 12:37:49,432 - INFO - avg loss: 61.027339935302734
2024-08-13 12:37:51,582 - INFO - Quantizing self_attn.o_proj in block 29/32...
2024-08-13 12:37:53,931 - INFO - duration: 2.3472683429718018
2024-08-13 12:37:53,934 - INFO - avg loss: 7.215464115142822
2024-08-13 12:37:56,066 - INFO - Quantizing mlp.gate_proj in block 29/32...
2024-08-13 12:37:58,445 - INFO - duration: 2.3762571811676025
2024-08-13 12:37:58,448 - INFO - avg loss: 678.4863891

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:38:27,683 - INFO - Quantizing self_attn.q_proj in block 30/32...
2024-08-13 12:38:30,030 - INFO - duration: 2.345587730407715
2024-08-13 12:38:30,033 - INFO - avg loss: 316.3336181640625
2024-08-13 12:38:32,169 - INFO - Quantizing self_attn.k_proj in block 30/32...
2024-08-13 12:38:34,502 - INFO - duration: 2.3308141231536865
2024-08-13 12:38:34,505 - INFO - avg loss: 183.02401733398438
2024-08-13 12:38:36,650 - INFO - Quantizing self_attn.v_proj in block 30/32...
2024-08-13 12:38:38,980 - INFO - duration: 2.3278539180755615
2024-08-13 12:38:38,983 - INFO - avg loss: 71.41181945800781
2024-08-13 12:38:41,112 - INFO - Quantizing self_attn.o_proj in block 30/32...
2024-08-13 12:38:43,454 - INFO - duration: 2.3405373096466064
2024-08-13 12:38:43,456 - INFO - avg loss: 8.477123260498047
2024-08-13 12:38:45,580 - INFO - Quantizing mlp.gate_proj in block 30/32...
2024-08-13 12:38:47,935 - INFO - duration: 2.3525571823120117
2024-08-13 12:38:47,937 - INFO - avg loss: 700.9455566

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:39:17,045 - INFO - Quantizing self_attn.q_proj in block 31/32...
2024-08-13 12:39:19,405 - INFO - duration: 2.3572022914886475
2024-08-13 12:39:19,408 - INFO - avg loss: 293.40142822265625
2024-08-13 12:39:21,555 - INFO - Quantizing self_attn.k_proj in block 31/32...
2024-08-13 12:39:23,878 - INFO - duration: 2.3209688663482666
2024-08-13 12:39:23,880 - INFO - avg loss: 194.94924926757812
2024-08-13 12:39:26,029 - INFO - Quantizing self_attn.v_proj in block 31/32...
2024-08-13 12:39:28,347 - INFO - duration: 2.316722869873047
2024-08-13 12:39:28,350 - INFO - avg loss: 103.46127319335938
2024-08-13 12:39:30,481 - INFO - Quantizing self_attn.o_proj in block 31/32...
2024-08-13 12:39:32,830 - INFO - duration: 2.346766948699951
2024-08-13 12:39:32,833 - INFO - avg loss: 13.251123428344727
2024-08-13 12:39:34,954 - INFO - Quantizing mlp.gate_proj in block 31/32...
2024-08-13 12:39:37,371 - INFO - duration: 2.414619207382202
2024-08-13 12:39:37,376 - INFO - avg loss: 764.360839

Quantizing layers inside the block:   0%|          | 0/7 [00:00<?, ?it/s]

2024-08-13 12:40:06,520 - INFO - Quantizing self_attn.q_proj in block 32/32...
2024-08-13 12:40:08,871 - INFO - duration: 2.3486011028289795
2024-08-13 12:40:08,873 - INFO - avg loss: 232.31521606445312
2024-08-13 12:40:11,013 - INFO - Quantizing self_attn.k_proj in block 32/32...
2024-08-13 12:40:13,346 - INFO - duration: 2.3308284282684326
2024-08-13 12:40:13,349 - INFO - avg loss: 128.8953399658203
2024-08-13 12:40:15,496 - INFO - Quantizing self_attn.v_proj in block 32/32...
2024-08-13 12:40:17,825 - INFO - duration: 2.326690912246704
2024-08-13 12:40:17,827 - INFO - avg loss: 55.518653869628906
2024-08-13 12:40:19,955 - INFO - Quantizing self_attn.o_proj in block 32/32...
2024-08-13 12:40:22,332 - INFO - duration: 2.375446081161499
2024-08-13 12:40:22,334 - INFO - avg loss: 29.380741119384766
2024-08-13 12:40:24,459 - INFO - Quantizing mlp.gate_proj in block 32/32...
2024-08-13 12:40:26,841 - INFO - duration: 2.379107713699341
2024-08-13 12:40:26,843 - INFO - avg loss: 700.1906738

KeyboardInterrupt: 

선형 레이어의 속성을 확인하여 모델이 올바르게 정량화되었는지 확인할 수 있으며, 여기에는 torch.int32 dtype에 있어야 하는 qweight 및 qzeros 속성이 포함되어 있어야 합니다.

In [None]:
quant_model.model.decoder.layers[0].self_attn.q_proj.__dict__

이제 잘 동작하는지 확인하기 위해 양자화된 모델에 대해 추론을 수행해 보겠습니다. (트랜스포머와 동일한 API를 사용할 것)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

text = "안녕"
inputs = tokenizer(text, return_tensors="pt").to(0)
out = quant_model.generate(**inputs)
print(tokenizer.decode(out[0], skip_special_tokens=True))
logging.info(f"생성된 텍스트: {result_text}")

# 3. 양자화된 모델 허깅페이스에 업로드하기

In [None]:
from huggingface_hub import notebook_login

# Hugging Face Hub 로그인 (최초 1회 실행 필요)
notebook_login()

In [None]:
# 양자화된 모델을 Hugging Face Hub에 업로드
quant_model.push_to_hub("ormor/Llama-3.1-Korean-8B-Instruct-GPTQ-4bit")
tokenizer.push_to_hub("ormor/Llama-3.1-Korean-8B-Instruct-GPTQ-4bit")

# 4. 허깅페이스에 업로드한 모델 가져오기

In [None]:
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
from langchain import HuggingFacePipeline

def setup_llm_pipeline():
    # 모델 ID (Hugging Face Hub에서 가져온 양자화된 모델 ID)
    model_id = "ormor/Llama-3.1-Korean-8B-Instruct-GPTQ-4bit"

    # 토크나이저 로드 및 설정
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.use_default_system_prompt = False

    # 모델 로드 및 양자화 설정 적용
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        trust_remote_code=True
    )
    
    print(f"#### [ model ] ####\n{model}\n###################")

    # HuggingFacePipeline 객체 생성
    text_generation_pipeline = pipeline(
        model=model,
        tokenizer=tokenizer,
        task="text-generation",
        return_full_text=False,
        max_new_tokens=450,
    )

    hf = HuggingFacePipeline(pipeline=text_generation_pipeline)

    return hf

파이프라인 설정 및 테스트

In [None]:
# 파이프라인 설정
hf_pipeline = setup_llm_pipeline()

# 텍스트 생성 예시
text = "안녕"
result = hf_pipeline(text)
print(result)