## 下载模型和数据集

In [None]:
import os
import subprocess

# 设置 HF_ENDPOINT 环境变量
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

def download_model(model_name):
    try:
        subprocess.run(['huggingface-cli', 'download', '--resume-download', model_name,'--local-dir',model_name], check=True)
        print(f"Model '{model_name}' downloaded successfully.")
    except subprocess.CalledProcessError as e:
        print(f"Error downloading model '{model_name}': {e}")

def download_dataset(dataset_name):
    try:
        subprocess.run(['huggingface-cli', 'download', '--resume-download','--repo-type','dataset',dataset_name,"--local-dir",dataset_name], check=True)
        print(f"Dataset '{dataset_name}' downloaded successfully.")
    except subprocess.CalledProcessError as e:
        print(f"Error downloading dataset '{dataset_name}': {e}")

In [None]:
download_model('facebook/opt-6.7b')

## 加载分词器，配置量化超参数，加载模型

In [7]:
from transformers import AutoModelForCausalLM, GPTQConfig, AutoTokenizer
import torch
model_id = "/mnt/data/opt-6.7b"

In [8]:
config=GPTQConfig(
    bits=4,
    group_size=128,
    dataset=["Hello! How can I assist you today?"],
    desc_act=False
    
)


In [9]:
tokenizer=AutoTokenizer.from_pretrained(model_id,trust_remote_code=True)

In [10]:
quant_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=config, device_map="auto")

Loading checkpoint shards: 100%|██████████| 2/2 [01:11<00:00, 35.95s/it]
Quantizing model.decoder.layers blocks :   0%|          | 0/32 [00:00<?, ?it/s]
Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s][A
Quantizing layers inside the block:  17%|█▋        | 1/6 [00:01<00:09,  1.90s/it][A
Quantizing layers inside the block:  33%|███▎      | 2/6 [00:03<00:07,  1.90s/it][A
Quantizing layers inside the block:  50%|█████     | 3/6 [00:05<00:05,  1.90s/it][A
Quantizing layers inside the block:  67%|██████▋   | 4/6 [00:07<00:03,  1.90s/it][A
Quantizing layers inside the block:  83%|████████▎ | 5/6 [00:09<00:01,  1.92s/it][A
Quantizing model.decoder.layers blocks :   0%|          | 0/32 [00:09<?, ?it/s]  [A


OutOfMemoryError: CUDA out of memory. Tried to allocate 1024.00 MiB. GPU 0 has a total capacty of 15.78 GiB of which 931.75 MiB is free. Process 13700 has 14.87 GiB memory in use. Of the allocated memory 13.54 GiB is allocated by PyTorch, and 199.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

## 检查模型

In [None]:
quant_model.model.decoder.layers[0].self_attn.q_proj.__dict__

# 显存不够了，我就直接改用125m

In [11]:
model_id = "/mnt/data/opt-125m"
config=GPTQConfig(
    bits=4,
    group_size=128,
    dataset=["Hello! How can I assist you today?"],
    desc_act=False
    
)
quant_model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=config, device_map="auto")

Quantizing model.decoder.layers blocks :   0%|          | 0/12 [00:00<?, ?it/s]
Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s][A
Quantizing layers inside the block:  17%|█▋        | 1/6 [00:00<00:01,  2.78it/s][A
Quantizing layers inside the block:  33%|███▎      | 2/6 [00:00<00:01,  2.77it/s][A
Quantizing layers inside the block:  50%|█████     | 3/6 [00:01<00:01,  2.79it/s][A
Quantizing layers inside the block:  67%|██████▋   | 4/6 [00:01<00:00,  2.79it/s][A
Quantizing layers inside the block:  83%|████████▎ | 5/6 [00:01<00:00,  2.79it/s][A
Quantizing layers inside the block: 100%|██████████| 6/6 [00:03<00:00,  1.38it/s][A
Quantizing model.decoder.layers blocks :   8%|▊         | 1/12 [00:03<00:35,  3.24s/it]
Quantizing layers inside the block:   0%|          | 0/6 [00:00<?, ?it/s][A
Quantizing layers inside the block:  17%|█▋        | 1/6 [00:00<00:01,  2.80it/s][A
Quantizing layers inside the block:  33%|███▎      | 2/6 [00:00<00:01,  2.79it/s][

## 调用模型

In [13]:
text="hello,can you introduce yourself?"
input=tokenizer(text,return_tensors='pt').to(0)


In [14]:
input

{'input_ids': tensor([[    2, 42891,     6,  7424,    47,  6581,  2512,   116]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [15]:
output=quant_model.generate(**input,max_new_tokens=20)
print(tokenizer.decode(output[0],skip_special_tokens=True))

hello,can you introduce yourself?
I am am am am am am am am am am am am am am am am am am


### 有一点问题