In [30]:
from transformers import AutoTokenizer, GPTQConfig, BertForSequenceClassification, BertTokenizer
from datasets import load_from_disk
from tqdm import tqdm
import torch

In [31]:
model_id = "./7bert"
qModelID = "./7qbert"

In [32]:
fp16_model = BertForSequenceClassification.from_pretrained('./5bert')
fp32_model = fp16_model.to(torch.float32)
fp16_model.save_pretrained(model_id)

In [33]:
tokenizer = BertTokenizer.from_pretrained('./data/bert/bert_tokenizer')
dataset = load_from_disk('./data/bert/imdb_dataset').shuffle()

In [34]:
model = BertForSequenceClassification.from_pretrained(model_id, num_labels=2)

In [None]:
# 检查模型的第一个参数的数据类型
first_param_dtype = next(model.parameters()).dtype

print(f"The model's first parameter dtype is: {first_param_dtype}")

# 判断模型是 FP32 还是 FP16
if first_param_dtype == torch.float32:
    print("The model is in FP32.")
elif first_param_dtype == torch.float16:
    print("The model is in FP16.")
else:
    print("The model is in another precision format.")

In [None]:
inputdata = []
for i in tqdm(range(1000)):
    ex = dataset['train'][i]
    if  len(tokenizer(ex['text'], padding=True, truncation=True)['input_ids']) < 512:
        inputdata.append(ex['text'])

In [37]:
gptq_config = GPTQConfig(bits=4, 
                        dataset=inputdata, 
                        tokenizer=tokenizer,
                        block_name_to_quantize = 'bert.encoder.layer', 
                        use_exllama=False
                        )

In [None]:
quantized_model = BertForSequenceClassification.from_pretrained(
    model_id, 
    quantization_config=gptq_config,
    low_cpu_mem_usage = False
    )

In [39]:
quantized_model = quantized_model.to("cpu")
quantized_model.save_pretrained(qModelID)
# print(quantized_model)

In [53]:
tokenizer = BertTokenizer.from_pretrained('./data/bert/bert_tokenizer')
dataset = load_from_disk('./data/bert/imdb_dataset')
dataset.shuffle()
data = dataset['test'].select(range(3000))

In [54]:
def testAccuracy(model, data, device):
    model = model.to(device)
    model.eval()
    testlen = len(data)
    num_wrong = 0
    for i in tqdm(range(testlen)):
        ex = data[i]
        text = ex['text']
        lable = ex['label']
        input = tokenizer(text, return_tensors="pt",padding=True, truncation=True).to(device)
        output = model(**input)
        if int(torch.argmax(output.logits)) != lable:
            num_wrong += 1
    return 100-num_wrong/testlen*100

In [None]:
qModelID = "./5qbert"
model = BertForSequenceClassification.from_pretrained(
    qModelID, 
    low_cpu_mem_usage = False
    )

In [None]:
testAccuracy(model, data, 'cuda')