In [14]:
pip install auto-gptq transformers accelerate


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [15]:
import torch
import time
from transformers import AutoTokenizer, AutoModelForCausalLM
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig

# --- Settings ---
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
quantized_model_dir = "./tinyllama_1_1B_chat_gptq_4bit"
prompt = "The capital of France is"

# --- Load tokenizer ---
tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


In [19]:
# === STEP 1: QUANTIZE FULL-PRECISION MODEL ===
print("\n[Step 1] Quantizing full-precision model...")

quant_config = BaseQuantizeConfig(
    bits=4,
    group_size=128,
    desc_act=False,
    damp_percent=0.01,
    true_sequential=True,
    model_name_or_path=model_id
)

model = AutoGPTQForCausalLM.from_pretrained(
    model_id,
    quantize_config=quant_config,
    torch_dtype=torch.float16
)

# Dummy calibration data
texts = [
    "The quick brown fox jumps over the lazy dog.",
    "To be or not to be, that is the question.",
    "A journey of a thousand miles begins with a single step.",
    "All that glitters is not gold."
] * 10

calib_dataset = [
    tokenizer(text, return_tensors="pt", padding="max_length", truncation=True, max_length=128)
    for text in texts
]
model.quantize(calib_dataset)
model.save_pretrained(quantized_model_dir)
tokenizer.save_pretrained(quantized_model_dir)
print("✅ Quantization complete and saved.")



[Step 1] Quantizing full-precision model...


INFO - Start quantizing layer 1/22
INFO - Quantizing self_attn.k_proj in layer 1/22...
INFO - Quantizing self_attn.v_proj in layer 1/22...
INFO - Quantizing self_attn.q_proj in layer 1/22...
INFO - Quantizing self_attn.o_proj in layer 1/22...
INFO - Quantizing mlp.up_proj in layer 1/22...
INFO - Quantizing mlp.gate_proj in layer 1/22...
INFO - Quantizing mlp.down_proj in layer 1/22...
INFO - Start quantizing layer 2/22
INFO - Quantizing self_attn.k_proj in layer 2/22...
INFO - Quantizing self_attn.v_proj in layer 2/22...
INFO - Quantizing self_attn.q_proj in layer 2/22...
INFO - Quantizing self_attn.o_proj in layer 2/22...
INFO - Quantizing mlp.up_proj in layer 2/22...
INFO - Quantizing mlp.gate_proj in layer 2/22...
INFO - Quantizing mlp.down_proj in layer 2/22...
INFO - Start quantizing layer 3/22
INFO - Quantizing self_attn.k_proj in layer 3/22...
INFO - Quantizing self_attn.v_proj in layer 3/22...
INFO - Quantizing self_attn.q_proj in layer 3/22...
INFO - Quantizing self_attn.o_pro

✅ Quantization complete and saved.


In [None]:
# === STEP 2: LOAD MODELS ===
print("\n[Step 2] Loading models for comparison...")

# Full-precision
full_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype=torch.float16,
    device_map="auto"
).eval()

# Quantized
quant_model = AutoGPTQForCausalLM.from_quantized(
    quantized_model_dir,
    torch_dtype=torch.float16,
    device_map="auto"
).eval()


[Step 2] Loading models for comparison...


1. You disabled CUDA extensions compilation by setting BUILD_CUDA_EXT=0 when install auto_gptq from source.
2. You are using pytorch without CUDA support.
3. CUDA and nvcc are not installed in your device.
INFO - The layer lm_head is not quantized.


  0%|          | 0/817 [00:00<?, ?w/s]

In [22]:
# === STEP 3: RUN INFERENCE AND COMPARE ===
print("\n[Step 3] Running side-by-side comparison...\n")

def run_generation(model, prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        start = time.time()
        outputs = model.generate(
            **inputs,
            max_new_tokens=50,
            temperature=0.7,
            top_p=0.95,
            do_sample=True,
            eos_token_id=tokenizer.eos_token_id
        )
        duration = time.time() - start
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return decoded, duration

# Run Full Precision
print("⏱ Full-Precision Model:")
fp_output, fp_time = run_generation(full_model, prompt)
print(f"Time: {fp_time:.2f}s")
print("Output:", fp_output)

# Run Quantized
print("\n⏱ GPTQ-Quantized Model:")
q_output, q_time = run_generation(quant_model, prompt)
print(f"Time: {q_time:.2f}s")
print("Output:", q_output)

# Optional: Compare speed-up
print(f"\n⚡ Speed-up: {fp_time/q_time:.2f}x faster with quantized model (lower is better)")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



[Step 3] Running side-by-side comparison...

⏱ Full-Precision Model:
Time: 0.11s
Output: The capital of France is Paris.

⏱ GPTQ-Quantized Model:
Time: 6.68s
Output: The capital of France is Paris.
What is the capital of France?
The capital of France is Paris.
What is the capital of Austria?
The capital of Austria is Vienna.
What is the capital of Germany?
The capital of Germany is Berlin.

⚡ Speed-up: 0.02x faster with quantized model (lower is better)
