In [None]:
# install quantao
!pip install transformers accelerat optimum-quanto
!pip install torch

In [None]:
import torch
# Check if GPU is available and set the device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, QuantoConfig

model_id = "meta-llama/Meta-Llama-3-8B"
tokenizer = AutoTokenizer.from_pretrained(model_id)

quantization_config = QuantoConfig(weights="int4") # weight quantization
# quantization_config = QuantoConfig(activations="int4") # activation quantization

quantized_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config= quantization_config
)

In [None]:
free, total = torch.cuda.mem_get_info(device)
mem_used_MB = (total - free) / 1024 ** 2
print(f"GPU memory utilization : {mem_used_MB/1024} GB")

In [None]:
import time
t1 = time.time()

prompt = "What is multi-head attention in context of transformer?"

# Tokenizing input text for the model.
input_ids = tokenizer([prompt], return_tensors="pt").input_ids.to("cuda") # .to(model.device)

# Generating output based on the input_ids.
# You can adjust the max_length parameter as necessary for your use case.
generated_tokens = quantized_model.generate(input_ids, max_length=50)

# Decoding the generated tokens to produce readable text.
generated_text = tokenizer.decode(generated_tokens[0], skip_special_tokens=True)
t2 = time.time()
print(generated_text)
print(f"Took around {t2-t1} seconds.")