<a href="https://colab.research.google.com/github/peremartra/LLMOptCost/blob/main/11/10_Quantization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [17]:
!pip install -q transformers
!pip install -q accelerate #==0.29.3
!pip install -q -U bitsandbytes

In [18]:
!pip install -q huggingface_hub

In [19]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [20]:
OPT2="memory"

In [21]:
base_model_name = "bigscience/bloomz-560m"
quantized_model_name = "bloomz-560m-quantized_" + OPT2
#base_model = "google/gemma-2-2b"
#quantized_mode_name = "gemma-2-2b-quantized"

In [22]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device_map = {"": device} if device != "cpu" else {}

In [23]:
def clean_gpu_memory(model):
  del model
  torch.cuda.empty_cache()

# Quantizing the model.

In [24]:
def load_models(base_model_name, quantized_model_name, goal="memory"):
    """
    Load the base model and quantized model.
    Choose between memory optimization (4-bit) and inference optimization (8-bit).
    Print model sizes using get_memory_footprint() and calculate memory savings.
    """
    # Load the tokenizer (both models use the same tokenizer)
    tokenizer = AutoTokenizer.from_pretrained(base_model_name)

    # Load the base model (non-quantized)
    base_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        device_map=device_map,
    )

    # Define BitsAndBytes configuration based on the goal
    if goal == "memory":
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,  # Optimize for memory using 4-bit quantization
            bnb_4bit_use_double_quant=True,  # Save memory but slower on inference
            bnb_4bit_quant_type="nf4",  # Use nf4 quantization for better memory usage
            bnb_4bit_compute_dtype=torch.float16  # Depending on GPU we can change for bfloat16.
        )
    else:  # goal == "inference"
        bnb_config = BitsAndBytesConfig(
            load_in_8bit=True,  # Optimize for inference using 8-bit quantization
            bnb_4bit_compute_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
        )

    # Load the quantized model using the defined BitsAndBytes configuration
    quantized_model = AutoModelForCausalLM.from_pretrained(
        base_model_name,
        device_map=device_map,  # Automatically map layers to devices
        quantization_config=bnb_config,  # Apply the chosen quantization config
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
    )

    # Get the memory footprint for both models
    base_model_size = base_model.get_memory_footprint() / (1024 ** 2)  # Convert to MB
    quantized_model_size = quantized_model.get_memory_footprint() / (1024 ** 2)  # Convert to MB

    # Calculate the percentage of memory saved
    size_saved = ((base_model_size - quantized_model_size) / base_model_size) * 100

    # Print model sizes and percentage saved
    print(f"Base Model Memory Footprint: {base_model_size:.2f} MB")
    print(f"Quantized Model Memory Footprint: {quantized_model_size:.2f} MB")
    print(f"Memory Saved: {size_saved:.2f}%")

    return tokenizer, base_model, quantized_model

In [25]:
tokenizer, model, quantized_model = load_models(base_model_name, quantized_model_name, goal=OPT2)

Base Model Memory Footprint: 2133.23 MB
Quantized Model Memory Footprint: 634.62 MB
Memory Saved: 70.25%


Results with "Inference" Mode over bigscience/bloomz-560m:

* Base Model Memory Footprint: 2133.23 MB
* Quantized Model Memory Footprint: 778.62 MB
* Memory Saved: 63.50%

Results with "Memory" Mode over bigscience/bloomz-560m:

* Base Model Memory Footprint: 2133.23 MB
* Quantized Model Memory Footprint: 634.62 MB
* Memory Saved: 70.25%

In [26]:
#quantized_model = torch.quantization.quantize_dynamic(
#   model, {torch.nn.Linear}, dtype=torch.qint8
#)

# Save the model

In [27]:
# Save the quantized model
quantized_model.save_pretrained(
    quantized_model_name,
)
tokenizer.save_pretrained(quantized_model_name)

('bloomz-560m-quantized_memory/tokenizer_config.json',
 'bloomz-560m-quantized_memory/special_tokens_map.json',
 'bloomz-560m-quantized_memory/tokenizer.json')

## Upload to Hugging Face.

In [28]:
quantized_model.push_to_hub(quantized_model_name,
                  private=False,
                  use_temp_dir=False)
tokenizer.push_to_hub(quantized_model_name,
                      private=False,
                      use_temp_dir=False)

model.safetensors:   0%|          | 0.00/670M [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/oopere/bloomz-560m-quantized_memory/commit/62c9d925633b70efc057c9dd516d6d4f42a0b798', commit_message='Upload tokenizer', commit_description='', oid='62c9d925633b70efc057c9dd516d6d4f42a0b798', pr_url=None, pr_revision=None, pr_num=None)

In [29]:
clean_gpu_memory(model)

In [30]:
# Reload the quantized model
quantized_model = AutoModelForCausalLM.from_pretrained(quantized_model_name,
                                                       device_map=device_map)
#quantized_model.to(device)

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


In [31]:
quantized_model.get_memory_footprint()

665444352

In [32]:
# Inference
input_text = "Tell me a joke"
input_ids = tokenizer(input_text, return_tensors="pt").to(device)

output = quantized_model.generate(**input_ids)
print(tokenizer.decode(output[0], skip_special_tokens=True))



Tell me a joke about a man who is a man
