# LIBRARY INSTALLATION

In [None]:
# transformers: The main Hugging Face library. It provides the 'AutoModelForCausalLM'
#    class to load the LLaMA architecture and the 'AutoTokenizer' to process text.

# accelerate: A library by Hugging Face to handle hardware acceleration.
#    It is useful here for the 'device_map="auto"' feature, which efficiently manages
#    how the large model is loaded onto the GPU memory.

# bitsandbytes: The core library for quantization. It allows us to use the
#    8-bit quantization described in the paper (using 'load_in_8bit=True').

# scipy: Needed for complex mathematical operations for transformers

# torch: Deep Learning framework

!pip install -q -U transformers accelerate bitsandbytes scipy torch
print("Dependencies installed successfully.")

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m899.7/899.7 MB[0m [31m838.4 kB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m594.3/594.3 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m141.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m88.0/88.0 MB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m954.8/954.8 kB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.1/193.1 MB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m77.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

# SET UP AND FUNCTIONS



In [None]:
import torch
import time
import gc
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

We will compare the performance of the 8-bit model to the 16 and not 32-bit model. In fact, the free Colab GPU would crash when loading the 32 bits model. It only has 15 Giga bytes while the 32-bit requires 28 Giga bytes of VRAM.

In [None]:
# Using the Chat version of LLaMA 2 (7 billion parameters)

# 1. Model Selection:
# We use "NousResearch/Llama-2-7b-chat-hf".
# - "Llama-2-7b": The 7 Billion parameter version is the standard size that fits
#   on the Colab GPU.
# - "NousResearch": We use this third-party mirror because the official Meta weights
#   require a gated license approval process. This version is architecture-identical
#   but publicly accessible immediately.
# - "chat": This model is fine-tuned for instructions, ensuring it answers questions
#   rather than just completing text.
MODEL_ID = "NousResearch/Llama-2-7b-chat-hf"

# 2. Benchmark Prompt:
# We select a complex prompt to ensure the model generates a long enough sequence.
# This allows us to:
# - Accurately measure generation speed (tokens/sec) over a sustained period.
# - Qualitatively assess if quantization affects the logical coherence of the plan.
PROMPT = "Write a detailed plan to visit Paris in 3 days, focusing on museums and food."

def flush_memory():
    """
    Function to clear GPU VRAM between model loads.
    Prevents running out of memory on Google Colab.
    """
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    print("GPU Memory flushed.\n")

def get_memory_usage():
    """
    Returns the current GPU memory allocation in Gigabyte.
    """
    return torch.cuda.memory_allocated() / (1024 ** 3)

def run_benchmark(model, tokenizer, name):
    """
    Performance and quality benchmark on the loaded model.
    """
    print(f"--- Benchmarking Model: {name} ---")

    # 1. Measure Memory Footprint
    # We measure how much VRAM the model weights occupy on the GPU.
    memory_footprint = get_memory_usage()
    print(f"VRAM Memory Footprint: {memory_footprint:.2f} GB")

    # Prepare inputs
    inputs = tokenizer(PROMPT, return_tensors="pt").to("cuda")

    # 2. Warmup Phase
    # The first inference is always slower due to CUDA kernel initialization.
    # We run a short generation to 'warm up' the GPU for accurate timing later.
    print("Warming up GPU")
    _ = model.generate(**inputs, max_new_tokens=20)

    # 3. Inference Speed & Quality Test
    print("Generating text for measurement")
    start_time = time.time()

    # We generate exactly 100 new tokens to ensure a fair comparison between models.
    output = model.generate(**inputs, max_new_tokens=100, do_sample=True, temperature=0.7)

    end_time = time.time()

    # Calculate Metrics
    latency = end_time - start_time
    tokens_per_sec = 100 / latency
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

    print(f"Time: {latency:.2f}s | Speed: {tokens_per_sec:.2f} tokens/s")
    print("-" * 30)

    return {
        "Model": name,
        "Memory (GB)": round(memory_footprint, 2),
        "Latency (s)": round(latency, 2),
        "Tokens/Sec": round(tokens_per_sec, 2),
        "Output Sample quick": generated_text[:200] + "..." , # Keep a small sample to quickly check the quality
        "Output Sample": generated_text # Full output for comparison
    }

# BASELINE MODEL BENCHMARK (FP16)

In [None]:
# We start by evaluating the model in Half-Precision (FP16).
print("Loading Baseline Model (FP16)")

# Loading the Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)

# Loading the Model weights
model_fp16 = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    # We explicitly load in float16 to fit in memory while maintaining high precision.
    dtype=torch.float16,
    # 'accelerate' library automatically handles the placement of layers on the GPU.
    device_map="auto"
)

# Executing the Benchmark
res_fp16 = run_benchmark(model_fp16, tokenizer, "LLaMA-2-7B (FP16)")

# Cleaning up phase
# The Colab GPU cannot hold both the FP16 and the 8-bit model
# in memory simultaneously. Thus, we must delete the first model and force-clear the
# GPU cache to prevent an "Out Of Memory" (OOM) crash before the next step

print("Cleaning up FP16 model to free VRAM")
del model_fp16
del tokenizer
flush_memory()

Loading Baseline Model (FP16)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

--- Benchmarking Model: LLaMA-2-7B (FP16) ---
VRAM Memory Footprint: 12.55 GB
Warming up GPU
Generating text for measurement
Time: 6.03s | Speed: 16.58 tokens/s
------------------------------
Cleaning up FP16 model to free VRAM

 GPU Memory flushed.



# QUANTIZED MODEL BENCHMARK (8-bit)

In [None]:
print("Loading Quantized Model (8-bit)")

# 1. Define Quantization Configuration
# This is where we apply the theory from the "8-bit LLM" paper.
# - load_in_8bit=True: Activates the bitsandbytes library to compress weights.
# - llm_int8_threshold=6.0: This is the "Mixed Precision" trick.
# Any value (outlier) larger than 6.0 is kept in FP16 to preserve accuracy.
# This threshold comes from the LLM.int(8) paper
# Everything else is compressed to 8-bit.
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0
)

# 2. Load the Model with Quantization
# We pass the 'quantization_config' argument. The model is compressed on-the-fly
# while being loaded into the GPU.
model_8bit = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto"
)

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# 3. Execute the Benchmark
# We use the same function and prompt to ensure a fair comparison.
res_8bit = run_benchmark(model_8bit, tokenizer, "LLaMA-2-7B (8-bit)")



Loading Quantized Model (8-bit)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

--- Benchmarking Model: LLaMA-2-7B (8-bit) ---
VRAM Memory Footprint: 6.53 GB
Warming up GPU
Generating text for measurement
Time: 14.76s | Speed: 6.77 tokens/s
------------------------------


# FINAL RESULTS & COMPARISON

In [None]:
print("FINAL COMPARATIVE RESULTS")

# We create a dataFrame to visualize the differences
df = pd.DataFrame([res_fp16, res_8bit])

print(df[["Model", "Memory (GB)", "Latency (s)", "Tokens/Sec"]])

# A quick quality check printing only the 200 first generated characters
print("\n--- Quick Quality Check: FP16 Output ---")
print(res_fp16["Output Sample quick"])
print("\n--- Quick Quality Check: 8-bit Output ---")
print(res_8bit["Output Sample quick"])

# Full comparison with the entire generated answer
print("\n--- Quality Check: FP16 Output ---")
print(res_fp16["Output Sample"])
print("\n--- Quality Check: 8-bit Output ---")
print(res_8bit["Output Sample"])

FINAL COMPARATIVE RESULTS
                Model  Memory (GB)  Latency (s)  Tokens/Sec
0   LLaMA-2-7B (FP16)        12.55         6.03       16.58
1  LLaMA-2-7B (8-bit)         6.53        14.76        6.77

--- Quick Quality Check: FP16 Output ---
Write a detailed plan to visit Paris in 3 days, focusing on museums and food. Here are some of the top museums and food experiences you could include:

Day 1:

* Start the day at the Louvre Museum, sp...

--- Quick Quality Check: 8-bit Output ---
Write a detailed plan to visit Paris in 3 days, focusing on museums and food.
Day 1: Museums
Stop 1: The Louvre Museum (9:00 am - 12:00 pm)
* Start your day at the world-famous Louvre Museum, home to ...

--- Quality Check: FP16 Output ---
Write a detailed plan to visit Paris in 3 days, focusing on museums and food. Here are some of the top museums and food experiences you could include:

Day 1:

* Start the day at the Louvre Museum, spending at least 2-3 hours exploring the collection, including the

To conclude, we do not see any difference between the two answers in terms of quality (the quantized version is not telling us to go to Trafalgar Square while we asked for recommendations for Paris)

Moreover, the quantized version is performing as it is meant to, in the sense that it uses less memory than the 16-bit version.